1a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com/*
2a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * Copyright 2012 The Android Open Source Project
3a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com *
4a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * Use of this source code is governed by a BSD-style license that can be
5a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * found in the LICENSE file.
6a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com */
7a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
8a111e492b84c312a6bd5d5d9ef100dca48f4941ddjsollen@google.com#include "SkBlitRow_opts_arm_neon.h"
9a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
10a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkBlitMask.h"
11a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkBlitRow.h"
12a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkColorPriv.h"
13a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkDither.h"
14a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkMathPriv.h"
15a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include "SkUtils.h"
16a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
170060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org#include "SkColor_opts_neon.h"
18a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#include <arm_neon.h>
19a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
20ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
21ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petitstatic inline uint8x8x4_t sk_vld4_u8_arm64_3(const SkPMColor* SK_RESTRICT & src) {
22ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8x4_t vsrc;
23ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8_t vsrc_0, vsrc_1, vsrc_2;
24ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
25ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    asm (
26ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
27ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc0].8b, v0.8b             \t\n"
28ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc1].8b, v1.8b             \t\n"
29ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc2].8b, v2.8b             \t\n"
30ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
31ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit          [vsrc2] "=w" (vsrc_2), [src] "+&r" (src)
32ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : : "v0", "v1", "v2", "v3"
33ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    );
34ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
35ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[0] = vsrc_0;
36ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[1] = vsrc_1;
37ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[2] = vsrc_2;
38ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
39ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    return vsrc;
40ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit}
41ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
42ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petitstatic inline uint8x8x4_t sk_vld4_u8_arm64_4(const SkPMColor* SK_RESTRICT & src) {
43ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8x4_t vsrc;
44ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    uint8x8_t vsrc_0, vsrc_1, vsrc_2, vsrc_3;
45ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
46ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    asm (
47ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "ld4    {v0.8b - v3.8b}, [%[src]], #32 \t\n"
48ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc0].8b, v0.8b             \t\n"
49ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc1].8b, v1.8b             \t\n"
50ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc2].8b, v2.8b             \t\n"
51ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        "mov    %[vsrc3].8b, v3.8b             \t\n"
52ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : [vsrc0] "=w" (vsrc_0), [vsrc1] "=w" (vsrc_1),
53ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit          [vsrc2] "=w" (vsrc_2), [vsrc3] "=w" (vsrc_3),
54ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit          [src] "+&r" (src)
55ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        : : "v0", "v1", "v2", "v3"
56ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    );
57ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
58ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[0] = vsrc_0;
59ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[1] = vsrc_1;
60ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[2] = vsrc_2;
61ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    vsrc.val[3] = vsrc_3;
62ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
63ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit    return vsrc;
64ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit}
65ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
66ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
670060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.orgvoid S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
680060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org                           const SkPMColor* SK_RESTRICT src, int count,
690060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org                           U8CPU alpha, int /*x*/, int /*y*/) {
700060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    SkASSERT(255 == alpha);
710060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
720060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    while (count >= 8) {
730060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        uint8x8x4_t vsrc;
740060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        uint16x8_t vdst;
750060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
760060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Load
77ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
78ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_3(src);
79ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
800060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        vsrc = vld4_u8((uint8_t*)src);
81ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        src += 8;
82ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
830060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
840060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Convert src to 565
85bc25dfc798fff225ce65355ecda19d2b85bd0e74commit-bot@chromium.org        vdst = SkPixel32ToPixel16_neon8(vsrc);
860060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
870060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Store
880060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        vst1q_u16(dst, vdst);
890060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
900060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        // Prepare next iteration
910060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        dst += 8;
920060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        count -= 8;
930060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    };
940060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
950060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    // Leftovers
960060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    while (count > 0) {
970060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        SkPMColor c = *src++;
980060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        SkPMColorAssert(c);
990060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        *dst = SkPixel32ToPixel16_ToU16(c);
1000060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        dst++;
1010060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org        count--;
1020060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    };
1030060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org}
1040060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org
10595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.orgvoid S32_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
10695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                          const SkPMColor* SK_RESTRICT src, int count,
10795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                          U8CPU alpha, int /*x*/, int /*y*/) {
10895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    SkASSERT(255 > alpha);
10995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
11095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    uint16x8_t vmask_blue, vscale;
11195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
11295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    // prepare constants
11395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    vscale = vdupq_n_u16(SkAlpha255To256(alpha));
11495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    vmask_blue = vmovq_n_u16(0x1F);
11595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
11695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    while (count >= 8) {
117ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        uint8x8x4_t vsrc;
11895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
11995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        uint16x8_t vres_r, vres_g, vres_b;
12095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
12195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Load src
122ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
123ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_3(src);
124ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
12595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        {
12695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d0 asm("d0");
12795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d1 asm("d1");
12895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d2 asm("d2");
12995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        register uint8x8_t d3 asm("d3");
13095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
13195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        asm (
13295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            "vld4.8    {d0-d3},[%[src]]!"
13395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
13495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            :
13595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        );
136ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[0] = d0;
137ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[1] = d1;
138ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[2] = d2;
13995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        }
140ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
14195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
14295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Load and unpack dst
14395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst = vld1q_u16(dst);
14495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_g = vshlq_n_u16(vdst, 5);        // shift green to top of lanes
14595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_b = vandq_u16(vdst, vmask_blue); // extract blue
14695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_r = vshrq_n_u16(vdst, 6+5);      // extract red
14795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vdst_g = vshrq_n_u16(vdst_g, 5+5);    // extract green
14895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
149ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        // Shift src to 565 range
150ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 3);
151ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 2);
152ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 3);
15395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
15495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Scale src - dst
155ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vres_r = vmovl_u8(vsrc.val[NEON_R]) - vdst_r;
156ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vres_g = vmovl_u8(vsrc.val[NEON_G]) - vdst_g;
157ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vres_b = vmovl_u8(vsrc.val[NEON_B]) - vdst_b;
15895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
15995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_r = vshrq_n_u16(vres_r * vscale, 8);
16095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_g = vshrq_n_u16(vres_g * vscale, 8);
16195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b = vshrq_n_u16(vres_b * vscale, 8);
16295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
16395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_r += vdst_r;
16495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_g += vdst_g;
16595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b += vdst_b;
16695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
16795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Combine
16895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b = vsliq_n_u16(vres_b, vres_g, 5);    // insert green into blue
16995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vres_b = vsliq_n_u16(vres_b, vres_r, 6+5);  // insert red into green/blue
17095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
17195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        // Store
17295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        vst1q_u16(dst, vres_b);
17395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        dst += 8;
17495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        count -= 8;
17595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    }
17695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    if (count > 0) {
17795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        int scale = SkAlpha255To256(alpha);
17895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        do {
17995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            SkPMColor c = *src++;
18095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            SkPMColorAssert(c);
18195c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            uint16_t d = *dst;
18295c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org            *dst++ = SkPackRGB16(
18395c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                    SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale),
18495c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                    SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale),
18595c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org                    SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale));
18695c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org        } while (--count != 0);
18795c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    }
18895c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org}
18995c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org
190ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM32
191a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
192a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                           const SkPMColor* SK_RESTRICT src, int count,
193a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                           U8CPU alpha, int /*x*/, int /*y*/) {
194a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(255 == alpha);
195a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
196a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count >= 8) {
1974b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein        uint16_t* SK_RESTRICT keep_dst = 0;
198fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
199a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        asm volatile (
2004b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "ands       ip, %[count], #7            \n\t"
2014b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmov.u8    d31, #1<<7                  \n\t"
2024b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vld1.16    {q12}, [%[dst]]             \n\t"
2034b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
204a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // Thumb does not support the standard ARM conditional
205a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // instructions but instead requires the 'it' instruction
206a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // to signal conditional execution
2074b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "it eq                                  \n\t"
2084b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "moveq      ip, #8                      \n\t"
2094b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "mov        %[keep_dst], %[dst]         \n\t"
2104b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein
2114b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "add        %[src], %[src], ip, LSL#2   \n\t"
2124b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
2134b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "subs       %[count], %[count], ip      \n\t"
2144b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "b          9f                          \n\t"
215a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // LOOP
216a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "2:                                         \n\t"
217fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
2184b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vld1.16    {q12}, [%[dst]]!            \n\t"
2194b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
2204b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
2214b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
2224b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "subs       %[count], %[count], #8      \n\t"
223a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "9:                                         \n\t"
2244b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "pld        [%[dst],#32]                \n\t"
225a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // expand 0565 q12 to 8888 {d4-d7}
2264b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmovn.u16  d4, q12                     \n\t"
2274b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshr.u16   q11, q12, #5                \n\t"
2284b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshr.u16   q10, q12, #6+5              \n\t"
2294b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmovn.u16  d5, q11                     \n\t"
2304b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmovn.u16  d6, q10                     \n\t"
2314b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshl.u8    d4, d4, #3                  \n\t"
2324b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshl.u8    d5, d5, #2                  \n\t"
2334b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshl.u8    d6, d6, #3                  \n\t"
2344b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein
2354b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmovl.u8   q14, d31                    \n\t"
2364b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmovl.u8   q13, d31                    \n\t"
2374b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmovl.u8   q12, d31                    \n\t"
238fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
239a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // duplicate in 4/2/1 & 8pix vsns
2404b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmvn.8     d30, d3                     \n\t"
2414b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmlal.u8   q14, d30, d6                \n\t"
2424b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmlal.u8   q13, d30, d5                \n\t"
2434b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vmlal.u8   q12, d30, d4                \n\t"
2444b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshr.u16   q8, q14, #5                 \n\t"
2454b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshr.u16   q9, q13, #6                 \n\t"
2464b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vaddhn.u16 d6, q14, q8                 \n\t"
2474b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshr.u16   q8, q12, #5                 \n\t"
2484b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vaddhn.u16 d5, q13, q9                 \n\t"
2494b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vaddhn.u16 d4, q12, q8                 \n\t"
250a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // intentionally don't calculate alpha
251a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // result in d4-d6
252fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
2533f55eed73f5af405909c2c10bff179d80526d423Mike Klein            #ifdef SK_PMCOLOR_IS_RGBA
2544b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vqadd.u8   d6, d6, d0                  \n\t"
2554b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vqadd.u8   d5, d5, d1                  \n\t"
2564b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vqadd.u8   d4, d4, d2                  \n\t"
2573f55eed73f5af405909c2c10bff179d80526d423Mike Klein            #else
2584b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vqadd.u8   d6, d6, d2                  \n\t"
2594b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vqadd.u8   d5, d5, d1                  \n\t"
2604b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vqadd.u8   d4, d4, d0                  \n\t"
2613f55eed73f5af405909c2c10bff179d80526d423Mike Klein            #endif
262fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
263a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // pack 8888 {d4-d6} to 0565 q10
2644b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshll.u8   q10, d6, #8                 \n\t"
2654b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshll.u8   q3, d5, #8                  \n\t"
2664b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vshll.u8   q2, d4, #8                  \n\t"
2674b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vsri.u16   q10, q3, #5                 \n\t"
2684b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vsri.u16   q10, q2, #11                \n\t"
269fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
2704b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "bne        2b                          \n\t"
271fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
272a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "1:                                         \n\t"
2734b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
274a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : [count] "+r" (count)
2754b19b403944dd4ab70507c0dea2aa3d38f145eacMike Klein                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
276a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
277a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
278a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d30","d31"
279a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      );
280a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
281fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    else
282a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    {   // handle count < 8
283efbe8e9bedda21a3e061ebf3d96431a0f250a654djsollen@google.com        uint16_t* SK_RESTRICT keep_dst = 0;
284fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
285a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        asm volatile (
286a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmov.u8    d31, #1<<7                  \n\t"
287a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "mov        %[keep_dst], %[dst]         \n\t"
288fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
289a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #4                \n\t"
290a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        14f                         \n\t"
291a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.16    {d25}, [%[dst]]!            \n\t"
292a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {q1}, [%[src]]!             \n\t"
293fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
294a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "14:                                        \n\t"
295a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #2                \n\t"
296a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        12f                         \n\t"
297a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
298a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {d1}, [%[src]]!             \n\t"
299fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
300a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "12:                                        \n\t"
301a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #1                \n\t"
302a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        11f                         \n\t"
303a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
304a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
305fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
306a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "11:                                        \n\t"
307a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // unzips achieve the same as a vld4 operation
308ea13afff6e46d8a969611cdd56c996bfb05a27c1thakis                      "vuzp.u16   q0, q1                      \n\t"
309a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vuzp.u8    d0, d1                      \n\t"
310a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vuzp.u8    d2, d3                      \n\t"
311a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // expand 0565 q12 to 8888 {d4-d7}
312a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d4, q12                     \n\t"
313a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q11, q12, #5                \n\t"
314a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q10, q12, #6+5              \n\t"
315a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d5, q11                     \n\t"
316a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovn.u16  d6, q10                     \n\t"
317a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d4, d4, #3                  \n\t"
318a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d5, d5, #2                  \n\t"
319a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshl.u8    d6, d6, #3                  \n\t"
320fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
321a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q14, d31                    \n\t"
322a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q13, d31                    \n\t"
323a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmovl.u8   q12, d31                    \n\t"
324fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
325a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // duplicate in 4/2/1 & 8pix vsns
326a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmvn.8     d30, d3                     \n\t"
327a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q14, d30, d6                \n\t"
328a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q13, d30, d5                \n\t"
329a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vmlal.u8   q12, d30, d4                \n\t"
330a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q8, q14, #5                 \n\t"
331a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q9, q13, #6                 \n\t"
332a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d6, q14, q8                 \n\t"
333a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshr.u16   q8, q12, #5                 \n\t"
334a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d5, q13, q9                 \n\t"
335a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vaddhn.u16 d4, q12, q8                 \n\t"
336a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // intentionally don't calculate alpha
337a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // result in d4-d6
338fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
3393f55eed73f5af405909c2c10bff179d80526d423Mike Klein            #ifdef SK_PMCOLOR_IS_RGBA
3403f55eed73f5af405909c2c10bff179d80526d423Mike Klein                      "vqadd.u8   d6, d6, d0                  \n\t"
341a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d5, d5, d1                  \n\t"
342a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vqadd.u8   d4, d4, d2                  \n\t"
3433f55eed73f5af405909c2c10bff179d80526d423Mike Klein            #else
3443f55eed73f5af405909c2c10bff179d80526d423Mike Klein                      "vqadd.u8   d6, d6, d2                  \n\t"
3453f55eed73f5af405909c2c10bff179d80526d423Mike Klein                      "vqadd.u8   d5, d5, d1                  \n\t"
3463f55eed73f5af405909c2c10bff179d80526d423Mike Klein                      "vqadd.u8   d4, d4, d0                  \n\t"
3473f55eed73f5af405909c2c10bff179d80526d423Mike Klein            #endif
348fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
349a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // pack 8888 {d4-d6} to 0565 q10
350a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q10, d6, #8                 \n\t"
351a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q3, d5, #8                  \n\t"
352a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vshll.u8   q2, d4, #8                  \n\t"
353a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vsri.u16   q10, q3, #5                 \n\t"
354a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vsri.u16   q10, q2, #11                \n\t"
355fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
356a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      // store
357a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #4                \n\t"
358a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        24f                         \n\t"
359a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
360fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
361a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "24:                                        \n\t"
362a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #2                \n\t"
363a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        22f                         \n\t"
364a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
365fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
366a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "22:                                        \n\t"
367a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "tst        %[count], #1                \n\t"
368a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "beq        21f                         \n\t"
369a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
370fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
371a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "21:                                        \n\t"
372a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : [count] "+r" (count)
373a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
374a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
375a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
376a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      "d30","d31"
377a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                      );
378a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
379a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
3800d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3810d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit#else // #ifdef SK_CPU_ARM32
3820d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3830d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petitvoid S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
3840d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit                           const SkPMColor* SK_RESTRICT src, int count,
3850d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit                           U8CPU alpha, int /*x*/, int /*y*/) {
3860d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    SkASSERT(255 == alpha);
3870d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3880d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    if (count >= 16) {
3890d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        asm (
3900d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "movi    v4.8h, #0x80                   \t\n"
3910d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit
3920d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "1:                                     \t\n"
393f61088321c0906f62431c029173c1a7a70856ec7mtklein            "sub     %w[count], %w[count], #16      \t\n"
3940d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ld1     {v16.8h-v17.8h}, [%[dst]]      \t\n"
3950d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ld4     {v0.16b-v3.16b}, [%[src]], #64 \t\n"
3960d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "prfm    pldl1keep, [%[src],#512]       \t\n"
3970d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "prfm    pldl1keep, [%[dst],#256]       \t\n"
3980d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v20.8h, v17.8h, #5             \t\n"
3990d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v31.8h, v16.8h, #5             \t\n"
4000d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn     v6.8b, v31.8h                  \t\n"
4010d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn2    v6.16b, v20.8h                 \t\n"
4020d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v20.8h, v17.8h, #11            \t\n"
4030d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shl     v19.16b, v6.16b, #2            \t\n"
4040d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v31.8h, v16.8h, #11            \t\n"
4050d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn     v22.8b, v31.8h                 \t\n"
4060d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn2    v22.16b, v20.8h                \t\n"
4070d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shl     v18.16b, v22.16b, #3           \t\n"
4080d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mvn     v3.16b, v3.16b                 \t\n"
4090d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn     v16.8b, v16.8h                 \t\n"
4100d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v7.16b, v4.16b                 \t\n"
4110d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "xtn2    v16.16b, v17.8h                \t\n"
4120d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal   v7.8h, v3.8b, v19.8b           \t\n"
4130d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shl     v16.16b, v16.16b, #3           \t\n"
4140d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v22.16b, v4.16b                \t\n"
4150d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v24.8h, v7.8h, #6              \t\n"
4160d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal   v22.8h, v3.8b, v18.8b          \t\n"
4170d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v20.8h, v22.8h, #5             \t\n"
4180d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn   v20.8b, v22.8h, v20.8h         \t\n"
419f61088321c0906f62431c029173c1a7a70856ec7mtklein            "cmp     %w[count], #16                 \t\n"
4200d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v6.16b, v4.16b                 \t\n"
4210d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v5.16b, v4.16b                 \t\n"
4220d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal   v6.8h, v3.8b, v16.8b           \t\n"
4230d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal2  v5.8h, v3.16b, v19.16b         \t\n"
4240d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v17.16b, v4.16b                \t\n"
4250d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v19.8h, v6.8h, #5              \t\n"
4260d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal2  v17.8h, v3.16b, v18.16b        \t\n"
4270d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn   v7.8b, v7.8h, v24.8h           \t\n"
4280d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v18.8h, v5.8h, #6              \t\n"
4290d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v21.8h, v17.8h, #5             \t\n"
4300d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn2  v7.16b, v5.8h, v18.8h          \t\n"
4310d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn2  v20.16b, v17.8h, v21.8h        \t\n"
4320d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "mov     v22.16b, v4.16b                \t\n"
4330d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn   v6.8b, v6.8h, v19.8h           \t\n"
4340d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "umlal2  v22.8h, v3.16b, v16.16b        \t\n"
4350d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "ushr    v5.8h, v22.8h, #5              \t\n"
4360d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "addhn2  v6.16b, v22.8h, v5.8h          \t\n"
4370d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "uqadd   v7.16b, v1.16b, v7.16b         \t\n"
4380be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
4390d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "uqadd   v20.16b, v2.16b, v20.16b       \t\n"
4400d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "uqadd   v6.16b, v0.16b, v6.16b         \t\n"
4410be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
4420be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit            "uqadd   v20.16b, v0.16b, v20.16b       \t\n"
4430be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit            "uqadd   v6.16b, v2.16b, v6.16b         \t\n"
4440be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#else
4450be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#error "This function only supports BGRA and RGBA."
4460be677d35c7ef1bf8a7a694d1838fa11333d1beckevin.petit#endif
4470d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll    v22.8h, v20.8b, #8             \t\n"
4480d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll    v5.8h, v7.8b, #8               \t\n"
4490d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v22.8h, v5.8h, #5              \t\n"
4500d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll    v17.8h, v6.8b, #8              \t\n"
4510d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll2   v23.8h, v20.16b, #8            \t\n"
4520d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll2   v7.8h, v7.16b, #8              \t\n"
4530d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v22.8h, v17.8h, #11            \t\n"
4540d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v23.8h, v7.8h, #5              \t\n"
4550d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "shll2   v6.8h, v6.16b, #8              \t\n"
4560d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "st1     {v22.8h}, [%[dst]], #16        \t\n"
4570d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "sri     v23.8h, v6.8h, #11             \t\n"
4580d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "st1     {v23.8h}, [%[dst]], #16        \t\n"
4590d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            "b.ge    1b                             \t\n"
4600d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            : [dst] "+&r" (dst), [src] "+&r" (src), [count] "+&r" (count)
4610d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            :: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
4620d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit               "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
4630d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit               "v31"
4640d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        );
4650d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    }
4660d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        // Leftovers
4670d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    if (count > 0) {
4680d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        do {
4690d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            SkPMColor c = *src++;
4700d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            SkPMColorAssert(c);
4710d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            if (c) {
4720d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit                *dst = SkSrcOver32To16(c, *dst);
4730d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            }
4740d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit            dst += 1;
4750d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit        } while (--count != 0);
4760d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit    }
4770d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit}
4780d448303099037d56d8f97ce87d8cae05dd9fdabkevin.petit#endif // #ifdef SK_CPU_ARM32
479a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
480402448d6818cab9d7b7633a0c18fcf574c915357mleestatic uint32_t pmcolor_to_expand16(SkPMColor c) {
481402448d6818cab9d7b7633a0c18fcf574c915357mlee    unsigned r = SkGetPackedR32(c);
482402448d6818cab9d7b7633a0c18fcf574c915357mlee    unsigned g = SkGetPackedG32(c);
483402448d6818cab9d7b7633a0c18fcf574c915357mlee    unsigned b = SkGetPackedB32(c);
484402448d6818cab9d7b7633a0c18fcf574c915357mlee    return (g << 24) | (r << 13) | (b << 2);
485402448d6818cab9d7b7633a0c18fcf574c915357mlee}
486402448d6818cab9d7b7633a0c18fcf574c915357mlee
487402448d6818cab9d7b7633a0c18fcf574c915357mleevoid Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) {
488402448d6818cab9d7b7633a0c18fcf574c915357mlee    uint32_t src_expand;
489402448d6818cab9d7b7633a0c18fcf574c915357mlee    unsigned scale;
490402448d6818cab9d7b7633a0c18fcf574c915357mlee    uint16x8_t vmask_blue;
491402448d6818cab9d7b7633a0c18fcf574c915357mlee
492402448d6818cab9d7b7633a0c18fcf574c915357mlee    if (count <= 0) return;
493402448d6818cab9d7b7633a0c18fcf574c915357mlee    SkASSERT(((size_t)dst & 0x01) == 0);
494402448d6818cab9d7b7633a0c18fcf574c915357mlee
495402448d6818cab9d7b7633a0c18fcf574c915357mlee    /*
496402448d6818cab9d7b7633a0c18fcf574c915357mlee     * This preamble code is in order to make dst aligned to 8 bytes
497402448d6818cab9d7b7633a0c18fcf574c915357mlee     * in the next mutiple bytes read & write access.
498402448d6818cab9d7b7633a0c18fcf574c915357mlee     */
499402448d6818cab9d7b7633a0c18fcf574c915357mlee    src_expand = pmcolor_to_expand16(src);
500402448d6818cab9d7b7633a0c18fcf574c915357mlee    scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
501402448d6818cab9d7b7633a0c18fcf574c915357mlee
502402448d6818cab9d7b7633a0c18fcf574c915357mlee#define DST_ALIGN 8
503402448d6818cab9d7b7633a0c18fcf574c915357mlee
504402448d6818cab9d7b7633a0c18fcf574c915357mlee    /*
505402448d6818cab9d7b7633a0c18fcf574c915357mlee     * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time.
506402448d6818cab9d7b7633a0c18fcf574c915357mlee     */
507402448d6818cab9d7b7633a0c18fcf574c915357mlee    int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1);
508402448d6818cab9d7b7633a0c18fcf574c915357mlee
509402448d6818cab9d7b7633a0c18fcf574c915357mlee    for (int i = 0; i < preamble_size; i+=2, dst++) {
510402448d6818cab9d7b7633a0c18fcf574c915357mlee        uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
511402448d6818cab9d7b7633a0c18fcf574c915357mlee        *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
512402448d6818cab9d7b7633a0c18fcf574c915357mlee        if (--count == 0)
513402448d6818cab9d7b7633a0c18fcf574c915357mlee            break;
514402448d6818cab9d7b7633a0c18fcf574c915357mlee    }
515402448d6818cab9d7b7633a0c18fcf574c915357mlee
516402448d6818cab9d7b7633a0c18fcf574c915357mlee    int count16 = 0;
517402448d6818cab9d7b7633a0c18fcf574c915357mlee    count16 = count >> 4;
518402448d6818cab9d7b7633a0c18fcf574c915357mlee    vmask_blue = vmovq_n_u16(SK_B16_MASK);
519402448d6818cab9d7b7633a0c18fcf574c915357mlee
520402448d6818cab9d7b7633a0c18fcf574c915357mlee    if (count16) {
521402448d6818cab9d7b7633a0c18fcf574c915357mlee        uint16x8_t wide_sr;
522402448d6818cab9d7b7633a0c18fcf574c915357mlee        uint16x8_t wide_sg;
523402448d6818cab9d7b7633a0c18fcf574c915357mlee        uint16x8_t wide_sb;
524402448d6818cab9d7b7633a0c18fcf574c915357mlee        uint16x8_t wide_256_sa;
525402448d6818cab9d7b7633a0c18fcf574c915357mlee
526402448d6818cab9d7b7633a0c18fcf574c915357mlee        unsigned sr = SkGetPackedR32(src);
527402448d6818cab9d7b7633a0c18fcf574c915357mlee        unsigned sg = SkGetPackedG32(src);
528402448d6818cab9d7b7633a0c18fcf574c915357mlee        unsigned sb = SkGetPackedB32(src);
529402448d6818cab9d7b7633a0c18fcf574c915357mlee        unsigned sa = SkGetPackedA32(src);
530402448d6818cab9d7b7633a0c18fcf574c915357mlee
531402448d6818cab9d7b7633a0c18fcf574c915357mlee        // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb
532402448d6818cab9d7b7633a0c18fcf574c915357mlee        // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted,
533402448d6818cab9d7b7633a0c18fcf574c915357mlee        //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
534402448d6818cab9d7b7633a0c18fcf574c915357mlee        wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift
535402448d6818cab9d7b7633a0c18fcf574c915357mlee
536402448d6818cab9d7b7633a0c18fcf574c915357mlee        // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted,
537402448d6818cab9d7b7633a0c18fcf574c915357mlee        //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5)
538402448d6818cab9d7b7633a0c18fcf574c915357mlee        wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift
539402448d6818cab9d7b7633a0c18fcf574c915357mlee
540402448d6818cab9d7b7633a0c18fcf574c915357mlee        // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted,
541402448d6818cab9d7b7633a0c18fcf574c915357mlee        //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5)
542402448d6818cab9d7b7633a0c18fcf574c915357mlee        wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift
543402448d6818cab9d7b7633a0c18fcf574c915357mlee
544402448d6818cab9d7b7633a0c18fcf574c915357mlee        wide_256_sa =
545402448d6818cab9d7b7633a0c18fcf574c915357mlee            vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3
546402448d6818cab9d7b7633a0c18fcf574c915357mlee
547402448d6818cab9d7b7633a0c18fcf574c915357mlee        while (count16-- > 0) {
548402448d6818cab9d7b7633a0c18fcf574c915357mlee            uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b;
549402448d6818cab9d7b7633a0c18fcf574c915357mlee            uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b;
550402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1 = vld1q_u16(dst);
551402448d6818cab9d7b7633a0c18fcf574c915357mlee            dst += 8;
552402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2 = vld1q_u16(dst);
553402448d6818cab9d7b7633a0c18fcf574c915357mlee            dst -= 8;    //to store dst again.
554402448d6818cab9d7b7633a0c18fcf574c915357mlee
555402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS);                 // shift green to top of lanes
556402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_b = vdst1 & vmask_blue;                              // extract blue
557402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT);                // extract red
558402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green
559402448d6818cab9d7b7633a0c18fcf574c915357mlee
560402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS);                 // shift green to top of lanes
561402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_b = vdst2 & vmask_blue;                              // extract blue
562402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT);                // extract red
563402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green
564402448d6818cab9d7b7633a0c18fcf574c915357mlee
565402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r);        // sr + (256-sa) x dr1
566402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g);        // sg + (256-sa) x dg1
567402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b);        // sb + (256-sa) x db1
568402448d6818cab9d7b7633a0c18fcf574c915357mlee
569402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r);        // sr + (256-sa) x dr2
570402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g);        // sg + (256-sa) x dg2
571402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b);        // sb + (256-sa) x db2
572402448d6818cab9d7b7633a0c18fcf574c915357mlee
573402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_r = vshrq_n_u16(vdst1_r, 5);                         // 5-bit right shift for 5-bit red
574402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_g = vshrq_n_u16(vdst1_g, 5);                         // 5-bit right shift for 6-bit green
575402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1_b = vshrq_n_u16(vdst1_b, 5);                         // 5-bit right shift for 5-bit blue
576402448d6818cab9d7b7633a0c18fcf574c915357mlee
577402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT);       // insert green into blue
578402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT);         // insert red into green/blue
579402448d6818cab9d7b7633a0c18fcf574c915357mlee
580402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_r = vshrq_n_u16(vdst2_r, 5);                         // 5-bit right shift for 5-bit red
581402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_g = vshrq_n_u16(vdst2_g, 5);                         // 5-bit right shift for 6-bit green
582402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2_b = vshrq_n_u16(vdst2_b, 5);                         // 5-bit right shift for 5-bit blue
583402448d6818cab9d7b7633a0c18fcf574c915357mlee
584402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT);       // insert green into blue
585402448d6818cab9d7b7633a0c18fcf574c915357mlee            vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT);         // insert red into green/blue
586402448d6818cab9d7b7633a0c18fcf574c915357mlee
587402448d6818cab9d7b7633a0c18fcf574c915357mlee            vst1q_u16(dst, vdst1);
588402448d6818cab9d7b7633a0c18fcf574c915357mlee            dst += 8;
589402448d6818cab9d7b7633a0c18fcf574c915357mlee            vst1q_u16(dst, vdst2);
590402448d6818cab9d7b7633a0c18fcf574c915357mlee            dst += 8;
591402448d6818cab9d7b7633a0c18fcf574c915357mlee        }
592402448d6818cab9d7b7633a0c18fcf574c915357mlee    }
593402448d6818cab9d7b7633a0c18fcf574c915357mlee
594402448d6818cab9d7b7633a0c18fcf574c915357mlee    count &= 0xF;
595402448d6818cab9d7b7633a0c18fcf574c915357mlee    if (count > 0) {
596402448d6818cab9d7b7633a0c18fcf574c915357mlee        do {
597402448d6818cab9d7b7633a0c18fcf574c915357mlee            uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
598402448d6818cab9d7b7633a0c18fcf574c915357mlee            *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
599402448d6818cab9d7b7633a0c18fcf574c915357mlee            dst += 1;
600402448d6818cab9d7b7633a0c18fcf574c915357mlee        } while (--count != 0);
601402448d6818cab9d7b7633a0c18fcf574c915357mlee    }
602402448d6818cab9d7b7633a0c18fcf574c915357mlee}
603402448d6818cab9d7b7633a0c18fcf574c915357mlee
604be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.orgstatic inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
605be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    prod += vdupq_n_u16(128);
606be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    prod += vshrq_n_u16(prod, 8);
607be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    return vshrq_n_u16(prod, 8);
608be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org}
609be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
610a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
611a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                          const SkPMColor* SK_RESTRICT src, int count,
612a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                          U8CPU alpha, int /*x*/, int /*y*/) {
613be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org   SkASSERT(255 > alpha);
614a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
615be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    /* This code implements a Neon version of S32A_D565_Blend. The results have
616be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org     * a few mismatches compared to the original code. These mismatches never
617be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org     * exceed 1.
618a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com     */
619fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
620be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    if (count >= 8) {
621be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        uint16x8_t valpha_max, vmask_blue;
622be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        uint8x8_t valpha;
623be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
624be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        // prepare constants
625be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        valpha_max = vmovq_n_u16(255);
626be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        valpha = vdup_n_u8(alpha);
627be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        vmask_blue = vmovq_n_u16(SK_B16_MASK);
628be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
629be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        do {
630be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
631be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint16x8_t vres_a, vres_r, vres_g, vres_b;
632be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint8x8x4_t vsrc;
633be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
634be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // load pixels
635be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst = vld1q_u16(dst);
636ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
637ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc = sk_vld4_u8_arm64_4(src);
638e683e810a35e4ea91b00104590e15a56cb35ad39mtklein#elif (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
639be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            asm (
640be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                "vld4.u8 %h[vsrc], [%[src]]!"
641be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
642be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : :
643be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            );
644a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#else
645be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d0 asm("d0");
646be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d1 asm("d1");
647be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d2 asm("d2");
648be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            register uint8x8_t d3 asm("d3");
649be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
650be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            asm volatile (
651be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                "vld4.u8    {d0-d3},[%[src]]!;"
652be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
653be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                  [src] "+&r" (src)
654be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org                : :
655be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            );
656be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[0] = d0;
657be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[1] = d1;
658be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[2] = d2;
659be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[3] = d3;
660a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com#endif
661fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
662fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
663be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // deinterleave dst
664be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
665be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_b = vdst & vmask_blue;                     // extract blue
666be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
667be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
668be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
669be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // shift src to 565
670be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
671be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
672be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
673be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
674be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // calc src * src_scale
675be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
676be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
677be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
678be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
679be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
680be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // prepare dst_scale
681be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_a = SkDiv255Round_neon8(vres_a);
682be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
683be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
684be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // add dst * dst_scale to previous result
685be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
686be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
687be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
688be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
689be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org#ifdef S32A_D565_BLEND_EXACT
690be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // It is possible to get exact results with this but it is slow,
691be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // even slower than C code in some cases
692be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = SkDiv255Round_neon8(vres_r);
693be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = SkDiv255Round_neon8(vres_g);
694be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = SkDiv255Round_neon8(vres_b);
695be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org#else
696be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_r = vrshrq_n_u16(vres_r, 8);
697be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_g = vrshrq_n_u16(vres_g, 8);
698be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vrshrq_n_u16(vres_b, 8);
699be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org#endif
700be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // pack result
701be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
702be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
703be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org
704be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            // store
705be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            vst1q_u16(dst, vres_b);
706be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            dst += 8;
707be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            count -= 8;
708be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        } while (count >= 8);
709be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    }
710a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
711be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    // leftovers
712be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org    while (count-- > 0) {
713be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        SkPMColor sc = *src++;
714be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        if (sc) {
715be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            uint16_t dc = *dst;
716be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
7173848427d884b72114854c8eef9662691f23fae7bmtklein            unsigned dr = (SkPacked32ToR16(sc) * alpha) + (SkGetPackedR16(dc) * dst_scale);
7183848427d884b72114854c8eef9662691f23fae7bmtklein            unsigned dg = (SkPacked32ToG16(sc) * alpha) + (SkGetPackedG16(dc) * dst_scale);
7193848427d884b72114854c8eef9662691f23fae7bmtklein            unsigned db = (SkPacked32ToB16(sc) * alpha) + (SkGetPackedB16(dc) * dst_scale);
720be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
721be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        }
722be233d63ca015c2991f4fe0802e4a31a71642062commit-bot@chromium.org        dst += 1;
723a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
724a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
725a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
726a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
727a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * each dither value is spaced out into byte lanes, and repeated
728a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
729a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * start of each row.
730a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com */
731a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comstatic const uint8_t gDitherMatrix_Neon[48] = {
732a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
733a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
734a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
735a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
736fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
737a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com};
738a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
739a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
740a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                int count, U8CPU alpha, int x, int y)
741a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com{
742fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
7434cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    SkASSERT(255 > alpha);
7444cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7454cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    // rescale alpha to range 1 - 256
746a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    int scale = SkAlpha255To256(alpha);
747fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
7484cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    if (count >= 8) {
7494cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        /* select row and offset for dither array */
7504cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
751fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
7524cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
7534cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
754fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
7554cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
7564cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
757fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
7584cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        do {
7594cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
760ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            uint8x8x4_t vsrc;
7614cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
7624cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
7634cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
7644cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
7654cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vdst;
7664cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16x8_t vdst_r, vdst_g, vdst_b;
7674cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int16x8_t vres_r, vres_g, vres_b;
7684cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int8x8_t vres8_r, vres8_g, vres8_b;
7694cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7704cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Load source and add dither
771ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
772ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc = sk_vld4_u8_arm64_3(src);
773ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
7744cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            {
7754cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d0 asm("d0");
7764cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d1 asm("d1");
7774cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d2 asm("d2");
7784cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            register uint8x8_t d3 asm("d3");
7794cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7804cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            asm (
781ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit                "vld4.8    {d0-d3},[%[src]]! "
7824cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
7834cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                :
7844cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            );
785ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc.val[0] = d0;
786ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc.val[1] = d1;
787ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc.val[2] = d2;
7884cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            }
789ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
790ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc_r = vsrc.val[NEON_R];
791ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc_g = vsrc.val[NEON_G];
792ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            vsrc_b = vsrc.val[NEON_B];
7934cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7944cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
7954cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
7964cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
7974cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
7984cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
7994cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
8004cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
8014cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8024cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
8034cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
8044cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
8054cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8064cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
8074cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
8084cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
8094cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8104cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Load dst and unpack
8114cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst = vld1q_u16(dst);
8124cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
8134cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
8144cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
8154cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8164cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // subtract dst from src and widen
8174cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
8184cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
8194cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
8204cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8214cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // multiply diffs by scale and shift
8224cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_r = vmulq_s16(vres_r, vscale);
8234cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_g = vmulq_s16(vres_g, vscale);
8244cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vmulq_s16(vres_b, vscale);
8254cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8264cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres8_r = vshrn_n_s16(vres_r, 8);
8274cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres8_g = vshrn_n_s16(vres_g, 8);
8284cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres8_b = vshrn_n_s16(vres_b, 8);
8294cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8304cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // add dst to result
8314cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
8324cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
8334cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
8344cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8354cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // put result into 565 format
8364cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
8374cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
8384cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8394cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Store result
8404cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
8414cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8424cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            // Next iteration
8434cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            dst += 8;
8444cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            count -= 8;
8454cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8464cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        } while (count >= 8);
8474cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    }
8484cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8494cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    // Leftovers
8504cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org    if (count > 0) {
8514cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        int scale = SkAlpha255To256(alpha);
8524cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        DITHER_565_SCAN(y);
8534cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        do {
8544cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            SkPMColor c = *src++;
8554cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            SkPMColorAssert(c);
8564cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8574cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int dither = DITHER_VALUE(x);
8584cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int sr = SkGetPackedR32(c);
8594cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int sg = SkGetPackedG32(c);
8604cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            int sb = SkGetPackedB32(c);
8614cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            sr = SkDITHER_R32To565(sr, dither);
8624cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            sg = SkDITHER_G32To565(sg, dither);
8634cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            sb = SkDITHER_B32To565(sb, dither);
8644cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org
8654cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            uint16_t d = *dst;
8664cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
8674cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
8684cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
8694cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org            DITHER_INC_X(x);
8704cc26324e3be5258fae9dc102aa6a3af7d1c96eacommit-bot@chromium.org        } while (--count != 0);
871a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
872a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
873a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
874a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com/* Neon version of S32_Blend_BlitRow32()
875a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com * portable version is in src/core/SkBlitRow_D32.cpp
876a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com */
877a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
878a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                              const SkPMColor* SK_RESTRICT src,
879a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                              int count, U8CPU alpha) {
880a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(alpha <= 255);
881fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
882374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    if (count <= 0) {
883374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        return;
884374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    }
885dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
886374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    uint16_t src_scale = SkAlpha255To256(alpha);
887374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    uint16_t dst_scale = 256 - src_scale;
888dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
889374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org    while (count >= 2) {
890374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint8x8_t vsrc, vdst, vres;
891374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint16x8_t vsrc_wide, vdst_wide;
892dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
893374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        /* These commented prefetches are a big win for count
894374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
895374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org         * They also hurt a little (<5%) on an A15
896374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org         */
897374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        //__builtin_prefetch(src+32);
898374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        //__builtin_prefetch(dst+32);
899dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
900374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Load
901374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc = vreinterpret_u8_u32(vld1_u32(src));
902374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst = vreinterpret_u8_u32(vld1_u32(dst));
903374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
904374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Process src
905374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmovl_u8(vsrc);
906374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
907374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
908374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Process dst
909374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
910374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
911374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Combine
91240254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman        vdst_wide += vsrc_wide;
91340254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman        vres = vshrn_n_u16(vdst_wide, 8);
914374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
915374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Store
916374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vst1_u32(dst, vreinterpret_u32_u8(vres));
917374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
918374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        src += 2;
919374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        dst += 2;
920374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        count -= 2;
921fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
922fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
923fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    if (count == 1) {
924374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
925374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        uint16x8_t vsrc_wide, vdst_wide;
926dfff2737f8ad3e945a4dcbe175380d4b2a91a260commit-bot@chromium.org
927374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Load
928374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
929374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
930374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
931374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Process
932374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmovl_u8(vsrc);
933374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
934374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
93540254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman        vdst_wide += vsrc_wide;
93640254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman        vres = vshrn_n_u16(vdst_wide, 8);
937374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org
938374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        // Store
939374ea4ee26b9d537c1b9635544105f915766f61bcommit-bot@chromium.org        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
940a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
941a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
942a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
9433a2682a77f996f649de7699c9f7bee046c6d4f17mtklein#ifdef SK_CPU_ARM32
9441fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.orgvoid S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
9451fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org                         const SkPMColor* SK_RESTRICT src,
9461fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org                         int count, U8CPU alpha) {
9471fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
94840254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman    SkASSERT(255 > alpha);
9491fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9501fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    if (count <= 0) {
9511fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        return;
9521fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    }
9531fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9541fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    unsigned alpha256 = SkAlpha255To256(alpha);
9551fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9561fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    // First deal with odd counts
9571fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    if (count & 1) {
9581fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
9591fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        uint16x8_t vdst_wide, vsrc_wide;
9601fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        unsigned dst_scale;
9611fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9621fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Load
9631fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
9641fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
9651fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9661fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Calc dst_scale
9671fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        dst_scale = vget_lane_u8(vsrc, 3);
96840254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman        dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
9691fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9701fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Process src
9711fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vsrc_wide = vmovl_u8(vsrc);
9721fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
9731fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9741fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Process dst
9751fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vdst_wide = vmovl_u8(vdst);
9761fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
9771fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9781fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        // Combine
97940254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman        vdst_wide += vsrc_wide;
98040254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman        vres = vshrn_n_u16(vdst_wide, 8);
9811fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9821fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
9831fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        dst++;
9841fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        src++;
9851fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        count--;
9861fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    }
9871fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9881fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    if (count) {
9891fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        uint8x8_t alpha_mask;
9901fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
9911fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        alpha_mask = vld1_u8(alpha_mask_setup);
9921fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9931fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        do {
9941fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9951fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
9961fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
9971fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
9981fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            __builtin_prefetch(src+32);
9991fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            __builtin_prefetch(dst+32);
10001fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10011fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Load
10021fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc = vreinterpret_u8_u32(vld1_u32(src));
10031fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst = vreinterpret_u8_u32(vld1_u32(dst));
10041fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10051fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Prepare src_scale
10061fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_scale = vdupq_n_u16(alpha256);
10071fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10081fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Calc dst_scale
10091fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
10101fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_scale = vmovl_u8(vsrc_alphas);
101140254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
101240254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            // A 16-bit lane would overflow if we used 0xFFFF here,
101340254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            // so use an approximation with 0xFF00 that is off by 1,
101440254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            // and add back 1 after to get the correct value.
101540254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            // This is valid if alpha256 <= 255.
101640254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
101740254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
101840254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
10191fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10201fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Process src
10211fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_wide = vmovl_u8(vsrc);
10221fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vsrc_wide *= vsrc_scale;
10231fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10241fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Process dst
10251fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_wide = vmovl_u8(vdst);
10261fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vdst_wide *= vdst_scale;
10271fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10281fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            // Combine
102940254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            vdst_wide += vsrc_wide;
103040254c2c2dc28a34f96294d5a1ad94a99b0be8a6lsalzman            vres = vshrn_n_u16(vdst_wide, 8);
10311fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10321fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            vst1_u32(dst, vreinterpret_u32_u8(vres));
10331fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
10341fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            src += 2;
10351fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            dst += 2;
10361fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org            count -= 2;
10371fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org        } while(count);
10381fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    }
10391fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org}
10401fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org
1041a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com///////////////////////////////////////////////////////////////////////////////
1042a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1043ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif // #ifdef SK_CPU_ARM32
1044a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1045a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
1046a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                   const SkPMColor* SK_RESTRICT src,
1047a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                   int count, U8CPU alpha, int x, int y) {
1048a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(255 == alpha);
1049a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1050fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#define    UNROLL    8
1051a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1052a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count >= UNROLL) {
1053fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1054fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org    uint8x8_t dbase;
1055fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1056fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    dbase = vld1_u8(dstart);
1057a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1058a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        do {
1059ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        uint8x8x4_t vsrc;
1060fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint8x8_t sr, sg, sb, sa, d;
1061fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst8, scale8, alpha8;
1062fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst_r, dst_g, dst_b;
1063fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1064ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
1065ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_4(src);
1066ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
1067fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        {
1068fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d0 asm("d0");
1069fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d1 asm("d1");
1070fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d2 asm("d2");
1071fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d3 asm("d3");
1072a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1073ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        asm ("vld4.8    {d0-d3},[%[src]]! "
1074fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
1075fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org            :
1076fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        );
1077ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[0] = d0;
1078ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[1] = d1;
1079ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[2] = d2;
1080ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[3] = d3;
1081fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        }
1082ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
1083ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sa = vsrc.val[NEON_A];
1084ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sr = vsrc.val[NEON_R];
1085ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sg = vsrc.val[NEON_G];
1086ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sb = vsrc.val[NEON_B];
1087a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1088fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        /* calculate 'd', which will be 0..7
1089fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
1090fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         */
1091fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        alpha8 = vmovl_u8(dbase);
1092fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        alpha8 = vmlal_u8(alpha8, sa, dbase);
1093fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        d = vshrn_n_u16(alpha8, 8);    // narrowing too
1094fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1095fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // sr = sr - (sr>>5) + d
1096fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* watching for 8-bit overflow.  d is 0..7; risky range of
1097fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
1098fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         * safe  as long as we do ((sr-sr>>5) + d)
1099fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org         */
1100fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1101fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sr = vadd_u8(sr, d);
1102fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1103fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // sb = sb - (sb>>5) + d
1104fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1105fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sb = vadd_u8(sb, d);
1106fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1107fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1108fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1109fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sg = vadd_u8(sg, vshr_n_u8(d,1));
1110fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1111fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // need to pick up 8 dst's -- at 16 bits each, 128 bits
1112fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vld1q_u16(dst);
1113fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
1114fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
1115fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
1116fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org
1117fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // blend
1118fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
1119a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1120fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // combine the addq and mul, save 3 insns
1121fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        scale8 = vshrq_n_u16(scale8, 3);
1122fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
1123fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
1124fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
1125a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1126fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // repack to store
1127fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst8 = vshrq_n_u16(dst_b, 5);
1128fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
1129fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
1130fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1131fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        vst1q_u16(dst, dst8);
1132fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1133fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        dst += UNROLL;
1134fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        count -= UNROLL;
1135fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org        // skip x += UNROLL, since it's unchanged mod-4
1136a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        } while (count >= UNROLL);
1137a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1138fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#undef    UNROLL
1139a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1140fe68eb6a4081f60caf665ec632180e6d7c26a169commit-bot@chromium.org    // residuals
1141a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count > 0) {
1142a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        DITHER_565_SCAN(y);
1143a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        do {
1144a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColor c = *src++;
1145a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColorAssert(c);
1146a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            if (c) {
1147a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned a = SkGetPackedA32(c);
1148fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1149a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                // dither and alpha are just temporary variables to work-around
1150a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                // an ICE in debug.
1151a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned dither = DITHER_VALUE(x);
1152a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned alpha = SkAlpha255To256(a);
1153a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                int d = SkAlphaMul(dither, alpha);
1154fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1155a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sr = SkGetPackedR32(c);
1156a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sg = SkGetPackedG32(c);
1157a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                unsigned sb = SkGetPackedB32(c);
1158a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sr = SkDITHER_R32_FOR_565(sr, d);
1159a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sg = SkDITHER_G32_FOR_565(sg, d);
1160a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                sb = SkDITHER_B32_FOR_565(sb, d);
1161fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1162a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
1163a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
1164a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
1165a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                // now src and dst expanded are in g:11 r:10 x:1 b:10
1166a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
1167a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            }
1168a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            dst += 1;
1169a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            DITHER_INC_X(x);
1170a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        } while (--count != 0);
1171a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1172a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1173a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1174a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com///////////////////////////////////////////////////////////////////////////////
1175a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1176a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comvoid S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
1177a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                 const SkPMColor* SK_RESTRICT src,
1178a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com                                 int count, U8CPU alpha, int x, int y) {
1179a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    SkASSERT(255 == alpha);
1180a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1181fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#define    UNROLL    8
1182a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count >= UNROLL) {
1183fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    uint8x8_t d;
1184fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
1185fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    d = vld1_u8(dstart);
1186fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1187fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    while (count >= UNROLL) {
1188efbe8e9bedda21a3e061ebf3d96431a0f250a654djsollen@google.com        uint8x8_t sr, sg, sb;
1189efbe8e9bedda21a3e061ebf3d96431a0f250a654djsollen@google.com        uint16x8_t dr, dg, db;
1190fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        uint16x8_t dst8;
1191ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        uint8x8x4_t vsrc;
1192fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1193ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#ifdef SK_CPU_ARM64
1194ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc = sk_vld4_u8_arm64_3(src);
1195ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#else
1196fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        {
1197fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d0 asm("d0");
1198fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d1 asm("d1");
1199fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d2 asm("d2");
1200fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        register uint8x8_t d3 asm("d3");
1201fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1202688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        asm (
1203ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit            "vld4.8    {d0-d3},[%[src]]! "
1204688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
1205688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org            :
1206688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        );
1207ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[0] = d0;
1208ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[1] = d1;
1209ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        vsrc.val[2] = d2;
1210fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        }
1211ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit#endif
1212ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sr = vsrc.val[NEON_R];
1213ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sg = vsrc.val[NEON_G];
1214ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit        sb = vsrc.val[NEON_B];
1215ea6b46b6c06fd9e03b98f01b274733de1eeae89dkevin.petit
1216fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        /* XXX: if we want to prefetch, hide it in the above asm()
1217fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * using the gcc __builtin_prefetch(), the prefetch will
1218fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * fall to the bottom of the loop -- it won't stick up
1219fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         * at the top of the loop, just after the vld4.
1220fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com         */
1221fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1222688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // sr = sr - (sr>>5) + d
1223fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
1224fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dr = vaddl_u8(sr, d);
1225fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1226688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // sb = sb - (sb>>5) + d
1227fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
1228fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        db = vaddl_u8(sb, d);
1229fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1230688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
1231fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
1232688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
1233fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1234688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // pack high bits of each into 565 format  (rgb, b is lsb)
1235fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vshrq_n_u16(db, 3);
1236fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
1237688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
1238fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1239688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // store it
1240fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        vst1q_u16(dst, dst8);
1241fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com
1242fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        dst += UNROLL;
1243688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        // we don't need to increment src as the asm above has already done it
1244fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com        count -= UNROLL;
1245688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org        x += UNROLL;        // probably superfluous
1246fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    }
1247a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1248fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com#undef    UNROLL
1249a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1250688d362b4545e1beadebb7ba5886813d7038883ccommit-bot@chromium.org    // residuals
1251a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    if (count > 0) {
1252a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        DITHER_565_SCAN(y);
1253a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        do {
1254a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColor c = *src++;
1255a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkPMColorAssert(c);
1256a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            SkASSERT(SkGetPackedA32(c) == 255);
1257a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1258a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            unsigned dither = DITHER_VALUE(x);
1259a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            *dst++ = SkDitherRGB32To565(c, dither);
1260a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com            DITHER_INC_X(x);
1261a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com        } while (--count != 0);
1262a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    }
1263a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com}
1264a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1265a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com///////////////////////////////////////////////////////////////////////////////
1266a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1267a7f11918d92621507f35b228a290f05dcaf0f4b6reedconst SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = {
1268a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    // no dither
12690060159457453ca45a47828648c8f29d5695983ccommit-bot@chromium.org    S32_D565_Opaque_neon,
127095c2e5532b094add82b007bdfcd4c64050b6b366commit-bot@chromium.org    S32_D565_Blend_neon,
1271a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32A_D565_Opaque_neon,
1272fa115bd4543631244f3b9accb3541b28f4222a96mtklein#if 0
1273a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32A_D565_Blend_neon,
1274fa115bd4543631244f3b9accb3541b28f4222a96mtklein#else
127596fcdcc219d2a0d3579719b84b28bede76efba64halcanary    nullptr,   // https://code.google.com/p/skia/issues/detail?id=2797
12765b2c2c6fd09752641b14766678d62fe50b4e3ef3reed#endif
1277a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1278a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    // dither
1279a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32_D565_Opaque_Dither_neon,
1280a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32_D565_Blend_Dither_neon,
1281a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com    S32A_D565_Opaque_Dither_neon,
128296fcdcc219d2a0d3579719b84b28bede76efba64halcanary    nullptr,   // S32A_D565_Blend_Dither
1283a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com};
1284a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com
1285402448d6818cab9d7b7633a0c18fcf574c915357mleeconst SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = {
1286402448d6818cab9d7b7633a0c18fcf574c915357mlee    Color32A_D565_neon,    // Color32_D565,
1287402448d6818cab9d7b7633a0c18fcf574c915357mlee    Color32A_D565_neon,    // Color32A_D565,
1288402448d6818cab9d7b7633a0c18fcf574c915357mlee    Color32A_D565_neon,    // Color32_D565_Dither,
1289402448d6818cab9d7b7633a0c18fcf574c915357mlee    Color32A_D565_neon,    // Color32A_D565_Dither
1290402448d6818cab9d7b7633a0c18fcf574c915357mlee};
1291402448d6818cab9d7b7633a0c18fcf574c915357mlee
1292a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.comconst SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
129396fcdcc219d2a0d3579719b84b28bede76efba64halcanary    nullptr,   // S32_Opaque,
1294fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com    S32_Blend_BlitRow32_neon,        // S32_Blend,
1295b4a7dc99b1a01cdd5c0cd5913b630436ca696210mtklein    nullptr,  // Ported to SkOpts
12963a2682a77f996f649de7699c9f7bee046c6d4f17mtklein#ifdef SK_CPU_ARM32
12971fdc6774280ffc18dd7e1247e430931aa2f58790commit-bot@chromium.org    S32A_Blend_BlitRow32_neon        // S32A_Blend
1298866b95d65dfc01af372bbed206ec067e04c1f533kevin.petit#else
129996fcdcc219d2a0d3579719b84b28bede76efba64halcanary    nullptr
1300866b95d65dfc01af372bbed206ec067e04c1f533kevin.petit#endif
1301a8dd1ce930811a51cc841f583424d507d95e7e78digit@google.com};
1302