13a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett/*
23a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett * Copyright 2016 Google Inc.
33a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett *
43a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett * Use of this source code is governed by a BSD-style license that can be
53a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett * found in the LICENSE file.
63a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett */
73a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
83a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett#ifndef SkSwizzler_opts_DEFINED
93a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett#define SkSwizzler_opts_DEFINED
103a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
11a4083c97d48e8a4f88e2797d7363f141e3d42553Cary Clark#include "SkColorData.h"
123a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
13e18fa440e74e9af0324de0a1de9b6ffb0fe3c3d3mtklein#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
14e18fa440e74e9af0324de0a1de9b6ffb0fe3c3d3mtklein    #include <immintrin.h>
15e18fa440e74e9af0324de0a1de9b6ffb0fe3c3d3mtklein#elif defined(SK_ARM_HAS_NEON)
16e18fa440e74e9af0324de0a1de9b6ffb0fe3c3d3mtklein    #include <arm_neon.h>
17e18fa440e74e9af0324de0a1de9b6ffb0fe3c3d3mtklein#endif
18e18fa440e74e9af0324de0a1de9b6ffb0fe3c3d3mtklein
193a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarettnamespace SK_OPTS_NS {
203a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
218bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtkleinstatic void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
228bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto src = (const uint32_t*)vsrc;
233a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    for (int i = 0; i < count; i++) {
243a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        uint8_t a = src[i] >> 24,
258bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                b = src[i] >> 16,
263a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett                g = src[i] >>  8,
278bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                r = src[i] >>  0;
283a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        b = (b*a+127)/255;
298bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        g = (g*a+127)/255;
308bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        r = (r*a+127)/255;
313a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        dst[i] = (uint32_t)a << 24
328bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein               | (uint32_t)b << 16
333a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett               | (uint32_t)g <<  8
348bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein               | (uint32_t)r <<  0;
353a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    }
363a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
373a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
388bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtkleinstatic void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
398bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto src = (const uint32_t*)vsrc;
403a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    for (int i = 0; i < count; i++) {
413a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        uint8_t a = src[i] >> 24,
428bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                b = src[i] >> 16,
433a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett                g = src[i] >>  8,
448bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                r = src[i] >>  0;
453a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        b = (b*a+127)/255;
468bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        g = (g*a+127)/255;
478bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        r = (r*a+127)/255;
483a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        dst[i] = (uint32_t)a << 24
498bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein               | (uint32_t)r << 16
503a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett               | (uint32_t)g <<  8
518bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein               | (uint32_t)b <<  0;
523a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    }
533a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
543a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
558bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtkleinstatic void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
568bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto src = (const uint32_t*)vsrc;
5703108de163354fa574679ad153b58ce57126b2bamsarett    for (int i = 0; i < count; i++) {
5803108de163354fa574679ad153b58ce57126b2bamsarett        uint8_t a = src[i] >> 24,
598bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                b = src[i] >> 16,
6003108de163354fa574679ad153b58ce57126b2bamsarett                g = src[i] >>  8,
618bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                r = src[i] >>  0;
6203108de163354fa574679ad153b58ce57126b2bamsarett        dst[i] = (uint32_t)a << 24
638bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein               | (uint32_t)r << 16
6403108de163354fa574679ad153b58ce57126b2bamsarett               | (uint32_t)g <<  8
658bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein               | (uint32_t)b <<  0;
6603108de163354fa574679ad153b58ce57126b2bamsarett    }
6703108de163354fa574679ad153b58ce57126b2bamsarett}
6803108de163354fa574679ad153b58ce57126b2bamsarett
69f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarettstatic void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
70f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    const uint8_t* src = (const uint8_t*)vsrc;
71f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    for (int i = 0; i < count; i++) {
72f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8_t r = src[0],
73f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett                g = src[1],
74f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett                b = src[2];
75f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        src += 3;
76f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        dst[i] = (uint32_t)0xFF << 24
77f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett               | (uint32_t)b    << 16
78f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett               | (uint32_t)g    <<  8
79f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett               | (uint32_t)r    <<  0;
80f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    }
81f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
82f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
83f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarettstatic void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
84f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    const uint8_t* src = (const uint8_t*)vsrc;
85f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    for (int i = 0; i < count; i++) {
86f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8_t r = src[0],
87f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett                g = src[1],
88f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett                b = src[2];
89f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        src += 3;
90f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        dst[i] = (uint32_t)0xFF << 24
91f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett               | (uint32_t)r    << 16
92f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett               | (uint32_t)g    <<  8
93f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett               | (uint32_t)b    <<  0;
94f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    }
95f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
96f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
972eff71c9b5f984b58961e5a6b4e66774c4385224msarettstatic void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
982eff71c9b5f984b58961e5a6b4e66774c4385224msarett    const uint8_t* src = (const uint8_t*)vsrc;
992eff71c9b5f984b58961e5a6b4e66774c4385224msarett    for (int i = 0; i < count; i++) {
1002eff71c9b5f984b58961e5a6b4e66774c4385224msarett        dst[i] = (uint32_t)0xFF   << 24
1012eff71c9b5f984b58961e5a6b4e66774c4385224msarett               | (uint32_t)src[i] << 16
1022eff71c9b5f984b58961e5a6b4e66774c4385224msarett               | (uint32_t)src[i] <<  8
1032eff71c9b5f984b58961e5a6b4e66774c4385224msarett               | (uint32_t)src[i] <<  0;
1042eff71c9b5f984b58961e5a6b4e66774c4385224msarett    }
1052eff71c9b5f984b58961e5a6b4e66774c4385224msarett}
1062eff71c9b5f984b58961e5a6b4e66774c4385224msarett
1071e06079b259d1091b735492b2f71d9897c14c608msarettstatic void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
1081e06079b259d1091b735492b2f71d9897c14c608msarett    const uint8_t* src = (const uint8_t*)vsrc;
1091e06079b259d1091b735492b2f71d9897c14c608msarett    for (int i = 0; i < count; i++) {
1101e06079b259d1091b735492b2f71d9897c14c608msarett        uint8_t g = src[0],
1111e06079b259d1091b735492b2f71d9897c14c608msarett                a = src[1];
1121e06079b259d1091b735492b2f71d9897c14c608msarett        src += 2;
1131e06079b259d1091b735492b2f71d9897c14c608msarett        dst[i] = (uint32_t)a << 24
1141e06079b259d1091b735492b2f71d9897c14c608msarett               | (uint32_t)g << 16
1151e06079b259d1091b735492b2f71d9897c14c608msarett               | (uint32_t)g <<  8
1161e06079b259d1091b735492b2f71d9897c14c608msarett               | (uint32_t)g <<  0;
1171e06079b259d1091b735492b2f71d9897c14c608msarett    }
1181e06079b259d1091b735492b2f71d9897c14c608msarett}
1191e06079b259d1091b735492b2f71d9897c14c608msarett
1201e06079b259d1091b735492b2f71d9897c14c608msarettstatic void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
1211e06079b259d1091b735492b2f71d9897c14c608msarett    const uint8_t* src = (const uint8_t*)vsrc;
1221e06079b259d1091b735492b2f71d9897c14c608msarett    for (int i = 0; i < count; i++) {
1231e06079b259d1091b735492b2f71d9897c14c608msarett        uint8_t g = src[0],
1241e06079b259d1091b735492b2f71d9897c14c608msarett                a = src[1];
1251e06079b259d1091b735492b2f71d9897c14c608msarett        src += 2;
1261e06079b259d1091b735492b2f71d9897c14c608msarett        g = (g*a+127)/255;
1271e06079b259d1091b735492b2f71d9897c14c608msarett        dst[i] = (uint32_t)a << 24
1281e06079b259d1091b735492b2f71d9897c14c608msarett               | (uint32_t)g << 16
1291e06079b259d1091b735492b2f71d9897c14c608msarett               | (uint32_t)g <<  8
1301e06079b259d1091b735492b2f71d9897c14c608msarett               | (uint32_t)g <<  0;
1311e06079b259d1091b735492b2f71d9897c14c608msarett    }
1321e06079b259d1091b735492b2f71d9897c14c608msarett}
1331e06079b259d1091b735492b2f71d9897c14c608msarett
134c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarettstatic void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
135c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    const uint32_t* src = (const uint32_t*)vsrc;
136c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    for (int i = 0; i < count; i++) {
137c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8_t k = src[i] >> 24,
138c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                y = src[i] >> 16,
139c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                m = src[i] >>  8,
140c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                c = src[i] >>  0;
141c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // See comments in SkSwizzler.cpp for details on the conversion formula.
142c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8_t b = (y*k+127)/255,
143c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                g = (m*k+127)/255,
144c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                r = (c*k+127)/255;
145c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        dst[i] = (uint32_t)0xFF << 24
146c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett               | (uint32_t)   b << 16
147c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett               | (uint32_t)   g <<  8
148c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett               | (uint32_t)   r <<  0;
149c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    }
150c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
151c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
152c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarettstatic void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
153c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    const uint32_t* src = (const uint32_t*)vsrc;
154c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    for (int i = 0; i < count; i++) {
155c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8_t k = src[i] >> 24,
156c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                y = src[i] >> 16,
157c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                m = src[i] >>  8,
158c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                c = src[i] >>  0;
159c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8_t b = (y*k+127)/255,
160c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                g = (m*k+127)/255,
161c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                r = (c*k+127)/255;
162c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        dst[i] = (uint32_t)0xFF << 24
163c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett               | (uint32_t)   r << 16
164c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett               | (uint32_t)   g <<  8
165c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett               | (uint32_t)   b <<  0;
166c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    }
167c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
168c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
1693a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett#if defined(SK_ARM_HAS_NEON)
1703a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
1713a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett// Rounded divide by 255, (x + 127) / 255
1723a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarettstatic uint8x8_t div255_round(uint16x8_t x) {
1733a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // result = (x + 127) / 255
1743a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // result = (x + 127) / 256 + error1
1753a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    //
1763a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // error1 = (x + 127) / (255 * 256)
1773a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // error1 = (x + 127) / (256 * 256) + error2
1783a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    //
1793a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // error2 = (x + 127) / (255 * 256 * 256)
1803a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    //
1813a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // The maximum value of error2 is too small to matter.  Thus:
1823a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // result = (x + 127) / 256 + (x + 127) / (256 * 256)
1833a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // result = ((x + 127) / 256 + x + 127) / 256
1843a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // result = ((x + 127) >> 8 + x + 127) >> 8
1853a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    //
1863a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // Use >>> to represent "rounded right shift" which, conveniently,
1873a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // NEON supports in one instruction.
1883a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // result = ((x >>> 8) + x) >>> 8
1893a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    //
1903a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // Note that the second right shift is actually performed as an
1913a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // "add, round, and narrow back to 8-bits" instruction.
1923a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    return vraddhn_u16(x, vrshrq_n_u16(x, 8));
1933a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
1943a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
1953a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett// Scale a byte by another, (x * y + 127) / 255
1963a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarettstatic uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
1973a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    return div255_round(vmull_u8(x, y));
1983a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
1993a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
2003a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsaretttemplate <bool kSwapRB>
2018bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtkleinstatic void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
2028bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto src = (const uint32_t*)vsrc;
2033a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    while (count >= 8) {
2043a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        // Load 8 pixels.
205f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
2063a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
207f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x8_t a = rgba.val[3],
208f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett                  b = rgba.val[2],
209f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett                  g = rgba.val[1],
210f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett                  r = rgba.val[0];
2113a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
2123a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        // Premultiply.
2133a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        b = scale(b, a);
2148bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        g = scale(g, a);
2158bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        r = scale(r, a);
2163a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
2173a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        // Store 8 premultiplied pixels.
2183a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        if (kSwapRB) {
219f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[2] = r;
220f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[1] = g;
221f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[0] = b;
2228bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        } else {
223f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[2] = b;
224f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[1] = g;
225f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[0] = r;
2263a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        }
227f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        vst4_u8((uint8_t*) dst, rgba);
2283a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        src += 8;
2293a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        dst += 8;
2303a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett        count -= 8;
2313a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    }
2323a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
2333a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    // Call portable code to finish up the tail of [0,8) pixels.
2348bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
2353a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett    proc(dst, src, count);
2363a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
2373a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
238cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
2398bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    premul_should_swapRB<false>(dst, src, count);
2403a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
2413a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
242cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
2438bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    premul_should_swapRB<true>(dst, src, count);
2443a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
2453a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
246cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
2478bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto src = (const uint32_t*)vsrc;
24803108de163354fa574679ad153b58ce57126b2bamsarett    while (count >= 16) {
24903108de163354fa574679ad153b58ce57126b2bamsarett        // Load 16 pixels.
250f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
25103108de163354fa574679ad153b58ce57126b2bamsarett
25203108de163354fa574679ad153b58ce57126b2bamsarett        // Swap r and b.
253f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        SkTSwap(rgba.val[0], rgba.val[2]);
25403108de163354fa574679ad153b58ce57126b2bamsarett
25503108de163354fa574679ad153b58ce57126b2bamsarett        // Store 16 pixels.
256f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        vst4q_u8((uint8_t*) dst, rgba);
25703108de163354fa574679ad153b58ce57126b2bamsarett        src += 16;
25803108de163354fa574679ad153b58ce57126b2bamsarett        dst += 16;
25903108de163354fa574679ad153b58ce57126b2bamsarett        count -= 16;
26003108de163354fa574679ad153b58ce57126b2bamsarett    }
26103108de163354fa574679ad153b58ce57126b2bamsarett
26203108de163354fa574679ad153b58ce57126b2bamsarett    if (count >= 8) {
26303108de163354fa574679ad153b58ce57126b2bamsarett        // Load 8 pixels.
264f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
26503108de163354fa574679ad153b58ce57126b2bamsarett
26603108de163354fa574679ad153b58ce57126b2bamsarett        // Swap r and b.
267f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        SkTSwap(rgba.val[0], rgba.val[2]);
26803108de163354fa574679ad153b58ce57126b2bamsarett
26903108de163354fa574679ad153b58ce57126b2bamsarett        // Store 8 pixels.
270f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        vst4_u8((uint8_t*) dst, rgba);
27103108de163354fa574679ad153b58ce57126b2bamsarett        src += 8;
27203108de163354fa574679ad153b58ce57126b2bamsarett        dst += 8;
27303108de163354fa574679ad153b58ce57126b2bamsarett        count -= 8;
27403108de163354fa574679ad153b58ce57126b2bamsarett    }
27503108de163354fa574679ad153b58ce57126b2bamsarett
2768bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    RGBA_to_BGRA_portable(dst, src, count);
27703108de163354fa574679ad153b58ce57126b2bamsarett}
27803108de163354fa574679ad153b58ce57126b2bamsarett
279f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsaretttemplate <bool kSwapRB>
280f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarettstatic void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
281f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    const uint8_t* src = (const uint8_t*) vsrc;
282f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    while (count >= 16) {
283f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        // Load 16 pixels.
284f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x16x3_t rgb = vld3q_u8(src);
285f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
286f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        // Insert an opaque alpha channel and swap if needed.
287f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x16x4_t rgba;
288f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        if (kSwapRB) {
289f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[0] = rgb.val[2];
290f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[2] = rgb.val[0];
291f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        } else {
292f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[0] = rgb.val[0];
293f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[2] = rgb.val[2];
294f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        }
295f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        rgba.val[1] = rgb.val[1];
296f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        rgba.val[3] = vdupq_n_u8(0xFF);
297f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
298f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        // Store 16 pixels.
299f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        vst4q_u8((uint8_t*) dst, rgba);
300f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        src += 16*3;
301f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        dst += 16;
302f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        count -= 16;
303f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    }
304f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
305f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    if (count >= 8) {
306f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        // Load 8 pixels.
307f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x8x3_t rgb = vld3_u8(src);
308f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
309f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        // Insert an opaque alpha channel and swap if needed.
310f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        uint8x8x4_t rgba;
311f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        if (kSwapRB) {
312f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[0] = rgb.val[2];
313f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[2] = rgb.val[0];
314f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        } else {
315f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[0] = rgb.val[0];
316f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett            rgba.val[2] = rgb.val[2];
317f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        }
318f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        rgba.val[1] = rgb.val[1];
319f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        rgba.val[3] = vdup_n_u8(0xFF);
320f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
321f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        // Store 8 pixels.
322f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        vst4_u8((uint8_t*) dst, rgba);
323f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        src += 8*3;
324f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        dst += 8;
325f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett        count -= 8;
326f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    }
327f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
328f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    // Call portable code to finish up the tail of [0,8) pixels.
329f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
330f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    proc(dst, src, count);
331f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
332f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
333cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
334f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    insert_alpha_should_swaprb<false>(dst, src, count);
335f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
336f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
337cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
338f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    insert_alpha_should_swaprb<true>(dst, src, count);
339f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
340f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
341cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
3422eff71c9b5f984b58961e5a6b4e66774c4385224msarett    const uint8_t* src = (const uint8_t*) vsrc;
3432eff71c9b5f984b58961e5a6b4e66774c4385224msarett    while (count >= 16) {
3442eff71c9b5f984b58961e5a6b4e66774c4385224msarett        // Load 16 pixels.
3452eff71c9b5f984b58961e5a6b4e66774c4385224msarett        uint8x16_t gray = vld1q_u8(src);
3462eff71c9b5f984b58961e5a6b4e66774c4385224msarett
3472eff71c9b5f984b58961e5a6b4e66774c4385224msarett        // Set each of the color channels.
3482eff71c9b5f984b58961e5a6b4e66774c4385224msarett        uint8x16x4_t rgba;
3492eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[0] = gray;
3502eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[1] = gray;
3512eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[2] = gray;
3522eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[3] = vdupq_n_u8(0xFF);
3532eff71c9b5f984b58961e5a6b4e66774c4385224msarett
3542eff71c9b5f984b58961e5a6b4e66774c4385224msarett        // Store 16 pixels.
3552eff71c9b5f984b58961e5a6b4e66774c4385224msarett        vst4q_u8((uint8_t*) dst, rgba);
3562eff71c9b5f984b58961e5a6b4e66774c4385224msarett        src += 16;
3572eff71c9b5f984b58961e5a6b4e66774c4385224msarett        dst += 16;
3582eff71c9b5f984b58961e5a6b4e66774c4385224msarett        count -= 16;
3592eff71c9b5f984b58961e5a6b4e66774c4385224msarett    }
3602eff71c9b5f984b58961e5a6b4e66774c4385224msarett
3612eff71c9b5f984b58961e5a6b4e66774c4385224msarett    if (count >= 8) {
3622eff71c9b5f984b58961e5a6b4e66774c4385224msarett        // Load 8 pixels.
3632eff71c9b5f984b58961e5a6b4e66774c4385224msarett        uint8x8_t gray = vld1_u8(src);
3642eff71c9b5f984b58961e5a6b4e66774c4385224msarett
3652eff71c9b5f984b58961e5a6b4e66774c4385224msarett        // Set each of the color channels.
3662eff71c9b5f984b58961e5a6b4e66774c4385224msarett        uint8x8x4_t rgba;
3672eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[0] = gray;
3682eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[1] = gray;
3692eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[2] = gray;
3702eff71c9b5f984b58961e5a6b4e66774c4385224msarett        rgba.val[3] = vdup_n_u8(0xFF);
3712eff71c9b5f984b58961e5a6b4e66774c4385224msarett
3722eff71c9b5f984b58961e5a6b4e66774c4385224msarett        // Store 8 pixels.
3732eff71c9b5f984b58961e5a6b4e66774c4385224msarett        vst4_u8((uint8_t*) dst, rgba);
3742eff71c9b5f984b58961e5a6b4e66774c4385224msarett        src += 8;
3752eff71c9b5f984b58961e5a6b4e66774c4385224msarett        dst += 8;
3762eff71c9b5f984b58961e5a6b4e66774c4385224msarett        count -= 8;
3772eff71c9b5f984b58961e5a6b4e66774c4385224msarett    }
3782eff71c9b5f984b58961e5a6b4e66774c4385224msarett
3792eff71c9b5f984b58961e5a6b4e66774c4385224msarett    gray_to_RGB1_portable(dst, src, count);
3802eff71c9b5f984b58961e5a6b4e66774c4385224msarett}
3812eff71c9b5f984b58961e5a6b4e66774c4385224msarett
3821e06079b259d1091b735492b2f71d9897c14c608msaretttemplate <bool kPremul>
3831e06079b259d1091b735492b2f71d9897c14c608msarettstatic void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
3841e06079b259d1091b735492b2f71d9897c14c608msarett    const uint8_t* src = (const uint8_t*) vsrc;
3851e06079b259d1091b735492b2f71d9897c14c608msarett    while (count >= 16) {
3861e06079b259d1091b735492b2f71d9897c14c608msarett        // Load 16 pixels.
3871e06079b259d1091b735492b2f71d9897c14c608msarett        uint8x16x2_t ga = vld2q_u8(src);
3881e06079b259d1091b735492b2f71d9897c14c608msarett
3891e06079b259d1091b735492b2f71d9897c14c608msarett        // Premultiply if requested.
3901e06079b259d1091b735492b2f71d9897c14c608msarett        if (kPremul) {
3911e06079b259d1091b735492b2f71d9897c14c608msarett            ga.val[0] = vcombine_u8(
3921e06079b259d1091b735492b2f71d9897c14c608msarett                    scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
3931e06079b259d1091b735492b2f71d9897c14c608msarett                    scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
3941e06079b259d1091b735492b2f71d9897c14c608msarett        }
3951e06079b259d1091b735492b2f71d9897c14c608msarett
3961e06079b259d1091b735492b2f71d9897c14c608msarett        // Set each of the color channels.
3971e06079b259d1091b735492b2f71d9897c14c608msarett        uint8x16x4_t rgba;
3981e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[0] = ga.val[0];
3991e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[1] = ga.val[0];
4001e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[2] = ga.val[0];
4011e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[3] = ga.val[1];
4021e06079b259d1091b735492b2f71d9897c14c608msarett
4031e06079b259d1091b735492b2f71d9897c14c608msarett        // Store 16 pixels.
4041e06079b259d1091b735492b2f71d9897c14c608msarett        vst4q_u8((uint8_t*) dst, rgba);
4051e06079b259d1091b735492b2f71d9897c14c608msarett        src += 16*2;
4061e06079b259d1091b735492b2f71d9897c14c608msarett        dst += 16;
4071e06079b259d1091b735492b2f71d9897c14c608msarett        count -= 16;
4081e06079b259d1091b735492b2f71d9897c14c608msarett    }
4091e06079b259d1091b735492b2f71d9897c14c608msarett
4101e06079b259d1091b735492b2f71d9897c14c608msarett    if (count >= 8) {
4111e06079b259d1091b735492b2f71d9897c14c608msarett        // Load 8 pixels.
4121e06079b259d1091b735492b2f71d9897c14c608msarett        uint8x8x2_t ga = vld2_u8(src);
4131e06079b259d1091b735492b2f71d9897c14c608msarett
4141e06079b259d1091b735492b2f71d9897c14c608msarett        // Premultiply if requested.
4151e06079b259d1091b735492b2f71d9897c14c608msarett        if (kPremul) {
4161e06079b259d1091b735492b2f71d9897c14c608msarett            ga.val[0] = scale(ga.val[0], ga.val[1]);
4171e06079b259d1091b735492b2f71d9897c14c608msarett        }
4181e06079b259d1091b735492b2f71d9897c14c608msarett
4191e06079b259d1091b735492b2f71d9897c14c608msarett        // Set each of the color channels.
4201e06079b259d1091b735492b2f71d9897c14c608msarett        uint8x8x4_t rgba;
4211e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[0] = ga.val[0];
4221e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[1] = ga.val[0];
4231e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[2] = ga.val[0];
4241e06079b259d1091b735492b2f71d9897c14c608msarett        rgba.val[3] = ga.val[1];
4251e06079b259d1091b735492b2f71d9897c14c608msarett
4261e06079b259d1091b735492b2f71d9897c14c608msarett        // Store 8 pixels.
4271e06079b259d1091b735492b2f71d9897c14c608msarett        vst4_u8((uint8_t*) dst, rgba);
4281e06079b259d1091b735492b2f71d9897c14c608msarett        src += 8*2;
4291e06079b259d1091b735492b2f71d9897c14c608msarett        dst += 8;
4301e06079b259d1091b735492b2f71d9897c14c608msarett        count -= 8;
4311e06079b259d1091b735492b2f71d9897c14c608msarett    }
4321e06079b259d1091b735492b2f71d9897c14c608msarett
4331e06079b259d1091b735492b2f71d9897c14c608msarett    auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
4341e06079b259d1091b735492b2f71d9897c14c608msarett    proc(dst, src, count);
4351e06079b259d1091b735492b2f71d9897c14c608msarett}
4361e06079b259d1091b735492b2f71d9897c14c608msarett
437cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
4381e06079b259d1091b735492b2f71d9897c14c608msarett    expand_grayA<false>(dst, src, count);
4391e06079b259d1091b735492b2f71d9897c14c608msarett}
4401e06079b259d1091b735492b2f71d9897c14c608msarett
441cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
4421e06079b259d1091b735492b2f71d9897c14c608msarett    expand_grayA<true>(dst, src, count);
4431e06079b259d1091b735492b2f71d9897c14c608msarett}
4441e06079b259d1091b735492b2f71d9897c14c608msarett
445c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarettenum Format { kRGB1, kBGR1 };
446c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msaretttemplate <Format format>
447c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarettstatic void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
448c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    auto src = (const uint32_t*)vsrc;
449c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    while (count >= 8) {
450c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // Load 8 cmyk pixels.
451c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
452c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
453c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8x8_t k = pixels.val[3],
454c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                  y = pixels.val[2],
455c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                  m = pixels.val[1],
456c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                  c = pixels.val[0];
457c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
458c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // Scale to r, g, b.
459c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8x8_t b = scale(y, k);
460c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8x8_t g = scale(m, k);
461c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        uint8x8_t r = scale(c, k);
462c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
463c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // Store 8 rgba pixels.
464c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        if (kBGR1 == format) {
465c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[3] = vdup_n_u8(0xFF);
466c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[2] = r;
467c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[1] = g;
468c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[0] = b;
469c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        } else {
470c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[3] = vdup_n_u8(0xFF);
471c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[2] = b;
472c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[1] = g;
473c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            pixels.val[0] = r;
474c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        }
475c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        vst4_u8((uint8_t*) dst, pixels);
476c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        src += 8;
477c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        dst += 8;
478c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        count -= 8;
479c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    }
480c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
481c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
482c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    proc(dst, src, count);
483c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
484c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
485cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
486c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    inverted_cmyk_to<kRGB1>(dst, src, count);
487c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
488c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
489cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
490c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    inverted_cmyk_to<kBGR1>(dst, src, count);
491c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
492c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
49353b9d29b973f2828624f097bf110f1c7acc4b593msarett#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
49453b9d29b973f2828624f097bf110f1c7acc4b593msarett
495095742419d0277a4fb0d499a05ff29b7506f1c5emsarett// Scale a byte by another.
496095742419d0277a4fb0d499a05ff29b7506f1c5emsarett// Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
497095742419d0277a4fb0d499a05ff29b7506f1c5emsarettstatic __m128i scale(__m128i x, __m128i y) {
498095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    const __m128i _128 = _mm_set1_epi16(128);
499095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    const __m128i _257 = _mm_set1_epi16(257);
500095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
501095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
502095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
503095742419d0277a4fb0d499a05ff29b7506f1c5emsarett}
504095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
50553b9d29b973f2828624f097bf110f1c7acc4b593msaretttemplate <bool kSwapRB>
5068bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtkleinstatic void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
5078bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto src = (const uint32_t*)vsrc;
50853b9d29b973f2828624f097bf110f1c7acc4b593msarett
50953b9d29b973f2828624f097bf110f1c7acc4b593msarett    auto premul8 = [](__m128i* lo, __m128i* hi) {
51053b9d29b973f2828624f097bf110f1c7acc4b593msarett        const __m128i zeros = _mm_setzero_si128();
51153b9d29b973f2828624f097bf110f1c7acc4b593msarett        __m128i planar;
51253b9d29b973f2828624f097bf110f1c7acc4b593msarett        if (kSwapRB) {
51353b9d29b973f2828624f097bf110f1c7acc4b593msarett            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
51453b9d29b973f2828624f097bf110f1c7acc4b593msarett        } else {
51553b9d29b973f2828624f097bf110f1c7acc4b593msarett            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
51653b9d29b973f2828624f097bf110f1c7acc4b593msarett        }
51753b9d29b973f2828624f097bf110f1c7acc4b593msarett
51853b9d29b973f2828624f097bf110f1c7acc4b593msarett        // Swizzle the pixels to 8-bit planar.
5198bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
5208bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
5218bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
5228bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
52353b9d29b973f2828624f097bf110f1c7acc4b593msarett
52453b9d29b973f2828624f097bf110f1c7acc4b593msarett        // Unpack to 16-bit planar.
5258bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
5268bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
5278bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
5288bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein                a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
52953b9d29b973f2828624f097bf110f1c7acc4b593msarett
530095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        // Premultiply!
531095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        r = scale(r, a);
532095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        g = scale(g, a);
533095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        b = scale(b, a);
53453b9d29b973f2828624f097bf110f1c7acc4b593msarett
53553b9d29b973f2828624f097bf110f1c7acc4b593msarett        // Repack into interlaced pixels.
5368bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
5378bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
5388bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
5398bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
54053b9d29b973f2828624f097bf110f1c7acc4b593msarett    };
54153b9d29b973f2828624f097bf110f1c7acc4b593msarett
54253b9d29b973f2828624f097bf110f1c7acc4b593msarett    while (count >= 8) {
54353b9d29b973f2828624f097bf110f1c7acc4b593msarett        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
54453b9d29b973f2828624f097bf110f1c7acc4b593msarett                hi = _mm_loadu_si128((const __m128i*) (src + 4));
54553b9d29b973f2828624f097bf110f1c7acc4b593msarett
54653b9d29b973f2828624f097bf110f1c7acc4b593msarett        premul8(&lo, &hi);
54753b9d29b973f2828624f097bf110f1c7acc4b593msarett
54853b9d29b973f2828624f097bf110f1c7acc4b593msarett        _mm_storeu_si128((__m128i*) (dst + 0), lo);
54953b9d29b973f2828624f097bf110f1c7acc4b593msarett        _mm_storeu_si128((__m128i*) (dst + 4), hi);
55053b9d29b973f2828624f097bf110f1c7acc4b593msarett
55153b9d29b973f2828624f097bf110f1c7acc4b593msarett        src += 8;
55253b9d29b973f2828624f097bf110f1c7acc4b593msarett        dst += 8;
55353b9d29b973f2828624f097bf110f1c7acc4b593msarett        count -= 8;
55453b9d29b973f2828624f097bf110f1c7acc4b593msarett    }
55553b9d29b973f2828624f097bf110f1c7acc4b593msarett
55653b9d29b973f2828624f097bf110f1c7acc4b593msarett    if (count >= 4) {
55753b9d29b973f2828624f097bf110f1c7acc4b593msarett        __m128i lo = _mm_loadu_si128((const __m128i*) src),
55853b9d29b973f2828624f097bf110f1c7acc4b593msarett                hi = _mm_setzero_si128();
55953b9d29b973f2828624f097bf110f1c7acc4b593msarett
56053b9d29b973f2828624f097bf110f1c7acc4b593msarett        premul8(&lo, &hi);
56153b9d29b973f2828624f097bf110f1c7acc4b593msarett
56253b9d29b973f2828624f097bf110f1c7acc4b593msarett        _mm_storeu_si128((__m128i*) dst, lo);
56353b9d29b973f2828624f097bf110f1c7acc4b593msarett
56453b9d29b973f2828624f097bf110f1c7acc4b593msarett        src += 4;
56553b9d29b973f2828624f097bf110f1c7acc4b593msarett        dst += 4;
56653b9d29b973f2828624f097bf110f1c7acc4b593msarett        count -= 4;
56753b9d29b973f2828624f097bf110f1c7acc4b593msarett    }
56853b9d29b973f2828624f097bf110f1c7acc4b593msarett
56953b9d29b973f2828624f097bf110f1c7acc4b593msarett    // Call portable code to finish up the tail of [0,4) pixels.
5708bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
57153b9d29b973f2828624f097bf110f1c7acc4b593msarett    proc(dst, src, count);
57253b9d29b973f2828624f097bf110f1c7acc4b593msarett}
57353b9d29b973f2828624f097bf110f1c7acc4b593msarett
574cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
5758bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    premul_should_swapRB<false>(dst, src, count);
57653b9d29b973f2828624f097bf110f1c7acc4b593msarett}
57753b9d29b973f2828624f097bf110f1c7acc4b593msarett
578cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
5798bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    premul_should_swapRB<true>(dst, src, count);
58053b9d29b973f2828624f097bf110f1c7acc4b593msarett}
58153b9d29b973f2828624f097bf110f1c7acc4b593msarett
582cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
5838bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    auto src = (const uint32_t*)vsrc;
58453b9d29b973f2828624f097bf110f1c7acc4b593msarett    const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
58553b9d29b973f2828624f097bf110f1c7acc4b593msarett
58653b9d29b973f2828624f097bf110f1c7acc4b593msarett    while (count >= 4) {
5878bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        __m128i rgba = _mm_loadu_si128((const __m128i*) src);
5888bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
5898bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein        _mm_storeu_si128((__m128i*) dst, bgra);
59053b9d29b973f2828624f097bf110f1c7acc4b593msarett
59153b9d29b973f2828624f097bf110f1c7acc4b593msarett        src += 4;
59253b9d29b973f2828624f097bf110f1c7acc4b593msarett        dst += 4;
59353b9d29b973f2828624f097bf110f1c7acc4b593msarett        count -= 4;
59453b9d29b973f2828624f097bf110f1c7acc4b593msarett    }
59553b9d29b973f2828624f097bf110f1c7acc4b593msarett
5968bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    RGBA_to_BGRA_portable(dst, src, count);
59753b9d29b973f2828624f097bf110f1c7acc4b593msarett}
59853b9d29b973f2828624f097bf110f1c7acc4b593msarett
59913aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsaretttemplate <bool kSwapRB>
60013aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarettstatic void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
60113aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    const uint8_t* src = (const uint8_t*) vsrc;
60213aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett
60313aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
60413aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    __m128i expand;
60513aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
60613aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    if (kSwapRB) {
60713aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
60813aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    } else {
60913aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
61013aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    }
61113aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett
61213aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    while (count >= 6) {
61313aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        // Load a vector.  While this actually contains 5 pixels plus an
61413aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        // extra component, we will discard all but the first four pixels on
61513aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        // this iteration.
61613aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        __m128i rgb = _mm_loadu_si128((const __m128i*) src);
61713aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett
61813aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        // Expand the first four pixels to RGBX and then mask to RGB(FF).
61913aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
62013aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett
62113aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        // Store 4 pixels.
62213aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        _mm_storeu_si128((__m128i*) dst, rgba);
62313aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett
62413aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        src += 4*3;
62513aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        dst += 4;
62613aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett        count -= 4;
62713aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    }
62813aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett
62913aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    // Call portable code to finish up the tail of [0,4) pixels.
63013aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
63113aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    proc(dst, src, count);
63213aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett}
63313aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett
634cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
63513aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    insert_alpha_should_swaprb<false>(dst, src, count);
636f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
637f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
638cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
63913aa1a5ad97156e35184970fc1ce1aaf3c50c91cmsarett    insert_alpha_should_swaprb<true>(dst, src, count);
640f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
641f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
642cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
6430700651128f8c505da65e651f9788589593f07c4msarett    const uint8_t* src = (const uint8_t*) vsrc;
6440700651128f8c505da65e651f9788589593f07c4msarett
6450700651128f8c505da65e651f9788589593f07c4msarett    const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
6460700651128f8c505da65e651f9788589593f07c4msarett    while (count >= 16) {
6470700651128f8c505da65e651f9788589593f07c4msarett        __m128i grays = _mm_loadu_si128((const __m128i*) src);
6480700651128f8c505da65e651f9788589593f07c4msarett
6490700651128f8c505da65e651f9788589593f07c4msarett        __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
6500700651128f8c505da65e651f9788589593f07c4msarett        __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
6510700651128f8c505da65e651f9788589593f07c4msarett        __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
6520700651128f8c505da65e651f9788589593f07c4msarett        __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
6530700651128f8c505da65e651f9788589593f07c4msarett
6540700651128f8c505da65e651f9788589593f07c4msarett        __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
6550700651128f8c505da65e651f9788589593f07c4msarett        __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
6560700651128f8c505da65e651f9788589593f07c4msarett        __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
6570700651128f8c505da65e651f9788589593f07c4msarett        __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
6580700651128f8c505da65e651f9788589593f07c4msarett
6590700651128f8c505da65e651f9788589593f07c4msarett        _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
6600700651128f8c505da65e651f9788589593f07c4msarett        _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
6610700651128f8c505da65e651f9788589593f07c4msarett        _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
6620700651128f8c505da65e651f9788589593f07c4msarett        _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
6630700651128f8c505da65e651f9788589593f07c4msarett
6640700651128f8c505da65e651f9788589593f07c4msarett        src += 16;
6650700651128f8c505da65e651f9788589593f07c4msarett        dst += 16;
6660700651128f8c505da65e651f9788589593f07c4msarett        count -= 16;
6670700651128f8c505da65e651f9788589593f07c4msarett    }
6680700651128f8c505da65e651f9788589593f07c4msarett
6692eff71c9b5f984b58961e5a6b4e66774c4385224msarett    gray_to_RGB1_portable(dst, src, count);
6702eff71c9b5f984b58961e5a6b4e66774c4385224msarett}
6712eff71c9b5f984b58961e5a6b4e66774c4385224msarett
672cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
673095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    const uint8_t* src = (const uint8_t*) vsrc;
674095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    while (count >= 8) {
675095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i ga = _mm_loadu_si128((const __m128i*) src);
676095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
677095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
678095742419d0277a4fb0d499a05ff29b7506f1c5emsarett                                  _mm_slli_epi16(ga, 8));
679095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
680095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
681095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
682095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
683095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
684095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
685095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
686095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        src += 8*2;
687095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        dst += 8;
688095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        count -= 8;
689095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    }
690095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
6911e06079b259d1091b735492b2f71d9897c14c608msarett    grayA_to_RGBA_portable(dst, src, count);
6921e06079b259d1091b735492b2f71d9897c14c608msarett}
6931e06079b259d1091b735492b2f71d9897c14c608msarett
694cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
695095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    const uint8_t* src = (const uint8_t*) vsrc;
696095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    while (count >= 8) {
697095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i grayA = _mm_loadu_si128((const __m128i*) src);
698095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
699095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
700095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i a0 = _mm_srli_epi16(grayA, 8);
701095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
702095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        // Premultiply
703095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        g0 = scale(g0, a0);
704095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
705095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
706095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
707095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
708095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
709095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
710095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
711095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
712095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
713095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
714095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
715095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        src += 8*2;
716095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        dst += 8;
717095742419d0277a4fb0d499a05ff29b7506f1c5emsarett        count -= 8;
718095742419d0277a4fb0d499a05ff29b7506f1c5emsarett    }
719095742419d0277a4fb0d499a05ff29b7506f1c5emsarett
7201e06079b259d1091b735492b2f71d9897c14c608msarett    grayA_to_rgbA_portable(dst, src, count);
7211e06079b259d1091b735492b2f71d9897c14c608msarett}
7221e06079b259d1091b735492b2f71d9897c14c608msarett
723c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarettenum Format { kRGB1, kBGR1 };
724c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msaretttemplate <Format format>
725c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarettstatic void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
726c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    auto src = (const uint32_t*)vsrc;
727c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
728c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    auto convert8 = [](__m128i* lo, __m128i* hi) {
729c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        const __m128i zeros = _mm_setzero_si128();
730c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        __m128i planar;
731c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        if (kBGR1 == format) {
732c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
733c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        } else {
734c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett            planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
735c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        }
736c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
737c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // Swizzle the pixels to 8-bit planar.
738c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
739c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
740c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
741c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
742c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
743c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // Unpack to 16-bit planar.
744c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
745c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
746c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
747c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
748c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
749c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // Scale to r, g, b.
750c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        __m128i r = scale(c, k),
751c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                g = scale(m, k),
752c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                b = scale(y, k);
753c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
754c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        // Repack into interlaced pixels.
755c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
756c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
757c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
758c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
759c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    };
760c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
761c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    while (count >= 8) {
762c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
763c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                hi = _mm_loadu_si128((const __m128i*) (src + 4));
764c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
765c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        convert8(&lo, &hi);
766c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
767c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        _mm_storeu_si128((__m128i*) (dst + 0), lo);
768c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        _mm_storeu_si128((__m128i*) (dst + 4), hi);
769c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
770c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        src += 8;
771c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        dst += 8;
772c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        count -= 8;
773c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    }
774c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
775c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    if (count >= 4) {
776c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        __m128i lo = _mm_loadu_si128((const __m128i*) src),
777c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett                hi = _mm_setzero_si128();
778c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
779c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        convert8(&lo, &hi);
780c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
781c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        _mm_storeu_si128((__m128i*) dst, lo);
782c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
783c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        src += 4;
784c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        dst += 4;
785c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett        count -= 4;
786c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    }
787c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
788c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
789c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    proc(dst, src, count);
790c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
791c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
792cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
793c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    inverted_cmyk_to<kRGB1>(dst, src, count);
794c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
795c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
796cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
797c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    inverted_cmyk_to<kBGR1>(dst, src, count);
798c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
799c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
8003a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett#else
8013a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
802cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
8038bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    RGBA_to_rgbA_portable(dst, src, count);
8043a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
8053a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
806cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
8078bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    RGBA_to_bgrA_portable(dst, src, count);
8083a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
8093a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
810cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
8118bf7b79cf9776b4edb3f6810e5ab8c80c49d3480mtklein    RGBA_to_BGRA_portable(dst, src, count);
8123a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
8133a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
814cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
815f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    RGB_to_RGB1_portable(dst, src, count);
816f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
817f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
818cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
819f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett    RGB_to_BGR1_portable(dst, src, count);
820f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett}
821f1b8b6ae34e5a1f4b29e423401da39f88f0c117amsarett
822cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
8232eff71c9b5f984b58961e5a6b4e66774c4385224msarett    gray_to_RGB1_portable(dst, src, count);
8242eff71c9b5f984b58961e5a6b4e66774c4385224msarett}
8252eff71c9b5f984b58961e5a6b4e66774c4385224msarett
826cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
8271e06079b259d1091b735492b2f71d9897c14c608msarett    grayA_to_RGBA_portable(dst, src, count);
8281e06079b259d1091b735492b2f71d9897c14c608msarett}
8291e06079b259d1091b735492b2f71d9897c14c608msarett
830cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
8311e06079b259d1091b735492b2f71d9897c14c608msarett    grayA_to_rgbA_portable(dst, src, count);
8321e06079b259d1091b735492b2f71d9897c14c608msarett}
8331e06079b259d1091b735492b2f71d9897c14c608msarett
834cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
835c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    inverted_CMYK_to_RGB1_portable(dst, src, count);
836c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
837c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
838cd71f115a846332d95b29fbeed3f315d8c01753dMike Klein/*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
839c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett    inverted_CMYK_to_BGR1_portable(dst, src, count);
840c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett}
841c5c322d8ecfc05718f9f04360956c4f1f9dc33c1msarett
84203108de163354fa574679ad153b58ce57126b2bamsarett#endif
84303108de163354fa574679ad153b58ce57126b2bamsarett
8443a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett}
8453a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett
8463a24f459582f2665f0e66bd35a0d8f46a1c4c72fmsarett#endif // SkSwizzler_opts_DEFINED
847