SkBlitRow_opts_SSE2.cpp revision 275804782f7b752cc9c25cb556db2a0cfc711dd9
19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* 298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com * Copyright 2012 The Android Open Source Project 3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * 4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be 5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file. 69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 8ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com 94e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h" 1083ecdc3ac69c9208493c4c3fc8ea9f84b1350535caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h" 119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h" 12475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org#include "SkColor_opts_SSE2.h" 13275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#include "SkDither.h" 14c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h" 159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h> 179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32() 199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 224e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 234e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t dst_scale = 256 - src_scale; 319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 4498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 4598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 4698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Move scale factors to upper byte of word 4798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 4898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 5498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Interleave Atom port 0/1 operations based on the execution port 5598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // constraints that multiply can only be executed on port 0 (while 5698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // boolean operations can be executed on either port 0 or port 1) 5798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // because GCC currently doesn't do a good job scheduling 5898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // instructions based on these constraints. 5998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 6198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 6498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Multiply by scale. 6598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 6698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // where rs.h stands for the higher byte of r * scale, and 6798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // bs.h the higher byte of b * scale. 6898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 6998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 7098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Get alpha and green pixels into higher byte of each word. 7198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 7298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by scale. 7598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, as.l, gs.h, gs.l)) 7698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 7898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Clear the lower byte of the a*scale and g*scale results 7998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 8098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_and_si128(src_ag, ag_mask); 8198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 8298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Operations the destination pixels are the same as on the 8398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // source pixels. See the comments above. 8498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 8598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 8698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 8798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 8898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_ag = _mm_and_si128(dst_ag, ag_mask); 89dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 90dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 9198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, rs.h, gs.h, bs.h)) 92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 103dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 1049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 1059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 1064e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org while (count > 0) { 1079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 1089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 1099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 1109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 1129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 1139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 1144e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 1154e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 1164e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 1179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha == 255); 1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 1199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 1209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 1339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING 134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 143f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Shift alphas down to lower 8 bits of each quad. 145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i alpha = _mm_srli_epi32(src_pixel, 24); 146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Copy alpha to upper 3rd byte of each quad 148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 255, to get 0..255 151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_255, alpha); 152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb_low = (dst_rb >> 8) 159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, c_128); 165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, c_128); 170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org #else 183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 191f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 192dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i alpha = _mm_srli_epi16(src_pixel, 8); 195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, g2, a3, g3) 197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflehi_epi16(alpha, 0xF5); 198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 199f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, a2, a3, a3) 200f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflelo_epi16(alpha, 0xF5); 201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_256, alpha); 204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 209dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 213dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out high bits (already in the right place) 214dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 215dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 216dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 217dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 218dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 219dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 220dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 221dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 222dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 223dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 224dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 225dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif 227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 228dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 229dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 2329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 2339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 2349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 2359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 2369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 2389272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2394e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 2404e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 2414e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 2429272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 2439272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 2449272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 2459272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2469272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 25998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get alpha and green into lower byte of each word. 272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 275dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Put per-pixel alpha in low byte of each word. 27698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // After the following two statements, the dst_alpha looks like 27798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 281dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_alpha = dst_alpha * src_scale 28298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Because src_scales are in the higher byte of each word and 28398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // we use mulhi here, the resulting alpha values are already 28498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // in the right place and don't need to be divided by 256. 28598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 28698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by dst pixel alpha. 292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by dst pixel alpha. 294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 296dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by global alpha. 29798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 29898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // where rs.h stands for the higher byte of r * src_scale, 29998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // and bs.h the higher byte of b * src_scale. 30098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Again, because we use mulhi, the resuling red and blue 30198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // values are already in the right place and don't need to 30298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // be divided by 256. 30398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 304dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by global alpha. 30598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, as.h, 0, gs.h)) 30698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 307dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 308dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 309dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 310dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 311dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out low bits (goodies already in the right place; no need to divide) 312dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 31398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Shift alpha and green to higher byte of each word. 31498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 31598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_slli_epi16(src_ag, 8); 316dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 317dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 318dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 319dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 320dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 321dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add two pixels into result. 322dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 323dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 324dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 325dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 326dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 327dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 328dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 329dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 3309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 331dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 3329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 3339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 3349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 3359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 3369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 3379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 3389272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32() 341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */ 343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkPMColor color) { 345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count <= 0) { 347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org return; 348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 349c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (0 == color) { 351c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (src != dst) { 352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org memcpy(dst, src, count * sizeof(SkPMColor)); 353c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 354c909a1ecadd422d91ff97d10ce08865290223b14reed@google.com return; 355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned colorA = SkGetPackedA32(color); 358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (255 == colorA) { 359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org sk_memset32(dst, color, count); 360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } else { 361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned scale = 256 - SkAlpha255To256(colorA); 362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count >= 4) { 364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src++; 368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst++; 369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(scale); 376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i color_wide = _mm_set1_epi32(color); 377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count >= 4) { 378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Load 4 pixels each of src and dest. 379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 380c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 382c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 383981d4798007b91e2e19c13b171583927a56df63breed@google.com 384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get alpha and green into lower byte of each word. 385c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 386c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 387c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Multiply by scale. 388c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 389c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 390c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 391c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Divide by 256. 392c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 393c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 394c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 395c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Combine back into RGBA. 396c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 397c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 398c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Add color to result. 399c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i result = _mm_add_epi8(color_wide, src_pixel); 400c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 401c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Store result. 402c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org _mm_store_si128(d, result); 403c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org s++; 404c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org d++; 405c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count -= 4; 406c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 407c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 408c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 409c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 410c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 411c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count > 0) { 412c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 413c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src += 1; 414c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst += 1; 415c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 416981d4798007b91e2e19c13b171583927a56df63breed@google.com } 417c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 418c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org} 419981d4798007b91e2e19c13b171583927a56df63breed@google.com 420edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 421edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com size_t maskRB, SkColor origColor, 422d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com int width, int height) { 423ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com SkPMColor color = SkPreMultiplyColor(origColor); 424981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t dstOffset = dstRB - (width << 2); 425981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t maskOffset = maskRB - width; 426981d4798007b91e2e19c13b171583927a56df63breed@google.com SkPMColor* dst = (SkPMColor *)device; 427edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com const uint8_t* mask = (const uint8_t*)maskPtr; 428981d4798007b91e2e19c13b171583927a56df63breed@google.com do { 429981d4798007b91e2e19c13b171583927a56df63breed@google.com int count = width; 430981d4798007b91e2e19c13b171583927a56df63breed@google.com if (count >= 4) { 431981d4798007b91e2e19c13b171583927a56df63breed@google.com while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 432981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst = SkBlendARGB32(color, *dst, *mask); 433981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 434981d4798007b91e2e19c13b171583927a56df63breed@google.com dst++; 435981d4798007b91e2e19c13b171583927a56df63breed@google.com count--; 436981d4798007b91e2e19c13b171583927a56df63breed@google.com } 437981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 438981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 439981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_256 = _mm_set1_epi16(256); 440981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_1 = _mm_set1_epi16(1); 441981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_pixel = _mm_set1_epi32(color); 442981d4798007b91e2e19c13b171583927a56df63breed@google.com while (count >= 4) { 443981d4798007b91e2e19c13b171583927a56df63breed@google.com // Load 4 pixels each of src and dest. 444981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_pixel = _mm_load_si128(d); 445981d4798007b91e2e19c13b171583927a56df63breed@google.com 446981d4798007b91e2e19c13b171583927a56df63breed@google.com //set the aphla value 447981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 448981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *(mask+3),0, \ 449981d4798007b91e2e19c13b171583927a56df63breed@google.com *(mask+2),0, *(mask+2),\ 450981d4798007b91e2e19c13b171583927a56df63breed@google.com 0,*(mask+1), 0,*(mask+1),\ 451981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *mask,0,*mask); 452981d4798007b91e2e19c13b171583927a56df63breed@google.com 453981d4798007b91e2e19c13b171583927a56df63breed@google.com //call SkAlpha255To256() 454981d4798007b91e2e19c13b171583927a56df63breed@google.com src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 455981d4798007b91e2e19c13b171583927a56df63breed@google.com 456981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get red and blue pixels into lower byte of each word. 457981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 458981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 459981d4798007b91e2e19c13b171583927a56df63breed@google.com 460981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get alpha and green into lower byte of each word. 461981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 462981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 463981d4798007b91e2e19c13b171583927a56df63breed@google.com 464981d4798007b91e2e19c13b171583927a56df63breed@google.com // Put per-pixel alpha in low byte of each word. 465981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 466981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 467981d4798007b91e2e19c13b171583927a56df63breed@google.com 468981d4798007b91e2e19c13b171583927a56df63breed@google.com // dst_alpha = dst_alpha * src_scale 469981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 470981d4798007b91e2e19c13b171583927a56df63breed@google.com 471981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 472981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_srli_epi16(dst_alpha, 8); 473981d4798007b91e2e19c13b171583927a56df63breed@google.com 474981d4798007b91e2e19c13b171583927a56df63breed@google.com // Subtract alphas from 256, to get 1..256 475981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 476981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by dst pixel alpha. 477981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 478981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by dst pixel alpha. 479981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 480981d4798007b91e2e19c13b171583927a56df63breed@google.com 481981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by global alpha. 482981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 483981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by global alpha. 484981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 485981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 486981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_srli_epi16(dst_rb, 8); 487981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_srli_epi16(src_rb, 8); 488981d4798007b91e2e19c13b171583927a56df63breed@google.com 489981d4798007b91e2e19c13b171583927a56df63breed@google.com // Mask out low bits (goodies already in the right place; no need to divide) 490981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 491981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_andnot_si128(rb_mask, src_ag); 492981d4798007b91e2e19c13b171583927a56df63breed@google.com 493981d4798007b91e2e19c13b171583927a56df63breed@google.com // Combine back into RGBA. 494981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_pixel = _mm_or_si128(dst_rb, dst_ag); 495981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 496981d4798007b91e2e19c13b171583927a56df63breed@google.com 497981d4798007b91e2e19c13b171583927a56df63breed@google.com // Add two pixels into result. 498981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 499981d4798007b91e2e19c13b171583927a56df63breed@google.com _mm_store_si128(d, result); 500981d4798007b91e2e19c13b171583927a56df63breed@google.com // load the next 4 pixel 501981d4798007b91e2e19c13b171583927a56df63breed@google.com mask = mask + 4; 502981d4798007b91e2e19c13b171583927a56df63breed@google.com d++; 503981d4798007b91e2e19c13b171583927a56df63breed@google.com count -= 4; 504981d4798007b91e2e19c13b171583927a56df63breed@google.com } 505981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = reinterpret_cast<SkPMColor *>(d); 506981d4798007b91e2e19c13b171583927a56df63breed@google.com } 507981d4798007b91e2e19c13b171583927a56df63breed@google.com while(count > 0) { 508981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst= SkBlendARGB32(color, *dst, *mask); 509981d4798007b91e2e19c13b171583927a56df63breed@google.com dst += 1; 510981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 511981d4798007b91e2e19c13b171583927a56df63breed@google.com count --; 512981d4798007b91e2e19c13b171583927a56df63breed@google.com } 513981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = (SkPMColor *)((char*)dst + dstOffset); 514981d4798007b91e2e19c13b171583927a56df63breed@google.com mask += maskOffset; 515981d4798007b91e2e19c13b171583927a56df63breed@google.com } while (--height != 0); 516981d4798007b91e2e19c13b171583927a56df63breed@google.com} 517d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 5188cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to 5198cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// line up with the corresponding components in an SkPMColor. 5208cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order. 5218cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 5228cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 5238cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 5248cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5258cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0 5268cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 5278cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0 5288cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 5298cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5308cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 5318cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5328cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5338cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0 5348cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 5358cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0 5368cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 5378cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5388cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 5398cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5408cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5418cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0 5428cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 5438cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0 5448cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 5458cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5468cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 5478cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5488cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 54976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 55076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i &mask, __m128i &srcA) { 55176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // In the following comments, the components of src, dst and mask are 55276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 55376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 55476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 55576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 55676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 55776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 55876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 55976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 56076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 56176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 56276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask stores 16-bit values (compressed three channels) interleaved with zeros. 56376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 56476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 56576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 56676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 567d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 56876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 5698cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 5708cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 5718cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 57276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 5738cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 5748cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 575fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 57676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 5778cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 5788cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 579fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 580d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 58176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 58276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 8-bit position 58376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 58476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 585d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 586d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 587fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Interleave R,G,B into the lower byte of word. 58876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 58976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 16-bit values, padded by zero. 590d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i maskLo, maskHi; 59176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 592d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 59376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 594d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 595d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 59676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Upscale from 0..31 to 0..32 59776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // (allows to replace division by left-shift further down) 59876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 59976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 600d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 601d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 602d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 60376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Multiply each component of maskLo and maskHi by srcA 60476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, srcA); 60576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, srcA); 606d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 60776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left shift mask components by 8 (divide by 256) 608d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srli_epi16(maskLo, 8); 609d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srli_epi16(maskHi, 8); 610d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 61176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 61276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 613d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 61476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 615d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 616d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 61776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask 61876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 61976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 620d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 62176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask >> 5 622d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 623d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 624d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 625d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Add two pixels into result. 62676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 627d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 628d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 629d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 63076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Pack into 4 32bit dst pixels. 63176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 63276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 63376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // clamping to 255 if necessary. 634d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return _mm_packus_epi16(resultLo, resultHi); 635d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 636d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 63776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 638d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i &mask) { 63976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // In the following comments, the components of src, dst and mask are 64076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 64176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 64276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 64376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 64476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 64576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 64676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 64776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 64876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask stores 16-bit values (shown as high and low bytes) interleaved with 64976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // zeros 65076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 65176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 65276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 653d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 65476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 6558cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 6568cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 657d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 65876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 6598cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 6608cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 661fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 66276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 6638cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 6648cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 665d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 666d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 66776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 66876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 8-bit position 66976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 67076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 671d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 672d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 673fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Interleave R,G,B into the lower byte of word. 67476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 67576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 16-bit values, padded by zero. 676d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i maskLo, maskHi; 67776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 678d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 67976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 680d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 681d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 68276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Upscale from 0..31 to 0..32 68376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // (allows to replace division by left-shift further down) 68476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 68576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 686d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 687d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 688d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 68976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 69076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 691d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 69276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 693d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 694d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 69576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask 69676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 69776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 698d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 69976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask >> 5 700d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 701d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 702d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 703d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Add two pixels into result. 70476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 705d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 706d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 707d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 70827123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com // Pack into 4 32bit dst pixels and force opaque. 70976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 71076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 71176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // clamping to 255 if necessary. Set alpha components to 0xFF. 71227123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 71327123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 714d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 715d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 71676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 71776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org SkColor src, int width, SkPMColor) { 718d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width <= 0) { 719d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return; 720d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 721d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 72276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcA = SkColorGetA(src); 72376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcR = SkColorGetR(src); 72476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcG = SkColorGetG(src); 72576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcB = SkColorGetB(src); 726fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 727d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com srcA = SkAlpha255To256(srcA); 728d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 729d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width >= 4) { 730d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 731d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 73276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 73376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 734d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 735d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width--; 736d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 737d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 738d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 73976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 74076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 74176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave with zeros to get two sets of four 16-bit values. 74276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 74376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 74476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 74576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i srcA_sse = _mm_set1_epi16(srcA); 746d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width >= 4) { 74776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four destination pixels into dst_sse. 74876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 74976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 75076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 75176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 75276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 75376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 75476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 755d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // pack_cmp to 0xFFFF 75676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 757d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_setzero_si128())); 758d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 759d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 760d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (pack_cmp != 0xFFFF) { 761fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Unpack 4 16bit mask pixels to 76276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 76376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 76476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 76576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org _mm_setzero_si128()); 766d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 767d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Process 4 32bit dst pixels 76876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 76976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse, srcA_sse); 770d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_store_si128(d, result); 771d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 772d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 773d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com d++; 77476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask += 4; 775d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width -= 4; 776d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 777d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 778d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 779d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 780d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 781d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width > 0) { 78276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 78376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 784d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 785fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com width--; 786d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 787d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 788d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 78976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 79076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org SkColor src, int width, SkPMColor opaqueDst) { 791d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width <= 0) { 792d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return; 793d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 794d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 79576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcR = SkColorGetR(src); 79676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcG = SkColorGetG(src); 79776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcB = SkColorGetB(src); 798d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 799d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width >= 4) { 800d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 801d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 80276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 80376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 804d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 805d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width--; 806d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 807d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 808d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 80976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 81076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 81176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 81276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 81376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 814d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width >= 4) { 81576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four destination pixels into dst_sse. 81676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 81776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 81876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 81976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 82076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 82176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 82276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 823d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // pack_cmp to 0xFFFF 82476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 825d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_setzero_si128())); 826d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 827d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 828d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (pack_cmp != 0xFFFF) { 829fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Unpack 4 16bit mask pixels to 83076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 83176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 83276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 83376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org _mm_setzero_si128()); 834d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 835d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Process 4 32bit dst pixels 83676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 83776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse); 838d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_store_si128(d, result); 839d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 840d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 841d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com d++; 84276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask += 4; 843d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width -= 4; 844d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 845d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 846d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 847d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 848d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 849d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width > 0) { 85076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 85176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 852d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 853fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com width--; 854d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 855d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 856475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 85739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org/* SSE2 version of S32_D565_Opaque() 85839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp 85939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org */ 86039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.orgvoid S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 86139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, int count, 86239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org U8CPU alpha, int /*x*/, int /*y*/) { 86339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkASSERT(255 == alpha); 86439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 86539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count <= 0) { 86639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org return; 86739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 86839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 86939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count >= 8) { 87039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 87139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColor c = *src++; 87239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColorAssert(c); 87339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 87439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org *dst++ = SkPixel32ToPixel16_ToU16(c); 87539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org count--; 87639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 87739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 87839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 87939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 88039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); 88139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); 88239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); 88339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 88439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org while (count >= 8) { 88539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Load 8 pixels of src. 88639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 88739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 88839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 88939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result r. 89039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r1 = _mm_srli_epi32(src_pixel1, 89139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_R32_SHIFT + (8 - SK_R16_BITS)); 89239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org r1 = _mm_and_si128(r1, r16_mask); 89339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r2 = _mm_srli_epi32(src_pixel2, 89439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_R32_SHIFT + (8 - SK_R16_BITS)); 89539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org r2 = _mm_and_si128(r2, r16_mask); 89639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r = _mm_packs_epi32(r1, r2); 89739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 89839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result g. 89939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g1 = _mm_srli_epi32(src_pixel1, 90039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_G32_SHIFT + (8 - SK_G16_BITS)); 90139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org g1 = _mm_and_si128(g1, g16_mask); 90239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g2 = _mm_srli_epi32(src_pixel2, 90339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_G32_SHIFT + (8 - SK_G16_BITS)); 90439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org g2 = _mm_and_si128(g2, g16_mask); 90539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g = _mm_packs_epi32(g1, g2); 90639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 90739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result b. 90839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b1 = _mm_srli_epi32(src_pixel1, 90939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_B32_SHIFT + (8 - SK_B16_BITS)); 91039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org b1 = _mm_and_si128(b1, b16_mask); 91139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b2 = _mm_srli_epi32(src_pixel2, 91239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_B32_SHIFT + (8 - SK_B16_BITS)); 91339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org b2 = _mm_and_si128(b2, b16_mask); 91439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b = _mm_packs_epi32(b1, b2); 91539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 91639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Store 8 16-bit colors in dst. 91739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE(r, g, b); 91839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 91939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org count -= 8; 92039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 92139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 92239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 92339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 92439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 92539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count > 0) { 92639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org do { 92739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColor c = *src++; 92839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColorAssert(c); 92939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org *dst++ = SkPixel32ToPixel16_ToU16(c); 93039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } while (--count != 0); 93139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 93239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org} 93339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 934475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque() 935475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp 936475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org */ 937475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.orgvoid S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 938475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, 939475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int count, U8CPU alpha, int /*x*/, int /*y*/) { 940475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkASSERT(255 == alpha); 941475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 942475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count <= 0) { 943475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org return; 944475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 945475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 946475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count >= 8) { 947475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Make dst 16 bytes alignment 948475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 949475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColor c = *src++; 950475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (c) { 951475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org *dst = SkSrcOver32To16(c, *dst); 952475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 953475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst += 1; 954475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count--; 955475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 956475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 957475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 958475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 959475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i var255 = _mm_set1_epi16(255); 960475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 961475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 962475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 963475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 964475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org while (count >= 8) { 965475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Load 8 pixels of src. 966475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 967475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 968475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 969475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Check whether src pixels are equal to 0 and get the highest bit 970475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // of each byte of result, if src pixels are all zero, src_cmp1 and 971475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // src_cmp2 will be 0xFFFF. 972475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 973475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_setzero_si128())); 974475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 975475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_setzero_si128())); 976475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 977475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org d++; 978475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count -= 8; 979475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org continue; 980475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 981475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 982475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Load 8 pixels of dst. 983475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i dst_pixel = _mm_load_si128(d); 984475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 985475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract A from src. 986475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); 987475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sa1 = _mm_srli_epi32(sa1, 24); 988475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); 989475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sa2 = _mm_srli_epi32(sa2, 24); 990475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sa = _mm_packs_epi32(sa1, sa2); 991475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 992475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract R from src. 993475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); 994475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sr1 = _mm_srli_epi32(sr1, 24); 995475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); 996475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sr2 = _mm_srli_epi32(sr2, 24); 997475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sr = _mm_packs_epi32(sr1, sr2); 998475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 999475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract G from src. 1000475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); 1001475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sg1 = _mm_srli_epi32(sg1, 24); 1002475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); 1003475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sg2 = _mm_srli_epi32(sg2, 24); 1004475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sg = _mm_packs_epi32(sg1, sg2); 1005475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1006475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract B from src. 1007475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); 1008475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sb1 = _mm_srli_epi32(sb1, 24); 1009475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); 1010475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sb2 = _mm_srli_epi32(sb2, 24); 1011475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sb = _mm_packs_epi32(sb1, sb2); 1012475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1013475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract R G B from dst. 1014475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); 1015475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_and_si128(dr, r16_mask); 1016475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); 1017475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_and_si128(dg, g16_mask); 1018475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); 1019475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_and_si128(db, b16_mask); 1020475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1021475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 1022475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1023475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Calculate R G B of result. 1024475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Original algorithm is in SkSrcOver32To16(). 1025475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); 1026475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 1027475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); 1028475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 1029475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); 1030475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 1031475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1032475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Pack R G B into 16-bit color. 1033475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); 1034475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1035475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Store 8 16-bit colors in dst. 1036475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 1037475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count -= 8; 1038475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1039475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1040475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 1041475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 1042475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1043475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1044475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count > 0) { 1045475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org do { 1046475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColor c = *src++; 1047475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColorAssert(c); 1048475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (c) { 1049475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org *dst = SkSrcOver32To16(c, *dst); 1050475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1051475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst += 1; 1052475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } while (--count != 0); 1053475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1054475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org} 1055275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1056275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.orgvoid S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 1057275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, 1058275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org int count, U8CPU alpha, int x, int y) { 1059275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkASSERT(255 == alpha); 1060275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1061275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org if (count <= 0) { 1062275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org return; 1063275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1064275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1065275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org if (count >= 8) { 1066275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 1067275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_565_SCAN(y); 1068275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColor c = *src++; 1069275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColorAssert(c); 1070275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1071275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org unsigned dither = DITHER_VALUE(x); 1072275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org *dst++ = SkDitherRGB32To565(c, dither); 1073275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_INC_X(x); 1074275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org count--; 1075275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1076275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1077275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org unsigned short dither_value[8]; 1078275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i dither; 1079275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#ifdef ENABLE_DITHER_MATRIX_4X4 1080275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1081275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1082275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1083275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1084275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1085275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#else 1086275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1087275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[0] = dither_value[4] = (dither_scan 1088275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x) & 3) << 2)) & 0xF; 1089275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[1] = dither_value[5] = (dither_scan 1090275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x + 1) & 3) << 2)) & 0xF; 1091275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[2] = dither_value[6] = (dither_scan 1092275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x + 2) & 3) << 2)) & 0xF; 1093275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[3] = dither_value[7] = (dither_scan 1094275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x + 3) & 3) << 2)) & 0xF; 1095275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#endif 1096275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither = _mm_loadu_si128((__m128i*) dither_value); 1097275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1098275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 1099275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 1100275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1101275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org while (count >= 8) { 1102275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Load 8 pixels of src. 1103275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 1104275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 1105275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1106275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Extract R from src. 1107275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1108275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr1 = _mm_srli_epi32(sr1, 24); 1109275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1110275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr2 = _mm_srli_epi32(sr2, 24); 1111275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr = _mm_packs_epi32(sr1, sr2); 1112275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1113275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // SkDITHER_R32To565(sr, dither) 1114275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr_offset = _mm_srli_epi16(sr, 5); 1115275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr = _mm_add_epi16(sr, dither); 1116275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr = _mm_sub_epi16(sr, sr_offset); 1117275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); 1118275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1119275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Extract G from src. 1120275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1121275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg1 = _mm_srli_epi32(sg1, 24); 1122275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1123275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg2 = _mm_srli_epi32(sg2, 24); 1124275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg = _mm_packs_epi32(sg1, sg2); 1125275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1126275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // SkDITHER_R32To565(sg, dither) 1127275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg_offset = _mm_srli_epi16(sg, 6); 1128275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); 1129275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg = _mm_sub_epi16(sg, sg_offset); 1130275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); 1131275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1132275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Extract B from src. 1133275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1134275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb1 = _mm_srli_epi32(sb1, 24); 1135275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1136275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb2 = _mm_srli_epi32(sb2, 24); 1137275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb = _mm_packs_epi32(sb1, sb2); 1138275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1139275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // SkDITHER_R32To565(sb, dither) 1140275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb_offset = _mm_srli_epi16(sb, 5); 1141275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb = _mm_add_epi16(sb, dither); 1142275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb = _mm_sub_epi16(sb, sb_offset); 1143275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); 1144275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1145275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Pack and store 16-bit dst pixel. 1146275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE(sr, sg, sb); 1147275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 1148275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1149275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org count -= 8; 1150275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org x += 8; 1151275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1152275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1153275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 1154275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 1155275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1156275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1157275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org if (count > 0) { 1158275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_565_SCAN(y); 1159275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org do { 1160275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColor c = *src++; 1161275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColorAssert(c); 1162275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1163275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org unsigned dither = DITHER_VALUE(x); 1164275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org *dst++ = SkDitherRGB32To565(c, dither); 1165275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_INC_X(x); 1166275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } while (--count != 0); 1167275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1168275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org} 1169