19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* 298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com * Copyright 2012 The Android Open Source Project 3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * 4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be 5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file. 69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 88c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org#include <emmintrin.h> 983ecdc3ac69c9208493c4c3fc8ea9f84b1350535caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h" 108c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org#include "SkBlitRow_opts_SSE2.h" 119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h" 12475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org#include "SkColor_opts_SSE2.h" 13275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#include "SkDither.h" 14c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h" 159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32() 179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 194e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 204e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t dst_scale = 256 - src_scale; 299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 30dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 4298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 4398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 4498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Move scale factors to upper byte of word 4598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 4698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 5298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Interleave Atom port 0/1 operations based on the execution port 5398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // constraints that multiply can only be executed on port 0 (while 5498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // boolean operations can be executed on either port 0 or port 1) 5598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // because GCC currently doesn't do a good job scheduling 5698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // instructions based on these constraints. 5798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 5998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 6298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Multiply by scale. 6398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 6498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // where rs.h stands for the higher byte of r * scale, and 6598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // bs.h the higher byte of b * scale. 6698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 6798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 6898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Get alpha and green pixels into higher byte of each word. 6998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 7098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by scale. 7398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, as.l, gs.h, gs.l)) 7498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 7698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Clear the lower byte of the a*scale and g*scale results 7798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 7898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_and_si128(src_ag, ag_mask); 7998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 8098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Operations the destination pixels are the same as on the 8198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // source pixels. See the comments above. 8298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 8398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 8498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 8598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 8698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_ag = _mm_and_si128(dst_ag, ag_mask); 87dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 8998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, rs.h, gs.h, bs.h)) 90dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 91dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 1029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 1039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 1044e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org while (count > 0) { 1059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 1069272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 1079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 1089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 1099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 1109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 1124e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 1134e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 1144e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 1159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha == 255); 1169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 1179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 119dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 1319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING 132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 141f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Shift alphas down to lower 8 bits of each quad. 143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i alpha = _mm_srli_epi32(src_pixel, 24); 144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Copy alpha to upper 3rd byte of each quad 146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 255, to get 0..255 149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_255, alpha); 150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb_low = (dst_rb >> 8) 157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, c_128); 163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, c_128); 168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 1808c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org#else 181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 189f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 191f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 192f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i alpha = _mm_srli_epi16(src_pixel, 8); 193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, g2, a3, g3) 195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflehi_epi16(alpha, 0xF5); 196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, a2, a3, a3) 198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflelo_epi16(alpha, 0xF5); 199dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_256, alpha); 202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 209dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out high bits (already in the right place) 212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 213dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 214dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 215dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 216dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 217dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 218dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 219dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 220dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 221dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 222dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 223dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif 225dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 226dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 2309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 2319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 2329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 2339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 2349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 2369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2374e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 2384e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 2394e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 2409272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 2419272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 2429272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 2439272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2449272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 245dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 25798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get alpha and green into lower byte of each word. 270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Put per-pixel alpha in low byte of each word. 27498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // After the following two statements, the dst_alpha looks like 27598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 276dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_alpha = dst_alpha * src_scale 28098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Because src_scales are in the higher byte of each word and 28198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // we use mulhi here, the resulting alpha values are already 28298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // in the right place and don't need to be divided by 256. 28398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 28498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 285dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by dst pixel alpha. 290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by dst pixel alpha. 292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by global alpha. 29598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 29698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // where rs.h stands for the higher byte of r * src_scale, 29798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // and bs.h the higher byte of b * src_scale. 29898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Again, because we use mulhi, the resuling red and blue 29998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // values are already in the right place and don't need to 30098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // be divided by 256. 30198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 302dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by global alpha. 30398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, as.h, 0, gs.h)) 30498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 305dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 306dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 307dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 308dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 309dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out low bits (goodies already in the right place; no need to divide) 310dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 31198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Shift alpha and green to higher byte of each word. 31298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 31398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_slli_epi16(src_ag, 8); 314dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 315dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 316dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 317dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 318dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 319dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add two pixels into result. 320dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 321dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 322dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 323dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 324dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 325dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 326dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 327dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 3289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 329dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 3309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 3319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 3329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 3339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 3349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 3359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 3369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 337c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 338c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32() 339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */ 341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkPMColor color) { 343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count <= 0) { 344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org return; 345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (0 == color) { 348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (src != dst) { 349c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org memcpy(dst, src, count * sizeof(SkPMColor)); 350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 351c909a1ecadd422d91ff97d10ce08865290223b14reed@google.com return; 352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 353c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 354c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned colorA = SkGetPackedA32(color); 355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (255 == colorA) { 356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org sk_memset32(dst, color, count); 357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } else { 358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned scale = 256 - SkAlpha255To256(colorA); 359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count >= 4) { 361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src++; 365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst++; 366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(scale); 373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i color_wide = _mm_set1_epi32(color); 374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count >= 4) { 375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Load 4 pixels each of src and dest. 376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 380981d4798007b91e2e19c13b171583927a56df63breed@google.com 381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get alpha and green into lower byte of each word. 382c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 383c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Multiply by scale. 385c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 386c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 387c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 388c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Divide by 256. 389c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 390c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 391c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 392c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Combine back into RGBA. 393c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 394c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 395c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Add color to result. 396c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i result = _mm_add_epi8(color_wide, src_pixel); 397c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 398c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Store result. 399c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org _mm_store_si128(d, result); 400c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org s++; 401c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org d++; 402c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count -= 4; 403c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 404c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 405c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 4068c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org } 407c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 408c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count > 0) { 409c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 410c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src += 1; 411c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst += 1; 412c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 413981d4798007b91e2e19c13b171583927a56df63breed@google.com } 414c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 415c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org} 416981d4798007b91e2e19c13b171583927a56df63breed@google.com 417edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 418edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com size_t maskRB, SkColor origColor, 419d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com int width, int height) { 420ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com SkPMColor color = SkPreMultiplyColor(origColor); 421981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t dstOffset = dstRB - (width << 2); 422981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t maskOffset = maskRB - width; 423981d4798007b91e2e19c13b171583927a56df63breed@google.com SkPMColor* dst = (SkPMColor *)device; 424edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com const uint8_t* mask = (const uint8_t*)maskPtr; 425981d4798007b91e2e19c13b171583927a56df63breed@google.com do { 426981d4798007b91e2e19c13b171583927a56df63breed@google.com int count = width; 427981d4798007b91e2e19c13b171583927a56df63breed@google.com if (count >= 4) { 428981d4798007b91e2e19c13b171583927a56df63breed@google.com while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 429981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst = SkBlendARGB32(color, *dst, *mask); 430981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 431981d4798007b91e2e19c13b171583927a56df63breed@google.com dst++; 432981d4798007b91e2e19c13b171583927a56df63breed@google.com count--; 433981d4798007b91e2e19c13b171583927a56df63breed@google.com } 434981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 435981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 436981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_256 = _mm_set1_epi16(256); 437981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_1 = _mm_set1_epi16(1); 438981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_pixel = _mm_set1_epi32(color); 439981d4798007b91e2e19c13b171583927a56df63breed@google.com while (count >= 4) { 440981d4798007b91e2e19c13b171583927a56df63breed@google.com // Load 4 pixels each of src and dest. 441981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_pixel = _mm_load_si128(d); 442981d4798007b91e2e19c13b171583927a56df63breed@google.com 443981d4798007b91e2e19c13b171583927a56df63breed@google.com //set the aphla value 444981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 445981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *(mask+3),0, \ 446981d4798007b91e2e19c13b171583927a56df63breed@google.com *(mask+2),0, *(mask+2),\ 447981d4798007b91e2e19c13b171583927a56df63breed@google.com 0,*(mask+1), 0,*(mask+1),\ 448981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *mask,0,*mask); 449981d4798007b91e2e19c13b171583927a56df63breed@google.com 450981d4798007b91e2e19c13b171583927a56df63breed@google.com //call SkAlpha255To256() 451981d4798007b91e2e19c13b171583927a56df63breed@google.com src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 452981d4798007b91e2e19c13b171583927a56df63breed@google.com 453981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get red and blue pixels into lower byte of each word. 454981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 455981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 456981d4798007b91e2e19c13b171583927a56df63breed@google.com 457981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get alpha and green into lower byte of each word. 458981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 459981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 460981d4798007b91e2e19c13b171583927a56df63breed@google.com 461981d4798007b91e2e19c13b171583927a56df63breed@google.com // Put per-pixel alpha in low byte of each word. 462981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 463981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 464981d4798007b91e2e19c13b171583927a56df63breed@google.com 465981d4798007b91e2e19c13b171583927a56df63breed@google.com // dst_alpha = dst_alpha * src_scale 466981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 467981d4798007b91e2e19c13b171583927a56df63breed@google.com 468981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 469981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_srli_epi16(dst_alpha, 8); 470981d4798007b91e2e19c13b171583927a56df63breed@google.com 471981d4798007b91e2e19c13b171583927a56df63breed@google.com // Subtract alphas from 256, to get 1..256 472981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 473981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by dst pixel alpha. 474981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 475981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by dst pixel alpha. 476981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 477981d4798007b91e2e19c13b171583927a56df63breed@google.com 478981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by global alpha. 479981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 480981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by global alpha. 481981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 482981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 483981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_srli_epi16(dst_rb, 8); 484981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_srli_epi16(src_rb, 8); 485981d4798007b91e2e19c13b171583927a56df63breed@google.com 486981d4798007b91e2e19c13b171583927a56df63breed@google.com // Mask out low bits (goodies already in the right place; no need to divide) 487981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 488981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_andnot_si128(rb_mask, src_ag); 489981d4798007b91e2e19c13b171583927a56df63breed@google.com 490981d4798007b91e2e19c13b171583927a56df63breed@google.com // Combine back into RGBA. 491981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_pixel = _mm_or_si128(dst_rb, dst_ag); 492981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 493981d4798007b91e2e19c13b171583927a56df63breed@google.com 494981d4798007b91e2e19c13b171583927a56df63breed@google.com // Add two pixels into result. 495981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 496981d4798007b91e2e19c13b171583927a56df63breed@google.com _mm_store_si128(d, result); 497981d4798007b91e2e19c13b171583927a56df63breed@google.com // load the next 4 pixel 498981d4798007b91e2e19c13b171583927a56df63breed@google.com mask = mask + 4; 499981d4798007b91e2e19c13b171583927a56df63breed@google.com d++; 500981d4798007b91e2e19c13b171583927a56df63breed@google.com count -= 4; 501981d4798007b91e2e19c13b171583927a56df63breed@google.com } 502981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = reinterpret_cast<SkPMColor *>(d); 503981d4798007b91e2e19c13b171583927a56df63breed@google.com } 5048c4953c6f176469ad287c3270ab146e292b23badcommit-bot@chromium.org while (count > 0) { 505981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst= SkBlendARGB32(color, *dst, *mask); 506981d4798007b91e2e19c13b171583927a56df63breed@google.com dst += 1; 507981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 508981d4798007b91e2e19c13b171583927a56df63breed@google.com count --; 509981d4798007b91e2e19c13b171583927a56df63breed@google.com } 510981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = (SkPMColor *)((char*)dst + dstOffset); 511981d4798007b91e2e19c13b171583927a56df63breed@google.com mask += maskOffset; 512981d4798007b91e2e19c13b171583927a56df63breed@google.com } while (--height != 0); 513981d4798007b91e2e19c13b171583927a56df63breed@google.com} 514d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 5158cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to 5168cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// line up with the corresponding components in an SkPMColor. 5178cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order. 5188cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 5198cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 5208cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 5218cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5228cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0 5238cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 5248cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0 5258cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 5268cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5278cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 5288cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5298cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5308cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0 5318cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 5328cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0 5338cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 5348cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5358cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 5368cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5378cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5388cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0 5398cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 5408cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0 5418cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 5428cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5438cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 5448cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5458cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 54676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 54776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i &mask, __m128i &srcA) { 54876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // In the following comments, the components of src, dst and mask are 54976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 55076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 55176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 55276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 55376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 55476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 55576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 55676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 55776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 55876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 55976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask stores 16-bit values (compressed three channels) interleaved with zeros. 56076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 56176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 56276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 56376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 564d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 56576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 5668cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 5678cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 5688cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 56976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 5708cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 5718cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 572fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 57376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 5748cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 5758cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 576fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 577d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 57876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 57976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 8-bit position 58076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 58176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 582d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 583d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 584fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Interleave R,G,B into the lower byte of word. 58576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 58676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 16-bit values, padded by zero. 587d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i maskLo, maskHi; 58876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 589d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 59076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 591d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 592d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 59376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Upscale from 0..31 to 0..32 59476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // (allows to replace division by left-shift further down) 59576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 59676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 597d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 598d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 599d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 60076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Multiply each component of maskLo and maskHi by srcA 60176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, srcA); 60276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, srcA); 603d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 60476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left shift mask components by 8 (divide by 256) 605d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srli_epi16(maskLo, 8); 606d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srli_epi16(maskHi, 8); 607d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 60876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 60976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 610d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 61176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 612d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 613d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 61476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask 61576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 61676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 617d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 61876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask >> 5 619d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 620d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 621d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 622d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Add two pixels into result. 62376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 624d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 625d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 626d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 62776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Pack into 4 32bit dst pixels. 62876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 62976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 63076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // clamping to 255 if necessary. 631d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return _mm_packus_epi16(resultLo, resultHi); 632d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 633d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 63476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 635d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i &mask) { 63676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // In the following comments, the components of src, dst and mask are 63776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 63876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 63976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 64076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 64176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 64276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 64376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 64476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 64576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask stores 16-bit values (shown as high and low bytes) interleaved with 64676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // zeros 64776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 64876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 64976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 650d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 65176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 6528cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 6538cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 654d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 65576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 6568cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 6578cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 658fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 65976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 6608cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 6618cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 662d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 663d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 66476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 66576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 8-bit position 66676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 66776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 668d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 669d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 670fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Interleave R,G,B into the lower byte of word. 67176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 67276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 16-bit values, padded by zero. 673d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i maskLo, maskHi; 67476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 675d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 67676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 677d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 678d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 67976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Upscale from 0..31 to 0..32 68076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // (allows to replace division by left-shift further down) 68176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 68276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 683d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 684d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 685d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 68676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 68776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 688d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 68976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 690d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 691d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 69276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask 69376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 69476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 695d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 69676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask >> 5 697d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 698d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 699d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 700d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Add two pixels into result. 70176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 702d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 703d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 704d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 70527123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com // Pack into 4 32bit dst pixels and force opaque. 70676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 70776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 70876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // clamping to 255 if necessary. Set alpha components to 0xFF. 70927123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 71027123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 711d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 712d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 71376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 71476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org SkColor src, int width, SkPMColor) { 715d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width <= 0) { 716d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return; 717d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 718d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 71976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcA = SkColorGetA(src); 72076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcR = SkColorGetR(src); 72176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcG = SkColorGetG(src); 72276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcB = SkColorGetB(src); 723fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 724d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com srcA = SkAlpha255To256(srcA); 725d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 726d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width >= 4) { 727d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 728d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 72976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 73076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 731d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 732d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width--; 733d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 734d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 735d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 73676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 73776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 73876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave with zeros to get two sets of four 16-bit values. 73976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 74076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 74176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 74276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i srcA_sse = _mm_set1_epi16(srcA); 743d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width >= 4) { 74476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four destination pixels into dst_sse. 74576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 74676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 74776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 74876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 74976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 75076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 75176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 752d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // pack_cmp to 0xFFFF 75376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 754d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_setzero_si128())); 755d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 756d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 757d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (pack_cmp != 0xFFFF) { 758fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Unpack 4 16bit mask pixels to 75976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 76076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 76176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 76276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org _mm_setzero_si128()); 763d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 764d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Process 4 32bit dst pixels 76576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 76676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse, srcA_sse); 767d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_store_si128(d, result); 768d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 769d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 770d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com d++; 77176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask += 4; 772d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width -= 4; 773d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 774d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 775d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 776d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 777d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 778d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width > 0) { 77976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 78076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 781d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 782fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com width--; 783d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 784d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 785d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 78676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 78776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org SkColor src, int width, SkPMColor opaqueDst) { 788d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width <= 0) { 789d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return; 790d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 791d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 79276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcR = SkColorGetR(src); 79376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcG = SkColorGetG(src); 79476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcB = SkColorGetB(src); 795d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 796d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width >= 4) { 797d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 798d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 79976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 80076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 801d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 802d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width--; 803d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 804d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 805d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 80676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 80776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 80876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 80976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 81076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 811d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width >= 4) { 81276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four destination pixels into dst_sse. 81376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 81476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 81576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 81676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 81776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 81876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 81976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 820d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // pack_cmp to 0xFFFF 82176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 822d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_setzero_si128())); 823d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 824d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 825d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (pack_cmp != 0xFFFF) { 826fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Unpack 4 16bit mask pixels to 82776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 82876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 82976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 83076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org _mm_setzero_si128()); 831d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 832d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Process 4 32bit dst pixels 83376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 83476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse); 835d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_store_si128(d, result); 836d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 837d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 838d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com d++; 83976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask += 4; 840d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width -= 4; 841d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 842d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 843d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 844d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 845d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 846d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width > 0) { 84776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 84876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 849d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 850fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com width--; 851d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 852d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 853475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 85439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org/* SSE2 version of S32_D565_Opaque() 85539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp 85639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org */ 85739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.orgvoid S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 85839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, int count, 85939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org U8CPU alpha, int /*x*/, int /*y*/) { 86039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkASSERT(255 == alpha); 86139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 86239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count <= 0) { 86339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org return; 86439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 86539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 86639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count >= 8) { 86739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 86839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColor c = *src++; 86939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColorAssert(c); 87039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 87139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org *dst++ = SkPixel32ToPixel16_ToU16(c); 87239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org count--; 87339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 87439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 87539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 87639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 87739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); 87839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); 87939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); 88039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 88139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org while (count >= 8) { 88239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Load 8 pixels of src. 88339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 88439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 88539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 88639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result r. 88739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r1 = _mm_srli_epi32(src_pixel1, 88839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_R32_SHIFT + (8 - SK_R16_BITS)); 88939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org r1 = _mm_and_si128(r1, r16_mask); 89039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r2 = _mm_srli_epi32(src_pixel2, 89139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_R32_SHIFT + (8 - SK_R16_BITS)); 89239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org r2 = _mm_and_si128(r2, r16_mask); 89339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r = _mm_packs_epi32(r1, r2); 89439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 89539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result g. 89639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g1 = _mm_srli_epi32(src_pixel1, 89739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_G32_SHIFT + (8 - SK_G16_BITS)); 89839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org g1 = _mm_and_si128(g1, g16_mask); 89939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g2 = _mm_srli_epi32(src_pixel2, 90039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_G32_SHIFT + (8 - SK_G16_BITS)); 90139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org g2 = _mm_and_si128(g2, g16_mask); 90239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g = _mm_packs_epi32(g1, g2); 90339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 90439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result b. 90539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b1 = _mm_srli_epi32(src_pixel1, 90639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_B32_SHIFT + (8 - SK_B16_BITS)); 90739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org b1 = _mm_and_si128(b1, b16_mask); 90839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b2 = _mm_srli_epi32(src_pixel2, 90939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_B32_SHIFT + (8 - SK_B16_BITS)); 91039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org b2 = _mm_and_si128(b2, b16_mask); 91139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b = _mm_packs_epi32(b1, b2); 91239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 91339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Store 8 16-bit colors in dst. 914c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE2(r, g, b); 91539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 91639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org count -= 8; 91739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 91839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 91939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 92039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 92139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 92239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count > 0) { 92339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org do { 92439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColor c = *src++; 92539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColorAssert(c); 92639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org *dst++ = SkPixel32ToPixel16_ToU16(c); 92739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } while (--count != 0); 92839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 92939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org} 93039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 931475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque() 932475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp 933475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org */ 934475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.orgvoid S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 935475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, 936475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int count, U8CPU alpha, int /*x*/, int /*y*/) { 937475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkASSERT(255 == alpha); 938475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 939475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count <= 0) { 940475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org return; 941475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 942475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 943475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count >= 8) { 944475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Make dst 16 bytes alignment 945475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 946475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColor c = *src++; 947475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (c) { 948475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org *dst = SkSrcOver32To16(c, *dst); 949475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 950475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst += 1; 951475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count--; 952475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 953475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 954475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 955475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 956475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i var255 = _mm_set1_epi16(255); 957475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 958475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 959475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 960475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 961475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org while (count >= 8) { 962475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Load 8 pixels of src. 963475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 964475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 965475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 966475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Check whether src pixels are equal to 0 and get the highest bit 967475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // of each byte of result, if src pixels are all zero, src_cmp1 and 968475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // src_cmp2 will be 0xFFFF. 969475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 970475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_setzero_si128())); 971475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 972475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_setzero_si128())); 973475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 974475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org d++; 975475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count -= 8; 976475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org continue; 977475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 978475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 979475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Load 8 pixels of dst. 980475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i dst_pixel = _mm_load_si128(d); 981475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 982475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract A from src. 983c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 984475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sa1 = _mm_srli_epi32(sa1, 24); 985c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 986475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sa2 = _mm_srli_epi32(sa2, 24); 987475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sa = _mm_packs_epi32(sa1, sa2); 988475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 989475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract R from src. 990c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 991475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sr1 = _mm_srli_epi32(sr1, 24); 992c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 993475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sr2 = _mm_srli_epi32(sr2, 24); 994475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sr = _mm_packs_epi32(sr1, sr2); 995475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 996475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract G from src. 997c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 998475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sg1 = _mm_srli_epi32(sg1, 24); 999c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1000475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sg2 = _mm_srli_epi32(sg2, 24); 1001475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sg = _mm_packs_epi32(sg1, sg2); 1002475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1003475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract B from src. 1004c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1005475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sb1 = _mm_srli_epi32(sb1, 24); 1006c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1007475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sb2 = _mm_srli_epi32(sb2, 24); 1008475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sb = _mm_packs_epi32(sb1, sb2); 1009475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1010475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract R G B from dst. 1011c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 1012475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_and_si128(dr, r16_mask); 1013c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 1014475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_and_si128(dg, g16_mask); 1015c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 1016475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_and_si128(db, b16_mask); 1017475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1018475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 1019475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1020475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Calculate R G B of result. 1021475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Original algorithm is in SkSrcOver32To16(). 1022c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)); 1023475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 1024c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)); 1025475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 1026c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)); 1027475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 1028475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1029475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Pack R G B into 16-bit color. 1030c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 1031475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1032475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Store 8 16-bit colors in dst. 1033475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 1034475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count -= 8; 1035475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1036475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1037475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 1038475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 1039475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1040475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1041475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count > 0) { 1042475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org do { 1043475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColor c = *src++; 1044475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColorAssert(c); 1045475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (c) { 1046475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org *dst = SkSrcOver32To16(c, *dst); 1047475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1048475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst += 1; 1049475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } while (--count != 0); 1050475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1051475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org} 1052275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1053275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.orgvoid S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 1054275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, 1055275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org int count, U8CPU alpha, int x, int y) { 1056275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkASSERT(255 == alpha); 1057275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1058275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org if (count <= 0) { 1059275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org return; 1060275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1061275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1062275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org if (count >= 8) { 1063275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 1064275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_565_SCAN(y); 1065275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColor c = *src++; 1066275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColorAssert(c); 1067275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1068275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org unsigned dither = DITHER_VALUE(x); 1069275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org *dst++ = SkDitherRGB32To565(c, dither); 1070275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_INC_X(x); 1071275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org count--; 1072275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1073275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1074275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org unsigned short dither_value[8]; 1075275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i dither; 1076275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#ifdef ENABLE_DITHER_MATRIX_4X4 1077275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1078275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1079275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1080275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1081275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1082275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#else 1083275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1084275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[0] = dither_value[4] = (dither_scan 1085275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x) & 3) << 2)) & 0xF; 1086275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[1] = dither_value[5] = (dither_scan 1087275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x + 1) & 3) << 2)) & 0xF; 1088275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[2] = dither_value[6] = (dither_scan 1089275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x + 2) & 3) << 2)) & 0xF; 1090275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither_value[3] = dither_value[7] = (dither_scan 1091275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org >> (((x + 3) & 3) << 2)) & 0xF; 1092275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org#endif 1093275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dither = _mm_loadu_si128((__m128i*) dither_value); 1094275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1095275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 1096275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 1097275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1098275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org while (count >= 8) { 1099275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Load 8 pixels of src. 1100275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 1101275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 1102275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1103275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Extract R from src. 1104275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1105275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr1 = _mm_srli_epi32(sr1, 24); 1106275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1107275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr2 = _mm_srli_epi32(sr2, 24); 1108275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr = _mm_packs_epi32(sr1, sr2); 1109275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1110275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // SkDITHER_R32To565(sr, dither) 1111275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sr_offset = _mm_srli_epi16(sr, 5); 1112275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr = _mm_add_epi16(sr, dither); 1113275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr = _mm_sub_epi16(sr, sr_offset); 1114275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); 1115275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1116275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Extract G from src. 1117275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1118275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg1 = _mm_srli_epi32(sg1, 24); 1119275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1120275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg2 = _mm_srli_epi32(sg2, 24); 1121275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg = _mm_packs_epi32(sg1, sg2); 1122275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1123275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // SkDITHER_R32To565(sg, dither) 1124275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sg_offset = _mm_srli_epi16(sg, 6); 1125275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); 1126275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg = _mm_sub_epi16(sg, sg_offset); 1127275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); 1128275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1129275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Extract B from src. 1130275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1131275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb1 = _mm_srli_epi32(sb1, 24); 1132275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1133275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb2 = _mm_srli_epi32(sb2, 24); 1134275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb = _mm_packs_epi32(sb1, sb2); 1135275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1136275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // SkDITHER_R32To565(sb, dither) 1137275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org __m128i sb_offset = _mm_srli_epi16(sb, 5); 1138275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb = _mm_add_epi16(sb, dither); 1139275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb = _mm_sub_epi16(sb, sb_offset); 1140275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); 1141275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1142275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org // Pack and store 16-bit dst pixel. 1143c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); 1144275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 1145275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1146275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org count -= 8; 1147275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org x += 8; 1148275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1149275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1150275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 1151275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 1152275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1153275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1154275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org if (count > 0) { 1155275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_565_SCAN(y); 1156275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org do { 1157275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColor c = *src++; 1158275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org SkPMColorAssert(c); 1159275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org 1160275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org unsigned dither = DITHER_VALUE(x); 1161275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org *dst++ = SkDitherRGB32To565(c, dither); 1162275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org DITHER_INC_X(x); 1163275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } while (--count != 0); 1164275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org } 1165275804782f7b752cc9c25cb556db2a0cfc711dd9commit-bot@chromium.org} 1166fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1167fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque_Dither() 1168fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp 1169fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org */ 1170fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.orgvoid S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 1171fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, 1172fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org int count, U8CPU alpha, int x, int y) { 1173fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org SkASSERT(255 == alpha); 1174fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1175fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org if (count <= 0) { 1176fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org return; 1177fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } 1178fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1179fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org if (count >= 8) { 1180fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 1181fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org DITHER_565_SCAN(y); 1182fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org SkPMColor c = *src++; 1183fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org SkPMColorAssert(c); 1184fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org if (c) { 1185fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned a = SkGetPackedA32(c); 1186fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1187fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1188fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1189fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned sr = SkGetPackedR32(c); 1190fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned sg = SkGetPackedG32(c); 1191fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned sb = SkGetPackedB32(c); 1192fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sr = SkDITHER_R32_FOR_565(sr, d); 1193fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sg = SkDITHER_G32_FOR_565(sg, d); 1194fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sb = SkDITHER_B32_FOR_565(sb, d); 1195fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1196fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1197fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1198fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1199fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // now src and dst expanded are in g:11 r:10 x:1 b:10 1200fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1201fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } 1202fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dst += 1; 1203fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org DITHER_INC_X(x); 1204fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org count--; 1205fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } 1206fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1207fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned short dither_value[8]; 1208fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i dither, dither_cur; 1209fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org#ifdef ENABLE_DITHER_MATRIX_4X4 1210fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1211fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1212fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1213fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1214fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1215fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org#else 1216fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1217fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[0] = dither_value[4] = (dither_scan 1218fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org >> (((x) & 3) << 2)) & 0xF; 1219fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[1] = dither_value[5] = (dither_scan 1220fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org >> (((x + 1) & 3) << 2)) & 0xF; 1221fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[2] = dither_value[6] = (dither_scan 1222fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org >> (((x + 2) & 3) << 2)) & 0xF; 1223fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_value[3] = dither_value[7] = (dither_scan 1224fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org >> (((x + 3) & 3) << 2)) & 0xF; 1225fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org#endif 1226fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither = _mm_loadu_si128((__m128i*) dither_value); 1227fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1228fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 1229fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 1230fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i var256 = _mm_set1_epi16(256); 1231fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 1232fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 1233fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 1234fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1235fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org while (count >= 8) { 1236fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Load 8 pixels of src and dst. 1237fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 1238fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 1239fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i dst_pixel = _mm_load_si128(d); 1240fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1241fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Extract A from src. 1242c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 1243fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sa1 = _mm_srli_epi32(sa1, 24); 1244c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 1245fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sa2 = _mm_srli_epi32(sa2, 24); 1246fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sa = _mm_packs_epi32(sa1, sa2); 1247fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1248fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Calculate current dither value. 1249fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_cur = _mm_mullo_epi16(dither, 1250fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org _mm_add_epi16(sa, _mm_set1_epi16(1))); 1251fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dither_cur = _mm_srli_epi16(dither_cur, 8); 1252fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1253fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Extract R from src. 1254fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1255fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sr1 = _mm_srli_epi32(sr1, 24); 1256fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1257fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sr2 = _mm_srli_epi32(sr2, 24); 1258fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sr = _mm_packs_epi32(sr1, sr2); 1259fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1260fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // SkDITHER_R32_FOR_565(sr, d) 1261fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sr_offset = _mm_srli_epi16(sr, 5); 1262fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sr = _mm_add_epi16(sr, dither_cur); 1263fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sr = _mm_sub_epi16(sr, sr_offset); 1264fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1265fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Expand sr. 1266fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sr = _mm_slli_epi16(sr, 2); 1267fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1268fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Extract G from src. 1269fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1270fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sg1 = _mm_srli_epi32(sg1, 24); 1271fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1272fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sg2 = _mm_srli_epi32(sg2, 24); 1273fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sg = _mm_packs_epi32(sg1, sg2); 1274fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1275fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // sg = SkDITHER_G32_FOR_565(sg, d). 1276fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sg_offset = _mm_srli_epi16(sg, 6); 1277fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); 1278fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sg = _mm_sub_epi16(sg, sg_offset); 1279fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1280fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Expand sg. 1281fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sg = _mm_slli_epi16(sg, 3); 1282fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1283fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Extract B from src. 1284fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1285fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sb1 = _mm_srli_epi32(sb1, 24); 1286fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1287fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sb2 = _mm_srli_epi32(sb2, 24); 1288fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sb = _mm_packs_epi32(sb1, sb2); 1289fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1290fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // sb = SkDITHER_B32_FOR_565(sb, d). 1291fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i sb_offset = _mm_srli_epi16(sb, 5); 1292fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sb = _mm_add_epi16(sb, dither_cur); 1293fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sb = _mm_sub_epi16(sb, sb_offset); 1294fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1295fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Expand sb. 1296fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sb = _mm_slli_epi16(sb, 2); 1297fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1298fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Extract R G B from dst. 1299fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 1300fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dr = _mm_and_si128(dr, r16_mask); 1301fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 1302fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dg = _mm_and_si128(dg, g16_mask); 1303fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 1304fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org db = _mm_and_si128(db, b16_mask); 1305fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1306fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // SkAlpha255To256(255 - a) >> 3 1307fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org __m128i isa = _mm_sub_epi16(var256, sa); 1308fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org isa = _mm_srli_epi16(isa, 3); 1309fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1310fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dr = _mm_mullo_epi16(dr, isa); 1311fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dr = _mm_add_epi16(dr, sr); 1312fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dr = _mm_srli_epi16(dr, 5); 1313fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1314fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dg = _mm_mullo_epi16(dg, isa); 1315fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dg = _mm_add_epi16(dg, sg); 1316fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dg = _mm_srli_epi16(dg, 5); 1317fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1318fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org db = _mm_mullo_epi16(db, isa); 1319fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org db = _mm_add_epi16(db, sb); 1320fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org db = _mm_srli_epi16(db, 5); 1321fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1322fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // Package and store dst pixel. 1323c524e98f1edf06b53e65543f5f28217fa13b7aa9commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 1324fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 1325fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1326fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org count -= 8; 1327fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org x += 8; 1328fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } 1329fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1330fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 1331fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 1332fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } 1333fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1334fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org if (count > 0) { 1335fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org DITHER_565_SCAN(y); 1336fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org do { 1337fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org SkPMColor c = *src++; 1338fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org SkPMColorAssert(c); 1339fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org if (c) { 1340fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned a = SkGetPackedA32(c); 1341fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1342fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1343fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1344fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned sr = SkGetPackedR32(c); 1345fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned sg = SkGetPackedG32(c); 1346fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org unsigned sb = SkGetPackedB32(c); 1347fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sr = SkDITHER_R32_FOR_565(sr, d); 1348fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sg = SkDITHER_G32_FOR_565(sg, d); 1349fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org sb = SkDITHER_B32_FOR_565(sb, d); 1350fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org 1351fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1352fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1353fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1354fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org // now src and dst expanded are in g:11 r:10 x:1 b:10 1355fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1356fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } 1357fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org dst += 1; 1358fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org DITHER_INC_X(x); 1359fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } while (--count != 0); 1360fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org } 1361fe089b383aeae512ee39678a667c81867f730cd0commit-bot@chromium.org} 1362