SkBlitRow_opts_SSE2.cpp revision 39ce33a1facae795eb2f02e35674702de7eb23b5
19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* 298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com * Copyright 2012 The Android Open Source Project 3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * 4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be 5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file. 69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 8ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com 94e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h" 1083ecdc3ac69c9208493c4c3fc8ea9f84b1350535caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h" 119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h" 12475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org#include "SkColor_opts_SSE2.h" 13c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h" 149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h> 169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32() 189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 204e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 224e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t dst_scale = 256 - src_scale; 309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 4398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 4498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 4598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Move scale factors to upper byte of word 4698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 4798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 5398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Interleave Atom port 0/1 operations based on the execution port 5498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // constraints that multiply can only be executed on port 0 (while 5598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // boolean operations can be executed on either port 0 or port 1) 5698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // because GCC currently doesn't do a good job scheduling 5798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // instructions based on these constraints. 5898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 6098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 6398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Multiply by scale. 6498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 6598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // where rs.h stands for the higher byte of r * scale, and 6698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // bs.h the higher byte of b * scale. 6798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 6898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 6998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Get alpha and green pixels into higher byte of each word. 7098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 7198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by scale. 7498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, as.l, gs.h, gs.l)) 7598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 7798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Clear the lower byte of the a*scale and g*scale results 7898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 7998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_and_si128(src_ag, ag_mask); 8098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com 8198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Operations the destination pixels are the same as on the 8298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // source pixels. See the comments above. 8398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 8498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 8598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 8698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 8798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_ag = _mm_and_si128(dst_ag, ag_mask); 88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 89dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 9098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, rs.h, gs.h, bs.h)) 91dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 92dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 93dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 94dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 95dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 96dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 97dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 98dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 1039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 1049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 1054e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org while (count > 0) { 1069272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 1079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 1089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 1099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 1109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 1129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 1134e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 1144e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 1154e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 1169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha == 255); 1179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 1199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 1329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING 133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 142f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Shift alphas down to lower 8 bits of each quad. 144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i alpha = _mm_srli_epi32(src_pixel, 24); 145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Copy alpha to upper 3rd byte of each quad 147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 255, to get 0..255 150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_255, alpha); 151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb_low = (dst_rb >> 8) 158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, c_128); 164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, c_128); 169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org #else 182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 190f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 191dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 192f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 193f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i alpha = _mm_srli_epi16(src_pixel, 8); 194f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 195f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, g2, a3, g3) 196f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflehi_epi16(alpha, 0xF5); 197f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 198f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, a2, a3, a3) 199f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflelo_epi16(alpha, 0xF5); 200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_256, alpha); 203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 209dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out high bits (already in the right place) 213dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 214dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 215dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 216dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 217dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 218dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 219dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 220dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 221dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 222dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 223dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 224dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif 226dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 228dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 2319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 2329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 2339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 2349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 2359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 2379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2384e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 2394e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 2404e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 2419272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 2429272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 2439272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 2449272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2459272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 25898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get alpha and green into lower byte of each word. 271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Put per-pixel alpha in low byte of each word. 27598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // After the following two statements, the dst_alpha looks like 27698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_alpha = dst_alpha * src_scale 28198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Because src_scales are in the higher byte of each word and 28298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // we use mulhi here, the resulting alpha values are already 28398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // in the right place and don't need to be divided by 256. 28498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 28598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by dst pixel alpha. 291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by dst pixel alpha. 293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by global alpha. 29698a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 29798a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // where rs.h stands for the higher byte of r * src_scale, 29898a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // and bs.h the higher byte of b * src_scale. 29998a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Again, because we use mulhi, the resuling red and blue 30098a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // values are already in the right place and don't need to 30198a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // be divided by 256. 30298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 303dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by global alpha. 30498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (0, as.h, 0, gs.h)) 30598a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 306dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 307dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 308dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 309dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 310dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out low bits (goodies already in the right place; no need to divide) 311dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 31298a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // Shift alpha and green to higher byte of each word. 31398a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 31498a5b420aa41b02a4fbf77eaa378e039defc62bbtomhudson@google.com src_ag = _mm_slli_epi16(src_ag, 8); 315dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 316dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 317dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 318dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 319dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 320dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add two pixels into result. 321dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 322dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 323dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 324dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 325dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 326dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 327dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 328dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 3299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 330dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 3319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 3329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 3339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 3349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 3359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 3369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 3379272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 338c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32() 340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */ 342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkPMColor color) { 344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count <= 0) { 346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org return; 347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 349c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (0 == color) { 350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (src != dst) { 351c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org memcpy(dst, src, count * sizeof(SkPMColor)); 352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 353c909a1ecadd422d91ff97d10ce08865290223b14reed@google.com return; 354c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned colorA = SkGetPackedA32(color); 357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (255 == colorA) { 358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org sk_memset32(dst, color, count); 359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } else { 360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned scale = 256 - SkAlpha255To256(colorA); 361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count >= 4) { 363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src++; 367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst++; 368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(scale); 375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i color_wide = _mm_set1_epi32(color); 376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count >= 4) { 377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Load 4 pixels each of src and dest. 378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 380c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 382981d4798007b91e2e19c13b171583927a56df63breed@google.com 383c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get alpha and green into lower byte of each word. 384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 385c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 386c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Multiply by scale. 387c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 388c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 389c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 390c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Divide by 256. 391c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 392c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 393c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 394c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Combine back into RGBA. 395c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 396c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 397c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Add color to result. 398c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i result = _mm_add_epi8(color_wide, src_pixel); 399c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 400c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Store result. 401c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org _mm_store_si128(d, result); 402c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org s++; 403c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org d++; 404c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count -= 4; 405c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 406c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 407c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 408c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 409c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 410c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count > 0) { 411c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 412c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src += 1; 413c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst += 1; 414c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 415981d4798007b91e2e19c13b171583927a56df63breed@google.com } 416c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 417c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org} 418981d4798007b91e2e19c13b171583927a56df63breed@google.com 419edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 420edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com size_t maskRB, SkColor origColor, 421d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com int width, int height) { 422ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com SkPMColor color = SkPreMultiplyColor(origColor); 423981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t dstOffset = dstRB - (width << 2); 424981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t maskOffset = maskRB - width; 425981d4798007b91e2e19c13b171583927a56df63breed@google.com SkPMColor* dst = (SkPMColor *)device; 426edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com const uint8_t* mask = (const uint8_t*)maskPtr; 427981d4798007b91e2e19c13b171583927a56df63breed@google.com do { 428981d4798007b91e2e19c13b171583927a56df63breed@google.com int count = width; 429981d4798007b91e2e19c13b171583927a56df63breed@google.com if (count >= 4) { 430981d4798007b91e2e19c13b171583927a56df63breed@google.com while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 431981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst = SkBlendARGB32(color, *dst, *mask); 432981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 433981d4798007b91e2e19c13b171583927a56df63breed@google.com dst++; 434981d4798007b91e2e19c13b171583927a56df63breed@google.com count--; 435981d4798007b91e2e19c13b171583927a56df63breed@google.com } 436981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 437981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 438981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_256 = _mm_set1_epi16(256); 439981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_1 = _mm_set1_epi16(1); 440981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_pixel = _mm_set1_epi32(color); 441981d4798007b91e2e19c13b171583927a56df63breed@google.com while (count >= 4) { 442981d4798007b91e2e19c13b171583927a56df63breed@google.com // Load 4 pixels each of src and dest. 443981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_pixel = _mm_load_si128(d); 444981d4798007b91e2e19c13b171583927a56df63breed@google.com 445981d4798007b91e2e19c13b171583927a56df63breed@google.com //set the aphla value 446981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 447981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *(mask+3),0, \ 448981d4798007b91e2e19c13b171583927a56df63breed@google.com *(mask+2),0, *(mask+2),\ 449981d4798007b91e2e19c13b171583927a56df63breed@google.com 0,*(mask+1), 0,*(mask+1),\ 450981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *mask,0,*mask); 451981d4798007b91e2e19c13b171583927a56df63breed@google.com 452981d4798007b91e2e19c13b171583927a56df63breed@google.com //call SkAlpha255To256() 453981d4798007b91e2e19c13b171583927a56df63breed@google.com src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 454981d4798007b91e2e19c13b171583927a56df63breed@google.com 455981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get red and blue pixels into lower byte of each word. 456981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 457981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 458981d4798007b91e2e19c13b171583927a56df63breed@google.com 459981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get alpha and green into lower byte of each word. 460981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 461981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 462981d4798007b91e2e19c13b171583927a56df63breed@google.com 463981d4798007b91e2e19c13b171583927a56df63breed@google.com // Put per-pixel alpha in low byte of each word. 464981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 465981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 466981d4798007b91e2e19c13b171583927a56df63breed@google.com 467981d4798007b91e2e19c13b171583927a56df63breed@google.com // dst_alpha = dst_alpha * src_scale 468981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 469981d4798007b91e2e19c13b171583927a56df63breed@google.com 470981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 471981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_srli_epi16(dst_alpha, 8); 472981d4798007b91e2e19c13b171583927a56df63breed@google.com 473981d4798007b91e2e19c13b171583927a56df63breed@google.com // Subtract alphas from 256, to get 1..256 474981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 475981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by dst pixel alpha. 476981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 477981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by dst pixel alpha. 478981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 479981d4798007b91e2e19c13b171583927a56df63breed@google.com 480981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by global alpha. 481981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 482981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by global alpha. 483981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 484981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 485981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_srli_epi16(dst_rb, 8); 486981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_srli_epi16(src_rb, 8); 487981d4798007b91e2e19c13b171583927a56df63breed@google.com 488981d4798007b91e2e19c13b171583927a56df63breed@google.com // Mask out low bits (goodies already in the right place; no need to divide) 489981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 490981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_andnot_si128(rb_mask, src_ag); 491981d4798007b91e2e19c13b171583927a56df63breed@google.com 492981d4798007b91e2e19c13b171583927a56df63breed@google.com // Combine back into RGBA. 493981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_pixel = _mm_or_si128(dst_rb, dst_ag); 494981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 495981d4798007b91e2e19c13b171583927a56df63breed@google.com 496981d4798007b91e2e19c13b171583927a56df63breed@google.com // Add two pixels into result. 497981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 498981d4798007b91e2e19c13b171583927a56df63breed@google.com _mm_store_si128(d, result); 499981d4798007b91e2e19c13b171583927a56df63breed@google.com // load the next 4 pixel 500981d4798007b91e2e19c13b171583927a56df63breed@google.com mask = mask + 4; 501981d4798007b91e2e19c13b171583927a56df63breed@google.com d++; 502981d4798007b91e2e19c13b171583927a56df63breed@google.com count -= 4; 503981d4798007b91e2e19c13b171583927a56df63breed@google.com } 504981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = reinterpret_cast<SkPMColor *>(d); 505981d4798007b91e2e19c13b171583927a56df63breed@google.com } 506981d4798007b91e2e19c13b171583927a56df63breed@google.com while(count > 0) { 507981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst= SkBlendARGB32(color, *dst, *mask); 508981d4798007b91e2e19c13b171583927a56df63breed@google.com dst += 1; 509981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 510981d4798007b91e2e19c13b171583927a56df63breed@google.com count --; 511981d4798007b91e2e19c13b171583927a56df63breed@google.com } 512981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = (SkPMColor *)((char*)dst + dstOffset); 513981d4798007b91e2e19c13b171583927a56df63breed@google.com mask += maskOffset; 514981d4798007b91e2e19c13b171583927a56df63breed@google.com } while (--height != 0); 515981d4798007b91e2e19c13b171583927a56df63breed@google.com} 516d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 5178cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to 5188cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// line up with the corresponding components in an SkPMColor. 5198cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order. 5208cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 5218cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 5228cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 5238cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5248cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0 5258cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 5268cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0 5278cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 5288cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5298cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 5308cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5318cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5328cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0 5338cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 5348cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0 5358cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 5368cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5378cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 5388cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5398cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 5408cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0 5418cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 5428cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0 5438cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 5448cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#else 5458cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 5468cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com#endif 5478cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 54876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 54976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i &mask, __m128i &srcA) { 55076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // In the following comments, the components of src, dst and mask are 55176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 55276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 55376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 55476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 55576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 55676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 55776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 55876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 55976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 56076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 56176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask stores 16-bit values (compressed three channels) interleaved with zeros. 56276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 56376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 56476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 56576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 566d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 56776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 5688cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 5698cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 5708cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com 57176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 5728cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 5738cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 574fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 57576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 5768cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 5778cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 578fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 579d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 58076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 58176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 8-bit position 58276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 58376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 584d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 585d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 586fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Interleave R,G,B into the lower byte of word. 58776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 58876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 16-bit values, padded by zero. 589d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i maskLo, maskHi; 59076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 591d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 59276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 593d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 594d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 59576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Upscale from 0..31 to 0..32 59676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // (allows to replace division by left-shift further down) 59776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 59876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 599d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 600d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 601d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 60276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Multiply each component of maskLo and maskHi by srcA 60376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, srcA); 60476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, srcA); 605d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 60676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left shift mask components by 8 (divide by 256) 607d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srli_epi16(maskLo, 8); 608d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srli_epi16(maskHi, 8); 609d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 61076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 61176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 612d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 61376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 614d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 615d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 61676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask 61776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 61876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 619d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 62076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask >> 5 621d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 622d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 623d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 624d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Add two pixels into result. 62576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 626d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 627d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 628d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 62976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Pack into 4 32bit dst pixels. 63076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 63176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 63276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // clamping to 255 if necessary. 633d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return _mm_packus_epi16(resultLo, resultHi); 634d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 635d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 63676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 637d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i &mask) { 63876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // In the following comments, the components of src, dst and mask are 63976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 64076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 64176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 64276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 64376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 64476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 64576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 64676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 64776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask stores 16-bit values (shown as high and low bytes) interleaved with 64876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // zeros 64976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 65076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 65176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 652d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 65376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 6548cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 6558cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 656d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 65776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 6588cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 6598cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 660fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 66176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 6628cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 6638cd5ae79c6aaa20188ac6f34318c2f358d87e103bungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 664d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 665d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 66676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 66776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 8-bit position 66876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 66976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 670d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 671d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 672fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Interleave R,G,B into the lower byte of word. 67376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 67476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // 16-bit values, padded by zero. 675d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i maskLo, maskHi; 67676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 677d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 67876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 679d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 680d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 68176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Upscale from 0..31 to 0..32 68276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // (allows to replace division by left-shift further down) 68376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 68476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 685d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 686d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 687d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 68876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 68976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 690d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 69176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 692d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 693d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 69476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask 69576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 69676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 697d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 69876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask = (src - dst) * mask >> 5 699d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 700d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 701d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 702d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Add two pixels into result. 70376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 704d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 705d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 706d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 70727123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com // Pack into 4 32bit dst pixels and force opaque. 70876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 70976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 71076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // clamping to 255 if necessary. Set alpha components to 0xFF. 71127123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 71227123cd59fb6d6fc240d327efd9fc068c0e3a495bungeman@google.com _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 713d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 714d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 71576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 71676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org SkColor src, int width, SkPMColor) { 717d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width <= 0) { 718d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return; 719d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 720d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 72176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcA = SkColorGetA(src); 72276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcR = SkColorGetR(src); 72376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcG = SkColorGetG(src); 72476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcB = SkColorGetB(src); 725fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com 726d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com srcA = SkAlpha255To256(srcA); 727d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 728d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width >= 4) { 729d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 730d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 73176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 73276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 733d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 734d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width--; 735d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 736d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 737d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 73876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 73976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 74076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Interleave with zeros to get two sets of four 16-bit values. 74176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 74276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 74376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 74476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i srcA_sse = _mm_set1_epi16(srcA); 745d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width >= 4) { 74676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four destination pixels into dst_sse. 74776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 74876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 74976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 75076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 75176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 75276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 75376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 754d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // pack_cmp to 0xFFFF 75576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 756d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_setzero_si128())); 757d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 758d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 759d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (pack_cmp != 0xFFFF) { 760fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Unpack 4 16bit mask pixels to 76176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 76276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 76376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 76476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org _mm_setzero_si128()); 765d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 766d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Process 4 32bit dst pixels 76776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 76876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse, srcA_sse); 769d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_store_si128(d, result); 770d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 771d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 772d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com d++; 77376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask += 4; 774d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width -= 4; 775d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 776d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 777d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 778d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 779d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 780d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width > 0) { 78176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 78276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 783d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 784fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com width--; 785d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 786d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 787d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 78876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 78976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org SkColor src, int width, SkPMColor opaqueDst) { 790d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width <= 0) { 791d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com return; 792d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 793d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 79476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcR = SkColorGetR(src); 79576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcG = SkColorGetG(src); 79676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int srcB = SkColorGetB(src); 797d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 798d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (width >= 4) { 799d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 800d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 80176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 80276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 803d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 804d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width--; 805d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 806d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 807d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 80876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 80976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 81076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 81176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 81276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 813d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width >= 4) { 81476e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four destination pixels into dst_sse. 81576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 81676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 81776e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 81876e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 81976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org 82076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 82176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 822d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // pack_cmp to 0xFFFF 82376e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 824d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_setzero_si128())); 825d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 826d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 827d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com if (pack_cmp != 0xFFFF) { 828fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com // Unpack 4 16bit mask pixels to 82976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 83076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 83176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 83276e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org _mm_setzero_si128()); 833d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 834d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com // Process 4 32bit dst pixels 83576e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 83676e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask_sse); 837d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com _mm_store_si128(d, result); 838d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 839d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 840d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com d++; 84176e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask += 4; 842d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com width -= 4; 843d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 844d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 845d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 846d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 847d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com 848d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com while (width > 0) { 84976e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 85076e0d137892f6a4f3bce278aceb99f9a0d37317ccommit-bot@chromium.org mask++; 851d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com dst++; 852fbfcd5602128ec010c82cb733c9cdc0a3254f9f3rmistry@google.com width--; 853d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com } 854d6770e69e05c9dcc12f2a1a2d509c0b174372ee7tomhudson@google.com} 855475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 85639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org/* SSE2 version of S32_D565_Opaque() 85739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp 85839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org */ 85939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.orgvoid S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 86039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, int count, 86139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org U8CPU alpha, int /*x*/, int /*y*/) { 86239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkASSERT(255 == alpha); 86339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 86439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count <= 0) { 86539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org return; 86639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 86739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 86839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count >= 8) { 86939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 87039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColor c = *src++; 87139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColorAssert(c); 87239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 87339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org *dst++ = SkPixel32ToPixel16_ToU16(c); 87439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org count--; 87539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 87639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 87739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 87839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 87939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); 88039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); 88139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); 88239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 88339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org while (count >= 8) { 88439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Load 8 pixels of src. 88539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 88639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 88739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 88839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result r. 88939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r1 = _mm_srli_epi32(src_pixel1, 89039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_R32_SHIFT + (8 - SK_R16_BITS)); 89139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org r1 = _mm_and_si128(r1, r16_mask); 89239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r2 = _mm_srli_epi32(src_pixel2, 89339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_R32_SHIFT + (8 - SK_R16_BITS)); 89439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org r2 = _mm_and_si128(r2, r16_mask); 89539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i r = _mm_packs_epi32(r1, r2); 89639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 89739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result g. 89839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g1 = _mm_srli_epi32(src_pixel1, 89939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_G32_SHIFT + (8 - SK_G16_BITS)); 90039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org g1 = _mm_and_si128(g1, g16_mask); 90139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g2 = _mm_srli_epi32(src_pixel2, 90239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_G32_SHIFT + (8 - SK_G16_BITS)); 90339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org g2 = _mm_and_si128(g2, g16_mask); 90439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i g = _mm_packs_epi32(g1, g2); 90539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 90639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Calculate result b. 90739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b1 = _mm_srli_epi32(src_pixel1, 90839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_B32_SHIFT + (8 - SK_B16_BITS)); 90939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org b1 = _mm_and_si128(b1, b16_mask); 91039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b2 = _mm_srli_epi32(src_pixel2, 91139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SK_B32_SHIFT + (8 - SK_B16_BITS)); 91239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org b2 = _mm_and_si128(b2, b16_mask); 91339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i b = _mm_packs_epi32(b1, b2); 91439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 91539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org // Store 8 16-bit colors in dst. 91639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE(r, g, b); 91739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 91839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org count -= 8; 91939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 92039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 92139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 92239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 92339ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 92439ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org if (count > 0) { 92539ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org do { 92639ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColor c = *src++; 92739ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org SkPMColorAssert(c); 92839ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org *dst++ = SkPixel32ToPixel16_ToU16(c); 92939ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } while (--count != 0); 93039ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org } 93139ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org} 93239ce33a1facae795eb2f02e35674702de7eb23b5commit-bot@chromium.org 933475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org/* SSE2 version of S32A_D565_Opaque() 934475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org * portable version is in core/SkBlitRow_D16.cpp 935475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org */ 936475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.orgvoid S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 937475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org const SkPMColor* SK_RESTRICT src, 938475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int count, U8CPU alpha, int /*x*/, int /*y*/) { 939475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkASSERT(255 == alpha); 940475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 941475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count <= 0) { 942475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org return; 943475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 944475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 945475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count >= 8) { 946475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Make dst 16 bytes alignment 947475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org while (((size_t)dst & 0x0F) != 0) { 948475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColor c = *src++; 949475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (c) { 950475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org *dst = SkSrcOver32To16(c, *dst); 951475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 952475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst += 1; 953475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count--; 954475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 955475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 956475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org const __m128i* s = reinterpret_cast<const __m128i*>(src); 957475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i* d = reinterpret_cast<__m128i*>(dst); 958475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i var255 = _mm_set1_epi16(255); 959475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 960475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 961475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 962475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 963475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org while (count >= 8) { 964475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Load 8 pixels of src. 965475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i src_pixel1 = _mm_loadu_si128(s++); 966475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i src_pixel2 = _mm_loadu_si128(s++); 967475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 968475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Check whether src pixels are equal to 0 and get the highest bit 969475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // of each byte of result, if src pixels are all zero, src_cmp1 and 970475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // src_cmp2 will be 0xFFFF. 971475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 972475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_setzero_si128())); 973475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 974475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_setzero_si128())); 975475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 976475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org d++; 977475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count -= 8; 978475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org continue; 979475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 980475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 981475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Load 8 pixels of dst. 982475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i dst_pixel = _mm_load_si128(d); 983475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 984475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract A from src. 985475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sa1 = _mm_slli_epi32(src_pixel1,(24 - SK_A32_SHIFT)); 986475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sa1 = _mm_srli_epi32(sa1, 24); 987475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sa2 = _mm_slli_epi32(src_pixel2,(24 - SK_A32_SHIFT)); 988475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sa2 = _mm_srli_epi32(sa2, 24); 989475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sa = _mm_packs_epi32(sa1, sa2); 990475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 991475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract R from src. 992475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sr1 = _mm_slli_epi32(src_pixel1,(24 - SK_R32_SHIFT)); 993475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sr1 = _mm_srli_epi32(sr1, 24); 994475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sr2 = _mm_slli_epi32(src_pixel2,(24 - SK_R32_SHIFT)); 995475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sr2 = _mm_srli_epi32(sr2, 24); 996475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sr = _mm_packs_epi32(sr1, sr2); 997475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 998475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract G from src. 999475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sg1 = _mm_slli_epi32(src_pixel1,(24 - SK_G32_SHIFT)); 1000475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sg1 = _mm_srli_epi32(sg1, 24); 1001475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sg2 = _mm_slli_epi32(src_pixel2,(24 - SK_G32_SHIFT)); 1002475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sg2 = _mm_srli_epi32(sg2, 24); 1003475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sg = _mm_packs_epi32(sg1, sg2); 1004475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1005475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract B from src. 1006475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sb1 = _mm_slli_epi32(src_pixel1,(24 - SK_B32_SHIFT)); 1007475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sb1 = _mm_srli_epi32(sb1, 24); 1008475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sb2 = _mm_slli_epi32(src_pixel2,(24 - SK_B32_SHIFT)); 1009475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org sb2 = _mm_srli_epi32(sb2, 24); 1010475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i sb = _mm_packs_epi32(sb1, sb2); 1011475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1012475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Extract R G B from dst. 1013475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i dr = _mm_srli_epi16(dst_pixel,SK_R16_SHIFT); 1014475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_and_si128(dr, r16_mask); 1015475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i dg = _mm_srli_epi16(dst_pixel,SK_G16_SHIFT); 1016475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_and_si128(dg, g16_mask); 1017475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i db = _mm_srli_epi16(dst_pixel,SK_B16_SHIFT); 1018475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_and_si128(db, b16_mask); 1019475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1020475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 1021475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1022475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Calculate R G B of result. 1023475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Original algorithm is in SkSrcOver32To16(). 1024475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE(dr, isa, SK_R16_BITS)); 1025475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 1026475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE(dg, isa, SK_G16_BITS)); 1027475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 1028475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE(db, isa, SK_B16_BITS)); 1029475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 1030475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1031475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Pack R G B into 16-bit color. 1032475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org __m128i d_pixel = SkPackRGB16_SSE(dr, dg, db); 1033475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1034475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org // Store 8 16-bit colors in dst. 1035475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org _mm_store_si128(d++, d_pixel); 1036475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org count -= 8; 1037475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1038475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1039475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 1040475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst = reinterpret_cast<uint16_t*>(d); 1041475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1042475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org 1043475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (count > 0) { 1044475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org do { 1045475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColor c = *src++; 1046475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org SkPMColorAssert(c); 1047475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org if (c) { 1048475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org *dst = SkSrcOver32To16(c, *dst); 1049475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1050475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org dst += 1; 1051475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } while (--count != 0); 1052475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org } 1053475910750cdc7d14da3071d4052ba9ab98383be9commit-bot@chromium.org} 1054