1f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org/* 23838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com * Copyright 2012 The Android Open Source Project 3685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com * 4685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com * Use of this source code is governed by a BSD-style license that can be 5685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com * found in the LICENSE file. 6f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org */ 7f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 8685cfc0ee13d7c355ae2f4f3d225ad45e945763fepoger@google.com 997c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h" 10e642bdf5215599ce5d4a6f43502b396c3b9aef36caryclark@google.com#include "SkBitmapProcState_opts_SSE2.h" 11f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#include "SkColorPriv.h" 121b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org#include "SkUtils.h" 13f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 14f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#include <emmintrin.h> 15f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 16f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32() 17f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 18f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org */ 1997c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 2097c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 2197c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org int count, U8CPU alpha) { 22f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org SkASSERT(alpha <= 255); 23f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org if (count <= 0) { 24f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org return; 25f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 26f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 27f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 28f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org uint32_t dst_scale = 256 - src_scale; 29f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 301d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org if (count >= 4) { 311d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 321d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 331d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 341d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src++; 351d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst++; 361d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org count--; 371d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 381d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 391d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 401d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 411d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 423838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 433838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com 443838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Move scale factors to upper byte of word 453838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 463838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 471d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org while (count >= 4) { 481d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Load 4 pixels each of src and dest. 491d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 501d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 511d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 523838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Interleave Atom port 0/1 operations based on the execution port 533838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // constraints that multiply can only be executed on port 0 (while 543838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // boolean operations can be executed on either port 0 or port 1) 553838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // because GCC currently doesn't do a good job scheduling 563838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // instructions based on these constraints. 573838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com 581d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 593838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 601d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 611d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 623838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Multiply by scale. 633838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 643838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // where rs.h stands for the higher byte of r * scale, and 653838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // bs.h the higher byte of b * scale. 663838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 673838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com 683838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Get alpha and green pixels into higher byte of each word. 693838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 703838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 711d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 721d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply by scale. 733838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (4 x (as.h, as.l, gs.h, gs.l)) 743838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 751d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 763838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Clear the lower byte of the a*scale and g*scale results 773838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 783838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com src_ag = _mm_and_si128(src_ag, ag_mask); 793838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com 803838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Operations the destination pixels are the same as on the 813838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // source pixels. See the comments above. 823838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 833838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 843838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 853838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 863838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com dst_ag = _mm_and_si128(dst_ag, ag_mask); 871d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 881d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Combine back into RGBA. 893838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (4 x (as.h, rs.h, gs.h, bs.h)) 901d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 911d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 921d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 931d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Add result 941d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 951d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org _mm_store_si128(d, result); 961d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org s++; 971d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org d++; 981d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org count -= 4; 991d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 1001d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 1011d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 102f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 103f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 10497c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org while (count > 0) { 105f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 106f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org src++; 107f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org dst++; 108f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org count--; 109f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 110f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org} 111f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 11297c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 11397c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 11497c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org int count, U8CPU alpha) { 115f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org SkASSERT(alpha == 255); 116f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org if (count <= 0) { 117f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org return; 118f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 1191d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1201d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org if (count >= 4) { 1211d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 1221d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 1231d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 1241d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src++; 1251d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst++; 1261d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org count--; 1271d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 1281d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1291d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 1301d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 131f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING 1321d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1331d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 1341d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 1351d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org while (count >= 4) { 1361d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Load 4 pixels 1371d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 1381d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 1391d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1401d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 141444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 1421d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Shift alphas down to lower 8 bits of each quad. 1431d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i alpha = _mm_srli_epi32(src_pixel, 24); 1441d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1451d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Copy alpha to upper 3rd byte of each quad 1461d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 1471d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1481d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Subtract alphas from 255, to get 0..255 1491d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org alpha = _mm_sub_epi16(c_255, alpha); 1501d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1511d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply by red and blue by src alpha. 1521d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 1531d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply by alpha and green by src alpha. 1541d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 1551d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1561d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // dst_rb_low = (dst_rb >> 8) 1571d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 1581d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 1591d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1601d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 1611d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 1621d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, c_128); 1631d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 1641d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1651d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 1661d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 1671d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, c_128); 1681d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 1691d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1701d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Combine back into RGBA. 1711d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 1721d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1731d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Add result 1741d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 1751d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org _mm_store_si128(d, result); 1761d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org s++; 1771d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org d++; 1781d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org count -= 4; 1791d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 1801d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org #else 1811d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 1821d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 1831d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org while (count >= 4) { 1841d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Load 4 pixels 1851d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 1861d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 1871d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 1881d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 189444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 1901d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 191444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 192444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org __m128i alpha = _mm_srli_epi16(src_pixel, 8); 193444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org 194444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org // (a0, a0, a1, a1, a2, g2, a3, g3) 195444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org alpha = _mm_shufflehi_epi16(alpha, 0xF5); 196444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org 197444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org // (a0, a0, a1, a1, a2, a2, a3, a3) 198444278bc14b33f80ab7e00431820a34f972776c1senorblanco@chromium.org alpha = _mm_shufflelo_epi16(alpha, 0xF5); 1991d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2001d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 2011d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org alpha = _mm_sub_epi16(c_256, alpha); 2021d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2031d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply by red and blue by src alpha. 2041d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 2051d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply by alpha and green by src alpha. 2061d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 2071d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2081d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Divide by 256. 2091d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 2101d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2111d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Mask out high bits (already in the right place) 2121d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 2131d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2141d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Combine back into RGBA. 2151d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 2161d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2171d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Add result 2181d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 2191d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org _mm_store_si128(d, result); 2201d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org s++; 2211d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org d++; 2221d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org count -= 4; 2231d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 224f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org#endif 2251d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 2261d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 2271d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 228f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 229f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org while (count > 0) { 230f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 231f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org src++; 232f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org dst++; 233f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org count--; 234f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 235f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org} 236f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 23797c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 23897c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 23997c06343a254f88ccfcdaaee157e40f3340320a6senorblanco@chromium.org int count, U8CPU alpha) { 240f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org SkASSERT(alpha <= 255); 241f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org if (count <= 0) { 242f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org return; 243f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 244f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org 2451d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org if (count >= 4) { 2461d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 2471d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 2481d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src++; 2491d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst++; 2501d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org count--; 2511d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 2521d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2531d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 2541d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2551d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 2561d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 2573838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 2581d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 2591d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 2601d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org while (count >= 4) { 2611d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Load 4 pixels each of src and dest. 2621d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 2631d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 2641d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2651d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 2661d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 2671d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 2681d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2691d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Get alpha and green into lower byte of each word. 2701d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 2711d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 2721d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2731d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Put per-pixel alpha in low byte of each word. 2743838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // After the following two statements, the dst_alpha looks like 2753838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 2761d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 2771d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 2781d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2791d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // dst_alpha = dst_alpha * src_scale 2803838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Because src_scales are in the higher byte of each word and 2813838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // we use mulhi here, the resulting alpha values are already 2823838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // in the right place and don't need to be divided by 256. 2833838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 2843838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 2851d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2861d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 2871d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 2881d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2891d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply red and blue by dst pixel alpha. 2901d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 2911d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply alpha and green by dst pixel alpha. 2921d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 2931d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 2941d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply red and blue by global alpha. 2953838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (4 x (0, rs.h, 0, bs.h)) 2963838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // where rs.h stands for the higher byte of r * src_scale, 2973838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // and bs.h the higher byte of b * src_scale. 2983838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Again, because we use mulhi, the resuling red and blue 2993838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // values are already in the right place and don't need to 3003838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // be divided by 256. 3013838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 3021d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Multiply alpha and green by global alpha. 3033838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (4 x (0, as.h, 0, gs.h)) 3043838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 3051d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 3061d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Divide by 256. 3071d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 3081d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 3091d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Mask out low bits (goodies already in the right place; no need to divide) 3101d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 3113838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // Shift alpha and green to higher byte of each word. 3123838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com // (4 x (as.h, 0, gs.h, 0)) 3133838d305d71baa7591e4b854e3c164e3f051b9b8tomhudson@google.com src_ag = _mm_slli_epi16(src_ag, 8); 3141d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 3151d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Combine back into RGBA. 3161d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 3171d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 3181d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 3191d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org // Add two pixels into result. 3201d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 3211d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org _mm_store_si128(d, result); 3221d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org s++; 3231d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org d++; 3241d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org count -= 4; 3251d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org } 3261d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 3271d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 328f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 3291d9655b31718b94c168dd388e9d1aaffd8d1ea82senorblanco@chromium.org 330f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org while (count > 0) { 331f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 332f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org src++; 333f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org dst++; 334f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org count--; 335f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org } 336f3a2fe6b359a2168af286156d4d5c2b20dda5b76senorblanco@chromium.org} 3371b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3381b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org/* SSE2 version of Color32() 3391b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 3401b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org */ 3411b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 3421b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org SkPMColor color) { 3431b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3441b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org if (count <= 0) { 3451b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org return; 3461b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } 3471b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3481b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org if (0 == color) { 3491b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org if (src != dst) { 3501b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org memcpy(dst, src, count * sizeof(SkPMColor)); 3511b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } 352739ecb5e1a57a9e1ce065f892fe2c73209617e2breed@google.com return; 3531b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } 3541b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3551b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org unsigned colorA = SkGetPackedA32(color); 3561b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org if (255 == colorA) { 3571b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org sk_memset32(dst, color, count); 3581b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } else { 3591b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org unsigned scale = 256 - SkAlpha255To256(colorA); 3601b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3611b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org if (count >= 4) { 3621b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 3631b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 3641b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 3651b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src++; 3661b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org dst++; 3671b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org count--; 3681b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } 3691b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3701b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 3711b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 3721b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 3731b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(scale); 3741b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i color_wide = _mm_set1_epi32(color); 3751b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org while (count >= 4) { 3761b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Load 4 pixels each of src and dest. 3771b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 3781b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3791b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 3801b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 3815fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 3821b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Get alpha and green into lower byte of each word. 3831b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 3841b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3851b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Multiply by scale. 3861b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 3871b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 3881b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3891b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Divide by 256. 3901b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 3911b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 3921b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3931b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Combine back into RGBA. 3941b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 3951b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3961b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Add color to result. 3971b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org __m128i result = _mm_add_epi8(color_wide, src_pixel); 3981b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 3991b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org // Store result. 4001b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org _mm_store_si128(d, result); 4011b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org s++; 4021b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org d++; 4031b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org count -= 4; 4041b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } 4051b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 4061b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 4071b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } 4081b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org 4091b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org while (count > 0) { 4101b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 4111b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org src += 1; 4121b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org dst += 1; 4131b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org count--; 4145fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com } 4151b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org } 4161b5fde98ede97ec9dd5de4213bc9a79e91385046senorblanco@chromium.org} 4175fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4189bed9d7780715fe1c6aae1df1ceee692a8c365a2reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 4199bed9d7780715fe1c6aae1df1ceee692a8c365a2reed@google.com size_t maskRB, SkColor origColor, 42040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com int width, int height) { 421def3e927ac43495ad443f186a1717cdfd15d1630reed@google.com SkPMColor color = SkPreMultiplyColor(origColor); 4225fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com size_t dstOffset = dstRB - (width << 2); 4235fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com size_t maskOffset = maskRB - width; 4245fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com SkPMColor* dst = (SkPMColor *)device; 4259bed9d7780715fe1c6aae1df1ceee692a8c365a2reed@google.com const uint8_t* mask = (const uint8_t*)maskPtr; 4265fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com do { 4275fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com int count = width; 4285fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com if (count >= 4) { 4295fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 4305fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com *dst = SkBlendARGB32(color, *dst, *mask); 4315fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com mask++; 4325fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst++; 4335fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com count--; 4345fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com } 4355fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 4365fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 4375fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i c_256 = _mm_set1_epi16(256); 4385fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i c_1 = _mm_set1_epi16(1); 4395fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i src_pixel = _mm_set1_epi32(color); 4405fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com while (count >= 4) { 4415fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Load 4 pixels each of src and dest. 4425fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i dst_pixel = _mm_load_si128(d); 4435fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4445fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com //set the aphla value 4455fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 4465fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 0, *(mask+3),0, \ 4475fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com *(mask+2),0, *(mask+2),\ 4485fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 0,*(mask+1), 0,*(mask+1),\ 4495fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 0, *mask,0,*mask); 4505fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4515fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com //call SkAlpha255To256() 4525fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 4535fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4545fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Get red and blue pixels into lower byte of each word. 4555fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 4565fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 4575fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4585fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Get alpha and green into lower byte of each word. 4595fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 4605fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 4615fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4625fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Put per-pixel alpha in low byte of each word. 4635fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 4645fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 4655fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4665fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // dst_alpha = dst_alpha * src_scale 4675fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 4685fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4695fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Divide by 256. 4705fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_alpha = _mm_srli_epi16(dst_alpha, 8); 4715fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4725fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Subtract alphas from 256, to get 1..256 4735fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 4745fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Multiply red and blue by dst pixel alpha. 4755fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 4765fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Multiply alpha and green by dst pixel alpha. 4775fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 4785fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4795fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Multiply red and blue by global alpha. 4805fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 4815fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Multiply alpha and green by global alpha. 4825fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 4835fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Divide by 256. 4845fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_rb = _mm_srli_epi16(dst_rb, 8); 4855fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com src_rb = _mm_srli_epi16(src_rb, 8); 4865fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4875fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Mask out low bits (goodies already in the right place; no need to divide) 4885fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 4895fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com src_ag = _mm_andnot_si128(rb_mask, src_ag); 4905fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4915fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Combine back into RGBA. 4925fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst_pixel = _mm_or_si128(dst_rb, dst_ag); 4935fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 4945fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com 4955fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // Add two pixels into result. 4965fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 4975fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com _mm_store_si128(d, result); 4985fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com // load the next 4 pixel 4995fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com mask = mask + 4; 5005fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com d++; 5015fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com count -= 4; 5025fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com } 5035fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst = reinterpret_cast<SkPMColor *>(d); 5045fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com } 5055fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com while(count > 0) { 5065fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com *dst= SkBlendARGB32(color, *dst, *mask); 5075fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst += 1; 5085fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com mask++; 5095fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com count --; 5105fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com } 5115fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com dst = (SkPMColor *)((char*)dst + dstOffset); 5125fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com mask += maskOffset; 5135fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com } while (--height != 0); 5145fe9bc095cb1fa93cffb7d081fe4bc972bc80eafreed@google.com} 51540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 516bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com// The following (left) shifts cause the top 5 bits of the mask components to 517bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com// line up with the corresponding components in an SkPMColor. 518bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com// Note that the mask's RGB16 order may differ from the SkPMColor order. 519bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 520bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 521bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 522bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com 523bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#if SK_R16x5_R32x5_SHIFT == 0 524bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 525bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#elif SK_R16x5_R32x5_SHIFT > 0 526bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 527bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#else 528bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 529bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#endif 530bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com 531bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#if SK_G16x5_G32x5_SHIFT == 0 532bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 533bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#elif SK_G16x5_G32x5_SHIFT > 0 534bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 535bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#else 536bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 537bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#endif 538bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com 539bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#if SK_B16x5_B32x5_SHIFT == 0 540bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 541bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#elif SK_B16x5_B32x5_SHIFT > 0 542bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 543bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#else 544bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 545bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com#endif 546bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com 54743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgstatic __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 54843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i &mask, __m128i &srcA) { 54943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // In the following comments, the components of src, dst and mask are 55043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 55143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 55243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 55343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 55443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 55543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org 55643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 55743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 55843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 55943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 56043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask stores 16-bit values (compressed three channels) interleaved with zeros. 56143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 56243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 56343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 56443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org 56540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 56643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 567bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 568bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 569bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com 57043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 571bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 572bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 573935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com 57443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 575bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 576bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 577935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com 57840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 57943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 58043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // 8-bit position 58143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 58243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 58340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 58440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 585935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com // Interleave R,G,B into the lower byte of word. 58643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 58743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // 16-bit values, padded by zero. 58840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i maskLo, maskHi; 58943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 59040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 59143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 59240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 59340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 59443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Upscale from 0..31 to 0..32 59543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // (allows to replace division by left-shift further down) 59643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 59743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 59840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 59940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 60040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 60143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Multiply each component of maskLo and maskHi by srcA 60243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, srcA); 60343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, srcA); 60440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 60543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Left shift mask components by 8 (divide by 256) 60640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskLo = _mm_srli_epi16(maskLo, 8); 60740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskHi = _mm_srli_epi16(maskHi, 8); 60840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 60943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 61043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 61140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 61243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 61340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 61440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 61543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (src - dst) * mask 61643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 61743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 61840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 61943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (src - dst) * mask >> 5 62040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 62140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 62240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 62340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Add two pixels into result. 62443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 62540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 62640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 62740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 62843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Pack into 4 32bit dst pixels. 62943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 63043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 63143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // clamping to 255 if necessary. 63240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com return _mm_packus_epi16(resultLo, resultHi); 63340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com} 63440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 63543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgstatic __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 63640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i &mask) { 63743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // In the following comments, the components of src, dst and mask are 63843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 63943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // by an R, G, B, or A suffix. Components of one of the four pixels that 64043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 64143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // example is the blue channel of the second destination pixel. Memory 64243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // layout is shown for an ARGB byte order in a color value. 64343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org 64443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // src and srcA store 8-bit values interleaved with zeros. 64543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 64643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask stores 16-bit values (shown as high and low bytes) interleaved with 64743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // zeros 64843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 64943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 65043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org 65140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 65243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 653bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 654bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 65540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 65643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 657bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 658bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 659935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com 66043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 661bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 662bfd42da5af2b038e1be834a59325636709089a1ebungeman@google.com _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 66340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 66440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 66543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 66643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // 8-bit position 66743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 66843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 66940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com mask = _mm_or_si128(_mm_or_si128(r, g), b); 67040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 671935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com // Interleave R,G,B into the lower byte of word. 67243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // i.e. split the sixteen 8-bit values from mask into two sets of eight 67343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // 16-bit values, padded by zero. 67440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i maskLo, maskHi; 67543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 67640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 67743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 67840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 67940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 68043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Upscale from 0..31 to 0..32 68143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // (allows to replace division by left-shift further down) 68243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Left-shift each component by 4 and add the result back to that component, 68343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 68440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 68540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 68640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 68743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Interleave R,G,B into the lower byte of the word 68843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 68940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 69043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 69140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 69240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 69343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (src - dst) * mask 69443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 69543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 69640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 69743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask = (src - dst) * mask >> 5 69840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskLo = _mm_srai_epi16(maskLo, 5); 69940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com maskHi = _mm_srai_epi16(maskHi, 5); 70040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 70140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Add two pixels into result. 70243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // result = dst + ((src - dst) * mask >> 5) 70340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 70440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 70540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 706b2229f21294a4a185be6bba86395f088f3100aa0bungeman@google.com // Pack into 4 32bit dst pixels and force opaque. 70743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // resultLo and resultHi contain eight 16-bit components (two pixels) each. 70843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 70943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // clamping to 255 if necessary. Set alpha components to 0xFF. 710b2229f21294a4a185be6bba86395f088f3100aa0bungeman@google.com return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 711b2229f21294a4a185be6bba86395f088f3100aa0bungeman@google.com _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 71240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com} 71340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 71443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgvoid SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 71543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org SkColor src, int width, SkPMColor) { 71640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com if (width <= 0) { 71740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com return; 71840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 71940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 72043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int srcA = SkColorGetA(src); 72143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int srcR = SkColorGetR(src); 72243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int srcG = SkColorGetG(src); 72343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int srcB = SkColorGetB(src); 724935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com 72540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com srcA = SkAlpha255To256(srcA); 72640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 72740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com if (width >= 4) { 72840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 72940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 73043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 73143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask++; 73240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com dst++; 73340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com width--; 73440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 73540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 73640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 73743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 73843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 73943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Interleave with zeros to get two sets of four 16-bit values. 74043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 74143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 74243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 74343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i srcA_sse = _mm_set1_epi16(srcA); 74440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com while (width >= 4) { 74543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Load four destination pixels into dst_sse. 74643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 74743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 74843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 74943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 75043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org 75143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 75243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 75340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // pack_cmp to 0xFFFF 75443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 75540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com _mm_setzero_si128())); 75640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 75740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 75840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com if (pack_cmp != 0xFFFF) { 759935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com // Unpack 4 16bit mask pixels to 76043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 76143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 76243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 76343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org _mm_setzero_si128()); 76440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 76540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Process 4 32bit dst pixels 76643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 76743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask_sse, srcA_sse); 76840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com _mm_store_si128(d, result); 76940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 77040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 77140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com d++; 77243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask += 4; 77340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com width -= 4; 77440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 77540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 77640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 77740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 77840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 77940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com while (width > 0) { 78043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 78143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask++; 78240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com dst++; 783935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com width--; 78440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 78540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com} 78640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 78743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.orgvoid SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 78843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org SkColor src, int width, SkPMColor opaqueDst) { 78940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com if (width <= 0) { 79040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com return; 79140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 79240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 79343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int srcR = SkColorGetR(src); 79443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int srcG = SkColorGetG(src); 79543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int srcB = SkColorGetB(src); 79640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 79740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com if (width >= 4) { 79840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com SkASSERT(((size_t)dst & 0x03) == 0); 79940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com while (((size_t)dst & 0x0F) != 0) { 80043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 80143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask++; 80240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com dst++; 80340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com width--; 80440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 80540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 80640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 80743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Set alpha to 0xFF and replicate source four times in SSE register. 80843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 80943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Set srcA_sse to contain eight copies of srcA, padded with zero. 81043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 81143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 81240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com while (width >= 4) { 81343fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Load four destination pixels into dst_sse. 81443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i dst_sse = _mm_load_si128(d); 81543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Load four 16-bit masks into lower half of mask_sse. 81643fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i mask_sse = _mm_loadl_epi64( 81743fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org reinterpret_cast<const __m128i*>(mask)); 81843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org 81943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // Check whether masks are equal to 0 and get the highest bit 82043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // of each byte of result, if masks are all zero, we will get 82140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // pack_cmp to 0xFFFF 82243fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 82340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com _mm_setzero_si128())); 82440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 82540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // if mask pixels are not all zero, we will blend the dst pixels 82640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com if (pack_cmp != 0xFFFF) { 827935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com // Unpack 4 16bit mask pixels to 82843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 82943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 83043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask_sse = _mm_unpacklo_epi16(mask_sse, 83143fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org _mm_setzero_si128()); 83240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 83340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com // Process 4 32bit dst pixels 83443fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 83543fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask_sse); 83640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com _mm_store_si128(d, result); 83740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 83840d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 83940d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com d++; 84043fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask += 4; 84140d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com width -= 4; 84240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 84340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 84440d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com dst = reinterpret_cast<SkPMColor*>(d); 84540d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 84640d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com 84740d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com while (width > 0) { 84843fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 84943fa426de0e9fa87575fc5083dc55246da5ae8f8commit-bot@chromium.org mask++; 85040d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com dst++; 851935e9f4fafdfc64130e6be9ea2bb30e3bafd852armistry@google.com width--; 85240d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com } 85340d908d434dcd277aab3d2885f417f8b060cabfdtomhudson@google.com} 854