SkBlitRow_opts_SSE2.cpp revision edb606cb999887d54629f361bcbf57c5fede1bb0
1ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com 29272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* 3ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Copyright 2009 The Android Open Source Project 4ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * 5ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * Use of this source code is governed by a BSD-style license that can be 6ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com * found in the LICENSE file. 79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 89272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 9ec3ed6a5ebf6f2c406d7bcf94b6bc34fcaeb976eepoger@google.com 104e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h" 119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h" 12c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org#include "SkUtils.h" 139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h> 159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32() 179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 194e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 204e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 214e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t dst_scale = 256 - src_scale; 299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 30dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 31dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 32dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 33dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 34dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 35dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 36dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(src_scale); 43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); 44dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 45dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 46dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get alpha and green into lower byte of each word. 54dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 55dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 56dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 57dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by scale. 58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); 61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); 62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 64dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 65dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 66dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 67dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 68dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 69dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 70dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 78dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 79dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 80dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 81dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 829272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 839272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 844e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org while (count > 0) { 859272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 869272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 879272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 889272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 899272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 909272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 919272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 924e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 934e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 944e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 959272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha == 255); 969272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 979272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 989272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 99dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 100dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 101dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 102dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 103dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 104dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 105dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 106dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 107dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 108dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 109dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 110dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 1119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING 112dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 113dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 114dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 115dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 116dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 117dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 118dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 119dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 121f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Shift alphas down to lower 8 bits of each quad. 123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i alpha = _mm_srli_epi32(src_pixel, 24); 124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Copy alpha to upper 3rd byte of each quad 126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 255, to get 0..255 129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_255, alpha); 130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb_low = (dst_rb >> 8) 137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, c_128); 143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, c_128); 148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org #else 161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 169f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 171f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 172f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org __m128i alpha = _mm_srli_epi16(src_pixel, 8); 173f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 174f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, g2, a3, g3) 175f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflehi_epi16(alpha, 0xF5); 176f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org 177f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org // (a0, a0, a1, a1, a2, a2, a3, a3) 178f3f0bd71b81097f6c640e7f60805de7eacbc98c6senorblanco@chromium.org alpha = _mm_shufflelo_epi16(alpha, 0xF5); 179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_256, alpha); 182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 191dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out high bits (already in the right place) 192dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 193dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 194dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 195dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 196dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 197dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 198dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 199dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif 205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 2109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 2119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 2129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 2139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 2149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 2169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2174e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 2184e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 2194e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 2209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 2219272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 2229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 2239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 225dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 226dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 227dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 228dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 229dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 230dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 231dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 232dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 233dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 234dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 235dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 236dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 237dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(src_scale); 238dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 239dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 240dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 241dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 242dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 243dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 244dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 245dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get alpha and green into lower byte of each word. 250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Put per-pixel alpha in low byte of each word. 254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_alpha = dst_alpha * src_scale 258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_srli_epi16(dst_alpha, 8); 262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by dst pixel alpha. 267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by dst pixel alpha. 269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by global alpha. 272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by global alpha. 274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 275dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 276dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out low bits (goodies already in the right place; no need to divide) 281dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 282dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 283dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 284dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 285dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add two pixels into result. 289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 296dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 2979272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 298dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 2999272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 3009272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 3019272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 3029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 3039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 3049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 3059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 306c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 307c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org/* SSE2 version of Color32() 308c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 309c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org */ 310c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.orgvoid Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 311c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkPMColor color) { 312c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 313c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count <= 0) { 314c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org return; 315c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 316c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 317c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (0 == color) { 318c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (src != dst) { 319c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org memcpy(dst, src, count * sizeof(SkPMColor)); 320c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 321c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 322c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 323c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned colorA = SkGetPackedA32(color); 324c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (255 == colorA) { 325c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org sk_memset32(dst, color, count); 326c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } else { 327c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org unsigned scale = 256 - SkAlpha255To256(colorA); 328c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 329c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org if (count >= 4) { 330c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 331c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 332c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 333c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src++; 334c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst++; 335c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 336c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 337c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 338c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 339c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 340c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 341c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(scale); 342c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i color_wide = _mm_set1_epi32(color); 343c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count >= 4) { 344c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Load 4 pixels each of src and dest. 345c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 346c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 347c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 348c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 349981d4798007b91e2e19c13b171583927a56df63breed@google.com 350c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Get alpha and green into lower byte of each word. 351c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 352c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 353c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Multiply by scale. 354c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 355c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 356c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 357c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Divide by 256. 358c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 359c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 360c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 361c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Combine back into RGBA. 362c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 363c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 364c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Add color to result. 365c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org __m128i result = _mm_add_epi8(color_wide, src_pixel); 366c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 367c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org // Store result. 368c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org _mm_store_si128(d, result); 369c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org s++; 370c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org d++; 371c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count -= 4; 372c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 373c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 374c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 375c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 376c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org 377c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org while (count > 0) { 378c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org *dst = color + SkAlphaMulQ(*src, scale); 379c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org src += 1; 380c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org dst += 1; 381c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org count--; 382981d4798007b91e2e19c13b171583927a56df63breed@google.com } 383c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org } 384c3856384e4ab9a7ad5902696a5c972ab595b8467senorblanco@chromium.org} 385981d4798007b91e2e19c13b171583927a56df63breed@google.com 386edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.comvoid SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 387edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com size_t maskRB, SkColor origColor, 388edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com int width, int height) 389981d4798007b91e2e19c13b171583927a56df63breed@google.com{ 390ee467ee79d449ebe6ae7f7946e613cc70a479c69reed@google.com SkPMColor color = SkPreMultiplyColor(origColor); 391981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t dstOffset = dstRB - (width << 2); 392981d4798007b91e2e19c13b171583927a56df63breed@google.com size_t maskOffset = maskRB - width; 393981d4798007b91e2e19c13b171583927a56df63breed@google.com SkPMColor* dst = (SkPMColor *)device; 394edb606cb999887d54629f361bcbf57c5fede1bb0reed@google.com const uint8_t* mask = (const uint8_t*)maskPtr; 395981d4798007b91e2e19c13b171583927a56df63breed@google.com do { 396981d4798007b91e2e19c13b171583927a56df63breed@google.com int count = width; 397981d4798007b91e2e19c13b171583927a56df63breed@google.com if (count >= 4) { 398981d4798007b91e2e19c13b171583927a56df63breed@google.com while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 399981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst = SkBlendARGB32(color, *dst, *mask); 400981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 401981d4798007b91e2e19c13b171583927a56df63breed@google.com dst++; 402981d4798007b91e2e19c13b171583927a56df63breed@google.com count--; 403981d4798007b91e2e19c13b171583927a56df63breed@google.com } 404981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i *d = reinterpret_cast<__m128i*>(dst); 405981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 406981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_256 = _mm_set1_epi16(256); 407981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i c_1 = _mm_set1_epi16(1); 408981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_pixel = _mm_set1_epi32(color); 409981d4798007b91e2e19c13b171583927a56df63breed@google.com while (count >= 4) { 410981d4798007b91e2e19c13b171583927a56df63breed@google.com // Load 4 pixels each of src and dest. 411981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_pixel = _mm_load_si128(d); 412981d4798007b91e2e19c13b171583927a56df63breed@google.com 413981d4798007b91e2e19c13b171583927a56df63breed@google.com //set the aphla value 414981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 415981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *(mask+3),0, \ 416981d4798007b91e2e19c13b171583927a56df63breed@google.com *(mask+2),0, *(mask+2),\ 417981d4798007b91e2e19c13b171583927a56df63breed@google.com 0,*(mask+1), 0,*(mask+1),\ 418981d4798007b91e2e19c13b171583927a56df63breed@google.com 0, *mask,0,*mask); 419981d4798007b91e2e19c13b171583927a56df63breed@google.com 420981d4798007b91e2e19c13b171583927a56df63breed@google.com //call SkAlpha255To256() 421981d4798007b91e2e19c13b171583927a56df63breed@google.com src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 422981d4798007b91e2e19c13b171583927a56df63breed@google.com 423981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get red and blue pixels into lower byte of each word. 424981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 425981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 426981d4798007b91e2e19c13b171583927a56df63breed@google.com 427981d4798007b91e2e19c13b171583927a56df63breed@google.com // Get alpha and green into lower byte of each word. 428981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 429981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 430981d4798007b91e2e19c13b171583927a56df63breed@google.com 431981d4798007b91e2e19c13b171583927a56df63breed@google.com // Put per-pixel alpha in low byte of each word. 432981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 433981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 434981d4798007b91e2e19c13b171583927a56df63breed@google.com 435981d4798007b91e2e19c13b171583927a56df63breed@google.com // dst_alpha = dst_alpha * src_scale 436981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 437981d4798007b91e2e19c13b171583927a56df63breed@google.com 438981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 439981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_srli_epi16(dst_alpha, 8); 440981d4798007b91e2e19c13b171583927a56df63breed@google.com 441981d4798007b91e2e19c13b171583927a56df63breed@google.com // Subtract alphas from 256, to get 1..256 442981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 443981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by dst pixel alpha. 444981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 445981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by dst pixel alpha. 446981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 447981d4798007b91e2e19c13b171583927a56df63breed@google.com 448981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply red and blue by global alpha. 449981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 450981d4798007b91e2e19c13b171583927a56df63breed@google.com // Multiply alpha and green by global alpha. 451981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 452981d4798007b91e2e19c13b171583927a56df63breed@google.com // Divide by 256. 453981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_rb = _mm_srli_epi16(dst_rb, 8); 454981d4798007b91e2e19c13b171583927a56df63breed@google.com src_rb = _mm_srli_epi16(src_rb, 8); 455981d4798007b91e2e19c13b171583927a56df63breed@google.com 456981d4798007b91e2e19c13b171583927a56df63breed@google.com // Mask out low bits (goodies already in the right place; no need to divide) 457981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 458981d4798007b91e2e19c13b171583927a56df63breed@google.com src_ag = _mm_andnot_si128(rb_mask, src_ag); 459981d4798007b91e2e19c13b171583927a56df63breed@google.com 460981d4798007b91e2e19c13b171583927a56df63breed@google.com // Combine back into RGBA. 461981d4798007b91e2e19c13b171583927a56df63breed@google.com dst_pixel = _mm_or_si128(dst_rb, dst_ag); 462981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 463981d4798007b91e2e19c13b171583927a56df63breed@google.com 464981d4798007b91e2e19c13b171583927a56df63breed@google.com // Add two pixels into result. 465981d4798007b91e2e19c13b171583927a56df63breed@google.com __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 466981d4798007b91e2e19c13b171583927a56df63breed@google.com _mm_store_si128(d, result); 467981d4798007b91e2e19c13b171583927a56df63breed@google.com // load the next 4 pixel 468981d4798007b91e2e19c13b171583927a56df63breed@google.com mask = mask + 4; 469981d4798007b91e2e19c13b171583927a56df63breed@google.com d++; 470981d4798007b91e2e19c13b171583927a56df63breed@google.com count -= 4; 471981d4798007b91e2e19c13b171583927a56df63breed@google.com } 472981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = reinterpret_cast<SkPMColor *>(d); 473981d4798007b91e2e19c13b171583927a56df63breed@google.com } 474981d4798007b91e2e19c13b171583927a56df63breed@google.com while(count > 0) { 475981d4798007b91e2e19c13b171583927a56df63breed@google.com *dst= SkBlendARGB32(color, *dst, *mask); 476981d4798007b91e2e19c13b171583927a56df63breed@google.com dst += 1; 477981d4798007b91e2e19c13b171583927a56df63breed@google.com mask++; 478981d4798007b91e2e19c13b171583927a56df63breed@google.com count --; 479981d4798007b91e2e19c13b171583927a56df63breed@google.com } 480981d4798007b91e2e19c13b171583927a56df63breed@google.com dst = (SkPMColor *)((char*)dst + dstOffset); 481981d4798007b91e2e19c13b171583927a56df63breed@google.com mask += maskOffset; 482981d4798007b91e2e19c13b171583927a56df63breed@google.com } while (--height != 0); 483981d4798007b91e2e19c13b171583927a56df63breed@google.com} 484