SkBlitRow_opts_SSE2.cpp revision dc7de745dd142cdc00ffed7963ebb030a0506f72
19272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* 29272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** 39272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** Copyright 2009, The Android Open Source Project 49272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** 59272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** Licensed under the Apache License, Version 2.0 (the "License"); 69272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** you may not use this file except in compliance with the License. 79272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** You may obtain a copy of the License at 89272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** 99272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** http://www.apache.org/licenses/LICENSE-2.0 109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** 119272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** Unless required by applicable law or agreed to in writing, software 129272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** distributed under the License is distributed on an "AS IS" BASIS, 139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** See the License for the specific language governing permissions and 159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org ** limitations under the License. 169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 184e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org#include "SkBlitRow_opts_SSE2.h" 199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include "SkColorPriv.h" 209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 219272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#include <emmintrin.h> 229272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 239272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org/* SSE2 version of S32_Blend_BlitRow32() 249272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org * portable version is in core/SkBlitRow_D32.cpp 259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org */ 264e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 274e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 284e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 309272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 319272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 329272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 339272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 349272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 359272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org uint32_t dst_scale = 256 - src_scale; 369272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 37dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 38dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 39dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 40dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 41dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 42dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 43dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 44dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 45dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 46dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 47dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 48dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 49dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(src_scale); 50dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); 51dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 52dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 53dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 54dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 55dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 56dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 57dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 58dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 59dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 60dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get alpha and green into lower byte of each word. 61dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 62dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 63dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 64dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by scale. 65dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 66dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 67dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); 68dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); 69dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 70dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 71dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 72dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 73dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 74dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 75dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 76dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 77dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 78dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 79dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 80dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 81dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 82dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 83dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 84dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 85dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 86dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 87dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 88dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 899272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 909272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 914e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org while (count > 0) { 929272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 939272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 949272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 959272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 969272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 979272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 989272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 994e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 1004e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 1014e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 1029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha == 255); 1039272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 1049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 1059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 106dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 107dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 108dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org SkASSERT(((size_t)dst & 0x03) == 0); 109dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 110dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 111dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 112dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 113dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 114dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 115dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 116dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 117dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 1189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#ifdef SK_USE_ACCURATE_BLENDING 119dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 120dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 121dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 122dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 123dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 124dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 125dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 126dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 127dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 128dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); 129dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_srli_epi16(dst_ag, 8); 130dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Shift alphas down to lower 8 bits of each quad. 131dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i alpha = _mm_srli_epi32(src_pixel, 24); 132dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 133dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Copy alpha to upper 3rd byte of each quad 134dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 135dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 136dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 255, to get 0..255 137dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_255, alpha); 138dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 139dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 140dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 141dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 142dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 143dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 144dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb_low = (dst_rb >> 8) 145dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 146dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 147dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 148dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 149dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 150dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_add_epi16(dst_rb, c_128); 151dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 152dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 153dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 154dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 155dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_add_epi16(dst_ag, c_128); 156dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 157dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 158dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 159dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 160dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 161dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 162dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 163dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 164dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 165dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 166dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 167dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 168dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org #else 169dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 170dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 171dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 172dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels 173dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 174dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 175dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 176dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 177dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); 178dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_srli_epi16(dst_ag, 8); 179dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Shift alphas down to lower 8 bits of each quad. 180dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i alpha = _mm_srli_epi32(src_pixel, 24); 181dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 182dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Copy alpha to upper 3rd byte of each quad 183dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 184dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 185dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 186dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org alpha = _mm_sub_epi16(c_256, alpha); 187dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 188dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by red and blue by src alpha. 189dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, alpha); 190dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply by alpha and green by src alpha. 191dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, alpha); 192dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 193dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 194dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 195dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 196dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out high bits (already in the right place) 197dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 198dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 199dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 200dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 201dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 202dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add result 203dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 204dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 205dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 206dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 207dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 208dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org#endif 210dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 211dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 212dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 2139272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2149272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 2159272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkPMSrcOver(*src, *dst); 2169272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 2179272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 2189272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 2199272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2209272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 2219272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 2224e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.orgvoid S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 2234e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org const SkPMColor* SK_RESTRICT src, 2244e753558fc8cc2f77cbcd46fba80d8612e836a1esenorblanco@chromium.org int count, U8CPU alpha) { 2259272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org SkASSERT(alpha <= 255); 2269272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org if (count <= 0) { 2279272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org return; 2289272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 2299272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org 230dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org if (count >= 4) { 231dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (((size_t)dst & 0x0F) != 0) { 232dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 233dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src++; 234dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst++; 235dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count--; 236dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 237dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 238dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org uint32_t src_scale = SkAlpha255To256(alpha); 239dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 240dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org const __m128i *s = reinterpret_cast<const __m128i*>(src); 241dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i *d = reinterpret_cast<__m128i*>(dst); 242dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_scale_wide = _mm_set1_epi16(src_scale); 243dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 244dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 245dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org while (count >= 4) { 246dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Load 4 pixels each of src and dest. 247dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_pixel = _mm_loadu_si128(s); 248dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_pixel = _mm_load_si128(d); 249dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 250dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get red and blue pixels into lower byte of each word. 251dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 252dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 253dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 254dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Get alpha and green into lower byte of each word. 255dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 256dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 257dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 258dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Put per-pixel alpha in low byte of each word. 259dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 260dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 261dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 262dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // dst_alpha = dst_alpha * src_scale 263dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 264dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 265dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 266dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_srli_epi16(dst_alpha, 8); 267dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 268dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Subtract alphas from 256, to get 1..256 269dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 270dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 271dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by dst pixel alpha. 272dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 273dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by dst pixel alpha. 274dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 275dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 276dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply red and blue by global alpha. 277dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 278dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Multiply alpha and green by global alpha. 279dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 280dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 281dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Divide by 256. 282dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_rb = _mm_srli_epi16(dst_rb, 8); 283dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_rb = _mm_srli_epi16(src_rb, 8); 284dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 285dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Mask out low bits (goodies already in the right place; no need to divide) 286dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 287dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_ag = _mm_andnot_si128(rb_mask, src_ag); 288dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 289dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Combine back into RGBA. 290dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst_pixel = _mm_or_si128(dst_rb, dst_ag); 291dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src_pixel = _mm_or_si128(src_rb, src_ag); 292dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 293dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org // Add two pixels into result. 294dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 295dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org _mm_store_si128(d, result); 296dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org s++; 297dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org d++; 298dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org count -= 4; 299dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org } 300dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org src = reinterpret_cast<const SkPMColor*>(s); 301dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org dst = reinterpret_cast<SkPMColor*>(d); 3029272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 303dc7de745dd142cdc00ffed7963ebb030a0506f72senorblanco@chromium.org 3049272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org while (count > 0) { 3059272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org *dst = SkBlendARGB32(*src, *dst, alpha); 3069272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org src++; 3079272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org dst++; 3089272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org count--; 3099272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org } 3109272761b22746d2d22439c26f5555028f8e824dasenorblanco@chromium.org} 311