10e72b7e2c68083333e90024cafd15d4084abd3a3reed/* 20e72b7e2c68083333e90024cafd15d4084abd3a3reed * Copyright 2015 Google Inc. 30e72b7e2c68083333e90024cafd15d4084abd3a3reed * 40e72b7e2c68083333e90024cafd15d4084abd3a3reed * Use of this source code is governed by a BSD-style license that can be 50e72b7e2c68083333e90024cafd15d4084abd3a3reed * found in the LICENSE file. 60e72b7e2c68083333e90024cafd15d4084abd3a3reed */ 70e72b7e2c68083333e90024cafd15d4084abd3a3reed 84bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#include "SkBlitRow_opts_SSE4.h" 94bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 104bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana// Some compilers can't compile SSSE3 or SSE4 intrinsics. We give them stub methods. 114bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana// The stubs should never be called, so we make them crash just to confirm that. 124bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41 134bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephanavoid S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) { 144bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana sk_throw(); 154bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana} 164bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 174bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#else 184bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 1970840cbd898df67f603987213164c798415d76bfhenrik.smiding#include <smmintrin.h> // SSE4.1 intrinsics 204bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#include "SkColorPriv.h" 214bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#include "SkColor_opts_SSE2.h" 221059b1fc9f8711592a81836512850d123d75146dmtklein#include "SkMSAN.h" 234bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 244bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephanavoid S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst, 254bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana const SkPMColor* SK_RESTRICT src, 264bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana int count, 274bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana U8CPU alpha) { 281059b1fc9f8711592a81836512850d123d75146dmtklein sk_msan_assert_initialized(src, src+count); 291059b1fc9f8711592a81836512850d123d75146dmtklein 304bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana SkASSERT(alpha == 255); 314bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana // As long as we can, we'll work on 16 pixel pairs at once. 324bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana int count16 = count / 16; 334bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana __m128i* dst4 = (__m128i*)dst; 344bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana const __m128i* src4 = (const __m128i*)src; 354bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 364bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana for (int i = 0; i < count16 * 4; i += 4) { 374bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana // Load 16 source pixels. 384bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana __m128i s0 = _mm_loadu_si128(src4+i+0), 394bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana s1 = _mm_loadu_si128(src4+i+1), 404bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana s2 = _mm_loadu_si128(src4+i+2), 414bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana s3 = _mm_loadu_si128(src4+i+3); 424bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 434bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT); 444bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0))); 454bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana if (_mm_testz_si128(ORed, alphaMask)) { 464bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana // All 16 source pixels are fully transparent. There's nothing to do! 474bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana continue; 484bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana } 494bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0))); 504bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana if (_mm_testc_si128(ANDed, alphaMask)) { 514bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana // All 16 source pixels are fully opaque. There's no need to read dst or blend it. 524bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+0, s0); 534bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+1, s1); 544bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+2, s2); 554bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+3, s3); 564bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana continue; 574bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana } 584bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana // The general slow case: do the blend for all 16 pixels. 594bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0))); 604bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1))); 614bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2))); 624bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3))); 634bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana } 644bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 654bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana // Wrap up the last <= 15 pixels. 664bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana for (int i = count16*16; i < count; i++) { 674bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana // This check is not really necessarily, but it prevents pointless autovectorization. 684bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana if (src[i] & 0xFF000000) { 694bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana dst[i] = SkPMSrcOver(src[i], dst[i]); 704bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana } 714bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana } 724bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana} 734bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana 744bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#endif 75