10e72b7e2c68083333e90024cafd15d4084abd3a3reed/*
20e72b7e2c68083333e90024cafd15d4084abd3a3reed * Copyright 2015 Google Inc.
30e72b7e2c68083333e90024cafd15d4084abd3a3reed *
40e72b7e2c68083333e90024cafd15d4084abd3a3reed * Use of this source code is governed by a BSD-style license that can be
50e72b7e2c68083333e90024cafd15d4084abd3a3reed * found in the LICENSE file.
60e72b7e2c68083333e90024cafd15d4084abd3a3reed */
70e72b7e2c68083333e90024cafd15d4084abd3a3reed
84bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#include "SkBlitRow_opts_SSE4.h"
94bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
104bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana// Some compilers can't compile SSSE3 or SSE4 intrinsics.  We give them stub methods.
114bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana// The stubs should never be called, so we make them crash just to confirm that.
124bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#if SK_CPU_SSE_LEVEL < SK_CPU_SSE_LEVEL_SSE41
134bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephanavoid S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT, const SkPMColor* SK_RESTRICT, int, U8CPU) {
144bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    sk_throw();
154bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana}
164bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
174bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#else
184bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
1970840cbd898df67f603987213164c798415d76bfhenrik.smiding#include <smmintrin.h>      // SSE4.1 intrinsics
204bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#include "SkColorPriv.h"
214bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#include "SkColor_opts_SSE2.h"
221059b1fc9f8711592a81836512850d123d75146dmtklein#include "SkMSAN.h"
234bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
244bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephanavoid S32A_Opaque_BlitRow32_SSE4(SkPMColor* SK_RESTRICT dst,
254bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana                                const SkPMColor* SK_RESTRICT src,
264bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana                                int count,
274bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana                                U8CPU alpha) {
281059b1fc9f8711592a81836512850d123d75146dmtklein    sk_msan_assert_initialized(src, src+count);
291059b1fc9f8711592a81836512850d123d75146dmtklein
304bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    SkASSERT(alpha == 255);
314bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    // As long as we can, we'll work on 16 pixel pairs at once.
324bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    int count16 = count / 16;
334bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    __m128i* dst4 = (__m128i*)dst;
344bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    const __m128i* src4 = (const __m128i*)src;
354bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
364bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    for (int i = 0; i < count16 * 4; i += 4) {
374bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        // Load 16 source pixels.
384bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        __m128i s0 = _mm_loadu_si128(src4+i+0),
394bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana                s1 = _mm_loadu_si128(src4+i+1),
404bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana                s2 = _mm_loadu_si128(src4+i+2),
414bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana                s3 = _mm_loadu_si128(src4+i+3);
424bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
434bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT);
444bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0)));
454bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        if (_mm_testz_si128(ORed, alphaMask)) {
464bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            // All 16 source pixels are fully transparent.  There's nothing to do!
474bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            continue;
484bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        }
494bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0)));
504bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        if (_mm_testc_si128(ANDed, alphaMask)) {
514bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            // All 16 source pixels are fully opaque.  There's no need to read dst or blend it.
524bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            _mm_storeu_si128(dst4+i+0, s0);
534bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            _mm_storeu_si128(dst4+i+1, s1);
544bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            _mm_storeu_si128(dst4+i+2, s2);
554bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            _mm_storeu_si128(dst4+i+3, s3);
564bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            continue;
574bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        }
584bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        // The general slow case: do the blend for all 16 pixels.
594bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0)));
604bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1)));
614bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2)));
624bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3)));
634bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    }
644bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
654bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    // Wrap up the last <= 15 pixels.
664bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    for (int i = count16*16; i < count; i++) {
674bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        // This check is not really necessarily, but it prevents pointless autovectorization.
684bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        if (src[i] & 0xFF000000) {
694bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana            dst[i] = SkPMSrcOver(src[i], dst[i]);
704bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana        }
714bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana    }
724bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana}
734bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana
744bf1ce2709f5f4a4850d9b04b3213be732cbdf89stephana#endif
75