17b7060c61e4182b29186849c5a857ea5f0898e56Rose, James/*
27b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * Copyright (C) 2011 The Android Open Source Project
37b7060c61e4182b29186849c5a857ea5f0898e56Rose, James *
47b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * Licensed under the Apache License, Version 2.0 (the "License");
57b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * you may not use this file except in compliance with the License.
67b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * You may obtain a copy of the License at
77b7060c61e4182b29186849c5a857ea5f0898e56Rose, James *
87b7060c61e4182b29186849c5a857ea5f0898e56Rose, James *      http://www.apache.org/licenses/LICENSE-2.0
97b7060c61e4182b29186849c5a857ea5f0898e56Rose, James *
107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * Unless required by applicable law or agreed to in writing, software
117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * distributed under the License is distributed on an "AS IS" BASIS,
127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * See the License for the specific language governing permissions and
147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * limitations under the License.
157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James */
167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#include <stdint.h>
187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#include <x86intrin.h>
197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
217b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i cvtepu8_epi32(__m128i x) {
227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__)
237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return _mm_cvtepu8_epi32(x);
247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__)
257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_shuffle_epi8(x, M8to32);
277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return x;
287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else
297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#   error "Require at least SSSE3"
307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif
317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
337b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i packus_epi32(__m128i lo, __m128i hi) {
347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__)
357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return _mm_packus_epi32(lo, hi);
367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__)
377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                        _mm_shuffle_epi8(hi, M32to16H));
477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else
487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#   error "Require at least SSSE3"
497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif
507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
527b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i mullo_epi32(__m128i x, __m128i y) {
537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__)
547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return _mm_mullo_epi32(x, y);
557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__)
567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i even = _mm_mul_epu32(x, y);
587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                _mm_srli_si128(y, 4));
607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    even = _mm_and_si128(even, Meven);
617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    odd = _mm_and_si128(odd, Meven);
627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return _mm_or_si128(even, _mm_slli_si128(odd, 4));
637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else
647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#   error "Require at least SSSE3"
657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif
667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James/* 'mask' must packed 8-bit of 0x00 or 0xff */
697b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__)
717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return _mm_blendv_epi8(x, y, mask);
727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__)
737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else
757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#   error "Require at least SSSE3"
767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif
777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
79ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
80ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert                                          const void *y1, const void *y2,
81ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert                                          const short *coef, uint32_t count) {
827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i x;
837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c2, c4, c6, c8;
847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i r0, r1, r2;
857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i o0, o1;
877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+0));
907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_shuffle_epi32(x, 0x00);
917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_shuffle_epi32(x, 0x55);
927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+4));
937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c4 = _mm_shuffle_epi32(x, 0x00);
947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c6 = _mm_shuffle_epi32(x, 0x55);
957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+8));
967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c8 = _mm_shuffle_epi32(x, 0x00);
977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count; ++i) {
997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
1017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
1027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
1037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
1047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
1057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
1067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
1077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
1087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
1097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
1107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
1117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
1127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
1147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
1157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
1177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
1187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
1207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
1217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
1237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
1247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
1267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
1277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_srai_epi32(o0, 8);
1297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_srai_epi32(o1, 8);
1307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = packus_epi32(o0, o1);
1327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_packus_epi16(o0, o0);
1337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storel_epi64((__m128i *)dst, o0);
1347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y0 = (const char *)y0 + 8;
1367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y1 = (const char *)y1 + 8;
1377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = (const char *)y2 + 8;
1387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 8;
1397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
1407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
1417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1427b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
1437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                  const short *coef, uint32_t count) {
1447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
1457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      14, 10, 6, 2,
1467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      13,  9, 5, 1,
1477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      12,  8, 4, 0);
1487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
1507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
1517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c1, c2, c3;
1527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i i4, o4;
1537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i xy, zw;
1547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i x2, y2, z2, w2;
1557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
1567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
1587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
1597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_unpacklo_epi16(c0, c1);
1607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
1627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
1637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_unpacklo_epi16(c2, c3);
1647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count; ++i) {
1667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        i4 = _mm_load_si128((const __m128i *)src);
1677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        xy = _mm_shuffle_epi8(i4, Mxy);
1687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        zw = _mm_shuffle_epi8(i4, Mzw);
1697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
1717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
1727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
1737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
1747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
1767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
1777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
1787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
1797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = _mm_srai_epi32(x2, 8);
1817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = _mm_srai_epi32(y2, 8);
1827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = _mm_srai_epi32(z2, 8);
1837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        w2 = _mm_srai_epi32(w2, 8);
1847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = packus_epi32(x2, y2);
1867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = packus_epi32(z2, w2);
1877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o4 = _mm_packus_epi16(x2, z2);
1887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o4 = _mm_shuffle_epi8(o4, T4x4);
1907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, o4);
1917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const char *)src + 16;
1937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 16;
1947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
1957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
1967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
1977b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
1987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                  const short *coef, uint32_t count) {
1997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
2007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      14, 10, 6, 2,
2017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      13,  9, 5, 1,
2027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      12,  8, 4, 0);
2037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
2057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
2067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c1, c2, c3;
2087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i i4, o4;
2097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i xy, zw;
2107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i x2, y2, z2, w2;
2117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
2127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
2147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
2157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_unpacklo_epi16(c0, c1);
2167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
2187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
2197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_unpacklo_epi16(c2, c3);
2207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count; ++i) {
2227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        i4 = _mm_loadu_si128((const __m128i *)src);
2237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        xy = _mm_shuffle_epi8(i4, Mxy);
2247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        zw = _mm_shuffle_epi8(i4, Mzw);
2257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
2277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
2287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
2297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
2317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
2327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
2337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = _mm_srai_epi32(x2, 8);
2357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = _mm_srai_epi32(y2, 8);
2367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = _mm_srai_epi32(z2, 8);
2377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        w2 = _mm_srli_epi32(zw, 16);
2387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = packus_epi32(x2, y2);
2407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = packus_epi32(z2, w2);
2417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o4 = _mm_packus_epi16(x2, z2);
2427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o4 = _mm_shuffle_epi8(o4, T4x4);
2447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, o4);
2457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const char *)src + 16;
2477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 16;
2487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
2497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
2507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2517b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
2527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                  const short *coef, uint32_t count) {
2537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
2547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      14, 10, 6, 2,
2557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      13,  9, 5, 1,
2567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                      12,  8, 4, 0);
2577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
2587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
2597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c1, c2, c3;
2607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i i4, o4;
2617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i xy, zw;
2627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i x2, y2, z2, w2;
2637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
2647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
2667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_shufflelo_epi16(c0, 0);
2677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
2687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c1 = _mm_shufflelo_epi16(c1, 0);
2697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_unpacklo_epi16(c0, c1);
2707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
2727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_shufflelo_epi16(c2, 0);
2737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
2747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c3 = _mm_shufflelo_epi16(c3, 0);
2757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_unpacklo_epi16(c2, c3);
2767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count; ++i) {
2787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        i4 = _mm_loadu_si128((const __m128i *)src);
2797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        xy = _mm_shuffle_epi8(i4, Mxy);
2817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        zw = _mm_shuffle_epi8(i4, Mzw);
2827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 =  _mm_madd_epi16(xy, c0);
2847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
2857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = _mm_srai_epi32(x2, 8);
2877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = x2;
2887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = x2;
2897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        w2 = _mm_srli_epi32(zw, 16);
2907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x2 = packus_epi32(x2, y2);
2927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        z2 = packus_epi32(z2, w2);
2937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o4 = _mm_packus_epi16(x2, z2);
2947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o4 = _mm_shuffle_epi8(o4, T4x4);
2967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, o4);
2977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
2987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const char *)src + 16;
2997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 16;
3007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
3017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
3027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3037b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlurVFU4_K(void *dst,
3047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                          const void *pin, int stride, const void *gptr,
3057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                          int rct, int x1, int x2) {
3067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const char *pi;
3077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i pi0, pi1;
3087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128 pf0, pf1;
3097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128 bp0, bp1;
3107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128 x;
3117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    int r;
3127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (; x1 < x2; x1 += 2) {
3147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pi = (const char *)pin + (x1 << 2);
3157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        bp0 = _mm_setzero_ps();
3167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        bp1 = _mm_setzero_ps();
3177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        for (r = 0; r < rct; ++r) {
3197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            x = _mm_load_ss((const float *)gptr + r);
3207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
3217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pi0 = _mm_cvtsi32_si128(*(const int *)pi);
3237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
3247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
3267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
3277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
3297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
3307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pi += stride;
3327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        }
3337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_ps((float *)dst, bp0);
3357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_ps((float *)dst + 4, bp1);
3367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 32;
3377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
3387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
3397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3407b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlurHFU4_K(void *dst,
3417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                          const void *pin, const void *gptr,
3427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                          int rct, int x1, int x2) {
3437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
3447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const float *pi;
3457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128 pf, x, y;
3467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i o;
3477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    int r;
3487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (; x1 < x2; ++x1) {
3507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        /* rct is define as 2*r+1 by the caller */
3517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x = _mm_load_ss((const float *)gptr);
3527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
3537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pi = (const float *)pin + (x1 << 2);
3557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pf = _mm_mul_ps(x, _mm_load_ps(pi));
3567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        for (r = 1; r < rct; r += 2) {
3587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            x = _mm_load_ss((const float *)gptr + r);
3597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            y = _mm_load_ss((const float *)gptr + r + 1);
3607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
3617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
3627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
3647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
3657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        }
3667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o = _mm_cvtps_epi32(pf);
3687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
3697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 4;
3707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
3717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
3727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3737b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlurHFU1_K(void *dst,
3747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                          const void *pin, const void *gptr,
3757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                          int rct, int x1, int x2) {
3767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
3777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const float *pi;
3787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128 pf, g0, g1, g2, g3, gx, p0, p1;
3797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i o;
3807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    int r;
3817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (; x1 < x2; x1+=4) {
3837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        g0 = _mm_load_ss((const float *)gptr);
3847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
3857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pi = (const float *)pin + x1;
3877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
3887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        for (r = 1; r < rct; r += 4) {
3907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            gx = _mm_loadu_ps((const float *)gptr + r);
3917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            p0 = _mm_loadu_ps(pi + r);
3927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            p1 = _mm_loadu_ps(pi + r + 4);
3937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
3947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
3957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
3967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
3977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
3987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
3997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
4007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
4017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
4027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        }
4037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o = _mm_cvtps_epi32(pf);
4057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
4067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 4;
4077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
4087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
4097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4107b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicYuv_K(void *dst,
4117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                       const unsigned char *pY, const unsigned char *pUV,
4127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                       uint32_t count, const short *param) {
4137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i biasY, biasUV;
4147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c1, c2, c3, c4;
4157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    biasY = _mm_set1_epi32(param[8]);   /*  16 */
4177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    biasUV = _mm_set1_epi32(param[16]); /* 128 */
4187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_set1_epi32(param[0]);  /*  298 */
4207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c1 = _mm_set1_epi32(param[1]);  /*  409 */
4217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_set1_epi32(param[2]);  /* -100 */
4227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c3 = _mm_set1_epi32(param[3]);  /*  516 */
4237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c4 = _mm_set1_epi32(param[4]);  /* -208 */
4247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i Y, UV, U, V, R, G, B, A;
4267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    A = _mm_set1_epi32(255);
4287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
4297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < (count << 1); ++i) {
4317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
4327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
4337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = _mm_sub_epi32(Y, biasY);
4357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        UV = _mm_sub_epi32(UV, biasUV);
4367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        U = _mm_shuffle_epi32(UV, 0xf5);
4387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        V = _mm_shuffle_epi32(UV, 0xa0);
4397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = mullo_epi32(Y, c0);
4417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
4437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_add_epi32(R, biasUV);
4447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_srai_epi32(R, 8);
4457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
4477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(G, mullo_epi32(V, c4));
4487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(G, biasUV);
4497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_srai_epi32(G, 8);
4507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
4527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_add_epi32(B, biasUV);
4537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_srai_epi32(B, 8);
4547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        __m128i y1, y2, y3, y4;
4567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y1 = packus_epi32(R, G);
4587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = packus_epi32(B, A);
4597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y3 = _mm_packus_epi16(y1, y2);
4607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
4617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          14, 10, 6, 2,
4627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          13,  9, 5, 1,
4637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          12,  8, 4, 0);
4647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y4 = _mm_shuffle_epi8(y3, T4x4);
4657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, y4);
4667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pY += 4;
4677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pUV += 4;
4687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 1;
4697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
4707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
4717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4727b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicYuvR_K(void *dst,
4737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                       const unsigned char *pY, const unsigned char *pUV,
4747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                       uint32_t count, const short *param) {
4757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i biasY, biasUV;
4767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c1, c2, c3, c4;
4777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    biasY = _mm_set1_epi32(param[8]);   /*  16 */
4797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    biasUV = _mm_set1_epi32(param[16]); /* 128 */
4807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_set1_epi32(param[0]);  /*  298 */
4827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c1 = _mm_set1_epi32(param[1]);  /*  409 */
4837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_set1_epi32(param[2]);  /* -100 */
4847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c3 = _mm_set1_epi32(param[3]);  /*  516 */
4857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c4 = _mm_set1_epi32(param[4]);  /* -208 */
4867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i Y, UV, U, V, R, G, B, A;
4887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    A = _mm_set1_epi32(255);
4907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
4917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < (count << 1); ++i) {
4937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
4947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
4957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = _mm_sub_epi32(Y, biasY);
4977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        UV = _mm_sub_epi32(UV, biasUV);
4987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        V = _mm_shuffle_epi32(UV, 0xf5);
5007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        U = _mm_shuffle_epi32(UV, 0xa0);
5017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = mullo_epi32(Y, c0);
5037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
5057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_add_epi32(R, biasUV);
5067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_srai_epi32(R, 8);
5077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
5097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(G, mullo_epi32(V, c4));
5107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(G, biasUV);
5117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_srai_epi32(G, 8);
5127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
5147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_add_epi32(B, biasUV);
5157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_srai_epi32(B, 8);
5167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        __m128i y1, y2, y3, y4;
5187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y1 = packus_epi32(R, G);
5207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = packus_epi32(B, A);
5217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y3 = _mm_packus_epi16(y1, y2);
5227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
5237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          14, 10, 6, 2,
5247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          13,  9, 5, 1,
5257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          12,  8, 4, 0);
5267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y4 = _mm_shuffle_epi8(y3, T4x4);
5277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, y4);
5287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pY += 4;
5297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pUV += 4;
5307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 1;
5317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
5327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
5337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5347b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicYuv2_K(void *dst,
5357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                       const unsigned char *pY, const unsigned char *pU,
5367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                       const unsigned char *pV, uint32_t count, const short *param) {
5377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i biasY, biasUV;
5387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c1, c2, c3, c4;
5397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    biasY = _mm_set1_epi32(param[8]);   /*  16 */
5417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    biasUV = _mm_set1_epi32(param[16]); /* 128 */
5427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0 = _mm_set1_epi32(param[0]);  /*  298 */
5447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c1 = _mm_set1_epi32(param[1]);  /*  409 */
5457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2 = _mm_set1_epi32(param[2]);  /* -100 */
5467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c3 = _mm_set1_epi32(param[3]);  /*  516 */
5477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c4 = _mm_set1_epi32(param[4]);  /* -208 */
5487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i Y, U, V, R, G, B, A;
5507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    A = _mm_set1_epi32(255);
5527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
5537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < (count << 1); ++i) {
5557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
5567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
5577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
5587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = _mm_sub_epi32(Y, biasY);
5607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        U = _mm_sub_epi32(U, biasUV);
5617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James		V = _mm_sub_epi32(V, biasUV);
5627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        Y = mullo_epi32(Y, c0);
5647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
5667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_add_epi32(R, biasUV);
5677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        R = _mm_srai_epi32(R, 8);
5687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
5707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(G, mullo_epi32(V, c4));
5717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_add_epi32(G, biasUV);
5727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        G = _mm_srai_epi32(G, 8);
5737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
5757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_add_epi32(B, biasUV);
5767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        B = _mm_srai_epi32(B, 8);
5777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        __m128i y1, y2, y3, y4;
5797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
5807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y1 = packus_epi32(R, G);
5817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = packus_epi32(B, A);
5827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y3 = _mm_packus_epi16(y1, y2);
5837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
5847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          14, 10, 6, 2,
5857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          13,  9, 5, 1,
5867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                          12,  8, 4, 0);
5877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y4 = _mm_shuffle_epi8(y3, T4x4);
5887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, y4);
5897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pY += 4;
5907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        pU += 4;
5917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James		pV += 4;
5927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 1;
5937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
5947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
5957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
596ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
597ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert                                          const void *y1, const void *y2,
598ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert                                          const void *y3, const void *y4,
599ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert                                          const short *coef, uint32_t count) {
6007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i x;
6017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c0, c2, c4, c6, c8, c10, c12;
6027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i c14, c16, c18, c20, c22, c24;
6037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
6047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
6057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
6067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i p16, p17, p18, p19, p20, p21, p22, p23;
6077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i p24, p25, p26, p27, p28, p29, p30, p31;
6087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i p32, p33, p34, p35, p36, p37, p38, p39;
6097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i o0, o1, o2, o3;
6107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
6117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+0));
6137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c0  = _mm_shuffle_epi32(x, 0x00);
6147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c2  = _mm_shuffle_epi32(x, 0x55);
6157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+4));
6177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c4  = _mm_shuffle_epi32(x, 0x00);
6187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c6  = _mm_shuffle_epi32(x, 0x55);
6197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+8));
6217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c8  = _mm_shuffle_epi32(x, 0x00);
6227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c10  = _mm_shuffle_epi32(x, 0x55);
6237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+12));
6257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c12  = _mm_shuffle_epi32(x, 0x00);
6267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c14  = _mm_shuffle_epi32(x, 0x55);
6277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+16));
6297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c16  = _mm_shuffle_epi32(x, 0x00);
6307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c18  = _mm_shuffle_epi32(x, 0x55);
6317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+20));
6337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c20  = _mm_shuffle_epi32(x, 0x00);
6347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c22  = _mm_shuffle_epi32(x, 0x55);
6357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    x = _mm_loadl_epi64((const __m128i *)(coef+24));
6377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    c24  = _mm_shuffle_epi32(x, 0x00);
6387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count; ++i) {
6407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
6427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
6437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
6447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
6457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
6467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
6477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
6487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
6497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
6517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
6527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
6537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
6547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
6557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
6567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
6577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
6587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
6607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
6617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
6627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
6637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
6647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
6657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
6667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
6677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
6697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
6707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
6717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
6727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
6737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
6747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
6757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
6767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
6787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
6797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
6807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
6817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
6827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
6837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
6847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
6857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
6867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
6877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
6887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
6897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
6907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
6917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
6927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
6937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
6947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
6957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
6967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
6977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
6987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
6997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_srai_epi32(o0, 8);
7007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
7027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
7037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
7047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
7057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
7067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
7077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
7087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
7097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
7107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
7117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
7127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
7137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
7147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o1 = _mm_srai_epi32(o1, 8);
7157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
7177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
7187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
7197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
7207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
7217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
7227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
7237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
7247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
7257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
7267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
7277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
7287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
7297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = _mm_srai_epi32(o2, 8);
7307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
7327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
7337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
7347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
7357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
7367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
7377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
7387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
7397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
7407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
7417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
7427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
7437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
7447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o3 = _mm_srai_epi32(o3, 8);
7457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = packus_epi32(o0, o1);
7477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o2 = packus_epi32(o2, o3);
7487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        o0 = _mm_packus_epi16(o0, o2);
7497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, o0);
7507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y0 = (const char *)y0 + 16;
7527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y1 = (const char *)y1 + 16;
7537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y2 = (const char *)y2 + 16;
7547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y3 = (const char *)y3 + 16;
7557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        y4 = (const char *)y4 + 16;
7567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (char *)dst + 16;
7577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
7587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
7597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7607b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
7617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i all1s, ina, ins;
7627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
7637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
7647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
7657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    all1s = _mm_set1_epi16(255);
7677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
7697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
7707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
7717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
7727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
7737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
7757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
7767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
7777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
7787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
7797c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t0 = _mm_srli_epi16(t0, 8);
7807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_add_epi16(t0, ins);
7817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
7837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
7847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
7857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
7867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
7877c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t1 = _mm_srli_epi16(t1, 8);
7887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_add_epi16(t1, ins);
7897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
7917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
7927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
7937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
7947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
7957c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t2 = _mm_srli_epi16(t2, 8);
7967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_add_epi16(t2, ins);
7977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
7987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
7997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
8007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
8017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
8027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
8037c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t3 = _mm_srli_epi16(t3, 8);
8047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_add_epi16(t3, ins);
8057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
8077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
8087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
8097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
8107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
8127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
8137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
8147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
8157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8167b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
8177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i all1s, outa, outs;
8187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
8197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
8207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
8217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    all1s = _mm_set1_epi16(255);
8237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
8257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
8267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
8277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
8287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
8297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
8327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
8337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
8347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
8357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
8367c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t0 = _mm_srli_epi16(t0, 8);
8377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_add_epi16(t0, outs);
8387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
8407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
8417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
8427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
8437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
8447c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t1 = _mm_srli_epi16(t1, 8);
8457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_add_epi16(t1, outs);
8467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
8487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
8497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
8507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
8517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
8527c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t2 = _mm_srli_epi16(t2, 8);
8537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_add_epi16(t2, outs);
8547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
8567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
8577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
8587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
8597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
8607c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t3 = _mm_srli_epi16(t3, 8);
8617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_add_epi16(t3, outs);
8627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
8647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
8657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
8667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
8677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
8697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
8707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
8717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
8727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8737b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
8747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i outa;
8757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
8767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
8777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
8787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
8807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
8817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
8827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
8837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
8847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
8867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
8877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
8887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
8897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, outa);
8907c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t0 = _mm_srli_epi16(t0, 8);
8917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
8937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
8947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
8957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
8967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, outa);
8977c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t1 = _mm_srli_epi16(t1, 8);
8987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
8997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
9007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
9017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
9027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
9037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, outa);
9047c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t2 = _mm_srli_epi16(t2, 8);
9057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
9077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
9087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
9097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
9107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, outa);
9117c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t3 = _mm_srli_epi16(t3, 8);
9127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
9147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
9157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
9167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
9177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
9197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
9207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
9217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
9227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9237b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
9247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i ina;
9257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
9267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
9277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
9287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
9307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
9317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
9327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
9337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
9347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
9367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
9377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
9387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
9397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, ina);
9407c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t0 = _mm_srli_epi16(t0, 8);
9417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
9437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
9447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
9457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
9467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, ina);
9477c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t1 = _mm_srli_epi16(t1, 8);
9487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
9507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
9517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
9527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
9537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, ina);
9547c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t2 = _mm_srli_epi16(t2, 8);
9557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
9577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
9587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
9597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
9607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, ina);
9617c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t3 = _mm_srli_epi16(t3, 8);
9627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
9647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
9657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
9667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
9677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
9697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
9707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
9717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
9727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9737b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
9747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i all1s, outa;
9757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
9767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
9777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
9787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    all1s = _mm_set1_epi16(255);
9807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
9827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
9837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
9847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
9857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
9867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
9887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
9897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
9907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
9917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
9927c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t0 = _mm_srli_epi16(t0, 8);
9937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
9957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
9967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
9977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
9987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
9997c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t1 = _mm_srli_epi16(t1, 8);
10007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
10027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
10037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
10047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
10057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
10067c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t2 = _mm_srli_epi16(t2, 8);
10077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
10097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outa, 0xFF);
10107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
10117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
10127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
10137c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t3 = _mm_srli_epi16(t3, 8);
10147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
10167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
10177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
10187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
10197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
10217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
10227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
10237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
10247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10257b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
10267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i all1s, ina;
10277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
10287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
10297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
10307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    all1s = _mm_set1_epi16(255);
10327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
10347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
10357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
10367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
10377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
10387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
10407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
10417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
10427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
10437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
10447c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t0 = _mm_srli_epi16(t0, 8);
10457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
10477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
10487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
10497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
10507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
10517c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t1 = _mm_srli_epi16(t1, 8);
10527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
10547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
10557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
10567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
10577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
10587c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t2 = _mm_srli_epi16(t2, 8);
10597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
10617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ina, 0xFF);
10627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
10637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
10647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
10657c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t3 = _mm_srli_epi16(t3, 8);
10667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
10687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
10697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
10707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
10717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
10737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
10747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
10757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
10767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10777b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
10787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
10797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i all1s, ina, outa, ins, outs;
10807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
10817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
10827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
10837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    all1s = _mm_set1_epi16(255);
10857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
10877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
10887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
10897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
10907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
10917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
10927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
10937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
10947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
10957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
10967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
10977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
10987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_sub_epi16(all1s, ina);
10997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, outs);
11007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
11017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_srli_epi16(t0, 8);
11027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
11047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
11057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
11067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
11077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
11087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
11097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_sub_epi16(all1s, ina);
11107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, outs);
11117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
11127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_srli_epi16(t1, 8);
11137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
11157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
11167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
11177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
11187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
11197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
11207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_sub_epi16(all1s, ina);
11217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, outs);
11227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
11237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_srli_epi16(t2, 8);
11247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
11267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
11277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
11287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
11297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
11307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
11317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_sub_epi16(all1s, ina);
11327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, outs);
11337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
11347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_srli_epi16(t3, 8);
11357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
11377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = blendv_epi8(t0, out0, M0001);
11387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
11397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = blendv_epi8(t2, out1, M0001);
11407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
11417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
11427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
11447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
11457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
11467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
11477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11487b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
11497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
11507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i all1s, ina, ins, outa, outs;
11517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
11527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
11537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
11547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    all1s = _mm_set1_epi16(255);
11567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
11587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
11597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
11607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
11617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
11627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
11647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
11657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
11667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
11677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
11687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
11697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_sub_epi16(all1s, outa);
11707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, ins);
11717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
11727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_srli_epi16(t0, 8);
11737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
11757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
11767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
11777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
11787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
11797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
11807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_sub_epi16(all1s, outa);
11817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, ins);
11827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
11837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_srli_epi16(t1, 8);
11847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
11867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
11877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
11887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
11897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
11907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
11917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_sub_epi16(all1s, outa);
11927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, ins);
11937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
11947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_srli_epi16(t2, 8);
11957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
11967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
11977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflelo_epi16(ins, 0xFF);
11987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        ina = _mm_shufflehi_epi16(ina, 0xFF);
11997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
12007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflelo_epi16(outs, 0xFF);
12017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        outa = _mm_shufflehi_epi16(outa, 0xFF);
12027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_sub_epi16(all1s, outa);
12037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, ins);
12047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
12057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_srli_epi16(t3, 8);
12067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
12087c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t0 = blendv_epi8(t0, in0, M0001);
12097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
12107c045dff08287a0273e6a0340d5ca88a90030363Yong Chen        t2 = blendv_epi8(t2, in1, M0001);
12117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
12127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
12137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
12157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
12167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
12177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
12187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12197b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
12207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
12217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
12227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
12247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
12257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
12267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
12277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
12287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_xor_si128(out0, in0);
12307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_xor_si128(out1, in1);
12317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, out0);
12337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, out1);
12347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
12367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
12377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
12387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
12397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12407b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
12417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
12427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i t0, t1, t2, t3;
12437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
12447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
12467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
12477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
12487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
12497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
12507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
12527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
12537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_srli_epi16(t0, 8);
12547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
12567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
12577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t1 = _mm_srli_epi16(t1, 8);
12587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
12607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
12617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_srli_epi16(t2, 8);
12627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
12647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
12657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t3 = _mm_srli_epi16(t3, 8);
12667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t0 = _mm_packus_epi16(t0, t1);
12687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        t2 = _mm_packus_epi16(t2, t3);
12697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, t0);
12707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, t2);
12717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
12737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
12747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
12757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
12767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12777b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
12787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
12797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
12807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
12827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
12837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
12847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
12857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
12867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_adds_epu8(out0, in0);
12887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_adds_epu8(out1, in1);
12897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, out0);
12917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, out1);
12927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
12947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
12957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
12967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
12977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
12987b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
12997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    __m128i in0, in1, out0, out1;
13007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    uint32_t i;
13017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
13027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    for (i = 0; i < count8; ++i) {
13037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in0 = _mm_loadu_si128((const __m128i *)src);
13047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        in1 = _mm_loadu_si128((const __m128i *)src + 1);
13057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_loadu_si128((const __m128i *)dst);
13067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
13077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
13087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out0 = _mm_subs_epu8(out0, in0);
13097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        out1 = _mm_subs_epu8(out1, in1);
13107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
13117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst, out0);
13127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        _mm_storeu_si128((__m128i *)dst + 1, out1);
13137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
13147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        src = (const __m128i *)src + 2;
13157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        dst = (__m128i *)dst + 2;
13167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
13177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
1318