17b7060c61e4182b29186849c5a857ea5f0898e56Rose, James/* 27b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * Copyright (C) 2011 The Android Open Source Project 37b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * 47b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * Licensed under the Apache License, Version 2.0 (the "License"); 57b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * you may not use this file except in compliance with the License. 67b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * You may obtain a copy of the License at 77b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * 87b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * http://www.apache.org/licenses/LICENSE-2.0 97b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * 107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * Unless required by applicable law or agreed to in writing, software 117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * distributed under the License is distributed on an "AS IS" BASIS, 127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * See the License for the specific language governing permissions and 147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James * limitations under the License. 157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James */ 167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#include <stdint.h> 187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#include <x86intrin.h> 197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */ 217b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i cvtepu8_epi32(__m128i x) { 227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__) 237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return _mm_cvtepu8_epi32(x); 247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__) 257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00); 267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_shuffle_epi8(x, M8to32); 277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return x; 287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else 297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James# error "Require at least SSSE3" 307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif 317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 337b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i packus_epi32(__m128i lo, __m128i hi) { 347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__) 357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return _mm_packus_epi32(lo, hi); 367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__) 377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000); 387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff); 397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100); 407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff); 417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0)); 427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1)); 437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0)); 447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1)); 457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L), 467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_shuffle_epi8(hi, M32to16H)); 477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else 487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James# error "Require at least SSSE3" 497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif 507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 527b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i mullo_epi32(__m128i x, __m128i y) { 537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__) 547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return _mm_mullo_epi32(x, y); 557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__) 567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff); 577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i even = _mm_mul_epu32(x, y); 587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4), 597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_srli_si128(y, 4)); 607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James even = _mm_and_si128(even, Meven); 617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James odd = _mm_and_si128(odd, Meven); 627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return _mm_or_si128(even, _mm_slli_si128(odd, 4)); 637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else 647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James# error "Require at least SSSE3" 657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif 667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James/* 'mask' must packed 8-bit of 0x00 or 0xff */ 697b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesstatic inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) { 707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(__SSE4_1__) 717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return _mm_blendv_epi8(x, y, mask); 727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#elif defined(__SSSE3__) 737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask)); 747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else 757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James# error "Require at least SSSE3" 767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif 777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 79ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, 80ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert const void *y1, const void *y2, 81ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert const short *coef, uint32_t count) { 827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i x; 837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c2, c4, c6, c8; 847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i r0, r1, r2; 857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11; 867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i o0, o1; 877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+0)); 907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_shuffle_epi32(x, 0x00); 917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_shuffle_epi32(x, 0x55); 927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+4)); 937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c4 = _mm_shuffle_epi32(x, 0x00); 947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c6 = _mm_shuffle_epi32(x, 0x55); 957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+8)); 967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c8 = _mm_shuffle_epi32(x, 0x00); 977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count; ++i) { 997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128()); 1017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); 1027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); 1037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); 1047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); 1057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); 1067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); 1077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); 1087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); 1097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); 1107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); 1117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); 1127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0); 1147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0); 1157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2)); 1177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2)); 1187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4)); 1207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4)); 1217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6)); 1237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6)); 1247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8)); 1267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8)); 1277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_srai_epi32(o0, 8); 1297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_srai_epi32(o1, 8); 1307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = packus_epi32(o0, o1); 1327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_packus_epi16(o0, o0); 1337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storel_epi64((__m128i *)dst, o0); 1347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y0 = (const char *)y0 + 8; 1367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y1 = (const char *)y1 + 8; 1377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = (const char *)y2 + 8; 1387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 8; 1397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 1407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 1417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1427b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, 1437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const short *coef, uint32_t count) { 1447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 1457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 14, 10, 6, 2, 1467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13, 9, 5, 1, 1477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12, 8, 4, 0); 1487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); 1507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); 1517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c1, c2, c3; 1527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i i4, o4; 1537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i xy, zw; 1547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i x2, y2, z2, w2; 1557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 1567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); 1587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); 1597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_unpacklo_epi16(c0, c1); 1607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); 1627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); 1637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_unpacklo_epi16(c2, c3); 1647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count; ++i) { 1667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James i4 = _mm_load_si128((const __m128i *)src); 1677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James xy = _mm_shuffle_epi8(i4, Mxy); 1687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James zw = _mm_shuffle_epi8(i4, Mzw); 1697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); 1717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); 1727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); 1737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff)); 1747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); 1767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); 1777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); 1787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff))); 1797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_srai_epi32(x2, 8); 1817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = _mm_srai_epi32(y2, 8); 1827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = _mm_srai_epi32(z2, 8); 1837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James w2 = _mm_srai_epi32(w2, 8); 1847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = packus_epi32(x2, y2); 1867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = packus_epi32(z2, w2); 1877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o4 = _mm_packus_epi16(x2, z2); 1887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o4 = _mm_shuffle_epi8(o4, T4x4); 1907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, o4); 1917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const char *)src + 16; 1937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 16; 1947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 1957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 1967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 1977b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, 1987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const short *coef, uint32_t count) { 1997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 2007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 14, 10, 6, 2, 2017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13, 9, 5, 1, 2027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12, 8, 4, 0); 2037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); 2057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); 2067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c1, c2, c3; 2087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i i4, o4; 2097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i xy, zw; 2107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i x2, y2, z2, w2; 2117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 2127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); 2147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); 2157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_unpacklo_epi16(c0, c1); 2167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); 2187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); 2197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_unpacklo_epi16(c2, c3); 2207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count; ++i) { 2227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James i4 = _mm_loadu_si128((const __m128i *)src); 2237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James xy = _mm_shuffle_epi8(i4, Mxy); 2247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James zw = _mm_shuffle_epi8(i4, Mzw); 2257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00)); 2277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55)); 2287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa)); 2297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00))); 2317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55))); 2327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa))); 2337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_srai_epi32(x2, 8); 2357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = _mm_srai_epi32(y2, 8); 2367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = _mm_srai_epi32(z2, 8); 2377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James w2 = _mm_srli_epi32(zw, 16); 2387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = packus_epi32(x2, y2); 2407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = packus_epi32(z2, w2); 2417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o4 = _mm_packus_epi16(x2, z2); 2427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o4 = _mm_shuffle_epi8(o4, T4x4); 2447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, o4); 2457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const char *)src + 16; 2477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 16; 2487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 2497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 2507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2517b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, 2527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const short *coef, uint32_t count) { 2537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 2547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 14, 10, 6, 2, 2557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13, 9, 5, 1, 2567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12, 8, 4, 0); 2577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00); 2587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02); 2597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c1, c2, c3; 2607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i i4, o4; 2617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i xy, zw; 2627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i x2, y2, z2, w2; 2637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 2647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_loadl_epi64((const __m128i *)(coef+0)); 2667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_shufflelo_epi16(c0, 0); 2677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c1 = _mm_loadl_epi64((const __m128i *)(coef+4)); 2687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c1 = _mm_shufflelo_epi16(c1, 0); 2697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_unpacklo_epi16(c0, c1); 2707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_loadl_epi64((const __m128i *)(coef+8)); 2727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_shufflelo_epi16(c2, 0); 2737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c3 = _mm_loadl_epi64((const __m128i *)(coef+12)); 2747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c3 = _mm_shufflelo_epi16(c3, 0); 2757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_unpacklo_epi16(c2, c3); 2767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count; ++i) { 2787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James i4 = _mm_loadu_si128((const __m128i *)src); 2797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James xy = _mm_shuffle_epi8(i4, Mxy); 2817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James zw = _mm_shuffle_epi8(i4, Mzw); 2827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_madd_epi16(xy, c0); 2847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2)); 2857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = _mm_srai_epi32(x2, 8); 2877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = x2; 2887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = x2; 2897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James w2 = _mm_srli_epi32(zw, 16); 2907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x2 = packus_epi32(x2, y2); 2927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James z2 = packus_epi32(z2, w2); 2937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o4 = _mm_packus_epi16(x2, z2); 2947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o4 = _mm_shuffle_epi8(o4, T4x4); 2967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, o4); 2977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 2987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const char *)src + 16; 2997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 16; 3007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 3017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 3027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3037b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlurVFU4_K(void *dst, 3047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const void *pin, int stride, const void *gptr, 3057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James int rct, int x1, int x2) { 3067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const char *pi; 3077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i pi0, pi1; 3087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128 pf0, pf1; 3097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128 bp0, bp1; 3107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128 x; 3117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James int r; 3127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (; x1 < x2; x1 += 2) { 3147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pi = (const char *)pin + (x1 << 2); 3157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James bp0 = _mm_setzero_ps(); 3167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James bp1 = _mm_setzero_ps(); 3177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (r = 0; r < rct; ++r) { 3197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_load_ss((const float *)gptr + r); 3207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); 3217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pi0 = _mm_cvtsi32_si128(*(const int *)pi); 3237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1)); 3247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0)); 3267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1)); 3277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x)); 3297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x)); 3307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pi += stride; 3327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 3337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_ps((float *)dst, bp0); 3357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_ps((float *)dst + 4, bp1); 3367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 32; 3377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 3387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 3397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3407b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlurHFU4_K(void *dst, 3417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const void *pin, const void *gptr, 3427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James int rct, int x1, int x2) { 3437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); 3447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const float *pi; 3457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128 pf, x, y; 3467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i o; 3477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James int r; 3487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (; x1 < x2; ++x1) { 3507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James /* rct is define as 2*r+1 by the caller */ 3517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_load_ss((const float *)gptr); 3527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); 3537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pi = (const float *)pin + (x1 << 2); 3557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_mul_ps(x, _mm_load_ps(pi)); 3567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (r = 1; r < rct; r += 2) { 3587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_load_ss((const float *)gptr + r); 3597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y = _mm_load_ss((const float *)gptr + r + 1); 3607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0)); 3617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0)); 3627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2)))); 3647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4))); 3657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 3667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o = _mm_cvtps_epi32(pf); 3687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); 3697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 4; 3707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 3717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 3727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3737b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlurHFU1_K(void *dst, 3747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const void *pin, const void *gptr, 3757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James int rct, int x1, int x2) { 3767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400); 3777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const float *pi; 3787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128 pf, g0, g1, g2, g3, gx, p0, p1; 3797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i o; 3807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James int r; 3817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (; x1 < x2; x1+=4) { 3837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James g0 = _mm_load_ss((const float *)gptr); 3847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0)); 3857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pi = (const float *)pin + x1; 3877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_mul_ps(g0, _mm_loadu_ps(pi)); 3887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (r = 1; r < rct; r += 4) { 3907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James gx = _mm_loadu_ps((const float *)gptr + r); 3917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p0 = _mm_loadu_ps(pi + r); 3927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p1 = _mm_loadu_ps(pi + r + 4); 3937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 3947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0)); 3957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0)); 3967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1)); 3977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4))); 3987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2)); 3997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8))); 4007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3)); 4017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12))); 4027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 4037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o = _mm_cvtps_epi32(pf); 4057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8)); 4067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 4; 4077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 4087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 4097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4107b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicYuv_K(void *dst, 4117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const unsigned char *pY, const unsigned char *pUV, 4127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t count, const short *param) { 4137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i biasY, biasUV; 4147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c1, c2, c3, c4; 4157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James biasY = _mm_set1_epi32(param[8]); /* 16 */ 4177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James biasUV = _mm_set1_epi32(param[16]); /* 128 */ 4187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_set1_epi32(param[0]); /* 298 */ 4207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c1 = _mm_set1_epi32(param[1]); /* 409 */ 4217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_set1_epi32(param[2]); /* -100 */ 4227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c3 = _mm_set1_epi32(param[3]); /* 516 */ 4237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c4 = _mm_set1_epi32(param[4]); /* -208 */ 4247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i Y, UV, U, V, R, G, B, A; 4267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James A = _mm_set1_epi32(255); 4287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 4297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < (count << 1); ++i) { 4317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); 4327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); 4337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = _mm_sub_epi32(Y, biasY); 4357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James UV = _mm_sub_epi32(UV, biasUV); 4367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James U = _mm_shuffle_epi32(UV, 0xf5); 4387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James V = _mm_shuffle_epi32(UV, 0xa0); 4397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = mullo_epi32(Y, c0); 4417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_add_epi32(Y, mullo_epi32(V, c1)); 4437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_add_epi32(R, biasUV); 4447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_srai_epi32(R, 8); 4457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(Y, mullo_epi32(U, c2)); 4477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(G, mullo_epi32(V, c4)); 4487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(G, biasUV); 4497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_srai_epi32(G, 8); 4507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_add_epi32(Y, mullo_epi32(U, c3)); 4527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_add_epi32(B, biasUV); 4537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_srai_epi32(B, 8); 4547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i y1, y2, y3, y4; 4567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y1 = packus_epi32(R, G); 4587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = packus_epi32(B, A); 4597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y3 = _mm_packus_epi16(y1, y2); 4607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 4617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 14, 10, 6, 2, 4627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13, 9, 5, 1, 4637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12, 8, 4, 0); 4647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y4 = _mm_shuffle_epi8(y3, T4x4); 4657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, y4); 4667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pY += 4; 4677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pUV += 4; 4687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 1; 4697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 4707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 4717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4727b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicYuvR_K(void *dst, 4737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const unsigned char *pY, const unsigned char *pUV, 4747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t count, const short *param) { 4757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i biasY, biasUV; 4767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c1, c2, c3, c4; 4777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James biasY = _mm_set1_epi32(param[8]); /* 16 */ 4797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James biasUV = _mm_set1_epi32(param[16]); /* 128 */ 4807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_set1_epi32(param[0]); /* 298 */ 4827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c1 = _mm_set1_epi32(param[1]); /* 409 */ 4837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_set1_epi32(param[2]); /* -100 */ 4847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c3 = _mm_set1_epi32(param[3]); /* 516 */ 4857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c4 = _mm_set1_epi32(param[4]); /* -208 */ 4867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i Y, UV, U, V, R, G, B, A; 4887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James A = _mm_set1_epi32(255); 4907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 4917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < (count << 1); ++i) { 4937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); 4947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV)); 4957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = _mm_sub_epi32(Y, biasY); 4977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James UV = _mm_sub_epi32(UV, biasUV); 4987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James V = _mm_shuffle_epi32(UV, 0xf5); 5007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James U = _mm_shuffle_epi32(UV, 0xa0); 5017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = mullo_epi32(Y, c0); 5037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_add_epi32(Y, mullo_epi32(V, c1)); 5057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_add_epi32(R, biasUV); 5067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_srai_epi32(R, 8); 5077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(Y, mullo_epi32(U, c2)); 5097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(G, mullo_epi32(V, c4)); 5107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(G, biasUV); 5117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_srai_epi32(G, 8); 5127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_add_epi32(Y, mullo_epi32(U, c3)); 5147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_add_epi32(B, biasUV); 5157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_srai_epi32(B, 8); 5167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i y1, y2, y3, y4; 5187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y1 = packus_epi32(R, G); 5207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = packus_epi32(B, A); 5217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y3 = _mm_packus_epi16(y1, y2); 5227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 5237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 14, 10, 6, 2, 5247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13, 9, 5, 1, 5257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12, 8, 4, 0); 5267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y4 = _mm_shuffle_epi8(y3, T4x4); 5277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, y4); 5287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pY += 4; 5297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pUV += 4; 5307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 1; 5317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 5327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 5337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5347b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicYuv2_K(void *dst, 5357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const unsigned char *pY, const unsigned char *pU, 5367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const unsigned char *pV, uint32_t count, const short *param) { 5377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i biasY, biasUV; 5387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c1, c2, c3, c4; 5397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James biasY = _mm_set1_epi32(param[8]); /* 16 */ 5417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James biasUV = _mm_set1_epi32(param[16]); /* 128 */ 5427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_set1_epi32(param[0]); /* 298 */ 5447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c1 = _mm_set1_epi32(param[1]); /* 409 */ 5457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_set1_epi32(param[2]); /* -100 */ 5467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c3 = _mm_set1_epi32(param[3]); /* 516 */ 5477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c4 = _mm_set1_epi32(param[4]); /* -208 */ 5487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i Y, U, V, R, G, B, A; 5507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James A = _mm_set1_epi32(255); 5527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 5537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < (count << 1); ++i) { 5557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY)); 5567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU)); 5577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV)); 5587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = _mm_sub_epi32(Y, biasY); 5607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James U = _mm_sub_epi32(U, biasUV); 5617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James V = _mm_sub_epi32(V, biasUV); 5627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James Y = mullo_epi32(Y, c0); 5647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_add_epi32(Y, mullo_epi32(V, c1)); 5667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_add_epi32(R, biasUV); 5677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James R = _mm_srai_epi32(R, 8); 5687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(Y, mullo_epi32(U, c2)); 5707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(G, mullo_epi32(V, c4)); 5717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_add_epi32(G, biasUV); 5727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James G = _mm_srai_epi32(G, 8); 5737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_add_epi32(Y, mullo_epi32(U, c3)); 5757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_add_epi32(B, biasUV); 5767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James B = _mm_srai_epi32(B, 8); 5777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i y1, y2, y3, y4; 5797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 5807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y1 = packus_epi32(R, G); 5817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = packus_epi32(B, A); 5827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y3 = _mm_packus_epi16(y1, y2); 5837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3, 5847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 14, 10, 6, 2, 5857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13, 9, 5, 1, 5867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12, 8, 4, 0); 5877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y4 = _mm_shuffle_epi8(y3, T4x4); 5887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, y4); 5897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pY += 4; 5907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pU += 4; 5917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James pV += 4; 5927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 1; 5937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 5947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 5957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 596ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, 597ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert const void *y1, const void *y2, 598ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert const void *y3, const void *y4, 599ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albert const short *coef, uint32_t count) { 6007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i x; 6017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c0, c2, c4, c6, c8, c10, c12; 6027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i c14, c16, c18, c20, c22, c24; 6037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9; 6047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i p0, p1, p2, p3, p4, p5, p6, p7; 6057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i p8, p9, p10, p11, p12, p13, p14, p15; 6067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i p16, p17, p18, p19, p20, p21, p22, p23; 6077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i p24, p25, p26, p27, p28, p29, p30, p31; 6087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i p32, p33, p34, p35, p36, p37, p38, p39; 6097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i o0, o1, o2, o3; 6107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 6117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+0)); 6137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c0 = _mm_shuffle_epi32(x, 0x00); 6147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c2 = _mm_shuffle_epi32(x, 0x55); 6157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+4)); 6177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c4 = _mm_shuffle_epi32(x, 0x00); 6187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c6 = _mm_shuffle_epi32(x, 0x55); 6197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+8)); 6217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c8 = _mm_shuffle_epi32(x, 0x00); 6227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c10 = _mm_shuffle_epi32(x, 0x55); 6237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+12)); 6257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c12 = _mm_shuffle_epi32(x, 0x00); 6267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c14 = _mm_shuffle_epi32(x, 0x55); 6277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+16)); 6297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c16 = _mm_shuffle_epi32(x, 0x00); 6307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c18 = _mm_shuffle_epi32(x, 0x55); 6317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+20)); 6337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c20 = _mm_shuffle_epi32(x, 0x00); 6347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c22 = _mm_shuffle_epi32(x, 0x55); 6357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x = _mm_loadl_epi64((const __m128i *)(coef+24)); 6377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James c24 = _mm_shuffle_epi32(x, 0x00); 6387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count; ++i) { 6407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128()); 6427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128()); 6437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128()); 6447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128()); 6457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128()); 6467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128()); 6477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128()); 6487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128()); 6497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128()); 6517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128()); 6527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128()); 6537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128()); 6547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128()); 6557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128()); 6567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128()); 6577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128()); 6587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128()); 6607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128()); 6617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128()); 6627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128()); 6637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128()); 6647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128()); 6657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128()); 6667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128()); 6677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128()); 6697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128()); 6707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128()); 6717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128()); 6727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128()); 6737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128()); 6747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128()); 6757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128()); 6767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128()); 6787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128()); 6797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128()); 6807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128()); 6817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128()); 6827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128()); 6837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128()); 6847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128()); 6857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 6867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1), c0); 6877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3), c2)); 6887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8), c4)); 6897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10), c6)); 6907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c8)); 6917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10)); 6927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12)); 6937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14)); 6947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16)); 6957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18)); 6967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20)); 6977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22)); 6987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24)); 6997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_srai_epi32(o0, 8); 7007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2), c0); 7027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c2)); 7037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9), c4)); 7047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11), c6)); 7057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13), c8)); 7067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10)); 7077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12)); 7087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14)); 7097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16)); 7107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18)); 7117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20)); 7127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22)); 7137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24)); 7147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o1 = _mm_srai_epi32(o1, 8); 7157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3), c0); 7177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5), c2)); 7187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10), c4)); 7197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12), c6)); 7207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14), c8)); 7217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10)); 7227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12)); 7237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14)); 7247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16)); 7257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18)); 7267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20)); 7277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22)); 7287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24)); 7297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = _mm_srai_epi32(o2, 8); 7307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4), c0); 7327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6), c2)); 7337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11), c4)); 7347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13), c6)); 7357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15), c8)); 7367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10)); 7377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12)); 7387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14)); 7397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16)); 7407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18)); 7417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20)); 7427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22)); 7437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24)); 7447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o3 = _mm_srai_epi32(o3, 8); 7457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = packus_epi32(o0, o1); 7477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o2 = packus_epi32(o2, o3); 7487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James o0 = _mm_packus_epi16(o0, o2); 7497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, o0); 7507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y0 = (const char *)y0 + 16; 7527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y1 = (const char *)y1 + 16; 7537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y2 = (const char *)y2 + 16; 7547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y3 = (const char *)y3 + 16; 7557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James y4 = (const char *)y4 + 16; 7567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (char *)dst + 16; 7577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 7587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 7597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7607b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) { 7617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i all1s, ina, ins; 7627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 7637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 7647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 7657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James all1s = _mm_set1_epi16(255); 7677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 7697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 7707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 7717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 7727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 7737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 7757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 7767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 7777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 7787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); 7797c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t0 = _mm_srli_epi16(t0, 8); 7807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_add_epi16(t0, ins); 7817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 7837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 7847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 7857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 7867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); 7877c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t1 = _mm_srli_epi16(t1, 8); 7887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_add_epi16(t1, ins); 7897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 7917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 7927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 7937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 7947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); 7957c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t2 = _mm_srli_epi16(t2, 8); 7967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_add_epi16(t2, ins); 7977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 7987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 7997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 8007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 8017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 8027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); 8037c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t3 = _mm_srli_epi16(t3, 8); 8047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_add_epi16(t3, ins); 8057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 8077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 8087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 8097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 8107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 8127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 8137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 8147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 8157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8167b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) { 8177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i all1s, outa, outs; 8187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 8197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 8207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 8217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James all1s = _mm_set1_epi16(255); 8237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 8257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 8267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 8277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 8287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 8297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 8327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 8337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 8347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 8357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); 8367c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t0 = _mm_srli_epi16(t0, 8); 8377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_add_epi16(t0, outs); 8387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 8407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 8417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 8427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 8437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); 8447c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t1 = _mm_srli_epi16(t1, 8); 8457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_add_epi16(t1, outs); 8467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 8487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 8497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 8507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 8517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); 8527c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t2 = _mm_srli_epi16(t2, 8); 8537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_add_epi16(t2, outs); 8547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 8567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 8577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 8587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 8597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); 8607c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t3 = _mm_srli_epi16(t3, 8); 8617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_add_epi16(t3, outs); 8627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 8647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 8657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 8667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 8677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 8697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 8707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 8717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 8727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8737b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) { 8747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i outa; 8757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 8767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 8777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 8787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 8807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 8817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 8827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 8837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 8847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 8867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 8877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 8887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 8897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, outa); 8907c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t0 = _mm_srli_epi16(t0, 8); 8917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 8937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 8947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 8957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 8967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, outa); 8977c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t1 = _mm_srli_epi16(t1, 8); 8987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 8997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 9007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 9017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 9027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 9037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, outa); 9047c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t2 = _mm_srli_epi16(t2, 8); 9057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 9077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 9087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 9097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 9107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, outa); 9117c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t3 = _mm_srli_epi16(t3, 8); 9127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 9147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 9157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 9167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 9177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 9197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 9207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 9217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 9227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9237b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) { 9247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i ina; 9257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 9267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 9277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 9287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 9307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 9317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 9327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 9337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 9347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 9367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 9377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 9387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 9397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, ina); 9407c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t0 = _mm_srli_epi16(t0, 8); 9417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 9437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 9447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 9457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 9467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, ina); 9477c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t1 = _mm_srli_epi16(t1, 8); 9487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 9507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 9517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 9527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 9537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, ina); 9547c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t2 = _mm_srli_epi16(t2, 8); 9557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 9577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 9587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 9597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 9607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, ina); 9617c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t3 = _mm_srli_epi16(t3, 8); 9627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 9647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 9657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 9667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 9677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 9697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 9707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 9717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 9727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9737b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) { 9747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i all1s, outa; 9757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 9767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 9777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 9787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James all1s = _mm_set1_epi16(255); 9807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 9827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 9837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 9847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 9857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 9867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 9887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 9897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 9907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 9917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa)); 9927c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t0 = _mm_srli_epi16(t0, 8); 9937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 9957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 9967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 9977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 9987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa)); 9997c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t1 = _mm_srli_epi16(t1, 8); 10007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 10027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 10037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 10047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 10057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa)); 10067c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t2 = _mm_srli_epi16(t2, 8); 10077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 10097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outa, 0xFF); 10107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 10117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 10127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa)); 10137c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t3 = _mm_srli_epi16(t3, 8); 10147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 10167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 10177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 10187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 10197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 10217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 10227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 10237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 10247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10257b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) { 10267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i all1s, ina; 10277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 10287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 10297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 10307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James all1s = _mm_set1_epi16(255); 10327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 10347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 10357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 10367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 10377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 10387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 10407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 10417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 10427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 10437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina)); 10447c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t0 = _mm_srli_epi16(t0, 8); 10457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 10477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 10487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 10497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 10507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina)); 10517c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t1 = _mm_srli_epi16(t1, 8); 10527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 10547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 10557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 10567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 10577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina)); 10587c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t2 = _mm_srli_epi16(t2, 8); 10597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 10617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ina, 0xFF); 10627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 10637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 10647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina)); 10657c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t3 = _mm_srli_epi16(t3, 8); 10667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 10687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 10697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 10707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 10717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 10737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 10747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 10757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 10767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10777b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) { 10787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); 10797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i all1s, ina, outa, ins, outs; 10807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 10817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 10827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 10837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James all1s = _mm_set1_epi16(255); 10857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 10877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 10887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 10897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 10907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 10917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 10927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 10937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 10947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 10957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 10967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 10977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 10987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_sub_epi16(all1s, ina); 10997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, outs); 11007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins)); 11017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_srli_epi16(t0, 8); 11027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 11047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 11057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 11067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 11077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 11087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 11097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_sub_epi16(all1s, ina); 11107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, outs); 11117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins)); 11127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_srli_epi16(t1, 8); 11137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 11157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 11167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 11177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 11187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 11197b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 11207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_sub_epi16(all1s, ina); 11217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, outs); 11227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins)); 11237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_srli_epi16(t2, 8); 11247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 11267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 11277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 11287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 11297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 11307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 11317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_sub_epi16(all1s, ina); 11327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, outs); 11337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins)); 11347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_srli_epi16(t3, 8); 11357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 11377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = blendv_epi8(t0, out0, M0001); 11387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 11397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = blendv_epi8(t2, out1, M0001); 11407b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 11417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 11427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 11447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 11457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 11467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 11477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11487b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) { 11497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000); 11507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i all1s, ina, ins, outa, outs; 11517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 11527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 11537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 11547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James all1s = _mm_set1_epi16(255); 11567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 11587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 11597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 11607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 11617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 11627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 11647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 11657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 11667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128()); 11677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 11687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 11697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_sub_epi16(all1s, outa); 11707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, ins); 11717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs)); 11727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_srli_epi16(t0, 8); 11737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 11757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 11767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 11777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128()); 11787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 11797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 11807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_sub_epi16(all1s, outa); 11817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, ins); 11827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs)); 11837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_srli_epi16(t1, 8); 11847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 11867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 11877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 11887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128()); 11897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 11907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 11917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_sub_epi16(all1s, outa); 11927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, ins); 11937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs)); 11947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_srli_epi16(t2, 8); 11957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 11967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 11977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflelo_epi16(ins, 0xFF); 11987b7060c61e4182b29186849c5a857ea5f0898e56Rose, James ina = _mm_shufflehi_epi16(ina, 0xFF); 11997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128()); 12007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflelo_epi16(outs, 0xFF); 12017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James outa = _mm_shufflehi_epi16(outa, 0xFF); 12027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_sub_epi16(all1s, outa); 12037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, ins); 12047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs)); 12057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_srli_epi16(t3, 8); 12067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 12087c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t0 = blendv_epi8(t0, in0, M0001); 12097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 12107c045dff08287a0273e6a0340d5ca88a90030363Yong Chen t2 = blendv_epi8(t2, in1, M0001); 12117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 12127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 12137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 12157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 12167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 12177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 12187b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12197b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) { 12207b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 12217b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 12227b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12237b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 12247b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 12257b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 12267b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 12277b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 12287b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12297b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_xor_si128(out0, in0); 12307b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_xor_si128(out1, in1); 12317b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12327b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, out0); 12337b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, out1); 12347b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12357b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 12367b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 12377b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 12387b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 12397b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12407b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) { 12417b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 12427b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i t0, t1, t2, t3; 12437b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 12447b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12457b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 12467b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 12477b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 12487b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 12497b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 12507b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12517b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128()); 12527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128())); 12537b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_srli_epi16(t0, 8); 12547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128()); 12567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128())); 12577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t1 = _mm_srli_epi16(t1, 8); 12587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128()); 12607b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128())); 12617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_srli_epi16(t2, 8); 12627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128()); 12647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128())); 12657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t3 = _mm_srli_epi16(t3, 8); 12667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t0 = _mm_packus_epi16(t0, t1); 12687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James t2 = _mm_packus_epi16(t2, t3); 12697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, t0); 12707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, t2); 12717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 12737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 12747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 12757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 12767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12777b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) { 12787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 12797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 12807b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 12827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 12837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 12847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 12857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 12867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12877b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_adds_epu8(out0, in0); 12887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_adds_epu8(out1, in1); 12897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, out0); 12917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, out1); 12927b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12937b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 12947b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 12957b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 12967b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 12977b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 12987b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) { 12997b7060c61e4182b29186849c5a857ea5f0898e56Rose, James __m128i in0, in1, out0, out1; 13007b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t i; 13017b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13027b7060c61e4182b29186849c5a857ea5f0898e56Rose, James for (i = 0; i < count8; ++i) { 13037b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in0 = _mm_loadu_si128((const __m128i *)src); 13047b7060c61e4182b29186849c5a857ea5f0898e56Rose, James in1 = _mm_loadu_si128((const __m128i *)src + 1); 13057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_loadu_si128((const __m128i *)dst); 13067b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_loadu_si128((const __m128i *)dst + 1); 13077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13087b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out0 = _mm_subs_epu8(out0, in0); 13097b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out1 = _mm_subs_epu8(out1, in1); 13107b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13117b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst, out0); 13127b7060c61e4182b29186849c5a857ea5f0898e56Rose, James _mm_storeu_si128((__m128i *)dst + 1, out1); 13137b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 13147b7060c61e4182b29186849c5a857ea5f0898e56Rose, James src = (const __m128i *)src + 2; 13157b7060c61e4182b29186849c5a857ea5f0898e56Rose, James dst = (__m128i *)dst + 2; 13167b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 13177b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 1318