1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <stdint.h>
18#include <x86intrin.h>
19
20/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
21static inline __m128i cvtepu8_epi32(__m128i x) {
22#if defined(__SSE4_1__)
23    return _mm_cvtepu8_epi32(x);
24#elif defined(__SSSE3__)
25    const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
26    x = _mm_shuffle_epi8(x, M8to32);
27    return x;
28#else
29#   error "Require at least SSSE3"
30#endif
31}
32
33static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
34#if defined(__SSE4_1__)
35    return _mm_packus_epi32(lo, hi);
36#elif defined(__SSSE3__)
37    const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
38    const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
39    const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
40    const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
41    lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
42    lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
43    hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
44    hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
45    return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
46                        _mm_shuffle_epi8(hi, M32to16H));
47#else
48#   error "Require at least SSSE3"
49#endif
50}
51
52static inline __m128i mullo_epi32(__m128i x, __m128i y) {
53#if defined(__SSE4_1__)
54    return _mm_mullo_epi32(x, y);
55#elif defined(__SSSE3__)
56    const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
57    __m128i even = _mm_mul_epu32(x, y);
58    __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
59                                _mm_srli_si128(y, 4));
60    even = _mm_and_si128(even, Meven);
61    odd = _mm_and_si128(odd, Meven);
62    return _mm_or_si128(even, _mm_slli_si128(odd, 4));
63#else
64#   error "Require at least SSSE3"
65#endif
66}
67
68/* 'mask' must packed 8-bit of 0x00 or 0xff */
69static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
70#if defined(__SSE4_1__)
71    return _mm_blendv_epi8(x, y, mask);
72#elif defined(__SSSE3__)
73    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
74#else
75#   error "Require at least SSSE3"
76#endif
77}
78
79extern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0,
80                                          const void *y1, const void *y2,
81                                          const short *coef, uint32_t count) {
82    __m128i x;
83    __m128i c0, c2, c4, c6, c8;
84    __m128i r0, r1, r2;
85    __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
86    __m128i o0, o1;
87    uint32_t i;
88
89    x = _mm_loadl_epi64((const __m128i *)(coef+0));
90    c0 = _mm_shuffle_epi32(x, 0x00);
91    c2 = _mm_shuffle_epi32(x, 0x55);
92    x = _mm_loadl_epi64((const __m128i *)(coef+4));
93    c4 = _mm_shuffle_epi32(x, 0x00);
94    c6 = _mm_shuffle_epi32(x, 0x55);
95    x = _mm_loadl_epi64((const __m128i *)(coef+8));
96    c8 = _mm_shuffle_epi32(x, 0x00);
97
98    for (i = 0; i < count; ++i) {
99
100        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
101        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
102        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
103        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
104        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
105        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
106        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
107        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
108        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
109        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
110        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
111        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
112
113        o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
114        o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
115
116        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
117        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
118
119        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
120        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
121
122        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
123        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
124
125        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
126        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
127
128        o0 = _mm_srai_epi32(o0, 8);
129        o1 = _mm_srai_epi32(o1, 8);
130
131        o0 = packus_epi32(o0, o1);
132        o0 = _mm_packus_epi16(o0, o0);
133        _mm_storel_epi64((__m128i *)dst, o0);
134
135        y0 = (const char *)y0 + 8;
136        y1 = (const char *)y1 + 8;
137        y2 = (const char *)y2 + 8;
138        dst = (char *)dst + 8;
139    }
140}
141
142void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
143                                  const short *coef, uint32_t count) {
144    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
145                                      14, 10, 6, 2,
146                                      13,  9, 5, 1,
147                                      12,  8, 4, 0);
148
149    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
150    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
151    __m128i c0, c1, c2, c3;
152    __m128i i4, o4;
153    __m128i xy, zw;
154    __m128i x2, y2, z2, w2;
155    uint32_t i;
156
157    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
158    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
159    c0 = _mm_unpacklo_epi16(c0, c1);
160
161    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
162    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
163    c2 = _mm_unpacklo_epi16(c2, c3);
164
165    for (i = 0; i < count; ++i) {
166        i4 = _mm_load_si128((const __m128i *)src);
167        xy = _mm_shuffle_epi8(i4, Mxy);
168        zw = _mm_shuffle_epi8(i4, Mzw);
169
170        x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
171        y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
172        z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
173        w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
174
175        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
176        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
177        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
178        w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
179
180        x2 = _mm_srai_epi32(x2, 8);
181        y2 = _mm_srai_epi32(y2, 8);
182        z2 = _mm_srai_epi32(z2, 8);
183        w2 = _mm_srai_epi32(w2, 8);
184
185        x2 = packus_epi32(x2, y2);
186        z2 = packus_epi32(z2, w2);
187        o4 = _mm_packus_epi16(x2, z2);
188
189        o4 = _mm_shuffle_epi8(o4, T4x4);
190        _mm_storeu_si128((__m128i *)dst, o4);
191
192        src = (const char *)src + 16;
193        dst = (char *)dst + 16;
194    }
195}
196
197void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
198                                  const short *coef, uint32_t count) {
199    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
200                                      14, 10, 6, 2,
201                                      13,  9, 5, 1,
202                                      12,  8, 4, 0);
203
204    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
205    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
206
207    __m128i c0, c1, c2, c3;
208    __m128i i4, o4;
209    __m128i xy, zw;
210    __m128i x2, y2, z2, w2;
211    uint32_t i;
212
213    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
214    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
215    c0 = _mm_unpacklo_epi16(c0, c1);
216
217    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
218    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
219    c2 = _mm_unpacklo_epi16(c2, c3);
220
221    for (i = 0; i < count; ++i) {
222        i4 = _mm_loadu_si128((const __m128i *)src);
223        xy = _mm_shuffle_epi8(i4, Mxy);
224        zw = _mm_shuffle_epi8(i4, Mzw);
225
226        x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
227        y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
228        z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
229
230        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
231        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
232        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
233
234        x2 = _mm_srai_epi32(x2, 8);
235        y2 = _mm_srai_epi32(y2, 8);
236        z2 = _mm_srai_epi32(z2, 8);
237        w2 = _mm_srli_epi32(zw, 16);
238
239        x2 = packus_epi32(x2, y2);
240        z2 = packus_epi32(z2, w2);
241        o4 = _mm_packus_epi16(x2, z2);
242
243        o4 = _mm_shuffle_epi8(o4, T4x4);
244        _mm_storeu_si128((__m128i *)dst, o4);
245
246        src = (const char *)src + 16;
247        dst = (char *)dst + 16;
248    }
249}
250
251void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
252                                  const short *coef, uint32_t count) {
253    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
254                                      14, 10, 6, 2,
255                                      13,  9, 5, 1,
256                                      12,  8, 4, 0);
257    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
258    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
259    __m128i c0, c1, c2, c3;
260    __m128i i4, o4;
261    __m128i xy, zw;
262    __m128i x2, y2, z2, w2;
263    uint32_t i;
264
265    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
266    c0 = _mm_shufflelo_epi16(c0, 0);
267    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
268    c1 = _mm_shufflelo_epi16(c1, 0);
269    c0 = _mm_unpacklo_epi16(c0, c1);
270
271    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
272    c2 = _mm_shufflelo_epi16(c2, 0);
273    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
274    c3 = _mm_shufflelo_epi16(c3, 0);
275    c2 = _mm_unpacklo_epi16(c2, c3);
276
277    for (i = 0; i < count; ++i) {
278        i4 = _mm_loadu_si128((const __m128i *)src);
279
280        xy = _mm_shuffle_epi8(i4, Mxy);
281        zw = _mm_shuffle_epi8(i4, Mzw);
282
283        x2 =  _mm_madd_epi16(xy, c0);
284        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
285
286        x2 = _mm_srai_epi32(x2, 8);
287        y2 = x2;
288        z2 = x2;
289        w2 = _mm_srli_epi32(zw, 16);
290
291        x2 = packus_epi32(x2, y2);
292        z2 = packus_epi32(z2, w2);
293        o4 = _mm_packus_epi16(x2, z2);
294
295        o4 = _mm_shuffle_epi8(o4, T4x4);
296        _mm_storeu_si128((__m128i *)dst, o4);
297
298        src = (const char *)src + 16;
299        dst = (char *)dst + 16;
300    }
301}
302
303void rsdIntrinsicBlurVFU4_K(void *dst,
304                          const void *pin, int stride, const void *gptr,
305                          int rct, int x1, int x2) {
306    const char *pi;
307    __m128i pi0, pi1;
308    __m128 pf0, pf1;
309    __m128 bp0, bp1;
310    __m128 x;
311    int r;
312
313    for (; x1 < x2; x1 += 2) {
314        pi = (const char *)pin + (x1 << 2);
315        bp0 = _mm_setzero_ps();
316        bp1 = _mm_setzero_ps();
317
318        for (r = 0; r < rct; ++r) {
319            x = _mm_load_ss((const float *)gptr + r);
320            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
321
322            pi0 = _mm_cvtsi32_si128(*(const int *)pi);
323            pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
324
325            pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
326            pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
327
328            bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
329            bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
330
331            pi += stride;
332        }
333
334        _mm_storeu_ps((float *)dst, bp0);
335        _mm_storeu_ps((float *)dst + 4, bp1);
336        dst = (char *)dst + 32;
337    }
338}
339
340void rsdIntrinsicBlurHFU4_K(void *dst,
341                          const void *pin, const void *gptr,
342                          int rct, int x1, int x2) {
343    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
344    const float *pi;
345    __m128 pf, x, y;
346    __m128i o;
347    int r;
348
349    for (; x1 < x2; ++x1) {
350        /* rct is define as 2*r+1 by the caller */
351        x = _mm_load_ss((const float *)gptr);
352        x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
353
354        pi = (const float *)pin + (x1 << 2);
355        pf = _mm_mul_ps(x, _mm_load_ps(pi));
356
357        for (r = 1; r < rct; r += 2) {
358            x = _mm_load_ss((const float *)gptr + r);
359            y = _mm_load_ss((const float *)gptr + r + 1);
360            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
361            y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
362
363            pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
364            pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
365        }
366
367        o = _mm_cvtps_epi32(pf);
368        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
369        dst = (char *)dst + 4;
370    }
371}
372
373void rsdIntrinsicBlurHFU1_K(void *dst,
374                          const void *pin, const void *gptr,
375                          int rct, int x1, int x2) {
376    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
377    const float *pi;
378    __m128 pf, g0, g1, g2, g3, gx, p0, p1;
379    __m128i o;
380    int r;
381
382    for (; x1 < x2; x1+=4) {
383        g0 = _mm_load_ss((const float *)gptr);
384        g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
385
386        pi = (const float *)pin + x1;
387        pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
388
389        for (r = 1; r < rct; r += 4) {
390            gx = _mm_loadu_ps((const float *)gptr + r);
391            p0 = _mm_loadu_ps(pi + r);
392            p1 = _mm_loadu_ps(pi + r + 4);
393
394            g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
395            pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
396            g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
397            pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
398            g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
399            pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
400            g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
401            pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
402        }
403
404        o = _mm_cvtps_epi32(pf);
405        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
406        dst = (char *)dst + 4;
407    }
408}
409
410void rsdIntrinsicYuv_K(void *dst,
411                       const unsigned char *pY, const unsigned char *pUV,
412                       uint32_t count, const short *param) {
413    __m128i biasY, biasUV;
414    __m128i c0, c1, c2, c3, c4;
415
416    biasY = _mm_set1_epi32(param[8]);   /*  16 */
417    biasUV = _mm_set1_epi32(param[16]); /* 128 */
418
419    c0 = _mm_set1_epi32(param[0]);  /*  298 */
420    c1 = _mm_set1_epi32(param[1]);  /*  409 */
421    c2 = _mm_set1_epi32(param[2]);  /* -100 */
422    c3 = _mm_set1_epi32(param[3]);  /*  516 */
423    c4 = _mm_set1_epi32(param[4]);  /* -208 */
424
425    __m128i Y, UV, U, V, R, G, B, A;
426
427    A = _mm_set1_epi32(255);
428    uint32_t i;
429
430    for (i = 0; i < (count << 1); ++i) {
431        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
432        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
433
434        Y = _mm_sub_epi32(Y, biasY);
435        UV = _mm_sub_epi32(UV, biasUV);
436
437        U = _mm_shuffle_epi32(UV, 0xf5);
438        V = _mm_shuffle_epi32(UV, 0xa0);
439
440        Y = mullo_epi32(Y, c0);
441
442        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
443        R = _mm_add_epi32(R, biasUV);
444        R = _mm_srai_epi32(R, 8);
445
446        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
447        G = _mm_add_epi32(G, mullo_epi32(V, c4));
448        G = _mm_add_epi32(G, biasUV);
449        G = _mm_srai_epi32(G, 8);
450
451        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
452        B = _mm_add_epi32(B, biasUV);
453        B = _mm_srai_epi32(B, 8);
454
455        __m128i y1, y2, y3, y4;
456
457        y1 = packus_epi32(R, G);
458        y2 = packus_epi32(B, A);
459        y3 = _mm_packus_epi16(y1, y2);
460        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
461                                          14, 10, 6, 2,
462                                          13,  9, 5, 1,
463                                          12,  8, 4, 0);
464        y4 = _mm_shuffle_epi8(y3, T4x4);
465        _mm_storeu_si128((__m128i *)dst, y4);
466        pY += 4;
467        pUV += 4;
468        dst = (__m128i *)dst + 1;
469    }
470}
471
472void rsdIntrinsicYuvR_K(void *dst,
473                       const unsigned char *pY, const unsigned char *pUV,
474                       uint32_t count, const short *param) {
475    __m128i biasY, biasUV;
476    __m128i c0, c1, c2, c3, c4;
477
478    biasY = _mm_set1_epi32(param[8]);   /*  16 */
479    biasUV = _mm_set1_epi32(param[16]); /* 128 */
480
481    c0 = _mm_set1_epi32(param[0]);  /*  298 */
482    c1 = _mm_set1_epi32(param[1]);  /*  409 */
483    c2 = _mm_set1_epi32(param[2]);  /* -100 */
484    c3 = _mm_set1_epi32(param[3]);  /*  516 */
485    c4 = _mm_set1_epi32(param[4]);  /* -208 */
486
487    __m128i Y, UV, U, V, R, G, B, A;
488
489    A = _mm_set1_epi32(255);
490    uint32_t i;
491
492    for (i = 0; i < (count << 1); ++i) {
493        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
494        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
495
496        Y = _mm_sub_epi32(Y, biasY);
497        UV = _mm_sub_epi32(UV, biasUV);
498
499        V = _mm_shuffle_epi32(UV, 0xf5);
500        U = _mm_shuffle_epi32(UV, 0xa0);
501
502        Y = mullo_epi32(Y, c0);
503
504        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
505        R = _mm_add_epi32(R, biasUV);
506        R = _mm_srai_epi32(R, 8);
507
508        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
509        G = _mm_add_epi32(G, mullo_epi32(V, c4));
510        G = _mm_add_epi32(G, biasUV);
511        G = _mm_srai_epi32(G, 8);
512
513        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
514        B = _mm_add_epi32(B, biasUV);
515        B = _mm_srai_epi32(B, 8);
516
517        __m128i y1, y2, y3, y4;
518
519        y1 = packus_epi32(R, G);
520        y2 = packus_epi32(B, A);
521        y3 = _mm_packus_epi16(y1, y2);
522        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
523                                          14, 10, 6, 2,
524                                          13,  9, 5, 1,
525                                          12,  8, 4, 0);
526        y4 = _mm_shuffle_epi8(y3, T4x4);
527        _mm_storeu_si128((__m128i *)dst, y4);
528        pY += 4;
529        pUV += 4;
530        dst = (__m128i *)dst + 1;
531    }
532}
533
534void rsdIntrinsicYuv2_K(void *dst,
535                       const unsigned char *pY, const unsigned char *pU,
536                       const unsigned char *pV, uint32_t count, const short *param) {
537    __m128i biasY, biasUV;
538    __m128i c0, c1, c2, c3, c4;
539
540    biasY = _mm_set1_epi32(param[8]);   /*  16 */
541    biasUV = _mm_set1_epi32(param[16]); /* 128 */
542
543    c0 = _mm_set1_epi32(param[0]);  /*  298 */
544    c1 = _mm_set1_epi32(param[1]);  /*  409 */
545    c2 = _mm_set1_epi32(param[2]);  /* -100 */
546    c3 = _mm_set1_epi32(param[3]);  /*  516 */
547    c4 = _mm_set1_epi32(param[4]);  /* -208 */
548
549    __m128i Y, U, V, R, G, B, A;
550
551    A = _mm_set1_epi32(255);
552    uint32_t i;
553
554    for (i = 0; i < (count << 1); ++i) {
555        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
556        U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
557		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
558
559        Y = _mm_sub_epi32(Y, biasY);
560        U = _mm_sub_epi32(U, biasUV);
561		V = _mm_sub_epi32(V, biasUV);
562
563        Y = mullo_epi32(Y, c0);
564
565        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
566        R = _mm_add_epi32(R, biasUV);
567        R = _mm_srai_epi32(R, 8);
568
569        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
570        G = _mm_add_epi32(G, mullo_epi32(V, c4));
571        G = _mm_add_epi32(G, biasUV);
572        G = _mm_srai_epi32(G, 8);
573
574        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
575        B = _mm_add_epi32(B, biasUV);
576        B = _mm_srai_epi32(B, 8);
577
578        __m128i y1, y2, y3, y4;
579
580        y1 = packus_epi32(R, G);
581        y2 = packus_epi32(B, A);
582        y3 = _mm_packus_epi16(y1, y2);
583        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
584                                          14, 10, 6, 2,
585                                          13,  9, 5, 1,
586                                          12,  8, 4, 0);
587        y4 = _mm_shuffle_epi8(y3, T4x4);
588        _mm_storeu_si128((__m128i *)dst, y4);
589        pY += 4;
590        pU += 4;
591		pV += 4;
592        dst = (__m128i *)dst + 1;
593    }
594}
595
596extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0,
597                                          const void *y1, const void *y2,
598                                          const void *y3, const void *y4,
599                                          const short *coef, uint32_t count) {
600    __m128i x;
601    __m128i c0, c2, c4, c6, c8, c10, c12;
602    __m128i c14, c16, c18, c20, c22, c24;
603    __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
604    __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
605    __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
606    __m128i p16, p17, p18, p19, p20, p21, p22, p23;
607    __m128i p24, p25, p26, p27, p28, p29, p30, p31;
608    __m128i p32, p33, p34, p35, p36, p37, p38, p39;
609    __m128i o0, o1, o2, o3;
610    uint32_t i;
611
612    x = _mm_loadl_epi64((const __m128i *)(coef+0));
613    c0  = _mm_shuffle_epi32(x, 0x00);
614    c2  = _mm_shuffle_epi32(x, 0x55);
615
616    x = _mm_loadl_epi64((const __m128i *)(coef+4));
617    c4  = _mm_shuffle_epi32(x, 0x00);
618    c6  = _mm_shuffle_epi32(x, 0x55);
619
620    x = _mm_loadl_epi64((const __m128i *)(coef+8));
621    c8  = _mm_shuffle_epi32(x, 0x00);
622    c10  = _mm_shuffle_epi32(x, 0x55);
623
624    x = _mm_loadl_epi64((const __m128i *)(coef+12));
625    c12  = _mm_shuffle_epi32(x, 0x00);
626    c14  = _mm_shuffle_epi32(x, 0x55);
627
628    x = _mm_loadl_epi64((const __m128i *)(coef+16));
629    c16  = _mm_shuffle_epi32(x, 0x00);
630    c18  = _mm_shuffle_epi32(x, 0x55);
631
632    x = _mm_loadl_epi64((const __m128i *)(coef+20));
633    c20  = _mm_shuffle_epi32(x, 0x00);
634    c22  = _mm_shuffle_epi32(x, 0x55);
635
636    x = _mm_loadl_epi64((const __m128i *)(coef+24));
637    c24  = _mm_shuffle_epi32(x, 0x00);
638
639    for (i = 0; i < count; ++i) {
640
641        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
642        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
643        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
644        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
645        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
646        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
647        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
648        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
649
650        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
651        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
652        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
653        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
654        p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
655        p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
656        p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
657        p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
658
659        p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
660        p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
661        p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
662        p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
663        p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
664        p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
665        p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
666        p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
667
668        p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
669        p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
670        p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
671        p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
672        p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
673        p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
674        p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
675        p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
676
677        p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
678        p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
679        p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
680        p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
681        p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
682        p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
683        p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
684        p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
685
686        o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
687        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
688        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
689        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
690        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
691        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
692        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
693        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
694        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
695        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
696        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
697        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
698        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
699        o0 = _mm_srai_epi32(o0, 8);
700
701        o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
702        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
703        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
704        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
705        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
706        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
707        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
708        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
709        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
710        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
711        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
712        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
713        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
714        o1 = _mm_srai_epi32(o1, 8);
715
716        o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
717        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
718        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
719        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
720        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
721        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
722        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
723        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
724        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
725        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
726        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
727        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
728        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
729        o2 = _mm_srai_epi32(o2, 8);
730
731        o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
732        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
733        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
734        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
735        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
736        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
737        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
738        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
739        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
740        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
741        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
742        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
743        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
744        o3 = _mm_srai_epi32(o3, 8);
745
746        o0 = packus_epi32(o0, o1);
747        o2 = packus_epi32(o2, o3);
748        o0 = _mm_packus_epi16(o0, o2);
749        _mm_storeu_si128((__m128i *)dst, o0);
750
751        y0 = (const char *)y0 + 16;
752        y1 = (const char *)y1 + 16;
753        y2 = (const char *)y2 + 16;
754        y3 = (const char *)y3 + 16;
755        y4 = (const char *)y4 + 16;
756        dst = (char *)dst + 16;
757    }
758}
759
760void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
761    __m128i all1s, ina, ins;
762    __m128i in0, in1, out0, out1;
763    __m128i t0, t1, t2, t3;
764    uint32_t i;
765
766    all1s = _mm_set1_epi16(255);
767
768    for (i = 0; i < count8; ++i) {
769        in0 = _mm_loadu_si128((const __m128i *)src);
770        in1 = _mm_loadu_si128((const __m128i *)src + 1);
771        out0 = _mm_loadu_si128((const __m128i *)dst);
772        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
773
774        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
775        ina = _mm_shufflelo_epi16(ins, 0xFF);
776        ina = _mm_shufflehi_epi16(ina, 0xFF);
777        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
778        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
779        t0 = _mm_srli_epi16(t0, 8);
780        t0 = _mm_add_epi16(t0, ins);
781
782        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
783        ina = _mm_shufflelo_epi16(ins, 0xFF);
784        ina = _mm_shufflehi_epi16(ina, 0xFF);
785        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
786        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
787        t1 = _mm_srli_epi16(t1, 8);
788        t1 = _mm_add_epi16(t1, ins);
789
790        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
791        ina = _mm_shufflelo_epi16(ins, 0xFF);
792        ina = _mm_shufflehi_epi16(ina, 0xFF);
793        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
794        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
795        t2 = _mm_srli_epi16(t2, 8);
796        t2 = _mm_add_epi16(t2, ins);
797
798        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
799        ina = _mm_shufflelo_epi16(ins, 0xFF);
800        ina = _mm_shufflehi_epi16(ina, 0xFF);
801        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
802        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
803        t3 = _mm_srli_epi16(t3, 8);
804        t3 = _mm_add_epi16(t3, ins);
805
806        t0 = _mm_packus_epi16(t0, t1);
807        t2 = _mm_packus_epi16(t2, t3);
808        _mm_storeu_si128((__m128i *)dst, t0);
809        _mm_storeu_si128((__m128i *)dst + 1, t2);
810
811        src = (const __m128i *)src + 2;
812        dst = (__m128i *)dst + 2;
813    }
814}
815
816void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
817    __m128i all1s, outa, outs;
818    __m128i in0, in1, out0, out1;
819    __m128i t0, t1, t2, t3;
820    uint32_t i;
821
822    all1s = _mm_set1_epi16(255);
823
824    for (i = 0; i < count8; ++i) {
825        in0 = _mm_loadu_si128((const __m128i *)src);
826        in1 = _mm_loadu_si128((const __m128i *)src + 1);
827        out0 = _mm_loadu_si128((const __m128i *)dst);
828        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
829
830
831        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
832        outa = _mm_shufflelo_epi16(outs, 0xFF);
833        outa = _mm_shufflehi_epi16(outa, 0xFF);
834        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
835        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
836        t0 = _mm_srli_epi16(t0, 8);
837        t0 = _mm_add_epi16(t0, outs);
838
839        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
840        outa = _mm_shufflelo_epi16(outs, 0xFF);
841        outa = _mm_shufflehi_epi16(outa, 0xFF);
842        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
843        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
844        t1 = _mm_srli_epi16(t1, 8);
845        t1 = _mm_add_epi16(t1, outs);
846
847        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
848        outa = _mm_shufflelo_epi16(outs, 0xFF);
849        outa = _mm_shufflehi_epi16(outa, 0xFF);
850        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
851        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
852        t2 = _mm_srli_epi16(t2, 8);
853        t2 = _mm_add_epi16(t2, outs);
854
855        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
856        outa = _mm_shufflelo_epi16(outs, 0xFF);
857        outa = _mm_shufflehi_epi16(outa, 0xFF);
858        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
859        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
860        t3 = _mm_srli_epi16(t3, 8);
861        t3 = _mm_add_epi16(t3, outs);
862
863        t0 = _mm_packus_epi16(t0, t1);
864        t2 = _mm_packus_epi16(t2, t3);
865        _mm_storeu_si128((__m128i *)dst, t0);
866        _mm_storeu_si128((__m128i *)dst + 1, t2);
867
868        src = (const __m128i *)src + 2;
869        dst = (__m128i *)dst + 2;
870    }
871}
872
873void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
874    __m128i outa;
875    __m128i in0, in1, out0, out1;
876    __m128i t0, t1, t2, t3;
877    uint32_t i;
878
879    for (i = 0; i < count8; ++i) {
880        in0 = _mm_loadu_si128((const __m128i *)src);
881        in1 = _mm_loadu_si128((const __m128i *)src + 1);
882        out0 = _mm_loadu_si128((const __m128i *)dst);
883        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
884
885        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
886        outa = _mm_shufflelo_epi16(outa, 0xFF);
887        outa = _mm_shufflehi_epi16(outa, 0xFF);
888        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
889        t0 = _mm_mullo_epi16(t0, outa);
890        t0 = _mm_srli_epi16(t0, 8);
891
892        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
893        outa = _mm_shufflelo_epi16(outa, 0xFF);
894        outa = _mm_shufflehi_epi16(outa, 0xFF);
895        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
896        t1 = _mm_mullo_epi16(t1, outa);
897        t1 = _mm_srli_epi16(t1, 8);
898
899        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
900        outa = _mm_shufflelo_epi16(outa, 0xFF);
901        outa = _mm_shufflehi_epi16(outa, 0xFF);
902        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
903        t2 = _mm_mullo_epi16(t2, outa);
904        t2 = _mm_srli_epi16(t2, 8);
905
906        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
907        outa = _mm_shufflelo_epi16(outa, 0xFF);
908        outa = _mm_shufflehi_epi16(outa, 0xFF);
909        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
910        t3 = _mm_mullo_epi16(t3, outa);
911        t3 = _mm_srli_epi16(t3, 8);
912
913        t0 = _mm_packus_epi16(t0, t1);
914        t2 = _mm_packus_epi16(t2, t3);
915        _mm_storeu_si128((__m128i *)dst, t0);
916        _mm_storeu_si128((__m128i *)dst + 1, t2);
917
918        src = (const __m128i *)src + 2;
919        dst = (__m128i *)dst + 2;
920    }
921}
922
923void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
924    __m128i ina;
925    __m128i in0, in1, out0, out1;
926    __m128i t0, t1, t2, t3;
927    uint32_t i;
928
929    for (i = 0; i < count8; ++i) {
930        in0 = _mm_loadu_si128((const __m128i *)src);
931        in1 = _mm_loadu_si128((const __m128i *)src + 1);
932        out0 = _mm_loadu_si128((const __m128i *)dst);
933        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
934
935        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
936        ina = _mm_shufflelo_epi16(ina, 0xFF);
937        ina = _mm_shufflehi_epi16(ina, 0xFF);
938        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
939        t0 = _mm_mullo_epi16(t0, ina);
940        t0 = _mm_srli_epi16(t0, 8);
941
942        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
943        ina = _mm_shufflelo_epi16(ina, 0xFF);
944        ina = _mm_shufflehi_epi16(ina, 0xFF);
945        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
946        t1 = _mm_mullo_epi16(t1, ina);
947        t1 = _mm_srli_epi16(t1, 8);
948
949        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
950        ina = _mm_shufflelo_epi16(ina, 0xFF);
951        ina = _mm_shufflehi_epi16(ina, 0xFF);
952        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
953        t2 = _mm_mullo_epi16(t2, ina);
954        t2 = _mm_srli_epi16(t2, 8);
955
956        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
957        ina = _mm_shufflelo_epi16(ina, 0xFF);
958        ina = _mm_shufflehi_epi16(ina, 0xFF);
959        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
960        t3 = _mm_mullo_epi16(t3, ina);
961        t3 = _mm_srli_epi16(t3, 8);
962
963        t0 = _mm_packus_epi16(t0, t1);
964        t2 = _mm_packus_epi16(t2, t3);
965        _mm_storeu_si128((__m128i *)dst, t0);
966        _mm_storeu_si128((__m128i *)dst + 1, t2);
967
968        src = (const __m128i *)src + 2;
969        dst = (__m128i *)dst + 2;
970    }
971}
972
973void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
974    __m128i all1s, outa;
975    __m128i in0, in1, out0, out1;
976    __m128i t0, t1, t2, t3;
977    uint32_t i;
978
979    all1s = _mm_set1_epi16(255);
980
981    for (i = 0; i < count8; ++i) {
982        in0 = _mm_loadu_si128((const __m128i *)src);
983        in1 = _mm_loadu_si128((const __m128i *)src + 1);
984        out0 = _mm_loadu_si128((const __m128i *)dst);
985        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
986
987        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
988        outa = _mm_shufflelo_epi16(outa, 0xFF);
989        outa = _mm_shufflehi_epi16(outa, 0xFF);
990        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
991        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
992        t0 = _mm_srli_epi16(t0, 8);
993
994        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
995        outa = _mm_shufflelo_epi16(outa, 0xFF);
996        outa = _mm_shufflehi_epi16(outa, 0xFF);
997        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
998        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
999        t1 = _mm_srli_epi16(t1, 8);
1000
1001        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1002        outa = _mm_shufflelo_epi16(outa, 0xFF);
1003        outa = _mm_shufflehi_epi16(outa, 0xFF);
1004        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1005        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1006        t2 = _mm_srli_epi16(t2, 8);
1007
1008        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1009        outa = _mm_shufflelo_epi16(outa, 0xFF);
1010        outa = _mm_shufflehi_epi16(outa, 0xFF);
1011        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1012        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1013        t3 = _mm_srli_epi16(t3, 8);
1014
1015        t0 = _mm_packus_epi16(t0, t1);
1016        t2 = _mm_packus_epi16(t2, t3);
1017        _mm_storeu_si128((__m128i *)dst, t0);
1018        _mm_storeu_si128((__m128i *)dst + 1, t2);
1019
1020        src = (const __m128i *)src + 2;
1021        dst = (__m128i *)dst + 2;
1022    }
1023}
1024
1025void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1026    __m128i all1s, ina;
1027    __m128i in0, in1, out0, out1;
1028    __m128i t0, t1, t2, t3;
1029    uint32_t i;
1030
1031    all1s = _mm_set1_epi16(255);
1032
1033    for (i = 0; i < count8; ++i) {
1034        in0 = _mm_loadu_si128((const __m128i *)src);
1035        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1036        out0 = _mm_loadu_si128((const __m128i *)dst);
1037        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1038
1039        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1040        ina = _mm_shufflelo_epi16(ina, 0xFF);
1041        ina = _mm_shufflehi_epi16(ina, 0xFF);
1042        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1043        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1044        t0 = _mm_srli_epi16(t0, 8);
1045
1046        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1047        ina = _mm_shufflelo_epi16(ina, 0xFF);
1048        ina = _mm_shufflehi_epi16(ina, 0xFF);
1049        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1050        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1051        t1 = _mm_srli_epi16(t1, 8);
1052
1053        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1054        ina = _mm_shufflelo_epi16(ina, 0xFF);
1055        ina = _mm_shufflehi_epi16(ina, 0xFF);
1056        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1057        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1058        t2 = _mm_srli_epi16(t2, 8);
1059
1060        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1061        ina = _mm_shufflelo_epi16(ina, 0xFF);
1062        ina = _mm_shufflehi_epi16(ina, 0xFF);
1063        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1064        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1065        t3 = _mm_srli_epi16(t3, 8);
1066
1067        t0 = _mm_packus_epi16(t0, t1);
1068        t2 = _mm_packus_epi16(t2, t3);
1069        _mm_storeu_si128((__m128i *)dst, t0);
1070        _mm_storeu_si128((__m128i *)dst + 1, t2);
1071
1072        src = (const __m128i *)src + 2;
1073        dst = (__m128i *)dst + 2;
1074    }
1075}
1076
1077void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1078    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1079    __m128i all1s, ina, outa, ins, outs;
1080    __m128i in0, in1, out0, out1;
1081    __m128i t0, t1, t2, t3;
1082    uint32_t i;
1083
1084    all1s = _mm_set1_epi16(255);
1085
1086    for (i = 0; i < count8; ++i) {
1087        in0 = _mm_loadu_si128((const __m128i *)src);
1088        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1089        out0 = _mm_loadu_si128((const __m128i *)dst);
1090        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1091
1092        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1093        ina = _mm_shufflelo_epi16(ins, 0xFF);
1094        ina = _mm_shufflehi_epi16(ina, 0xFF);
1095        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1096        outa = _mm_shufflelo_epi16(outs, 0xFF);
1097        outa = _mm_shufflehi_epi16(outa, 0xFF);
1098        t0 = _mm_sub_epi16(all1s, ina);
1099        t0 = _mm_mullo_epi16(t0, outs);
1100        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1101        t0 = _mm_srli_epi16(t0, 8);
1102
1103        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1104        ina = _mm_shufflelo_epi16(ins, 0xFF);
1105        ina = _mm_shufflehi_epi16(ina, 0xFF);
1106        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1107        outa = _mm_shufflelo_epi16(outs, 0xFF);
1108        outa = _mm_shufflehi_epi16(outa, 0xFF);
1109        t1 = _mm_sub_epi16(all1s, ina);
1110        t1 = _mm_mullo_epi16(t1, outs);
1111        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1112        t1 = _mm_srli_epi16(t1, 8);
1113
1114        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1115        ina = _mm_shufflelo_epi16(ins, 0xFF);
1116        ina = _mm_shufflehi_epi16(ina, 0xFF);
1117        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1118        outa = _mm_shufflelo_epi16(outs, 0xFF);
1119        outa = _mm_shufflehi_epi16(outa, 0xFF);
1120        t2 = _mm_sub_epi16(all1s, ina);
1121        t2 = _mm_mullo_epi16(t2, outs);
1122        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1123        t2 = _mm_srli_epi16(t2, 8);
1124
1125        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1126        ina = _mm_shufflelo_epi16(ins, 0xFF);
1127        ina = _mm_shufflehi_epi16(ina, 0xFF);
1128        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1129        outa = _mm_shufflelo_epi16(outs, 0xFF);
1130        outa = _mm_shufflehi_epi16(outa, 0xFF);
1131        t3 = _mm_sub_epi16(all1s, ina);
1132        t3 = _mm_mullo_epi16(t3, outs);
1133        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1134        t3 = _mm_srli_epi16(t3, 8);
1135
1136        t0 = _mm_packus_epi16(t0, t1);
1137        t0 = blendv_epi8(t0, out0, M0001);
1138        t2 = _mm_packus_epi16(t2, t3);
1139        t2 = blendv_epi8(t2, out1, M0001);
1140        _mm_storeu_si128((__m128i *)dst, t0);
1141        _mm_storeu_si128((__m128i *)dst + 1, t2);
1142
1143        src = (const __m128i *)src + 2;
1144        dst = (__m128i *)dst + 2;
1145    }
1146}
1147
1148void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1149    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1150    __m128i all1s, ina, ins, outa, outs;
1151    __m128i in0, in1, out0, out1;
1152    __m128i t0, t1, t2, t3;
1153    uint32_t i;
1154
1155    all1s = _mm_set1_epi16(255);
1156
1157    for (i = 0; i < count8; ++i) {
1158        in0 = _mm_loadu_si128((const __m128i *)src);
1159        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1160        out0 = _mm_loadu_si128((const __m128i *)dst);
1161        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1162
1163        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1164        ina = _mm_shufflelo_epi16(ins, 0xFF);
1165        ina = _mm_shufflehi_epi16(ina, 0xFF);
1166        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1167        outa = _mm_shufflelo_epi16(outs, 0xFF);
1168        outa = _mm_shufflehi_epi16(outa, 0xFF);
1169        t0 = _mm_sub_epi16(all1s, outa);
1170        t0 = _mm_mullo_epi16(t0, ins);
1171        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1172        t0 = _mm_srli_epi16(t0, 8);
1173
1174        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1175        ina = _mm_shufflelo_epi16(ins, 0xFF);
1176        ina = _mm_shufflehi_epi16(ina, 0xFF);
1177        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1178        outa = _mm_shufflelo_epi16(outs, 0xFF);
1179        outa = _mm_shufflehi_epi16(outa, 0xFF);
1180        t1 = _mm_sub_epi16(all1s, outa);
1181        t1 = _mm_mullo_epi16(t1, ins);
1182        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1183        t1 = _mm_srli_epi16(t1, 8);
1184
1185        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1186        ina = _mm_shufflelo_epi16(ins, 0xFF);
1187        ina = _mm_shufflehi_epi16(ina, 0xFF);
1188        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1189        outa = _mm_shufflelo_epi16(outs, 0xFF);
1190        outa = _mm_shufflehi_epi16(outa, 0xFF);
1191        t2 = _mm_sub_epi16(all1s, outa);
1192        t2 = _mm_mullo_epi16(t2, ins);
1193        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1194        t2 = _mm_srli_epi16(t2, 8);
1195
1196        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1197        ina = _mm_shufflelo_epi16(ins, 0xFF);
1198        ina = _mm_shufflehi_epi16(ina, 0xFF);
1199        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1200        outa = _mm_shufflelo_epi16(outs, 0xFF);
1201        outa = _mm_shufflehi_epi16(outa, 0xFF);
1202        t3 = _mm_sub_epi16(all1s, outa);
1203        t3 = _mm_mullo_epi16(t3, ins);
1204        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1205        t3 = _mm_srli_epi16(t3, 8);
1206
1207        t0 = _mm_packus_epi16(t0, t1);
1208        t0 = blendv_epi8(t0, in0, M0001);
1209        t2 = _mm_packus_epi16(t2, t3);
1210        t2 = blendv_epi8(t2, in1, M0001);
1211        _mm_storeu_si128((__m128i *)dst, t0);
1212        _mm_storeu_si128((__m128i *)dst + 1, t2);
1213
1214        src = (const __m128i *)src + 2;
1215        dst = (__m128i *)dst + 2;
1216    }
1217}
1218
1219void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1220    __m128i in0, in1, out0, out1;
1221    uint32_t i;
1222
1223    for (i = 0; i < count8; ++i) {
1224        in0 = _mm_loadu_si128((const __m128i *)src);
1225        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1226        out0 = _mm_loadu_si128((const __m128i *)dst);
1227        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1228
1229        out0 = _mm_xor_si128(out0, in0);
1230        out1 = _mm_xor_si128(out1, in1);
1231
1232        _mm_storeu_si128((__m128i *)dst, out0);
1233        _mm_storeu_si128((__m128i *)dst + 1, out1);
1234
1235        src = (const __m128i *)src + 2;
1236        dst = (__m128i *)dst + 2;
1237    }
1238}
1239
1240void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1241    __m128i in0, in1, out0, out1;
1242    __m128i t0, t1, t2, t3;
1243    uint32_t i;
1244
1245    for (i = 0; i < count8; ++i) {
1246        in0 = _mm_loadu_si128((const __m128i *)src);
1247        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1248        out0 = _mm_loadu_si128((const __m128i *)dst);
1249        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1250
1251        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1252        t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1253        t0 = _mm_srli_epi16(t0, 8);
1254
1255        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1256        t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1257        t1 = _mm_srli_epi16(t1, 8);
1258
1259        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1260        t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1261        t2 = _mm_srli_epi16(t2, 8);
1262
1263        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1264        t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1265        t3 = _mm_srli_epi16(t3, 8);
1266
1267        t0 = _mm_packus_epi16(t0, t1);
1268        t2 = _mm_packus_epi16(t2, t3);
1269        _mm_storeu_si128((__m128i *)dst, t0);
1270        _mm_storeu_si128((__m128i *)dst + 1, t2);
1271
1272        src = (const __m128i *)src + 2;
1273        dst = (__m128i *)dst + 2;
1274    }
1275}
1276
1277void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1278    __m128i in0, in1, out0, out1;
1279    uint32_t i;
1280
1281    for (i = 0; i < count8; ++i) {
1282        in0 = _mm_loadu_si128((const __m128i *)src);
1283        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1284        out0 = _mm_loadu_si128((const __m128i *)dst);
1285        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1286
1287        out0 = _mm_adds_epu8(out0, in0);
1288        out1 = _mm_adds_epu8(out1, in1);
1289
1290        _mm_storeu_si128((__m128i *)dst, out0);
1291        _mm_storeu_si128((__m128i *)dst + 1, out1);
1292
1293        src = (const __m128i *)src + 2;
1294        dst = (__m128i *)dst + 2;
1295    }
1296}
1297
1298void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1299    __m128i in0, in1, out0, out1;
1300    uint32_t i;
1301
1302    for (i = 0; i < count8; ++i) {
1303        in0 = _mm_loadu_si128((const __m128i *)src);
1304        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1305        out0 = _mm_loadu_si128((const __m128i *)dst);
1306        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1307
1308        out0 = _mm_subs_epu8(out0, in0);
1309        out1 = _mm_subs_epu8(out1, in1);
1310
1311        _mm_storeu_si128((__m128i *)dst, out0);
1312        _mm_storeu_si128((__m128i *)dst + 1, out1);
1313
1314        src = (const __m128i *)src + 2;
1315        dst = (__m128i *)dst + 2;
1316    }
1317}
1318