1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <stdint.h>
18#include <x86intrin.h>
19
20/* Unsigned extend packed 8-bit integer (in LBS) into packed 32-bit integer */
21static inline __m128i cvtepu8_epi32(__m128i x) {
22#if defined(__SSE4_1__)
23    return _mm_cvtepu8_epi32(x);
24#elif defined(__SSSE3__)
25    const __m128i M8to32 = _mm_set_epi32(0xffffff03, 0xffffff02, 0xffffff01, 0xffffff00);
26    x = _mm_shuffle_epi8(x, M8to32);
27    return x;
28#else
29#   error "Require at least SSSE3"
30#endif
31}
32
33static inline __m128i packus_epi32(__m128i lo, __m128i hi) {
34#if defined(__SSE4_1__)
35    return _mm_packus_epi32(lo, hi);
36#elif defined(__SSSE3__)
37    const __m128i C0 = _mm_set_epi32(0x0000, 0x0000, 0x0000, 0x0000);
38    const __m128i C1 = _mm_set_epi32(0xffff, 0xffff, 0xffff, 0xffff);
39    const __m128i M32to16L = _mm_set_epi32(0xffffffff, 0xffffffff, 0x0d0c0908, 0x05040100);
40    const __m128i M32to16H = _mm_set_epi32(0x0d0c0908, 0x05040100, 0xffffffff, 0xffffffff);
41    lo = _mm_and_si128(lo, _mm_cmpgt_epi32(lo, C0));
42    lo = _mm_or_si128(lo, _mm_cmpgt_epi32(lo, C1));
43    hi = _mm_and_si128(hi, _mm_cmpgt_epi32(hi, C0));
44    hi = _mm_or_si128(hi, _mm_cmpgt_epi32(hi, C1));
45    return _mm_or_si128(_mm_shuffle_epi8(lo, M32to16L),
46                        _mm_shuffle_epi8(hi, M32to16H));
47#else
48#   error "Require at least SSSE3"
49#endif
50}
51
52static inline __m128i mullo_epi32(__m128i x, __m128i y) {
53#if defined(__SSE4_1__)
54    return _mm_mullo_epi32(x, y);
55#elif defined(__SSSE3__)
56    const __m128i Meven = _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0xffffffff);
57    __m128i even = _mm_mul_epu32(x, y);
58    __m128i odd = _mm_mul_epu32(_mm_srli_si128(x, 4),
59                                _mm_srli_si128(y, 4));
60    even = _mm_and_si128(even, Meven);
61    odd = _mm_and_si128(odd, Meven);
62    return _mm_or_si128(even, _mm_slli_si128(odd, 4));
63#else
64#   error "Require at least SSSE3"
65#endif
66}
67
68/* 'mask' must packed 8-bit of 0x00 or 0xff */
69static inline __m128i blendv_epi8(__m128i x, __m128i y, __m128i mask) {
70#if defined(__SSE4_1__)
71    return _mm_blendv_epi8(x, y, mask);
72#elif defined(__SSSE3__)
73    return _mm_or_si128(_mm_andnot_si128(mask, x), _mm_and_si128(y, mask));
74#else
75#   error "Require at least SSSE3"
76#endif
77}
78
79void rsdIntrinsicConvolve3x3_K(void *dst,
80                               const void *y0, const void *y1, const void *y2,
81                               const short *coef, uint32_t count) {
82    __m128i x;
83    __m128i c0, c2, c4, c6, c8;
84    __m128i r0, r1, r2;
85    __m128i p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11;
86    __m128i o0, o1;
87    uint32_t i;
88
89    x = _mm_loadl_epi64((const __m128i *)(coef+0));
90    c0 = _mm_shuffle_epi32(x, 0x00);
91    c2 = _mm_shuffle_epi32(x, 0x55);
92    x = _mm_loadl_epi64((const __m128i *)(coef+4));
93    c4 = _mm_shuffle_epi32(x, 0x00);
94    c6 = _mm_shuffle_epi32(x, 0x55);
95    x = _mm_loadl_epi64((const __m128i *)(coef+8));
96    c8 = _mm_shuffle_epi32(x, 0x00);
97
98    for (i = 0; i < count; ++i) {
99
100        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0)), _mm_setzero_si128());
101        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
102        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
103        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
104        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
105        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
106        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
107        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
108        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
109        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
110        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
111        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
112
113        o0 = _mm_madd_epi16(_mm_unpacklo_epi16(p0, p1), c0);
114        o1 = _mm_madd_epi16(_mm_unpacklo_epi16(p1, p2), c0);
115
116        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p2, p4), c2));
117        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p3, p5), c2));
118
119        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p5, p6), c4));
120        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p6, p7), c4));
121
122        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p8, p9), c6));
123        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p9, p10), c6));
124
125        o0 = _mm_add_epi32(o0, _mm_madd_epi16(_mm_unpacklo_epi16(p10, _mm_setzero_si128()), c8));
126        o1 = _mm_add_epi32(o1, _mm_madd_epi16(_mm_unpacklo_epi16(p11, _mm_setzero_si128()), c8));
127
128        o0 = _mm_srai_epi32(o0, 8);
129        o1 = _mm_srai_epi32(o1, 8);
130
131        o0 = packus_epi32(o0, o1);
132        o0 = _mm_packus_epi16(o0, o0);
133        _mm_storel_epi64((__m128i *)dst, o0);
134
135        y0 = (const char *)y0 + 8;
136        y1 = (const char *)y1 + 8;
137        y2 = (const char *)y2 + 8;
138        dst = (char *)dst + 8;
139    }
140}
141
142void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
143                                  const short *coef, uint32_t count) {
144    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
145                                      14, 10, 6, 2,
146                                      13,  9, 5, 1,
147                                      12,  8, 4, 0);
148
149    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
150    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
151    __m128i c0, c1, c2, c3;
152    __m128i i4, o4;
153    __m128i xy, zw;
154    __m128i x2, y2, z2, w2;
155    uint32_t i;
156
157    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
158    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
159    c0 = _mm_unpacklo_epi16(c0, c1);
160
161    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
162    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
163    c2 = _mm_unpacklo_epi16(c2, c3);
164
165    for (i = 0; i < count; ++i) {
166        i4 = _mm_load_si128((const __m128i *)src);
167        xy = _mm_shuffle_epi8(i4, Mxy);
168        zw = _mm_shuffle_epi8(i4, Mzw);
169
170        x2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
171        y2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
172        z2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
173        w2 = _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xff));
174
175        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
176        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
177        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
178        w2 = _mm_add_epi32(w2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xff)));
179
180        x2 = _mm_srai_epi32(x2, 8);
181        y2 = _mm_srai_epi32(y2, 8);
182        z2 = _mm_srai_epi32(z2, 8);
183        w2 = _mm_srai_epi32(w2, 8);
184
185        x2 = packus_epi32(x2, y2);
186        z2 = packus_epi32(z2, w2);
187        o4 = _mm_packus_epi16(x2, z2);
188
189        o4 = _mm_shuffle_epi8(o4, T4x4);
190        _mm_storeu_si128((__m128i *)dst, o4);
191
192        src = (const char *)src + 16;
193        dst = (char *)dst + 16;
194    }
195}
196
197void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
198                                  const short *coef, uint32_t count) {
199    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
200                                      14, 10, 6, 2,
201                                      13,  9, 5, 1,
202                                      12,  8, 4, 0);
203
204    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
205    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
206
207    __m128i c0, c1, c2, c3;
208    __m128i i4, o4;
209    __m128i xy, zw;
210    __m128i x2, y2, z2, w2;
211    uint32_t i;
212
213    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
214    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
215    c0 = _mm_unpacklo_epi16(c0, c1);
216
217    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
218    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
219    c2 = _mm_unpacklo_epi16(c2, c3);
220
221    for (i = 0; i < count; ++i) {
222        i4 = _mm_loadu_si128((const __m128i *)src);
223        xy = _mm_shuffle_epi8(i4, Mxy);
224        zw = _mm_shuffle_epi8(i4, Mzw);
225
226        x2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x00));
227        y2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0x55));
228        z2 =  _mm_madd_epi16(xy, _mm_shuffle_epi32(c0, 0xaa));
229
230        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x00)));
231        y2 = _mm_add_epi32(y2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0x55)));
232        z2 = _mm_add_epi32(z2, _mm_madd_epi16(zw, _mm_shuffle_epi32(c2, 0xaa)));
233
234        x2 = _mm_srai_epi32(x2, 8);
235        y2 = _mm_srai_epi32(y2, 8);
236        z2 = _mm_srai_epi32(z2, 8);
237        w2 = _mm_srli_epi32(zw, 16);
238
239        x2 = packus_epi32(x2, y2);
240        z2 = packus_epi32(z2, w2);
241        o4 = _mm_packus_epi16(x2, z2);
242
243        o4 = _mm_shuffle_epi8(o4, T4x4);
244        _mm_storeu_si128((__m128i *)dst, o4);
245
246        src = (const char *)src + 16;
247        dst = (char *)dst + 16;
248    }
249}
250
251void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
252                                  const short *coef, uint32_t count) {
253    const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
254                                      14, 10, 6, 2,
255                                      13,  9, 5, 1,
256                                      12,  8, 4, 0);
257    const __m128i Mxy = _mm_set_epi32(0xff0dff0c, 0xff09ff08, 0xff05ff04, 0xff01ff00);
258    const __m128i Mzw = _mm_set_epi32(0xff0fff0e, 0xff0bff0a, 0xff07ff06, 0xff03ff02);
259    __m128i c0, c1, c2, c3;
260    __m128i i4, o4;
261    __m128i xy, zw;
262    __m128i x2, y2, z2, w2;
263    uint32_t i;
264
265    c0 = _mm_loadl_epi64((const __m128i *)(coef+0));
266    c0 = _mm_shufflelo_epi16(c0, 0);
267    c1 = _mm_loadl_epi64((const __m128i *)(coef+4));
268    c1 = _mm_shufflelo_epi16(c1, 0);
269    c0 = _mm_unpacklo_epi16(c0, c1);
270
271    c2 = _mm_loadl_epi64((const __m128i *)(coef+8));
272    c2 = _mm_shufflelo_epi16(c2, 0);
273    c3 = _mm_loadl_epi64((const __m128i *)(coef+12));
274    c3 = _mm_shufflelo_epi16(c3, 0);
275    c2 = _mm_unpacklo_epi16(c2, c3);
276
277    for (i = 0; i < count; ++i) {
278        i4 = _mm_loadu_si128((const __m128i *)src);
279
280        xy = _mm_shuffle_epi8(i4, Mxy);
281        zw = _mm_shuffle_epi8(i4, Mzw);
282
283        x2 =  _mm_madd_epi16(xy, c0);
284        x2 = _mm_add_epi32(x2, _mm_madd_epi16(zw, c2));
285
286        x2 = _mm_srai_epi32(x2, 8);
287        y2 = x2;
288        z2 = x2;
289        w2 = _mm_srli_epi32(zw, 16);
290
291        x2 = packus_epi32(x2, y2);
292        z2 = packus_epi32(z2, w2);
293        o4 = _mm_packus_epi16(x2, z2);
294
295        o4 = _mm_shuffle_epi8(o4, T4x4);
296        _mm_storeu_si128((__m128i *)dst, o4);
297
298        src = (const char *)src + 16;
299        dst = (char *)dst + 16;
300    }
301}
302
303void rsdIntrinsicBlurVFU4_K(void *dst,
304                          const void *pin, int stride, const void *gptr,
305                          int rct, int x1, int x2) {
306    const char *pi;
307    __m128i pi0, pi1;
308    __m128 pf0, pf1;
309    __m128 bp0, bp1;
310    __m128 x;
311    int r;
312
313    for (; x1 < x2; x1 += 2) {
314        pi = (const char *)pin + (x1 << 2);
315        bp0 = _mm_setzero_ps();
316        bp1 = _mm_setzero_ps();
317
318        for (r = 0; r < rct; ++r) {
319            x = _mm_load_ss((const float *)gptr + r);
320            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
321
322            pi0 = _mm_cvtsi32_si128(*(const int *)pi);
323            pi1 = _mm_cvtsi32_si128(*((const int *)pi + 1));
324
325            pf0 = _mm_cvtepi32_ps(cvtepu8_epi32(pi0));
326            pf1 = _mm_cvtepi32_ps(cvtepu8_epi32(pi1));
327
328            bp0 = _mm_add_ps(bp0, _mm_mul_ps(pf0, x));
329            bp1 = _mm_add_ps(bp1, _mm_mul_ps(pf1, x));
330
331            pi += stride;
332        }
333
334        _mm_storeu_ps((float *)dst, bp0);
335        _mm_storeu_ps((float *)dst + 4, bp1);
336        dst = (char *)dst + 32;
337    }
338}
339
340void rsdIntrinsicBlurHFU4_K(void *dst,
341                          const void *pin, const void *gptr,
342                          int rct, int x1, int x2) {
343    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
344    const float *pi;
345    __m128 pf, x, y;
346    __m128i o;
347    int r;
348
349    for (; x1 < x2; ++x1) {
350        /* rct is define as 2*r+1 by the caller */
351        x = _mm_load_ss((const float *)gptr);
352        x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
353
354        pi = (const float *)pin + (x1 << 2);
355        pf = _mm_mul_ps(x, _mm_load_ps(pi));
356
357        for (r = 1; r < rct; r += 2) {
358            x = _mm_load_ss((const float *)gptr + r);
359            y = _mm_load_ss((const float *)gptr + r + 1);
360            x = _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 0, 0, 0));
361            y = _mm_shuffle_ps(y, y, _MM_SHUFFLE(0, 0, 0, 0));
362
363            pf = _mm_add_ps(pf, _mm_mul_ps(x, _mm_load_ps(pi + (r << 2))));
364            pf = _mm_add_ps(pf, _mm_mul_ps(y, _mm_load_ps(pi + (r << 2) + 4)));
365        }
366
367        o = _mm_cvtps_epi32(pf);
368        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
369        dst = (char *)dst + 4;
370    }
371}
372
373void rsdIntrinsicBlurHFU1_K(void *dst,
374                          const void *pin, const void *gptr,
375                          int rct, int x1, int x2) {
376    const __m128i Mu8 = _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0x0c080400);
377    const float *pi;
378    __m128 pf, g0, g1, g2, g3, gx, p0, p1;
379    __m128i o;
380    int r;
381
382    for (; x1 < x2; x1+=4) {
383        g0 = _mm_load_ss((const float *)gptr);
384        g0 = _mm_shuffle_ps(g0, g0, _MM_SHUFFLE(0, 0, 0, 0));
385
386        pi = (const float *)pin + x1;
387        pf = _mm_mul_ps(g0, _mm_loadu_ps(pi));
388
389        for (r = 1; r < rct; r += 4) {
390            gx = _mm_loadu_ps((const float *)gptr + r);
391            p0 = _mm_loadu_ps(pi + r);
392            p1 = _mm_loadu_ps(pi + r + 4);
393
394            g0 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(0, 0, 0, 0));
395            pf = _mm_add_ps(pf, _mm_mul_ps(g0, p0));
396            g1 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(1, 1, 1, 1));
397            pf = _mm_add_ps(pf, _mm_mul_ps(g1, _mm_alignr_epi8(p1, p0, 4)));
398            g2 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(2, 2, 2, 2));
399            pf = _mm_add_ps(pf, _mm_mul_ps(g2, _mm_alignr_epi8(p1, p0, 8)));
400            g3 = _mm_shuffle_ps(gx, gx, _MM_SHUFFLE(3, 3, 3, 3));
401            pf = _mm_add_ps(pf, _mm_mul_ps(g3, _mm_alignr_epi8(p1, p0, 12)));
402        }
403
404        o = _mm_cvtps_epi32(pf);
405        *(int *)dst = _mm_cvtsi128_si32(_mm_shuffle_epi8(o, Mu8));
406        dst = (char *)dst + 4;
407    }
408}
409
410void rsdIntrinsicYuv_K(void *dst,
411                       const unsigned char *pY, const unsigned char *pUV,
412                       uint32_t count, const short *param) {
413    __m128i biasY, biasUV;
414    __m128i c0, c1, c2, c3, c4;
415
416    biasY = _mm_set1_epi32(param[8]);   /*  16 */
417    biasUV = _mm_set1_epi32(param[16]); /* 128 */
418
419    c0 = _mm_set1_epi32(param[0]);  /*  298 */
420    c1 = _mm_set1_epi32(param[1]);  /*  409 */
421    c2 = _mm_set1_epi32(param[2]);  /* -100 */
422    c3 = _mm_set1_epi32(param[3]);  /*  516 */
423    c4 = _mm_set1_epi32(param[4]);  /* -208 */
424
425    __m128i Y, UV, U, V, R, G, B, A;
426
427    A = _mm_set1_epi32(255);
428    uint32_t i;
429
430    for (i = 0; i < (count << 1); ++i) {
431        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
432        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
433
434        Y = _mm_sub_epi32(Y, biasY);
435        UV = _mm_sub_epi32(UV, biasUV);
436
437        U = _mm_shuffle_epi32(UV, 0xf5);
438        V = _mm_shuffle_epi32(UV, 0xa0);
439
440        Y = mullo_epi32(Y, c0);
441
442        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
443        R = _mm_add_epi32(R, biasUV);
444        R = _mm_srai_epi32(R, 8);
445
446        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
447        G = _mm_add_epi32(G, mullo_epi32(V, c4));
448        G = _mm_add_epi32(G, biasUV);
449        G = _mm_srai_epi32(G, 8);
450
451        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
452        B = _mm_add_epi32(B, biasUV);
453        B = _mm_srai_epi32(B, 8);
454
455        __m128i y1, y2, y3, y4;
456
457        y1 = packus_epi32(R, G);
458        y2 = packus_epi32(B, A);
459        y3 = _mm_packus_epi16(y1, y2);
460        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
461                                          14, 10, 6, 2,
462                                          13,  9, 5, 1,
463                                          12,  8, 4, 0);
464        y4 = _mm_shuffle_epi8(y3, T4x4);
465        _mm_storeu_si128((__m128i *)dst, y4);
466        pY += 4;
467        pUV += 4;
468        dst = (__m128i *)dst + 1;
469    }
470}
471
472void rsdIntrinsicYuvR_K(void *dst,
473                       const unsigned char *pY, const unsigned char *pUV,
474                       uint32_t count, const short *param) {
475    __m128i biasY, biasUV;
476    __m128i c0, c1, c2, c3, c4;
477
478    biasY = _mm_set1_epi32(param[8]);   /*  16 */
479    biasUV = _mm_set1_epi32(param[16]); /* 128 */
480
481    c0 = _mm_set1_epi32(param[0]);  /*  298 */
482    c1 = _mm_set1_epi32(param[1]);  /*  409 */
483    c2 = _mm_set1_epi32(param[2]);  /* -100 */
484    c3 = _mm_set1_epi32(param[3]);  /*  516 */
485    c4 = _mm_set1_epi32(param[4]);  /* -208 */
486
487    __m128i Y, UV, U, V, R, G, B, A;
488
489    A = _mm_set1_epi32(255);
490    uint32_t i;
491
492    for (i = 0; i < (count << 1); ++i) {
493        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
494        UV = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pUV));
495
496        Y = _mm_sub_epi32(Y, biasY);
497        UV = _mm_sub_epi32(UV, biasUV);
498
499        V = _mm_shuffle_epi32(UV, 0xf5);
500        U = _mm_shuffle_epi32(UV, 0xa0);
501
502        Y = mullo_epi32(Y, c0);
503
504        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
505        R = _mm_add_epi32(R, biasUV);
506        R = _mm_srai_epi32(R, 8);
507
508        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
509        G = _mm_add_epi32(G, mullo_epi32(V, c4));
510        G = _mm_add_epi32(G, biasUV);
511        G = _mm_srai_epi32(G, 8);
512
513        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
514        B = _mm_add_epi32(B, biasUV);
515        B = _mm_srai_epi32(B, 8);
516
517        __m128i y1, y2, y3, y4;
518
519        y1 = packus_epi32(R, G);
520        y2 = packus_epi32(B, A);
521        y3 = _mm_packus_epi16(y1, y2);
522        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
523                                          14, 10, 6, 2,
524                                          13,  9, 5, 1,
525                                          12,  8, 4, 0);
526        y4 = _mm_shuffle_epi8(y3, T4x4);
527        _mm_storeu_si128((__m128i *)dst, y4);
528        pY += 4;
529        pUV += 4;
530        dst = (__m128i *)dst + 1;
531    }
532}
533
534void rsdIntrinsicYuv2_K(void *dst,
535                       const unsigned char *pY, const unsigned char *pU,
536                       const unsigned char *pV, uint32_t count, const short *param) {
537    __m128i biasY, biasUV;
538    __m128i c0, c1, c2, c3, c4;
539
540    biasY = _mm_set1_epi32(param[8]);   /*  16 */
541    biasUV = _mm_set1_epi32(param[16]); /* 128 */
542
543    c0 = _mm_set1_epi32(param[0]);  /*  298 */
544    c1 = _mm_set1_epi32(param[1]);  /*  409 */
545    c2 = _mm_set1_epi32(param[2]);  /* -100 */
546    c3 = _mm_set1_epi32(param[3]);  /*  516 */
547    c4 = _mm_set1_epi32(param[4]);  /* -208 */
548
549    __m128i Y, U, V, R, G, B, A;
550
551    A = _mm_set1_epi32(255);
552    uint32_t i;
553
554    for (i = 0; i < (count << 1); ++i) {
555        Y = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pY));
556        U = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pU));
557		V = cvtepu8_epi32(_mm_set1_epi32(*(const int *)pV));
558
559        Y = _mm_sub_epi32(Y, biasY);
560        U = _mm_sub_epi32(U, biasUV);
561		V = _mm_sub_epi32(V, biasUV);
562
563        Y = mullo_epi32(Y, c0);
564
565        R = _mm_add_epi32(Y, mullo_epi32(V, c1));
566        R = _mm_add_epi32(R, biasUV);
567        R = _mm_srai_epi32(R, 8);
568
569        G = _mm_add_epi32(Y, mullo_epi32(U, c2));
570        G = _mm_add_epi32(G, mullo_epi32(V, c4));
571        G = _mm_add_epi32(G, biasUV);
572        G = _mm_srai_epi32(G, 8);
573
574        B = _mm_add_epi32(Y, mullo_epi32(U, c3));
575        B = _mm_add_epi32(B, biasUV);
576        B = _mm_srai_epi32(B, 8);
577
578        __m128i y1, y2, y3, y4;
579
580        y1 = packus_epi32(R, G);
581        y2 = packus_epi32(B, A);
582        y3 = _mm_packus_epi16(y1, y2);
583        const __m128i T4x4 = _mm_set_epi8(15, 11, 7, 3,
584                                          14, 10, 6, 2,
585                                          13,  9, 5, 1,
586                                          12,  8, 4, 0);
587        y4 = _mm_shuffle_epi8(y3, T4x4);
588        _mm_storeu_si128((__m128i *)dst, y4);
589        pY += 4;
590        pU += 4;
591		pV += 4;
592        dst = (__m128i *)dst + 1;
593    }
594}
595
596void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
597                               const void *y2, const void *y3, const void *y4,
598                               const short *coef, uint32_t count) {
599    __m128i x;
600    __m128i c0, c2, c4, c6, c8, c10, c12;
601    __m128i c14, c16, c18, c20, c22, c24;
602    __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9;
603    __m128i p0,  p1,  p2,  p3,  p4,  p5,  p6,  p7;
604    __m128i p8,  p9, p10, p11, p12, p13, p14, p15;
605    __m128i p16, p17, p18, p19, p20, p21, p22, p23;
606    __m128i p24, p25, p26, p27, p28, p29, p30, p31;
607    __m128i p32, p33, p34, p35, p36, p37, p38, p39;
608    __m128i o0, o1, o2, o3;
609    uint32_t i;
610
611    x = _mm_loadl_epi64((const __m128i *)(coef+0));
612    c0  = _mm_shuffle_epi32(x, 0x00);
613    c2  = _mm_shuffle_epi32(x, 0x55);
614
615    x = _mm_loadl_epi64((const __m128i *)(coef+4));
616    c4  = _mm_shuffle_epi32(x, 0x00);
617    c6  = _mm_shuffle_epi32(x, 0x55);
618
619    x = _mm_loadl_epi64((const __m128i *)(coef+8));
620    c8  = _mm_shuffle_epi32(x, 0x00);
621    c10  = _mm_shuffle_epi32(x, 0x55);
622
623    x = _mm_loadl_epi64((const __m128i *)(coef+12));
624    c12  = _mm_shuffle_epi32(x, 0x00);
625    c14  = _mm_shuffle_epi32(x, 0x55);
626
627    x = _mm_loadl_epi64((const __m128i *)(coef+16));
628    c16  = _mm_shuffle_epi32(x, 0x00);
629    c18  = _mm_shuffle_epi32(x, 0x55);
630
631    x = _mm_loadl_epi64((const __m128i *)(coef+20));
632    c20  = _mm_shuffle_epi32(x, 0x00);
633    c22  = _mm_shuffle_epi32(x, 0x55);
634
635    x = _mm_loadl_epi64((const __m128i *)(coef+24));
636    c24  = _mm_shuffle_epi32(x, 0x00);
637
638    for (i = 0; i < count; ++i) {
639
640        p0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t *)y0), _mm_setzero_si128());
641        p1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+1)), _mm_setzero_si128());
642        p2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+2)), _mm_setzero_si128());
643        p3 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+3)), _mm_setzero_si128());
644        p4 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+4)), _mm_setzero_si128());
645        p5 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+5)), _mm_setzero_si128());
646        p6 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+6)), _mm_setzero_si128());
647        p7 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y0+7)), _mm_setzero_si128());
648
649        p8 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1)), _mm_setzero_si128());
650        p9 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+1)), _mm_setzero_si128());
651        p10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+2)), _mm_setzero_si128());
652        p11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+3)), _mm_setzero_si128());
653        p12 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+4)), _mm_setzero_si128());
654        p13 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+5)), _mm_setzero_si128());
655        p14 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+6)), _mm_setzero_si128());
656        p15 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y1+7)), _mm_setzero_si128());
657
658        p16 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2)), _mm_setzero_si128());
659        p17 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+1)), _mm_setzero_si128());
660        p18 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+2)), _mm_setzero_si128());
661        p19 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+3)), _mm_setzero_si128());
662        p20 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+4)), _mm_setzero_si128());
663        p21 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+5)), _mm_setzero_si128());
664        p22 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+6)), _mm_setzero_si128());
665        p23 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y2+7)), _mm_setzero_si128());
666
667        p24 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3)), _mm_setzero_si128());
668        p25 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+1)), _mm_setzero_si128());
669        p26 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+2)), _mm_setzero_si128());
670        p27 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+3)), _mm_setzero_si128());
671        p28 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+4)), _mm_setzero_si128());
672        p29 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+5)), _mm_setzero_si128());
673        p30 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+6)), _mm_setzero_si128());
674        p31 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y3+7)), _mm_setzero_si128());
675
676        p32 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4)), _mm_setzero_si128());
677        p33 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+1)), _mm_setzero_si128());
678        p34 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+2)), _mm_setzero_si128());
679        p35 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+3)), _mm_setzero_si128());
680        p36 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+4)), _mm_setzero_si128());
681        p37 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+5)), _mm_setzero_si128());
682        p38 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+6)), _mm_setzero_si128());
683        p39 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*((int32_t *)y4+7)), _mm_setzero_si128());
684
685        o0 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p0, p1),  c0);
686        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p2, p3),  c2));
687        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p8),  c4));
688        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p9,p10),  c6));
689        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c8));
690        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p16, p17), c10));
691        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c12));
692        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p24), c14));
693        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p25,p26), c16));
694        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c18));
695        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p32, p33), c20));
696        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c22));
697        o0 = _mm_add_epi32(o0, _mm_madd_epi16( _mm_unpacklo_epi16(p36, _mm_setzero_si128()), c24));
698        o0 = _mm_srai_epi32(o0, 8);
699
700        o1 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p1, p2),  c0);
701        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c2));
702        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p9),  c4));
703        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p10,p11),  c6));
704        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p12,p13),  c8));
705        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p17,p18), c10));
706        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p19,p20), c12));
707        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p21,p25), c14));
708        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p26, p27), c16));
709        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c18));
710        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p33, p34), c20));
711        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c22));
712        o1 = _mm_add_epi32(o1, _mm_madd_epi16( _mm_unpacklo_epi16(p37, _mm_setzero_si128()), c24));
713        o1 = _mm_srai_epi32(o1, 8);
714
715        o2 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p2,p3),  c0);
716        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p4, p5),  c2));
717        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p6, p10),  c4));
718        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p11, p12),  c6));
719        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p13, p14),  c8));
720        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p18, p19), c10));
721        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p20, p21), c12));
722        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p22, p26), c14));
723        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p27, p28), c16));
724        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p29, p30), c18));
725        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p34, p35), c20));
726        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p36, p37), c22));
727        o2 = _mm_add_epi32(o2, _mm_madd_epi16( _mm_unpacklo_epi16(p38, _mm_setzero_si128()), c24));
728        o2 = _mm_srai_epi32(o2, 8);
729
730        o3 =                   _mm_madd_epi16( _mm_unpacklo_epi16(p3,p4),  c0);
731        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p5, p6),  c2));
732        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p7, p11),  c4));
733        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p12, p13),  c6));
734        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p14, p15),  c8));
735        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p19, p20), c10));
736        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p21, p22), c12));
737        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p23, p27), c14));
738        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p28, p29), c16));
739        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p30, p31), c18));
740        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p35, p36), c20));
741        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p37,p38), c22));
742        o3 = _mm_add_epi32(o3, _mm_madd_epi16( _mm_unpacklo_epi16(p39, _mm_setzero_si128()), c24));
743        o3 = _mm_srai_epi32(o3, 8);
744
745        o0 = packus_epi32(o0, o1);
746        o2 = packus_epi32(o2, o3);
747        o0 = _mm_packus_epi16(o0, o2);
748        _mm_storeu_si128((__m128i *)dst, o0);
749
750        y0 = (const char *)y0 + 16;
751        y1 = (const char *)y1 + 16;
752        y2 = (const char *)y2 + 16;
753        y3 = (const char *)y3 + 16;
754        y4 = (const char *)y4 + 16;
755        dst = (char *)dst + 16;
756    }
757}
758
759void rsdIntrinsicBlendSrcOver_K(void *dst, const void *src, uint32_t count8) {
760    __m128i all1s, ina, ins;
761    __m128i in0, in1, out0, out1;
762    __m128i t0, t1, t2, t3;
763    uint32_t i;
764
765    all1s = _mm_set1_epi16(255);
766
767    for (i = 0; i < count8; ++i) {
768        in0 = _mm_loadu_si128((const __m128i *)src);
769        in1 = _mm_loadu_si128((const __m128i *)src + 1);
770        out0 = _mm_loadu_si128((const __m128i *)dst);
771        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
772
773        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
774        ina = _mm_shufflelo_epi16(ins, 0xFF);
775        ina = _mm_shufflehi_epi16(ina, 0xFF);
776        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
777        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
778        t0 = _mm_srai_epi16(t0, 8);
779        t0 = _mm_add_epi16(t0, ins);
780
781        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
782        ina = _mm_shufflelo_epi16(ins, 0xFF);
783        ina = _mm_shufflehi_epi16(ina, 0xFF);
784        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
785        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
786        t1 = _mm_srai_epi16(t1, 8);
787        t1 = _mm_add_epi16(t1, ins);
788
789        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
790        ina = _mm_shufflelo_epi16(ins, 0xFF);
791        ina = _mm_shufflehi_epi16(ina, 0xFF);
792        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
793        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
794        t2 = _mm_srai_epi16(t2, 8);
795        t2 = _mm_add_epi16(t2, ins);
796
797        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
798        ina = _mm_shufflelo_epi16(ins, 0xFF);
799        ina = _mm_shufflehi_epi16(ina, 0xFF);
800        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
801        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
802        t3 = _mm_srai_epi16(t3, 8);
803        t3 = _mm_add_epi16(t3, ins);
804
805        t0 = _mm_packus_epi16(t0, t1);
806        t2 = _mm_packus_epi16(t2, t3);
807        _mm_storeu_si128((__m128i *)dst, t0);
808        _mm_storeu_si128((__m128i *)dst + 1, t2);
809
810        src = (const __m128i *)src + 2;
811        dst = (__m128i *)dst + 2;
812    }
813}
814
815void rsdIntrinsicBlendDstOver_K(void *dst, const void *src, uint32_t count8) {
816    __m128i all1s, outa, outs;
817    __m128i in0, in1, out0, out1;
818    __m128i t0, t1, t2, t3;
819    uint32_t i;
820
821    all1s = _mm_set1_epi16(255);
822
823    for (i = 0; i < count8; ++i) {
824        in0 = _mm_loadu_si128((const __m128i *)src);
825        in1 = _mm_loadu_si128((const __m128i *)src + 1);
826        out0 = _mm_loadu_si128((const __m128i *)dst);
827        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
828
829
830        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
831        outa = _mm_shufflelo_epi16(outs, 0xFF);
832        outa = _mm_shufflehi_epi16(outa, 0xFF);
833        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
834        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
835        t0 = _mm_srai_epi16(t0, 8);
836        t0 = _mm_add_epi16(t0, outs);
837
838        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
839        outa = _mm_shufflelo_epi16(outs, 0xFF);
840        outa = _mm_shufflehi_epi16(outa, 0xFF);
841        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
842        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
843        t1 = _mm_srai_epi16(t1, 8);
844        t1 = _mm_add_epi16(t1, outs);
845
846        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
847        outa = _mm_shufflelo_epi16(outs, 0xFF);
848        outa = _mm_shufflehi_epi16(outa, 0xFF);
849        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
850        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
851        t2 = _mm_srai_epi16(t2, 8);
852        t2 = _mm_add_epi16(t2, outs);
853
854        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
855        outa = _mm_shufflelo_epi16(outs, 0xFF);
856        outa = _mm_shufflehi_epi16(outa, 0xFF);
857        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
858        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
859        t3 = _mm_srai_epi16(t3, 8);
860        t3 = _mm_add_epi16(t3, outs);
861
862        t0 = _mm_packus_epi16(t0, t1);
863        t2 = _mm_packus_epi16(t2, t3);
864        _mm_storeu_si128((__m128i *)dst, t0);
865        _mm_storeu_si128((__m128i *)dst + 1, t2);
866
867        src = (const __m128i *)src + 2;
868        dst = (__m128i *)dst + 2;
869    }
870}
871
872void rsdIntrinsicBlendSrcIn_K(void *dst, const void *src, uint32_t count8) {
873    __m128i outa;
874    __m128i in0, in1, out0, out1;
875    __m128i t0, t1, t2, t3;
876    uint32_t i;
877
878    for (i = 0; i < count8; ++i) {
879        in0 = _mm_loadu_si128((const __m128i *)src);
880        in1 = _mm_loadu_si128((const __m128i *)src + 1);
881        out0 = _mm_loadu_si128((const __m128i *)dst);
882        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
883
884        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
885        outa = _mm_shufflelo_epi16(outa, 0xFF);
886        outa = _mm_shufflehi_epi16(outa, 0xFF);
887        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
888        t0 = _mm_mullo_epi16(t0, outa);
889        t0 = _mm_srai_epi16(t0, 8);
890
891        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
892        outa = _mm_shufflelo_epi16(outa, 0xFF);
893        outa = _mm_shufflehi_epi16(outa, 0xFF);
894        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
895        t1 = _mm_mullo_epi16(t1, outa);
896        t1 = _mm_srai_epi16(t1, 8);
897
898        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
899        outa = _mm_shufflelo_epi16(outa, 0xFF);
900        outa = _mm_shufflehi_epi16(outa, 0xFF);
901        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
902        t2 = _mm_mullo_epi16(t2, outa);
903        t2 = _mm_srai_epi16(t2, 8);
904
905        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
906        outa = _mm_shufflelo_epi16(outa, 0xFF);
907        outa = _mm_shufflehi_epi16(outa, 0xFF);
908        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
909        t3 = _mm_mullo_epi16(t3, outa);
910        t3 = _mm_srai_epi16(t3, 8);
911
912        t0 = _mm_packus_epi16(t0, t1);
913        t2 = _mm_packus_epi16(t2, t3);
914        _mm_storeu_si128((__m128i *)dst, t0);
915        _mm_storeu_si128((__m128i *)dst + 1, t2);
916
917        src = (const __m128i *)src + 2;
918        dst = (__m128i *)dst + 2;
919    }
920}
921
922void rsdIntrinsicBlendDstIn_K(void *dst, const void *src, uint32_t count8) {
923    __m128i ina;
924    __m128i in0, in1, out0, out1;
925    __m128i t0, t1, t2, t3;
926    uint32_t i;
927
928    for (i = 0; i < count8; ++i) {
929        in0 = _mm_loadu_si128((const __m128i *)src);
930        in1 = _mm_loadu_si128((const __m128i *)src + 1);
931        out0 = _mm_loadu_si128((const __m128i *)dst);
932        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
933
934        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
935        ina = _mm_shufflelo_epi16(ina, 0xFF);
936        ina = _mm_shufflehi_epi16(ina, 0xFF);
937        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
938        t0 = _mm_mullo_epi16(t0, ina);
939        t0 = _mm_srai_epi16(t0, 8);
940
941        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
942        ina = _mm_shufflelo_epi16(ina, 0xFF);
943        ina = _mm_shufflehi_epi16(ina, 0xFF);
944        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
945        t1 = _mm_mullo_epi16(t1, ina);
946        t1 = _mm_srai_epi16(t1, 8);
947
948        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
949        ina = _mm_shufflelo_epi16(ina, 0xFF);
950        ina = _mm_shufflehi_epi16(ina, 0xFF);
951        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
952        t2 = _mm_mullo_epi16(t2, ina);
953        t2 = _mm_srai_epi16(t2, 8);
954
955        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
956        ina = _mm_shufflelo_epi16(ina, 0xFF);
957        ina = _mm_shufflehi_epi16(ina, 0xFF);
958        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
959        t3 = _mm_mullo_epi16(t3, ina);
960        t3 = _mm_srai_epi16(t3, 8);
961
962        t0 = _mm_packus_epi16(t0, t1);
963        t2 = _mm_packus_epi16(t2, t3);
964        _mm_storeu_si128((__m128i *)dst, t0);
965        _mm_storeu_si128((__m128i *)dst + 1, t2);
966
967        src = (const __m128i *)src + 2;
968        dst = (__m128i *)dst + 2;
969    }
970}
971
972void rsdIntrinsicBlendSrcOut_K(void *dst, const void *src, uint32_t count8) {
973    __m128i all1s, outa;
974    __m128i in0, in1, out0, out1;
975    __m128i t0, t1, t2, t3;
976    uint32_t i;
977
978    all1s = _mm_set1_epi16(255);
979
980    for (i = 0; i < count8; ++i) {
981        in0 = _mm_loadu_si128((const __m128i *)src);
982        in1 = _mm_loadu_si128((const __m128i *)src + 1);
983        out0 = _mm_loadu_si128((const __m128i *)dst);
984        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
985
986        outa = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
987        outa = _mm_shufflelo_epi16(outa, 0xFF);
988        outa = _mm_shufflehi_epi16(outa, 0xFF);
989        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
990        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, outa));
991        t0 = _mm_srai_epi16(t0, 8);
992
993        outa = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
994        outa = _mm_shufflelo_epi16(outa, 0xFF);
995        outa = _mm_shufflehi_epi16(outa, 0xFF);
996        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
997        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, outa));
998        t1 = _mm_srai_epi16(t1, 8);
999
1000        outa = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1001        outa = _mm_shufflelo_epi16(outa, 0xFF);
1002        outa = _mm_shufflehi_epi16(outa, 0xFF);
1003        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1004        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, outa));
1005        t2 = _mm_srai_epi16(t2, 8);
1006
1007        outa = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1008        outa = _mm_shufflelo_epi16(outa, 0xFF);
1009        outa = _mm_shufflehi_epi16(outa, 0xFF);
1010        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1011        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, outa));
1012        t3 = _mm_srai_epi16(t3, 8);
1013
1014        t0 = _mm_packus_epi16(t0, t1);
1015        t2 = _mm_packus_epi16(t2, t3);
1016        _mm_storeu_si128((__m128i *)dst, t0);
1017        _mm_storeu_si128((__m128i *)dst + 1, t2);
1018
1019        src = (const __m128i *)src + 2;
1020        dst = (__m128i *)dst + 2;
1021    }
1022}
1023
1024void rsdIntrinsicBlendDstOut_K(void *dst, const void *src, uint32_t count8) {
1025    __m128i all1s, ina;
1026    __m128i in0, in1, out0, out1;
1027    __m128i t0, t1, t2, t3;
1028    uint32_t i;
1029
1030    all1s = _mm_set1_epi16(255);
1031
1032    for (i = 0; i < count8; ++i) {
1033        in0 = _mm_loadu_si128((const __m128i *)src);
1034        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1035        out0 = _mm_loadu_si128((const __m128i *)dst);
1036        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1037
1038        ina = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1039        ina = _mm_shufflelo_epi16(ina, 0xFF);
1040        ina = _mm_shufflehi_epi16(ina, 0xFF);
1041        t0 = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1042        t0 = _mm_mullo_epi16(t0, _mm_sub_epi16(all1s, ina));
1043        t0 = _mm_srai_epi16(t0, 8);
1044
1045        ina = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1046        ina = _mm_shufflelo_epi16(ina, 0xFF);
1047        ina = _mm_shufflehi_epi16(ina, 0xFF);
1048        t1 = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1049        t1 = _mm_mullo_epi16(t1, _mm_sub_epi16(all1s, ina));
1050        t1 = _mm_srai_epi16(t1, 8);
1051
1052        ina = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1053        ina = _mm_shufflelo_epi16(ina, 0xFF);
1054        ina = _mm_shufflehi_epi16(ina, 0xFF);
1055        t2 = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1056        t2 = _mm_mullo_epi16(t2, _mm_sub_epi16(all1s, ina));
1057        t2 = _mm_srai_epi16(t2, 8);
1058
1059        ina = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1060        ina = _mm_shufflelo_epi16(ina, 0xFF);
1061        ina = _mm_shufflehi_epi16(ina, 0xFF);
1062        t3 = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1063        t3 = _mm_mullo_epi16(t3, _mm_sub_epi16(all1s, ina));
1064        t3 = _mm_srai_epi16(t3, 8);
1065
1066        t0 = _mm_packus_epi16(t0, t1);
1067        t2 = _mm_packus_epi16(t2, t3);
1068        _mm_storeu_si128((__m128i *)dst, t0);
1069        _mm_storeu_si128((__m128i *)dst + 1, t2);
1070
1071        src = (const __m128i *)src + 2;
1072        dst = (__m128i *)dst + 2;
1073    }
1074}
1075
1076void rsdIntrinsicBlendSrcAtop_K(void *dst, const void *src, uint32_t count8) {
1077    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1078    __m128i all1s, ina, outa, ins, outs;
1079    __m128i in0, in1, out0, out1;
1080    __m128i t0, t1, t2, t3;
1081    uint32_t i;
1082
1083    all1s = _mm_set1_epi16(255);
1084
1085    for (i = 0; i < count8; ++i) {
1086        in0 = _mm_loadu_si128((const __m128i *)src);
1087        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1088        out0 = _mm_loadu_si128((const __m128i *)dst);
1089        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1090
1091        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1092        ina = _mm_shufflelo_epi16(ins, 0xFF);
1093        ina = _mm_shufflehi_epi16(ina, 0xFF);
1094        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1095        outa = _mm_shufflelo_epi16(outs, 0xFF);
1096        outa = _mm_shufflehi_epi16(outa, 0xFF);
1097        t0 = _mm_sub_epi16(all1s, ina);
1098        t0 = _mm_mullo_epi16(t0, outs);
1099        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(outa, ins));
1100        t0 = _mm_srli_epi16(t0, 8);
1101
1102        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1103        ina = _mm_shufflelo_epi16(ins, 0xFF);
1104        ina = _mm_shufflehi_epi16(ina, 0xFF);
1105        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1106        outa = _mm_shufflelo_epi16(outs, 0xFF);
1107        outa = _mm_shufflehi_epi16(outa, 0xFF);
1108        t1 = _mm_sub_epi16(all1s, ina);
1109        t1 = _mm_mullo_epi16(t1, outs);
1110        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(outa, ins));
1111        t1 = _mm_srli_epi16(t1, 8);
1112
1113        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1114        ina = _mm_shufflelo_epi16(ins, 0xFF);
1115        ina = _mm_shufflehi_epi16(ina, 0xFF);
1116        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1117        outa = _mm_shufflelo_epi16(outs, 0xFF);
1118        outa = _mm_shufflehi_epi16(outa, 0xFF);
1119        t2 = _mm_sub_epi16(all1s, ina);
1120        t2 = _mm_mullo_epi16(t2, outs);
1121        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(outa, ins));
1122        t2 = _mm_srli_epi16(t2, 8);
1123
1124        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1125        ina = _mm_shufflelo_epi16(ins, 0xFF);
1126        ina = _mm_shufflehi_epi16(ina, 0xFF);
1127        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1128        outa = _mm_shufflelo_epi16(outs, 0xFF);
1129        outa = _mm_shufflehi_epi16(outa, 0xFF);
1130        t3 = _mm_sub_epi16(all1s, ina);
1131        t3 = _mm_mullo_epi16(t3, outs);
1132        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(outa, ins));
1133        t3 = _mm_srli_epi16(t3, 8);
1134
1135        t0 = _mm_packus_epi16(t0, t1);
1136        t0 = blendv_epi8(t0, out0, M0001);
1137        t2 = _mm_packus_epi16(t2, t3);
1138        t2 = blendv_epi8(t2, out1, M0001);
1139        _mm_storeu_si128((__m128i *)dst, t0);
1140        _mm_storeu_si128((__m128i *)dst + 1, t2);
1141
1142        src = (const __m128i *)src + 2;
1143        dst = (__m128i *)dst + 2;
1144    }
1145}
1146
1147void rsdIntrinsicBlendDstAtop_K(void *dst, const void *src, uint32_t count8) {
1148    const __m128i M0001 = _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000);
1149    __m128i all1s, ina, ins, outa, outs;
1150    __m128i in0, in1, out0, out1;
1151    __m128i t0, t1, t2, t3;
1152    uint32_t i;
1153
1154    all1s = _mm_set1_epi16(255);
1155
1156    for (i = 0; i < count8; ++i) {
1157        in0 = _mm_loadu_si128((const __m128i *)src);
1158        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1159        out0 = _mm_loadu_si128((const __m128i *)dst);
1160        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1161
1162        ins = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1163        ina = _mm_shufflelo_epi16(ins, 0xFF);
1164        ina = _mm_shufflehi_epi16(ina, 0xFF);
1165        outs = _mm_unpacklo_epi8(out0, _mm_setzero_si128());
1166        outa = _mm_shufflelo_epi16(outs, 0xFF);
1167        outa = _mm_shufflehi_epi16(outa, 0xFF);
1168        t0 = _mm_sub_epi16(all1s, outa);
1169        t0 = _mm_mullo_epi16(t0, ins);
1170        t0 = _mm_adds_epu16(t0, _mm_mullo_epi16(ina, outs));
1171        t0 = _mm_srli_epi16(t0, 8);
1172
1173        ins = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1174        ina = _mm_shufflelo_epi16(ins, 0xFF);
1175        ina = _mm_shufflehi_epi16(ina, 0xFF);
1176        outs = _mm_unpackhi_epi8(out0, _mm_setzero_si128());
1177        outa = _mm_shufflelo_epi16(outs, 0xFF);
1178        outa = _mm_shufflehi_epi16(outa, 0xFF);
1179        t1 = _mm_sub_epi16(all1s, outa);
1180        t1 = _mm_mullo_epi16(t1, ins);
1181        t1 = _mm_adds_epu16(t1, _mm_mullo_epi16(ina, outs));
1182        t1 = _mm_srli_epi16(t1, 8);
1183
1184        ins = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1185        ina = _mm_shufflelo_epi16(ins, 0xFF);
1186        ina = _mm_shufflehi_epi16(ina, 0xFF);
1187        outs = _mm_unpacklo_epi8(out1, _mm_setzero_si128());
1188        outa = _mm_shufflelo_epi16(outs, 0xFF);
1189        outa = _mm_shufflehi_epi16(outa, 0xFF);
1190        t2 = _mm_sub_epi16(all1s, outa);
1191        t2 = _mm_mullo_epi16(t2, ins);
1192        t2 = _mm_adds_epu16(t2, _mm_mullo_epi16(ina, outs));
1193        t2 = _mm_srli_epi16(t2, 8);
1194
1195        ins = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1196        ina = _mm_shufflelo_epi16(ins, 0xFF);
1197        ina = _mm_shufflehi_epi16(ina, 0xFF);
1198        outs = _mm_unpackhi_epi8(out1, _mm_setzero_si128());
1199        outa = _mm_shufflelo_epi16(outs, 0xFF);
1200        outa = _mm_shufflehi_epi16(outa, 0xFF);
1201        t3 = _mm_sub_epi16(all1s, outa);
1202        t3 = _mm_mullo_epi16(t3, ins);
1203        t3 = _mm_adds_epu16(t3, _mm_mullo_epi16(ina, outs));
1204        t3 = _mm_srli_epi16(t3, 8);
1205
1206        t0 = _mm_packus_epi16(t0, t1);
1207        t0 = blendv_epi8(t0, out0, M0001);
1208        t2 = _mm_packus_epi16(t2, t3);
1209        t2 = blendv_epi8(t2, out1, M0001);
1210        _mm_storeu_si128((__m128i *)dst, t0);
1211        _mm_storeu_si128((__m128i *)dst + 1, t2);
1212
1213        src = (const __m128i *)src + 2;
1214        dst = (__m128i *)dst + 2;
1215    }
1216}
1217
1218void rsdIntrinsicBlendXor_K(void *dst, const void *src, uint32_t count8) {
1219    __m128i in0, in1, out0, out1;
1220    uint32_t i;
1221
1222    for (i = 0; i < count8; ++i) {
1223        in0 = _mm_loadu_si128((const __m128i *)src);
1224        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1225        out0 = _mm_loadu_si128((const __m128i *)dst);
1226        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1227
1228        out0 = _mm_xor_si128(out0, in0);
1229        out1 = _mm_xor_si128(out1, in1);
1230
1231        _mm_storeu_si128((__m128i *)dst, out0);
1232        _mm_storeu_si128((__m128i *)dst + 1, out1);
1233
1234        src = (const __m128i *)src + 2;
1235        dst = (__m128i *)dst + 2;
1236    }
1237}
1238
1239void rsdIntrinsicBlendMultiply_K(void *dst, const void *src, uint32_t count8) {
1240    __m128i in0, in1, out0, out1;
1241    __m128i t0, t1, t2, t3;
1242    uint32_t i;
1243
1244    for (i = 0; i < count8; ++i) {
1245        in0 = _mm_loadu_si128((const __m128i *)src);
1246        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1247        out0 = _mm_loadu_si128((const __m128i *)dst);
1248        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1249
1250        t0 = _mm_unpacklo_epi8(in0, _mm_setzero_si128());
1251        t0 = _mm_mullo_epi16(t0, _mm_unpacklo_epi8(out0, _mm_setzero_si128()));
1252        t0 = _mm_srli_epi16(t0, 8);
1253
1254        t1 = _mm_unpackhi_epi8(in0, _mm_setzero_si128());
1255        t1 = _mm_mullo_epi16(t1, _mm_unpackhi_epi8(out0, _mm_setzero_si128()));
1256        t1 = _mm_srli_epi16(t1, 8);
1257
1258        t2 = _mm_unpacklo_epi8(in1, _mm_setzero_si128());
1259        t2 = _mm_mullo_epi16(t2, _mm_unpacklo_epi8(out1, _mm_setzero_si128()));
1260        t2 = _mm_srli_epi16(t2, 8);
1261
1262        t3 = _mm_unpackhi_epi8(in1, _mm_setzero_si128());
1263        t3 = _mm_mullo_epi16(t3, _mm_unpackhi_epi8(out1, _mm_setzero_si128()));
1264        t3 = _mm_srli_epi16(t3, 8);
1265
1266        t0 = _mm_packus_epi16(t0, t1);
1267        t2 = _mm_packus_epi16(t2, t3);
1268        _mm_storeu_si128((__m128i *)dst, t0);
1269        _mm_storeu_si128((__m128i *)dst + 1, t2);
1270
1271        src = (const __m128i *)src + 2;
1272        dst = (__m128i *)dst + 2;
1273    }
1274}
1275
1276void rsdIntrinsicBlendAdd_K(void *dst, const void *src, uint32_t count8) {
1277    __m128i in0, in1, out0, out1;
1278    uint32_t i;
1279
1280    for (i = 0; i < count8; ++i) {
1281        in0 = _mm_loadu_si128((const __m128i *)src);
1282        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1283        out0 = _mm_loadu_si128((const __m128i *)dst);
1284        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1285
1286        out0 = _mm_adds_epu8(out0, in0);
1287        out1 = _mm_adds_epu8(out1, in1);
1288
1289        _mm_storeu_si128((__m128i *)dst, out0);
1290        _mm_storeu_si128((__m128i *)dst + 1, out1);
1291
1292        src = (const __m128i *)src + 2;
1293        dst = (__m128i *)dst + 2;
1294    }
1295}
1296
1297void rsdIntrinsicBlendSub_K(void *dst, const void *src, uint32_t count8) {
1298    __m128i in0, in1, out0, out1;
1299    uint32_t i;
1300
1301    for (i = 0; i < count8; ++i) {
1302        in0 = _mm_loadu_si128((const __m128i *)src);
1303        in1 = _mm_loadu_si128((const __m128i *)src + 1);
1304        out0 = _mm_loadu_si128((const __m128i *)dst);
1305        out1 = _mm_loadu_si128((const __m128i *)dst + 1);
1306
1307        out0 = _mm_subs_epu8(out0, in0);
1308        out1 = _mm_subs_epu8(out1, in1);
1309
1310        _mm_storeu_si128((__m128i *)dst, out0);
1311        _mm_storeu_si128((__m128i *)dst + 1, out1);
1312
1313        src = (const __m128i *)src + 2;
1314        dst = (__m128i *)dst + 2;
1315    }
1316}
1317