1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevcd_frm_cvt_x86_intr.c
22*
23* @brief
24*  Platform specific intrinsic implementation of certain functions
25*
26* @author
27*  Ittiam
28* @par List of Functions:
29*  - ihevcd_itrans_recon_dc
30*  - ihevcd_fmt_conv_420sp_to_420p
31*
32* @remarks
33*  None
34*
35*******************************************************************************
36*/
37#include "string.h"
38#include "ihevc_typedefs.h"
39#include "ihevc_defs.h"
40#include "ihevc_macros.h"
41#include "ihevc_platform_macros.h"
42#include "ihevcd_function_selector.h"
43#include <string.h>
44#include <immintrin.h>
45
46
47void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src,
48                                         UWORD8 *pu1_uv_src,
49                                         UWORD8 *pu1_y_dst,
50                                         UWORD8 *pu1_u_dst,
51                                         UWORD8 *pu1_v_dst,
52                                         WORD32 wd,
53                                         WORD32 ht,
54                                         WORD32 src_y_strd,
55                                         WORD32 src_uv_strd,
56                                         WORD32 dst_y_strd,
57                                         WORD32 dst_uv_strd,
58                                         WORD32 is_u_first,
59                                         WORD32 disable_luma_copy)
60{
61    UWORD8 *pu1_src, *pu1_dst;
62    UWORD8 *pu1_u_src, *pu1_v_src;
63    WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows;
64    WORD32 i, j;
65
66    cols = 0;
67    pu1_u_src = (UWORD8 *)pu1_uv_src;
68    pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
69    if(0 == disable_luma_copy)
70    {
71        /* copy luma */
72        pu1_src = (UWORD8 *)pu1_y_src;
73        pu1_dst = (UWORD8 *)pu1_y_dst;
74
75        num_rows = ht;
76        num_cols = wd;
77
78        src_strd = src_y_strd;
79        dst_strd = dst_y_strd;
80        for(i = 0; i < num_rows; i++)
81        {
82            memcpy(pu1_dst, pu1_src, num_cols);
83            pu1_dst += dst_strd;
84            pu1_src += src_strd;
85        }
86    }
87
88    /* de-interleave U and V and copy to destination */
89    if(!is_u_first)
90    {
91        UWORD8 *temp = pu1_u_dst;
92        pu1_u_dst = pu1_v_dst;
93        pu1_v_dst = temp;
94
95        pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
96        pu1_v_src = (UWORD8 *)pu1_uv_src;
97    }
98
99    {
100        __m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b;
101        __m128i temp0_8x16b, temp1_8x16b, alt_first_mask;
102
103        UWORD8 FIRST_ALT_SHUFFLE[16] = {
104            0x00, 0x02, 0x04, 0x06,
105            0x08, 0x0A, 0x0C, 0x0E,
106            0x01, 0x03, 0x05, 0x07,
107            0x09, 0x0B, 0x0D, 0x0F };
108
109        PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0)
110        PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0)
111        PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0)
112        PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0)
113        PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0)
114        PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0)
115        PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0)
116        PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0)
117
118        num_rows = ht >> 1;
119        num_cols = wd >> 1;
120
121        src_strd = src_uv_strd;
122        dst_strd = dst_uv_strd;
123
124        alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]);
125
126        if(num_cols > 15)
127        {
128            cols = num_cols >> 4;
129
130            for(i = 0; i < (num_rows >> 2); i++)
131            {
132                UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
133
134                PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0)
135                PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0)
136                PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0)
137                PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0)
138
139                pu1_uv_src_temp = pu1_uv_src;
140                pu1_u_dst_temp =  pu1_u_dst;
141                pu1_v_dst_temp =  pu1_v_dst;
142
143                for(j = 0; j < cols; j++)
144                {
145
146                    /**** Row 0 ***/
147                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
148                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
149
150                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
151                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
152
153                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
154                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
155
156                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
157                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
158
159                    /**** Row 1 ***/
160                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd)));
161                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16));
162
163                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
164                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
165
166                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
167                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
168
169                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b);
170                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b);
171
172                    /**** Row 2 ***/
173                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd)));
174                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16));
175
176                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
177                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
178
179                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
180                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
181
182                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b);
183                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b);
184
185                    /**** Row 3 ***/
186                    src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd)));
187                    src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16));
188
189                    temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
190                    temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
191
192                    src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
193                    src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
194
195                    _mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b);
196                    _mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b);
197
198                    pu1_u_dst_temp += 16;
199                    pu1_v_dst_temp += 16;
200                    pu1_uv_src_temp += 32;
201                }
202
203                pu1_u_dst += 4 * dst_strd;
204                pu1_v_dst += 4 * dst_strd;
205                pu1_uv_src += 4 * src_strd;
206                //pu1_v_src += src_strd;
207            }
208            rows = num_rows & 0x3;
209            if(rows)
210            {
211                for(i = 0; i < rows; i++)
212                {
213                    UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
214
215                    pu1_uv_src_temp = pu1_uv_src;
216                    pu1_u_dst_temp =  pu1_u_dst;
217                    pu1_v_dst_temp =  pu1_v_dst;
218
219                    for(j = 0; j < cols; j++)
220                    {
221
222                        src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
223                        src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
224
225                        temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
226                        temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
227
228                        src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
229                        src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
230
231                        _mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
232                        _mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
233
234                        pu1_u_dst_temp += 16;
235                        pu1_v_dst_temp += 16;
236                        pu1_uv_src_temp += 32;
237                    }
238
239                    pu1_u_dst += dst_strd;
240                    pu1_v_dst += dst_strd;
241                    pu1_uv_src += src_strd;
242                }
243            }
244            pu1_u_dst -= (num_rows * dst_strd);
245            pu1_v_dst -= (num_rows * dst_strd);
246            num_cols &= 0x0F;
247        }
248        if(num_cols)
249        {
250            pu1_u_dst += (cols << 4);
251            pu1_v_dst += (cols << 4);
252            pu1_u_src += 2 * (cols << 4);
253            pu1_v_src += 2 * (cols << 4);
254            for(i = 0; i < num_rows; i++)
255            {
256                for(j = 0; j < num_cols; j++)
257                {
258                    pu1_u_dst[j] = pu1_u_src[j * 2];
259                    pu1_v_dst[j] = pu1_v_src[j * 2];
260                }
261
262                pu1_u_dst += dst_strd;
263                pu1_v_dst += dst_strd;
264                pu1_u_src += src_strd;
265                pu1_v_src += src_strd;
266            }
267        }
268    }
269    return;
270}
271