1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevcd_it_rec_dc_x86_intr.c
22*
23* @brief
24*  Platform specific intrinsic implementation of certain functions
25*
26* @author
27*  Ittiam
28* @par List of Functions:
29*  - ihevcd_itrans_recon_dc
30*  - ihevcd_fmt_conv_420sp_to_420p
31*
32* @remarks
33*  None
34*
35*******************************************************************************
36*/
37
38#include "ihevc_typedefs.h"
39#include "ihevc_defs.h"
40#include "ihevc_macros.h"
41#include "ihevc_platform_macros.h"
42#include "ihevcd_function_selector.h"
43
44#include <immintrin.h>
45
46
47void ihevcd_itrans_recon_dc_luma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
48                                       WORD32 log2_trans_size, WORD16 i2_coeff_value)
49{
50    __m128i m_temp_reg_0;
51    __m128i m_temp_reg_1;
52    __m128i m_temp_reg_2;
53    __m128i m_temp_reg_3;
54    __m128i m_temp_reg_4;
55    __m128i m_temp_reg_5;
56    __m128i m_temp_reg_6;
57    __m128i m_temp_reg_7;
58    __m128i m_temp_reg_8;
59    __m128i m_temp_reg_9;
60    __m128i m_temp_reg_10;
61    __m128i m_temp_reg_11;
62    __m128i m_temp_reg_12;
63    __m128i m_temp_reg_13;
64    __m128i m_temp_reg_14;
65    __m128i m_temp_reg_15;
66    __m128i m_temp_reg_20, zero_8x16b;
67    __m128i *pi4_dst = (__m128i *)pu1_dst;
68
69
70    //WORD32 row,col;
71    WORD32 add, shift;
72    WORD32 dc_value, quant_out;
73    WORD32 trans_size;
74
75
76
77
78    trans_size = (1 << log2_trans_size);
79
80    quant_out = i2_coeff_value;
81
82    shift = IT_SHIFT_STAGE_1;
83    add = 1 << (shift - 1);
84    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
85    shift = IT_SHIFT_STAGE_2;
86    add = 1 << (shift - 1);
87    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
88
89    /*Replicate the DC value within 16 bits in 128 bit register*/
90    m_temp_reg_20 = _mm_set1_epi16(dc_value);
91    zero_8x16b = _mm_setzero_si128();
92
93    if(trans_size == 4)
94    {
95        WORD32 *pi4_dst = (WORD32 *)pu1_dst;
96
97        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
98        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
99        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
100        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
101
102        m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
103        m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
104
105        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
106        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
107
108        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
109        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
110
111        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
112
113
114        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
115        m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
116        m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
117        m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
118        pu1_dst += dst_strd;
119        pi4_dst = (WORD32 *)(pu1_dst);
120
121        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
122        pu1_dst += dst_strd;
123        pi4_dst = (WORD32 *)(pu1_dst);
124
125        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
126        pu1_dst += dst_strd;
127        pi4_dst = (WORD32 *)(pu1_dst);
128
129        *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
130    }
131    else
132    {
133        WORD32 i, j;
134
135        for(i = 1; i <= trans_size; i += 4)
136        {
137            for(j = 1; j <= trans_size; j += 8)
138            {
139
140                m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
141                m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
142                m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
143                m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
144
145
146                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
147                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
148                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
149                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
150
151                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
152                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
153                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
154                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
155
156                pi4_dst = (__m128i *)(pu1_dst);
157
158                m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
159                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
160
161                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
162
163                m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
164                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
165
166                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
167
168                m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
169                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
170
171                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
172
173                m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
174                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
175
176                pu1_pred += 8;
177                pu1_dst += 8;
178            }
179            pu1_pred += 4 * pred_strd - trans_size;
180            pu1_dst += 4 * dst_strd - trans_size;
181        }
182    }
183
184
185}
186
187void ihevcd_itrans_recon_dc_chroma_sse42(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
188                                         WORD32 log2_trans_size, WORD16 i2_coeff_value)
189{
190    __m128i m_temp_reg_0;
191    __m128i m_temp_reg_1;
192    __m128i m_temp_reg_2;
193    __m128i m_temp_reg_3;
194    __m128i m_temp_reg_4;
195    __m128i m_temp_reg_5;
196    __m128i m_temp_reg_6;
197    __m128i m_temp_reg_7;
198    __m128i m_temp_reg_8;
199    __m128i m_temp_reg_9;
200    __m128i m_temp_reg_10;
201    __m128i m_temp_reg_11;
202    __m128i m_temp_reg_12;
203    __m128i m_temp_reg_13;
204    __m128i m_temp_reg_14;
205    __m128i m_temp_reg_15;
206    __m128i m_temp_reg_20, zero_8x16b;
207    __m128i *pi4_dst = (__m128i *)pu1_dst;
208
209
210    //WORD32 row,col;
211    WORD32 add, shift;
212    WORD32 dc_value, quant_out;
213    WORD32 trans_size;
214
215
216    WORD32 shuffle_mask_4x4 = 0x06040200;
217    WORD32 unchanged_mask_4x4 = 0x07050301;
218    LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
219    LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
220
221    trans_size = (1 << log2_trans_size);
222
223    quant_out = i2_coeff_value;
224
225    shift = IT_SHIFT_STAGE_1;
226    add = 1 << (shift - 1);
227    dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
228    shift = IT_SHIFT_STAGE_2;
229    add = 1 << (shift - 1);
230    dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
231
232    /*Replicate the DC value within 16 bits in 128 bit register*/
233    m_temp_reg_20 = _mm_set1_epi16(dc_value);
234    zero_8x16b = _mm_setzero_si128();
235
236    if(trans_size == 4)
237    {
238        __m128i chroma_shuffle_mask_16x8b;
239        __m128i chroma_unchanged_mask_16x8b;
240        chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
241        chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
242
243        /*Load the prediction data*/
244        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
245        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
246        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
247        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
248
249        m_temp_reg_10  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
250        m_temp_reg_11  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
251        m_temp_reg_12  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
252        m_temp_reg_13  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
253
254        m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
255        m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
256
257        m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
258        m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
259
260        m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
261        m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
262
263        /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
264        m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
265        m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
266        m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
267        m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
268
269        m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
270        m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
271        m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
272        m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
273
274
275        m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
276        m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
277        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
278        m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
279        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
280        m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
281        m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
282        m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
283
284        /*Store the result in the destination*/
285        _mm_storel_epi64(pi4_dst, m_temp_reg_9);
286        pu1_dst += dst_strd;
287        pi4_dst = (__m128i *)(pu1_dst);
288
289
290        _mm_storel_epi64(pi4_dst, m_temp_reg_10);
291        pu1_dst += dst_strd;
292        pi4_dst = (__m128i *)(pu1_dst);
293
294        _mm_storel_epi64(pi4_dst, m_temp_reg_11);
295        pu1_dst += dst_strd;
296        pi4_dst = (__m128i *)(pu1_dst);
297
298        _mm_storel_epi64(pi4_dst, m_temp_reg_12);
299    }
300    else
301    {
302        WORD32 i, j;
303        __m128i chroma_shuffle_mask_16x8b;
304        __m128i chroma_unchanged_mask_16x8b;
305        chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
306        chroma_unchanged_mask_16x8b =
307                        _mm_loadl_epi64((__m128i *)(&unchanged_mask));
308
309        for(i = 0; i < trans_size; i += 4)
310        {
311            for(j = 0; j < trans_size; j += 8)
312            {
313
314                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
315                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
316                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
317                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
318
319                /*Retain only one chroma component*/
320                m_temp_reg_4  = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
321                m_temp_reg_5  = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
322                m_temp_reg_6  = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
323                m_temp_reg_7  = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
324
325                m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
326                m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
327                m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
328                m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
329
330                m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
331                m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
332                m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
333                m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
334
335
336                /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
337                m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
338                m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
339                m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
340                m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
341
342                m_temp_reg_0  = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
343                m_temp_reg_1  = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
344                m_temp_reg_2  = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
345                m_temp_reg_3  = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
346
347                m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
348                m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
349
350                m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
351                m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
352                m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
353
354                m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
355                m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
356                m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
357
358                /*Store the result in the destination*/
359                pi4_dst = (__m128i *)(pu1_dst);
360
361                _mm_storel_epi64(pi4_dst, m_temp_reg_12);
362                m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
363
364                pi4_dst = (__m128i *)(pu1_dst + 8);
365                _mm_storel_epi64(pi4_dst, m_temp_reg_8);
366
367                pi4_dst = (__m128i *)(pu1_dst + dst_strd);
368
369                _mm_storel_epi64(pi4_dst, m_temp_reg_13);
370                m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
371
372                pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
373                _mm_storel_epi64(pi4_dst, m_temp_reg_9);
374
375                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
376
377                _mm_storel_epi64(pi4_dst, m_temp_reg_14);
378                m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
379
380                pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
381                _mm_storel_epi64(pi4_dst, m_temp_reg_10);
382
383                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
384
385                _mm_storel_epi64(pi4_dst, m_temp_reg_15);
386                m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
387
388                pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
389                _mm_storel_epi64(pi4_dst, m_temp_reg_11);
390
391                pu1_pred += 16;
392                pu1_dst += 16;
393            }
394
395            pu1_pred += 4 * pred_strd - 2 * trans_size;
396            pu1_dst += 4 * dst_strd - 2 * trans_size;
397        }
398    }
399
400
401}
402