1/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21*  ihevc_chroma_intra_pred_filters_x86_intr.c
22*
23* @brief
24*  Contains function Definition for intra prediction  interpolation filters
25*
26*
27* @author
28*  Ittiam
29*
30* @par List of Functions:
31*  ihevc_intra_pred_chroma_planar_sse42()
32*
33*  ihevc_intra_pred_chroma_dc_sse42()
34*
35* @remarks
36*  None
37*
38*******************************************************************************
39*/
40
41
42/*****************************************************************************/
43/* File Includes                                                             */
44/*****************************************************************************/
45
46#include "ihevc_typedefs.h"
47#include "ihevc_macros.h"
48#include "ihevc_func_selector.h"
49#include "ihevc_platform_macros.h"
50#include "ihevc_intra_pred.h"
51#include "ihevc_chroma_intra_pred.h"
52#include "ihevc_common_tables.h"
53#include "ihevc_tables_x86_intr.h"
54
55#include <mmintrin.h>
56#include <xmmintrin.h>
57#include <emmintrin.h>
58#include <smmintrin.h>
59#include <immintrin.h>
60
61
62/****************************************************************************/
63/* Constant Macros                                                          */
64/****************************************************************************/
65#define MAX_CU_SIZE 64
66#define BIT_DEPTH 8
67#define T32_4NT 128
68#define T16_4NT 64
69#define T16C_4NT 64
70#define T8C_4NT 32
71/****************************************************************************/
72/* Function Macros                                                          */
73/****************************************************************************/
74
75#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
76
77/* tables to shuffle 8-bit values */
78
79/*****************************************************************************/
80/* Function Definition                                                      */
81/*****************************************************************************/
82
83
84
85/**
86*******************************************************************************
87*
88* @brief
89*  Planar Intraprediction with reference neighboring samples location
90* pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
91* to section 8.4.4.2.4 in the standard
92*
93* @par Description:
94*
95*
96* @param[in] pu1_src
97*  UWORD8 pointer to the source
98*
99* @param[in] pu1_dst
100*  UWORD8 pointer to the destination
101*
102* @param[in] src_strd
103*  integer source stride
104*
105* @param[in] dst_strd
106*  integer destination stride
107*
108* @param[in] nt
109*  integer Transform Block size
110*
111* @param[in] mode
112*  integer intraprediction mode
113*
114* @returns
115*
116* @remarks
117*  None
118*
119*******************************************************************************
120*/
121
122void ihevc_intra_pred_chroma_planar_sse42(UWORD8 *pu1_ref,
123                                          WORD32 src_strd,
124                                          UWORD8 *pu1_dst,
125                                          WORD32 dst_strd,
126                                          WORD32 nt,
127                                          WORD32 mode)
128{
129
130    WORD32 row, col;
131    WORD32 log2nt = 5;
132    WORD32 two_nt, three_nt;
133
134    __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
135    __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
136    UNUSED(src_strd);
137    UNUSED(mode);
138
139    switch(nt)
140    {
141        case 16:
142            log2nt = 4;
143            break;
144        case 8:
145            log2nt = 3;
146            break;
147        case 4:
148            log2nt = 2;
149            break;
150        default:
151            break;
152    }
153    two_nt = 2 * nt;
154    three_nt = 3 * nt;
155
156    /* Planar filtering */
157
158/* setting vallues in  registera*/
159
160//  pu1_ref[2*(two_nt - 1 - row)]
161//  pu1_ref[2 * (three_nt + 1)]
162//  pu1_ref[2 * (two_nt + 1) + col]
163//  pu1_ref[2 * (nt - 1)]
164
165    const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
166                                      pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
167                                      pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
168
169    const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
170                                      pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
171
172    const_temp4_4x32b = _mm_set1_epi16(nt - 1);
173    const_temp6_4x32b = _mm_set1_epi16(nt);
174    const_temp7_4x32b = _mm_set1_epi16(4);
175
176    zero_8x16b = _mm_set1_epi32(0);
177
178    if(nt % 4 == 0)
179    {
180        const_temp7_4x32b = _mm_set1_epi16(4);
181
182        for(row = 0; row < nt; row++)
183        {
184            __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
185            __m128i res_temp3_8x16b;
186
187            const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
188                                               pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
189                                               pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
190
191            const_temp3_4x32b  = _mm_set1_epi16((row + 1));
192            row_8x16b = _mm_set1_epi16((nt - 1 - row));
193
194            const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
195            col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
196
197            const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
198
199            /*(row + 1) * pu1_ref[nt - 1]*/
200            res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
201
202            /*(row + 1) * pu1_ref[nt - 1] + nt)*/
203            res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
204
205            for(col = 0; col < 2 * nt; col += 8)
206            {
207                __m128i src_temp_8x16b;
208
209                /* loding 8bit 16 pixles*/
210                src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
211
212                src_temp_8x16b =  _mm_cvtepu8_epi16(src_temp_8x16b); /* row=0*/
213
214                /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
215                res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
216
217                /*(col + 1) * pu1_ref[three_nt + 1]*/
218                res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
219
220                /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
221                res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
222
223                res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
224                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
225                res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
226
227                res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
228                res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
229
230                _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
231
232                const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
233                col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
234            } /* inner loop ends here */
235        }
236    }
237}
238
239/**
240*******************************************************************************
241*
242* @brief
243*  Intraprediction for DC mode with reference neighboring  samples location
244* pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
245* to section 8.4.4.2.5 in the standard
246*
247* @par Description:
248*
249*
250* @param[in] pu1_src
251*  UWORD8 pointer to the source
252*
253* @param[in] pu1_dst
254*  UWORD8 pointer to the destination
255*
256* @param[in] src_strd
257*  integer source stride
258*
259* @param[in] dst_strd
260*  integer destination stride
261*
262* @param[in] nt
263*  integer Transform Block size (Chroma)
264*
265* @param[in] mode
266*  integer intraprediction mode
267*
268* @returns
269*
270* @remarks
271*  None
272*
273*******************************************************************************
274*/
275
276void ihevc_intra_pred_chroma_dc_sse42(UWORD8 *pu1_ref,
277                                      WORD32 src_strd,
278                                      UWORD8 *pu1_dst,
279                                      WORD32 dst_strd,
280                                      WORD32 nt,
281                                      WORD32 mode)
282{
283
284    WORD32 acc_dc_u, acc_dc_v;
285    WORD32 dc_val_u, dc_val_v;
286    WORD32 row;
287    WORD32 log2nt = 5;
288    __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
289    __m128i src_temp7, src_temp8, src_temp9, src_temp10;
290    __m128i m_zero = _mm_set1_epi32(0);
291    UNUSED(src_strd);
292    UNUSED(mode);
293
294    switch(nt)
295    {
296        case 32:
297            log2nt = 5;
298            break;
299        case 16:
300            log2nt = 4;
301            break;
302        case 8:
303            log2nt = 3;
304            break;
305        case 4:
306            log2nt = 2;
307            break;
308        default:
309            break;
310    }
311
312    acc_dc_u = 0;
313    acc_dc_v = 0;
314
315    /* Calculate DC value for the transform block */
316
317    m_mask = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
318
319    if(nt == 16)
320    {
321        __m128i temp_sad;
322
323        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
324        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
325        src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
326        src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
327
328        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
329        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
330        src_temp9 =  _mm_cvtepu8_epi16(src_temp7);
331        src_temp10 =  _mm_cvtepu8_epi16(src_temp8);
332
333        src_temp3 = _mm_srli_si128(src_temp3, 8);
334        src_temp4 = _mm_srli_si128(src_temp4, 8);
335        src_temp7 = _mm_srli_si128(src_temp7, 8);
336        src_temp8 = _mm_srli_si128(src_temp8, 8);
337
338        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
339        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
340        src_temp7 =  _mm_cvtepu8_epi16(src_temp7);
341        src_temp8 =  _mm_cvtepu8_epi16(src_temp8);
342
343        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
344        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
345        src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
346        src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
347
348        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
349        src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
350
351        src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
352        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
353        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
354        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
355
356        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
357        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
358        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
359        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
360    }
361
362    else if(nt == 8)
363    {
364        __m128i temp_sad;
365        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
366        src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
367
368        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
369        src_temp6 =  _mm_cvtepu8_epi16(src_temp4);
370
371        src_temp3 = _mm_srli_si128(src_temp3, 8);
372        src_temp4 = _mm_srli_si128(src_temp4, 8);
373
374        src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
375        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
376
377        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
378        src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
379
380        src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
381        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
382        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
383        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
384
385        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
386        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
387        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
388        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
389    }
390
391    else if(nt == 4)
392    {
393        __m128i temp_sad;
394        src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
395
396        src_temp5 =  _mm_cvtepu8_epi16(src_temp3);
397        src_temp4 = _mm_srli_si128(src_temp3, 8);
398        src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
399
400        src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
401
402        src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
403        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
404        src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
405
406        src_temp4 = _mm_cvtepi16_epi32(src_temp4);
407        temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
408        acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
409        acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
410    }
411
412
413    acc_dc_u += pu1_ref[6 * nt];
414    acc_dc_v += pu1_ref[6 * nt + 1];
415
416    acc_dc_u -= pu1_ref[4 * nt];
417    acc_dc_v -= pu1_ref[4 * nt + 1];
418
419    dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
420    dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
421
422    dc_val_u = dc_val_u | (dc_val_v << 8);
423
424    /* Fill the remaining rows with DC value*/
425
426    if(nt == 4)
427    {
428        src_temp1 = _mm_set1_epi16(dc_val_u);
429
430        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
431        _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
432        _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
433        _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
434        _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
435
436    }
437    else if(nt == 8)
438    {
439        src_temp1 = _mm_set1_epi16(dc_val_u);
440
441        /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
442        _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
443        _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
444        _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
445        _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
446
447        _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
448        _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
449        _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
450        _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
451
452    }
453
454    else /* nt == 16 */
455    {
456
457        src_temp1 = _mm_set1_epi16(dc_val_u);
458
459        for(row = 0; row < nt; row += 8)
460        {
461            /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
462            _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
463            _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
464            _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
465            _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
466            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
467            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
468            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
469            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
470
471            _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
472            _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
473            _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
474            _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
475            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
476            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
477            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
478            _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
479
480            pu1_dst += 8 * dst_strd;
481        }
482
483
484    }
485
486}
487