10a39d0a697ff3603e8c100300fda363658e10b23James Zern/*
20a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
30a39d0a697ff3603e8c100300fda363658e10b23James Zern *
40a39d0a697ff3603e8c100300fda363658e10b23James Zern *  Use of this source code is governed by a BSD-style license
50a39d0a697ff3603e8c100300fda363658e10b23James Zern *  that can be found in the LICENSE file in the root of the source
60a39d0a697ff3603e8c100300fda363658e10b23James Zern *  tree. An additional intellectual property rights grant can be found
70a39d0a697ff3603e8c100300fda363658e10b23James Zern *  in the file PATENTS.  All contributing project authors may
80a39d0a697ff3603e8c100300fda363658e10b23James Zern *  be found in the AUTHORS file in the root of the source tree.
90a39d0a697ff3603e8c100300fda363658e10b23James Zern */
100a39d0a697ff3603e8c100300fda363658e10b23James Zern
110a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_dsp_rtcd.h"
120a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
130a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/inv_txfm_sse2.h"
140a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/transpose_sse2.h"
150a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/txfm_common_sse2.h"
160a39d0a697ff3603e8c100300fda363658e10b23James Zern
170a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
180a39d0a697ff3603e8c100300fda363658e10b23James Zern                                       int stride, int bd) {
190a39d0a697ff3603e8c100300fda363658e10b23James Zern  tran_low_t out[16 * 16];
200a39d0a697ff3603e8c100300fda363658e10b23James Zern  tran_low_t *outptr = out;
210a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i, j, test;
220a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i inptr[32];
230a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i min_input, max_input, temp1, temp2, sign_bits;
240a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i zero = _mm_set1_epi16(0);
250a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i rounding = _mm_set1_epi16(32);
260a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i max = _mm_set1_epi16(3155);
270a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i min = _mm_set1_epi16(-3155);
280a39d0a697ff3603e8c100300fda363658e10b23James Zern  int optimised_cols = 0;
290a39d0a697ff3603e8c100300fda363658e10b23James Zern
300a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Load input into __m128i & pack to 16 bits
310a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (i = 0; i < 16; i++) {
320a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
330a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
340a39d0a697ff3603e8c100300fda363658e10b23James Zern    inptr[i] = _mm_packs_epi32(temp1, temp2);
350a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
360a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
370a39d0a697ff3603e8c100300fda363658e10b23James Zern    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
380a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
390a39d0a697ff3603e8c100300fda363658e10b23James Zern
400a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Find the min & max for the row transform
410a39d0a697ff3603e8c100300fda363658e10b23James Zern  max_input = _mm_max_epi16(inptr[0], inptr[1]);
420a39d0a697ff3603e8c100300fda363658e10b23James Zern  min_input = _mm_min_epi16(inptr[0], inptr[1]);
430a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (i = 2; i < 32; i++) {
440a39d0a697ff3603e8c100300fda363658e10b23James Zern    max_input = _mm_max_epi16(max_input, inptr[i]);
450a39d0a697ff3603e8c100300fda363658e10b23James Zern    min_input = _mm_min_epi16(min_input, inptr[i]);
460a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
470a39d0a697ff3603e8c100300fda363658e10b23James Zern  max_input = _mm_cmpgt_epi16(max_input, max);
480a39d0a697ff3603e8c100300fda363658e10b23James Zern  min_input = _mm_cmplt_epi16(min_input, min);
490a39d0a697ff3603e8c100300fda363658e10b23James Zern  temp1 = _mm_or_si128(max_input, min_input);
500a39d0a697ff3603e8c100300fda363658e10b23James Zern  test = _mm_movemask_epi8(temp1);
510a39d0a697ff3603e8c100300fda363658e10b23James Zern
520a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (!test) {
530a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Do the row transform
540a39d0a697ff3603e8c100300fda363658e10b23James Zern    idct16_sse2(inptr, inptr + 16);
550a39d0a697ff3603e8c100300fda363658e10b23James Zern
560a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Find the min & max for the column transform
570a39d0a697ff3603e8c100300fda363658e10b23James Zern    max_input = _mm_max_epi16(inptr[0], inptr[1]);
580a39d0a697ff3603e8c100300fda363658e10b23James Zern    min_input = _mm_min_epi16(inptr[0], inptr[1]);
590a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 2; i < 32; i++) {
600a39d0a697ff3603e8c100300fda363658e10b23James Zern      max_input = _mm_max_epi16(max_input, inptr[i]);
610a39d0a697ff3603e8c100300fda363658e10b23James Zern      min_input = _mm_min_epi16(min_input, inptr[i]);
620a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
630a39d0a697ff3603e8c100300fda363658e10b23James Zern    max_input = _mm_cmpgt_epi16(max_input, max);
640a39d0a697ff3603e8c100300fda363658e10b23James Zern    min_input = _mm_cmplt_epi16(min_input, min);
650a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp1 = _mm_or_si128(max_input, min_input);
660a39d0a697ff3603e8c100300fda363658e10b23James Zern    test = _mm_movemask_epi8(temp1);
670a39d0a697ff3603e8c100300fda363658e10b23James Zern
680a39d0a697ff3603e8c100300fda363658e10b23James Zern    if (test) {
690a39d0a697ff3603e8c100300fda363658e10b23James Zern      array_transpose_16x16(inptr, inptr + 16);
700a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (i = 0; i < 16; i++) {
710a39d0a697ff3603e8c100300fda363658e10b23James Zern        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
720a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
730a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
740a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
750a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
760a39d0a697ff3603e8c100300fda363658e10b23James Zern        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
770a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
780a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
790a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
800a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
810a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
820a39d0a697ff3603e8c100300fda363658e10b23James Zern    } else {
830a39d0a697ff3603e8c100300fda363658e10b23James Zern      // Set to use the optimised transform for the column
840a39d0a697ff3603e8c100300fda363658e10b23James Zern      optimised_cols = 1;
850a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
860a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
870a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Run the un-optimised row transform
880a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 16; ++i) {
890a39d0a697ff3603e8c100300fda363658e10b23James Zern      vpx_highbd_idct16_c(input, outptr, bd);
900a39d0a697ff3603e8c100300fda363658e10b23James Zern      input += 16;
910a39d0a697ff3603e8c100300fda363658e10b23James Zern      outptr += 16;
920a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
930a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
940a39d0a697ff3603e8c100300fda363658e10b23James Zern
950a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (optimised_cols) {
960a39d0a697ff3603e8c100300fda363658e10b23James Zern    idct16_sse2(inptr, inptr + 16);
970a39d0a697ff3603e8c100300fda363658e10b23James Zern
980a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Final round & shift and Reconstruction and Store
990a39d0a697ff3603e8c100300fda363658e10b23James Zern    {
1000a39d0a697ff3603e8c100300fda363658e10b23James Zern      __m128i d[2];
1010a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (i = 0; i < 16; i++) {
1020a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i] = _mm_add_epi16(inptr[i], rounding);
1030a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
1040a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
1050a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
1060a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i] = _mm_srai_epi16(inptr[i], 6);
1070a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
1080a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
1090a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
1100a39d0a697ff3603e8c100300fda363658e10b23James Zern        // Store
1110a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
1120a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
1130a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
1140a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
1150a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
1160a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Run the un-optimised column transform
1170a39d0a697ff3603e8c100300fda363658e10b23James Zern    tran_low_t temp_in[16], temp_out[16];
1180a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 16; ++i) {
1190a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
1200a39d0a697ff3603e8c100300fda363658e10b23James Zern      vpx_highbd_idct16_c(temp_in, temp_out, bd);
1210a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (j = 0; j < 16; ++j) {
1220a39d0a697ff3603e8c100300fda363658e10b23James Zern        dest[j * stride + i] = highbd_clip_pixel_add(
1230a39d0a697ff3603e8c100300fda363658e10b23James Zern            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1240a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
1250a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
1260a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
1270a39d0a697ff3603e8c100300fda363658e10b23James Zern}
1280a39d0a697ff3603e8c100300fda363658e10b23James Zern
1290a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
1300a39d0a697ff3603e8c100300fda363658e10b23James Zern                                      int stride, int bd) {
1310a39d0a697ff3603e8c100300fda363658e10b23James Zern  tran_low_t out[16 * 16] = { 0 };
1320a39d0a697ff3603e8c100300fda363658e10b23James Zern  tran_low_t *outptr = out;
1330a39d0a697ff3603e8c100300fda363658e10b23James Zern  int i, j, test;
1340a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i inptr[32];
1350a39d0a697ff3603e8c100300fda363658e10b23James Zern  __m128i min_input, max_input, temp1, temp2, sign_bits;
1360a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i zero = _mm_set1_epi16(0);
1370a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i rounding = _mm_set1_epi16(32);
1380a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i max = _mm_set1_epi16(3155);
1390a39d0a697ff3603e8c100300fda363658e10b23James Zern  const __m128i min = _mm_set1_epi16(-3155);
1400a39d0a697ff3603e8c100300fda363658e10b23James Zern  int optimised_cols = 0;
1410a39d0a697ff3603e8c100300fda363658e10b23James Zern
1420a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Load input into __m128i & pack to 16 bits
1430a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (i = 0; i < 16; i++) {
1440a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
1450a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
1460a39d0a697ff3603e8c100300fda363658e10b23James Zern    inptr[i] = _mm_packs_epi32(temp1, temp2);
1470a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
1480a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
1490a39d0a697ff3603e8c100300fda363658e10b23James Zern    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
1500a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
1510a39d0a697ff3603e8c100300fda363658e10b23James Zern
1520a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Find the min & max for the row transform
1530a39d0a697ff3603e8c100300fda363658e10b23James Zern  // Since all non-zero dct coefficients are in upper-left 4x4 area,
1540a39d0a697ff3603e8c100300fda363658e10b23James Zern  // we only need to consider first 4 rows here.
1550a39d0a697ff3603e8c100300fda363658e10b23James Zern  max_input = _mm_max_epi16(inptr[0], inptr[1]);
1560a39d0a697ff3603e8c100300fda363658e10b23James Zern  min_input = _mm_min_epi16(inptr[0], inptr[1]);
1570a39d0a697ff3603e8c100300fda363658e10b23James Zern  for (i = 2; i < 4; i++) {
1580a39d0a697ff3603e8c100300fda363658e10b23James Zern    max_input = _mm_max_epi16(max_input, inptr[i]);
1590a39d0a697ff3603e8c100300fda363658e10b23James Zern    min_input = _mm_min_epi16(min_input, inptr[i]);
1600a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
1610a39d0a697ff3603e8c100300fda363658e10b23James Zern  max_input = _mm_cmpgt_epi16(max_input, max);
1620a39d0a697ff3603e8c100300fda363658e10b23James Zern  min_input = _mm_cmplt_epi16(min_input, min);
1630a39d0a697ff3603e8c100300fda363658e10b23James Zern  temp1 = _mm_or_si128(max_input, min_input);
1640a39d0a697ff3603e8c100300fda363658e10b23James Zern  test = _mm_movemask_epi8(temp1);
1650a39d0a697ff3603e8c100300fda363658e10b23James Zern
1660a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (!test) {
1670a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Do the row transform (N.B. This transposes inptr)
1680a39d0a697ff3603e8c100300fda363658e10b23James Zern    idct16_sse2(inptr, inptr + 16);
1690a39d0a697ff3603e8c100300fda363658e10b23James Zern
1700a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Find the min & max for the column transform
1710a39d0a697ff3603e8c100300fda363658e10b23James Zern    // N.B. Only first 4 cols contain non-zero coeffs
1720a39d0a697ff3603e8c100300fda363658e10b23James Zern    max_input = _mm_max_epi16(inptr[0], inptr[1]);
1730a39d0a697ff3603e8c100300fda363658e10b23James Zern    min_input = _mm_min_epi16(inptr[0], inptr[1]);
1740a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 2; i < 16; i++) {
1750a39d0a697ff3603e8c100300fda363658e10b23James Zern      max_input = _mm_max_epi16(max_input, inptr[i]);
1760a39d0a697ff3603e8c100300fda363658e10b23James Zern      min_input = _mm_min_epi16(min_input, inptr[i]);
1770a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
1780a39d0a697ff3603e8c100300fda363658e10b23James Zern    max_input = _mm_cmpgt_epi16(max_input, max);
1790a39d0a697ff3603e8c100300fda363658e10b23James Zern    min_input = _mm_cmplt_epi16(min_input, min);
1800a39d0a697ff3603e8c100300fda363658e10b23James Zern    temp1 = _mm_or_si128(max_input, min_input);
1810a39d0a697ff3603e8c100300fda363658e10b23James Zern    test = _mm_movemask_epi8(temp1);
1820a39d0a697ff3603e8c100300fda363658e10b23James Zern
1830a39d0a697ff3603e8c100300fda363658e10b23James Zern    if (test) {
1840a39d0a697ff3603e8c100300fda363658e10b23James Zern      // Use fact only first 4 rows contain non-zero coeffs
1850a39d0a697ff3603e8c100300fda363658e10b23James Zern      array_transpose_8x8(inptr, inptr);
1860a39d0a697ff3603e8c100300fda363658e10b23James Zern      array_transpose_8x8(inptr + 8, inptr + 16);
1870a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (i = 0; i < 4; i++) {
1880a39d0a697ff3603e8c100300fda363658e10b23James Zern        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
1890a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
1900a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
1910a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
1920a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
1930a39d0a697ff3603e8c100300fda363658e10b23James Zern        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
1940a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
1950a39d0a697ff3603e8c100300fda363658e10b23James Zern        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
1960a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
1970a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
1980a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
1990a39d0a697ff3603e8c100300fda363658e10b23James Zern    } else {
2000a39d0a697ff3603e8c100300fda363658e10b23James Zern      // Set to use the optimised transform for the column
2010a39d0a697ff3603e8c100300fda363658e10b23James Zern      optimised_cols = 1;
2020a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
2030a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
2040a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Run the un-optimised row transform
2050a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 4; ++i) {
2060a39d0a697ff3603e8c100300fda363658e10b23James Zern      vpx_highbd_idct16_c(input, outptr, bd);
2070a39d0a697ff3603e8c100300fda363658e10b23James Zern      input += 16;
2080a39d0a697ff3603e8c100300fda363658e10b23James Zern      outptr += 16;
2090a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
2100a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
2110a39d0a697ff3603e8c100300fda363658e10b23James Zern
2120a39d0a697ff3603e8c100300fda363658e10b23James Zern  if (optimised_cols) {
2130a39d0a697ff3603e8c100300fda363658e10b23James Zern    idct16_sse2(inptr, inptr + 16);
2140a39d0a697ff3603e8c100300fda363658e10b23James Zern
2150a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Final round & shift and Reconstruction and Store
2160a39d0a697ff3603e8c100300fda363658e10b23James Zern    {
2170a39d0a697ff3603e8c100300fda363658e10b23James Zern      __m128i d[2];
2180a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (i = 0; i < 16; i++) {
2190a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i] = _mm_add_epi16(inptr[i], rounding);
2200a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
2210a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
2220a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
2230a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i] = _mm_srai_epi16(inptr[i], 6);
2240a39d0a697ff3603e8c100300fda363658e10b23James Zern        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
2250a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
2260a39d0a697ff3603e8c100300fda363658e10b23James Zern        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
2270a39d0a697ff3603e8c100300fda363658e10b23James Zern        // Store
2280a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
2290a39d0a697ff3603e8c100300fda363658e10b23James Zern        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
2300a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
2310a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
2320a39d0a697ff3603e8c100300fda363658e10b23James Zern  } else {
2330a39d0a697ff3603e8c100300fda363658e10b23James Zern    // Run the un-optimised column transform
2340a39d0a697ff3603e8c100300fda363658e10b23James Zern    tran_low_t temp_in[16], temp_out[16];
2350a39d0a697ff3603e8c100300fda363658e10b23James Zern    for (i = 0; i < 16; ++i) {
2360a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
2370a39d0a697ff3603e8c100300fda363658e10b23James Zern      vpx_highbd_idct16_c(temp_in, temp_out, bd);
2380a39d0a697ff3603e8c100300fda363658e10b23James Zern      for (j = 0; j < 16; ++j) {
2390a39d0a697ff3603e8c100300fda363658e10b23James Zern        dest[j * stride + i] = highbd_clip_pixel_add(
2400a39d0a697ff3603e8c100300fda363658e10b23James Zern            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2410a39d0a697ff3603e8c100300fda363658e10b23James Zern      }
2420a39d0a697ff3603e8c100300fda363658e10b23James Zern    }
2430a39d0a697ff3603e8c100300fda363658e10b23James Zern  }
2440a39d0a697ff3603e8c100300fda363658e10b23James Zern}
2450a39d0a697ff3603e8c100300fda363658e10b23James Zern
2460a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
2470a39d0a697ff3603e8c100300fda363658e10b23James Zern                                     int stride, int bd) {
2480a39d0a697ff3603e8c100300fda363658e10b23James Zern  highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
2490a39d0a697ff3603e8c100300fda363658e10b23James Zern}
250