10a39d0a697ff3603e8c100300fda363658e10b23James Zern/* 20a39d0a697ff3603e8c100300fda363658e10b23James Zern * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 30a39d0a697ff3603e8c100300fda363658e10b23James Zern * 40a39d0a697ff3603e8c100300fda363658e10b23James Zern * Use of this source code is governed by a BSD-style license 50a39d0a697ff3603e8c100300fda363658e10b23James Zern * that can be found in the LICENSE file in the root of the source 60a39d0a697ff3603e8c100300fda363658e10b23James Zern * tree. An additional intellectual property rights grant can be found 70a39d0a697ff3603e8c100300fda363658e10b23James Zern * in the file PATENTS. All contributing project authors may 80a39d0a697ff3603e8c100300fda363658e10b23James Zern * be found in the AUTHORS file in the root of the source tree. 90a39d0a697ff3603e8c100300fda363658e10b23James Zern */ 100a39d0a697ff3603e8c100300fda363658e10b23James Zern 110a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "./vpx_dsp_rtcd.h" 120a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" 130a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/inv_txfm_sse2.h" 140a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/transpose_sse2.h" 150a39d0a697ff3603e8c100300fda363658e10b23James Zern#include "vpx_dsp/x86/txfm_common_sse2.h" 160a39d0a697ff3603e8c100300fda363658e10b23James Zern 170a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, 180a39d0a697ff3603e8c100300fda363658e10b23James Zern int stride, int bd) { 190a39d0a697ff3603e8c100300fda363658e10b23James Zern tran_low_t out[16 * 16]; 200a39d0a697ff3603e8c100300fda363658e10b23James Zern tran_low_t *outptr = out; 210a39d0a697ff3603e8c100300fda363658e10b23James Zern int i, j, test; 220a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i inptr[32]; 230a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i min_input, max_input, temp1, temp2, sign_bits; 240a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i zero = _mm_set1_epi16(0); 250a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i rounding = _mm_set1_epi16(32); 260a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i max = _mm_set1_epi16(3155); 270a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i min = _mm_set1_epi16(-3155); 280a39d0a697ff3603e8c100300fda363658e10b23James Zern int optimised_cols = 0; 290a39d0a697ff3603e8c100300fda363658e10b23James Zern 300a39d0a697ff3603e8c100300fda363658e10b23James Zern // Load input into __m128i & pack to 16 bits 310a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; i++) { 320a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); 330a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); 340a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i] = _mm_packs_epi32(temp1, temp2); 350a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); 360a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); 370a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 380a39d0a697ff3603e8c100300fda363658e10b23James Zern } 390a39d0a697ff3603e8c100300fda363658e10b23James Zern 400a39d0a697ff3603e8c100300fda363658e10b23James Zern // Find the min & max for the row transform 410a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(inptr[0], inptr[1]); 420a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(inptr[0], inptr[1]); 430a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 2; i < 32; i++) { 440a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(max_input, inptr[i]); 450a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(min_input, inptr[i]); 460a39d0a697ff3603e8c100300fda363658e10b23James Zern } 470a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_cmpgt_epi16(max_input, max); 480a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_cmplt_epi16(min_input, min); 490a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_or_si128(max_input, min_input); 500a39d0a697ff3603e8c100300fda363658e10b23James Zern test = _mm_movemask_epi8(temp1); 510a39d0a697ff3603e8c100300fda363658e10b23James Zern 520a39d0a697ff3603e8c100300fda363658e10b23James Zern if (!test) { 530a39d0a697ff3603e8c100300fda363658e10b23James Zern // Do the row transform 540a39d0a697ff3603e8c100300fda363658e10b23James Zern idct16_sse2(inptr, inptr + 16); 550a39d0a697ff3603e8c100300fda363658e10b23James Zern 560a39d0a697ff3603e8c100300fda363658e10b23James Zern // Find the min & max for the column transform 570a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(inptr[0], inptr[1]); 580a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(inptr[0], inptr[1]); 590a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 2; i < 32; i++) { 600a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(max_input, inptr[i]); 610a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(min_input, inptr[i]); 620a39d0a697ff3603e8c100300fda363658e10b23James Zern } 630a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_cmpgt_epi16(max_input, max); 640a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_cmplt_epi16(min_input, min); 650a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_or_si128(max_input, min_input); 660a39d0a697ff3603e8c100300fda363658e10b23James Zern test = _mm_movemask_epi8(temp1); 670a39d0a697ff3603e8c100300fda363658e10b23James Zern 680a39d0a697ff3603e8c100300fda363658e10b23James Zern if (test) { 690a39d0a697ff3603e8c100300fda363658e10b23James Zern array_transpose_16x16(inptr, inptr + 16); 700a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; i++) { 710a39d0a697ff3603e8c100300fda363658e10b23James Zern sign_bits = _mm_cmplt_epi16(inptr[i], zero); 720a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); 730a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); 740a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); 750a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); 760a39d0a697ff3603e8c100300fda363658e10b23James Zern sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); 770a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); 780a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); 790a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 800a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 810a39d0a697ff3603e8c100300fda363658e10b23James Zern } 820a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 830a39d0a697ff3603e8c100300fda363658e10b23James Zern // Set to use the optimised transform for the column 840a39d0a697ff3603e8c100300fda363658e10b23James Zern optimised_cols = 1; 850a39d0a697ff3603e8c100300fda363658e10b23James Zern } 860a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 870a39d0a697ff3603e8c100300fda363658e10b23James Zern // Run the un-optimised row transform 880a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; ++i) { 890a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_idct16_c(input, outptr, bd); 900a39d0a697ff3603e8c100300fda363658e10b23James Zern input += 16; 910a39d0a697ff3603e8c100300fda363658e10b23James Zern outptr += 16; 920a39d0a697ff3603e8c100300fda363658e10b23James Zern } 930a39d0a697ff3603e8c100300fda363658e10b23James Zern } 940a39d0a697ff3603e8c100300fda363658e10b23James Zern 950a39d0a697ff3603e8c100300fda363658e10b23James Zern if (optimised_cols) { 960a39d0a697ff3603e8c100300fda363658e10b23James Zern idct16_sse2(inptr, inptr + 16); 970a39d0a697ff3603e8c100300fda363658e10b23James Zern 980a39d0a697ff3603e8c100300fda363658e10b23James Zern // Final round & shift and Reconstruction and Store 990a39d0a697ff3603e8c100300fda363658e10b23James Zern { 1000a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i d[2]; 1010a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; i++) { 1020a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i] = _mm_add_epi16(inptr[i], rounding); 1030a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); 1040a39d0a697ff3603e8c100300fda363658e10b23James Zern d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); 1050a39d0a697ff3603e8c100300fda363658e10b23James Zern d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); 1060a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i] = _mm_srai_epi16(inptr[i], 6); 1070a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); 1080a39d0a697ff3603e8c100300fda363658e10b23James Zern d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); 1090a39d0a697ff3603e8c100300fda363658e10b23James Zern d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); 1100a39d0a697ff3603e8c100300fda363658e10b23James Zern // Store 1110a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); 1120a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); 1130a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1140a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1150a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 1160a39d0a697ff3603e8c100300fda363658e10b23James Zern // Run the un-optimised column transform 1170a39d0a697ff3603e8c100300fda363658e10b23James Zern tran_low_t temp_in[16], temp_out[16]; 1180a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; ++i) { 1190a39d0a697ff3603e8c100300fda363658e10b23James Zern for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 1200a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_idct16_c(temp_in, temp_out, bd); 1210a39d0a697ff3603e8c100300fda363658e10b23James Zern for (j = 0; j < 16; ++j) { 1220a39d0a697ff3603e8c100300fda363658e10b23James Zern dest[j * stride + i] = highbd_clip_pixel_add( 1230a39d0a697ff3603e8c100300fda363658e10b23James Zern dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 1240a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1250a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1260a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1270a39d0a697ff3603e8c100300fda363658e10b23James Zern} 1280a39d0a697ff3603e8c100300fda363658e10b23James Zern 1290a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, 1300a39d0a697ff3603e8c100300fda363658e10b23James Zern int stride, int bd) { 1310a39d0a697ff3603e8c100300fda363658e10b23James Zern tran_low_t out[16 * 16] = { 0 }; 1320a39d0a697ff3603e8c100300fda363658e10b23James Zern tran_low_t *outptr = out; 1330a39d0a697ff3603e8c100300fda363658e10b23James Zern int i, j, test; 1340a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i inptr[32]; 1350a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i min_input, max_input, temp1, temp2, sign_bits; 1360a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i zero = _mm_set1_epi16(0); 1370a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i rounding = _mm_set1_epi16(32); 1380a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i max = _mm_set1_epi16(3155); 1390a39d0a697ff3603e8c100300fda363658e10b23James Zern const __m128i min = _mm_set1_epi16(-3155); 1400a39d0a697ff3603e8c100300fda363658e10b23James Zern int optimised_cols = 0; 1410a39d0a697ff3603e8c100300fda363658e10b23James Zern 1420a39d0a697ff3603e8c100300fda363658e10b23James Zern // Load input into __m128i & pack to 16 bits 1430a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; i++) { 1440a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i)); 1450a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4)); 1460a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i] = _mm_packs_epi32(temp1, temp2); 1470a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8)); 1480a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12)); 1490a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i + 16] = _mm_packs_epi32(temp1, temp2); 1500a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1510a39d0a697ff3603e8c100300fda363658e10b23James Zern 1520a39d0a697ff3603e8c100300fda363658e10b23James Zern // Find the min & max for the row transform 1530a39d0a697ff3603e8c100300fda363658e10b23James Zern // Since all non-zero dct coefficients are in upper-left 4x4 area, 1540a39d0a697ff3603e8c100300fda363658e10b23James Zern // we only need to consider first 4 rows here. 1550a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(inptr[0], inptr[1]); 1560a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(inptr[0], inptr[1]); 1570a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 2; i < 4; i++) { 1580a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(max_input, inptr[i]); 1590a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(min_input, inptr[i]); 1600a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1610a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_cmpgt_epi16(max_input, max); 1620a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_cmplt_epi16(min_input, min); 1630a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_or_si128(max_input, min_input); 1640a39d0a697ff3603e8c100300fda363658e10b23James Zern test = _mm_movemask_epi8(temp1); 1650a39d0a697ff3603e8c100300fda363658e10b23James Zern 1660a39d0a697ff3603e8c100300fda363658e10b23James Zern if (!test) { 1670a39d0a697ff3603e8c100300fda363658e10b23James Zern // Do the row transform (N.B. This transposes inptr) 1680a39d0a697ff3603e8c100300fda363658e10b23James Zern idct16_sse2(inptr, inptr + 16); 1690a39d0a697ff3603e8c100300fda363658e10b23James Zern 1700a39d0a697ff3603e8c100300fda363658e10b23James Zern // Find the min & max for the column transform 1710a39d0a697ff3603e8c100300fda363658e10b23James Zern // N.B. Only first 4 cols contain non-zero coeffs 1720a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(inptr[0], inptr[1]); 1730a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(inptr[0], inptr[1]); 1740a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 2; i < 16; i++) { 1750a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_max_epi16(max_input, inptr[i]); 1760a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_min_epi16(min_input, inptr[i]); 1770a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1780a39d0a697ff3603e8c100300fda363658e10b23James Zern max_input = _mm_cmpgt_epi16(max_input, max); 1790a39d0a697ff3603e8c100300fda363658e10b23James Zern min_input = _mm_cmplt_epi16(min_input, min); 1800a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_or_si128(max_input, min_input); 1810a39d0a697ff3603e8c100300fda363658e10b23James Zern test = _mm_movemask_epi8(temp1); 1820a39d0a697ff3603e8c100300fda363658e10b23James Zern 1830a39d0a697ff3603e8c100300fda363658e10b23James Zern if (test) { 1840a39d0a697ff3603e8c100300fda363658e10b23James Zern // Use fact only first 4 rows contain non-zero coeffs 1850a39d0a697ff3603e8c100300fda363658e10b23James Zern array_transpose_8x8(inptr, inptr); 1860a39d0a697ff3603e8c100300fda363658e10b23James Zern array_transpose_8x8(inptr + 8, inptr + 16); 1870a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 4; i++) { 1880a39d0a697ff3603e8c100300fda363658e10b23James Zern sign_bits = _mm_cmplt_epi16(inptr[i], zero); 1890a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits); 1900a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits); 1910a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1); 1920a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2); 1930a39d0a697ff3603e8c100300fda363658e10b23James Zern sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero); 1940a39d0a697ff3603e8c100300fda363658e10b23James Zern temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits); 1950a39d0a697ff3603e8c100300fda363658e10b23James Zern temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits); 1960a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1); 1970a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2); 1980a39d0a697ff3603e8c100300fda363658e10b23James Zern } 1990a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 2000a39d0a697ff3603e8c100300fda363658e10b23James Zern // Set to use the optimised transform for the column 2010a39d0a697ff3603e8c100300fda363658e10b23James Zern optimised_cols = 1; 2020a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2030a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 2040a39d0a697ff3603e8c100300fda363658e10b23James Zern // Run the un-optimised row transform 2050a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 4; ++i) { 2060a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_idct16_c(input, outptr, bd); 2070a39d0a697ff3603e8c100300fda363658e10b23James Zern input += 16; 2080a39d0a697ff3603e8c100300fda363658e10b23James Zern outptr += 16; 2090a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2100a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2110a39d0a697ff3603e8c100300fda363658e10b23James Zern 2120a39d0a697ff3603e8c100300fda363658e10b23James Zern if (optimised_cols) { 2130a39d0a697ff3603e8c100300fda363658e10b23James Zern idct16_sse2(inptr, inptr + 16); 2140a39d0a697ff3603e8c100300fda363658e10b23James Zern 2150a39d0a697ff3603e8c100300fda363658e10b23James Zern // Final round & shift and Reconstruction and Store 2160a39d0a697ff3603e8c100300fda363658e10b23James Zern { 2170a39d0a697ff3603e8c100300fda363658e10b23James Zern __m128i d[2]; 2180a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; i++) { 2190a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i] = _mm_add_epi16(inptr[i], rounding); 2200a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding); 2210a39d0a697ff3603e8c100300fda363658e10b23James Zern d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i)); 2220a39d0a697ff3603e8c100300fda363658e10b23James Zern d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8)); 2230a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i] = _mm_srai_epi16(inptr[i], 6); 2240a39d0a697ff3603e8c100300fda363658e10b23James Zern inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6); 2250a39d0a697ff3603e8c100300fda363658e10b23James Zern d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd); 2260a39d0a697ff3603e8c100300fda363658e10b23James Zern d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd); 2270a39d0a697ff3603e8c100300fda363658e10b23James Zern // Store 2280a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]); 2290a39d0a697ff3603e8c100300fda363658e10b23James Zern _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]); 2300a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2310a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2320a39d0a697ff3603e8c100300fda363658e10b23James Zern } else { 2330a39d0a697ff3603e8c100300fda363658e10b23James Zern // Run the un-optimised column transform 2340a39d0a697ff3603e8c100300fda363658e10b23James Zern tran_low_t temp_in[16], temp_out[16]; 2350a39d0a697ff3603e8c100300fda363658e10b23James Zern for (i = 0; i < 16; ++i) { 2360a39d0a697ff3603e8c100300fda363658e10b23James Zern for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 2370a39d0a697ff3603e8c100300fda363658e10b23James Zern vpx_highbd_idct16_c(temp_in, temp_out, bd); 2380a39d0a697ff3603e8c100300fda363658e10b23James Zern for (j = 0; j < 16; ++j) { 2390a39d0a697ff3603e8c100300fda363658e10b23James Zern dest[j * stride + i] = highbd_clip_pixel_add( 2400a39d0a697ff3603e8c100300fda363658e10b23James Zern dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2410a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2420a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2430a39d0a697ff3603e8c100300fda363658e10b23James Zern } 2440a39d0a697ff3603e8c100300fda363658e10b23James Zern} 2450a39d0a697ff3603e8c100300fda363658e10b23James Zern 2460a39d0a697ff3603e8c100300fda363658e10b23James Zernvoid vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, 2470a39d0a697ff3603e8c100300fda363658e10b23James Zern int stride, int bd) { 2480a39d0a697ff3603e8c100300fda363658e10b23James Zern highbd_idct_1_add_kernel(input, dest, stride, bd, 16); 2490a39d0a697ff3603e8c100300fda363658e10b23James Zern} 250