1/* 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "./vp9_rtcd.h" 12#include "vpx_dsp/x86/inv_txfm_sse2.h" 13#include "vpx_dsp/x86/txfm_common_sse2.h" 14#include "vpx_ports/mem.h" 15 16void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, 17 int tx_type) { 18 __m128i in[2]; 19 const __m128i eight = _mm_set1_epi16(8); 20 21 in[0] = load_input_data(input); 22 in[1] = load_input_data(input + 8); 23 24 switch (tx_type) { 25 case 0: // DCT_DCT 26 idct4_sse2(in); 27 idct4_sse2(in); 28 break; 29 case 1: // ADST_DCT 30 idct4_sse2(in); 31 iadst4_sse2(in); 32 break; 33 case 2: // DCT_ADST 34 iadst4_sse2(in); 35 idct4_sse2(in); 36 break; 37 case 3: // ADST_ADST 38 iadst4_sse2(in); 39 iadst4_sse2(in); 40 break; 41 default: assert(0); break; 42 } 43 44 // Final round and shift 45 in[0] = _mm_add_epi16(in[0], eight); 46 in[1] = _mm_add_epi16(in[1], eight); 47 48 in[0] = _mm_srai_epi16(in[0], 4); 49 in[1] = _mm_srai_epi16(in[1], 4); 50 51 recon_and_store4x4_sse2(in, dest, stride); 52} 53 54void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, 55 int tx_type) { 56 __m128i in[8]; 57 const __m128i zero = _mm_setzero_si128(); 58 const __m128i final_rounding = _mm_set1_epi16(1 << 4); 59 60 // load input data 61 in[0] = load_input_data(input); 62 in[1] = load_input_data(input + 8 * 1); 63 in[2] = load_input_data(input + 8 * 2); 64 in[3] = load_input_data(input + 8 * 3); 65 in[4] = load_input_data(input + 8 * 4); 66 in[5] = load_input_data(input + 8 * 5); 67 in[6] = load_input_data(input + 8 * 6); 68 in[7] = load_input_data(input + 8 * 7); 69 70 switch (tx_type) { 71 case 0: // DCT_DCT 72 idct8_sse2(in); 73 idct8_sse2(in); 74 break; 75 case 1: // ADST_DCT 76 idct8_sse2(in); 77 iadst8_sse2(in); 78 break; 79 case 2: // DCT_ADST 80 iadst8_sse2(in); 81 idct8_sse2(in); 82 break; 83 case 3: // ADST_ADST 84 iadst8_sse2(in); 85 iadst8_sse2(in); 86 break; 87 default: assert(0); break; 88 } 89 90 // Final rounding and shift 91 in[0] = _mm_adds_epi16(in[0], final_rounding); 92 in[1] = _mm_adds_epi16(in[1], final_rounding); 93 in[2] = _mm_adds_epi16(in[2], final_rounding); 94 in[3] = _mm_adds_epi16(in[3], final_rounding); 95 in[4] = _mm_adds_epi16(in[4], final_rounding); 96 in[5] = _mm_adds_epi16(in[5], final_rounding); 97 in[6] = _mm_adds_epi16(in[6], final_rounding); 98 in[7] = _mm_adds_epi16(in[7], final_rounding); 99 100 in[0] = _mm_srai_epi16(in[0], 5); 101 in[1] = _mm_srai_epi16(in[1], 5); 102 in[2] = _mm_srai_epi16(in[2], 5); 103 in[3] = _mm_srai_epi16(in[3], 5); 104 in[4] = _mm_srai_epi16(in[4], 5); 105 in[5] = _mm_srai_epi16(in[5], 5); 106 in[6] = _mm_srai_epi16(in[6], 5); 107 in[7] = _mm_srai_epi16(in[7], 5); 108 109 RECON_AND_STORE(dest + 0 * stride, in[0]); 110 RECON_AND_STORE(dest + 1 * stride, in[1]); 111 RECON_AND_STORE(dest + 2 * stride, in[2]); 112 RECON_AND_STORE(dest + 3 * stride, in[3]); 113 RECON_AND_STORE(dest + 4 * stride, in[4]); 114 RECON_AND_STORE(dest + 5 * stride, in[5]); 115 RECON_AND_STORE(dest + 6 * stride, in[6]); 116 RECON_AND_STORE(dest + 7 * stride, in[7]); 117} 118 119void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, 120 int stride, int tx_type) { 121 __m128i in0[16], in1[16]; 122 123 load_buffer_8x16(input, in0); 124 input += 8; 125 load_buffer_8x16(input, in1); 126 127 switch (tx_type) { 128 case 0: // DCT_DCT 129 idct16_sse2(in0, in1); 130 idct16_sse2(in0, in1); 131 break; 132 case 1: // ADST_DCT 133 idct16_sse2(in0, in1); 134 iadst16_sse2(in0, in1); 135 break; 136 case 2: // DCT_ADST 137 iadst16_sse2(in0, in1); 138 idct16_sse2(in0, in1); 139 break; 140 case 3: // ADST_ADST 141 iadst16_sse2(in0, in1); 142 iadst16_sse2(in0, in1); 143 break; 144 default: assert(0); break; 145 } 146 147 write_buffer_8x16(dest, in0, stride); 148 dest += 8; 149 write_buffer_8x16(dest, in1, stride); 150} 151