1f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh/* 2f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Copyright (C) 2010-2011 Intel Corporation 3f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * 4f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Licensed under the Apache License, Version 2.0 (the "License"); 5f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * you may not use this file except in compliance with the License. 6f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * You may obtain a copy of the License at 7f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * 8f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * http://www.apache.org/licenses/LICENSE-2.0 9f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * 10f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Unless required by applicable law or agreed to in writing, software 11f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * distributed under the License is distributed on an "AS IS" BASIS, 12f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * See the License for the specific language governing permissions and 14f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * limitations under the License. 15f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh */ 16f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 17f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define JPEG_INTERNALS 18f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include "jinclude.h" 19f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include "jpeglib.h" 20f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include "jdct.h" /* Private declarations for DCT subsystem */ 21f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 22f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#ifdef ANDROID_INTELSSE2_IDCT 23f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include <emmintrin.h> 24f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 25f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#if DCTSIZE != 8 26f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */ 27f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#endif 28f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 29f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define BITS_INV_ACC 4 30f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define SHIFT_INV_ROW 12 31f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define SHIFT_INV_COL 5 32f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehconst short RND_INV_ROW = 2048; 33f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehconst short RND_INV_COL = 16; 34f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehconst short RND_INV_CORR = 15; 35f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 36f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_one_corr[8] = {1,1,1,1,1,1,1,1}; 37f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_round_inv_row[8] = {2048,0,2048,0,2048,0,2048,0}; 38f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_round_inv_col[8] = {16,16,16,16,16,16,16,16}; 39f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_round_inv_corr[8] = {15,15,15,15,15,15,15,15}; 40f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 41f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036}; 42f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146}; 43f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746}; 44f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195}; 45f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 46f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) jpeg_adjust[8] = {128, 128, 128, 128, 128, 128, 128, 128}; 47f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 48f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 0,4 49f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_04[32] = { 50f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16384, 21407, 16384, 8867, 51f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16384, -8867, 16384, -21407, 52f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16384, 8867, -16384, -21407, 53f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-16384, 21407, 16384, -8867, 54f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, 19266, 19266, -4520, 55f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh12873, -22725, 4520, -12873, 56f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh12873, 4520, -22725, -12873, 57f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh4520, 19266, 19266, -22725 58f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh}; 59f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 60f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 1,7 61f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_17[32] = { 62f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, 29692, 22725, 12299, 63f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, -12299, 22725, -29692, 64f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, 12299, -22725, -29692, 65f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-22725, 29692, 22725, -12299, 66f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh31521, 26722, 26722, -6270, 67f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh17855, -31521, 6270, -17855, 68f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh17855, 6270, -31521, -17855, 69f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh6270, 26722, 26722, -31521 70f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh}; 71f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 72f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 2,6 73f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_26[32] = { 74f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh21407, 27969, 21407, 11585, 75f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh21407, -11585, 21407, -27969, 76f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh21407, 11585, -21407, -27969, 77f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-21407, 27969, 21407, -11585, 78f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh29692, 25172, 25172, -5906, 79f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16819, -29692, 5906, -16819, 80f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16819, 5906, -29692, -16819, 81f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh5906, 25172, 25172, -29692 82f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh}; 83f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 84f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 3,5 85f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_35[32] = { 86f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh19266, 25172, 19266, 10426, 87f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh19266, -10426, 19266, -25172, 88f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh19266, 10426, -19266, -25172, 89f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-19266, 25172, 19266, -10426, 90f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh26722, 22654, 22654, -5315, 91f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh15137, -26722, 5315, -15137, 92f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh15137, 5315, -26722, -15137, 93f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh5315, 22654, 22654, -26722 94f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh}; 95f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 96f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 97f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh/* 98f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Perform dequantization and inverse DCT on one block of coefficients by SSE. 99f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh */ 100f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 101f897702251443b531b2ded93df71ffd87fbae076Andrew HsiehGLOBAL(void) 102f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehjpeg_idct_intelsse (j_decompress_ptr cinfo, jpeg_component_info * compptr, 103f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh JCOEFPTR coef_block, 104f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh JSAMPARRAY output_buf, JDIMENSION output_col) 105f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh{ 106f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh __m128i row0, tmp1, tmp2, tmp3, row2, tmp5, tmp6, tmp7; 107f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh int ctr; 108f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh JSAMPROW outptrTemp; 109f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh JSAMPLE *range_limit = IDCT_range_limit(cinfo); 110f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh short __attribute__((aligned(16))) quantptrSSE[DCTSIZE2]; 111f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh short __attribute__((aligned(16))) workspaceSSE[DCTSIZE2]; 112f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh short __attribute__((aligned(16))) coef_blockSSE[DCTSIZE2]; 113f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh __m128i x0, x1, x2, x3, x4, x5, x6, x7; 114f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh __m128i* tg3, *tg1, *tg2, *cos4; 115f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh __m128i tm765, tp765, tm465, tp465, tp03, tm03, tp12, tm12, tp65, tm65; 116f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh __m128i t0, t1, t2, t3, t4, t5, t6, t7; 117f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh __m128i temp, temp2; 118f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh short * wsptr; 119f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh unsigned char * outptr; 120f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 121f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define iDCT_8_2ROWs(table1, table2) \ 122f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_shufflelo_epi16(row0, 0xD8); /*x7, x6, x5, x4, x3, x1, x2, x0*/ \ 123f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_shufflelo_epi16(row2, 0xD8); \ 124f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp1 = _mm_shuffle_epi32(row0, 0); /*x2, x0, x2, x0, x2, x0, x2, x0*/ \ 125f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp5 = _mm_shuffle_epi32(row2, 0); \ 126f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh \ 127f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp3 = _mm_shuffle_epi32(row0, 0x55); /*x3, x1, x3, x1, x3, x1, x3, x1*/ \ 128f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp7 = _mm_shuffle_epi32(row2, 0x55); \ 129f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_shufflehi_epi16(row0, 0xD8); /*x7, x5, x6, x4, x3, x1, x2, x0*/ \ 130f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_shufflehi_epi16(row2, 0xD8); \ 131f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh \ 132f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp1 = _mm_madd_epi16(tmp1, * ( __m128i*)table1); /*x2*w13+x0*w12, x2*w9+x0*w8, x2*w5+x0*w4, x2*w1+x0*w0*/ \ 133f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp5 = _mm_madd_epi16(tmp5, * ( __m128i*)table2); \ 134f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh \ 135f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp2 = _mm_shuffle_epi32(row0, 0xAA); /*x6, x4, x6, x4, x6, x4, x6, x4*/ \ 136f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp6 = _mm_shuffle_epi32(row2, 0xAA); \ 137f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_shuffle_epi32(row0, 0xFF); /*x7, x5, x7, x5, x7, x5, x7, x5*/ \ 138f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_shuffle_epi32(row2, 0xFF); \ 139f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 140f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp3 = _mm_madd_epi16(tmp3, * ( __m128i*)(table1+16)); /*x3*w29+x1*w28, x3*w25+x1*w24, x3*w21+x1*w20, x3*w17+x1*w16*/ \ 141f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp7 = _mm_madd_epi16(tmp7, * ( __m128i*)(table2+16) ); \ 142f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_madd_epi16(row0, * ( __m128i*)(table1+24)); /*x7*w31+x5*w30, x7*w27+x5*w26, x7*w23+x5*w22, x7*w19+x5*w18*/ \ 143f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_madd_epi16(row2, * ( __m128i*)(table2+24) ); \ 144f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp2 = _mm_madd_epi16(tmp2, * ( __m128i*)(table1+8) ); /*x6*w15+x4*w14, x6*w11+x4*w10, x6*w7+x4*w6, x6*w3+x4*w2*/ \ 145f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp6 = _mm_madd_epi16(tmp6, * ( __m128i*)(table2+8) ); \ 146f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh \ 147f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp1 = _mm_add_epi32(tmp1, * ( __m128i*)M128_round_inv_row); \ 148f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp5 = _mm_add_epi32(tmp5, * ( __m128i*)M128_round_inv_row); \ 149f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_add_epi32(row0, tmp3); /*b3, b2, b1, b0*/ \ 150f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_add_epi32(row2, tmp7); \ 151f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp1 = _mm_add_epi32(tmp1, tmp2); /*a3, a2, a1, a0*/ \ 152f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp5 = _mm_add_epi32(tmp5, tmp6); \ 153f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh \ 154f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp2 = tmp1; \ 155f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp6 = tmp5; \ 156f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp2 = _mm_sub_epi32(tmp2, row0); /*for row0. y4= a3-b3, y5=a2-b2, y6=a1-b1, y7=a0-b0 */ \ 157f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp6 = _mm_sub_epi32(tmp6, row2); \ 158f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_add_epi32(row0, tmp1); /*y3=a3+b3,y2=a2+b2,y1=a1+b1,y0=a0+b0*/ \ 159f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_add_epi32(row2, tmp5); \ 160f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp2 = _mm_srai_epi32(tmp2, SHIFT_INV_ROW); \ 161f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp6 = _mm_srai_epi32(tmp6, SHIFT_INV_ROW); \ 162f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_srai_epi32(row0, SHIFT_INV_ROW); \ 163f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_srai_epi32(row2, SHIFT_INV_ROW); \ 164f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp2 = _mm_shuffle_epi32(tmp2, 0x1B); /*y7, y6, y5, y4*/ \ 165f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tmp6 = _mm_shuffle_epi32(tmp6, 0x1B); \ 166f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_packs_epi32(row0, tmp2); /*row0 = y7,y6,y5,y4,y3,y2,y1,y0*/ \ 167f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_packs_epi32(row2, tmp6); /*row2 = y7,...y0*/ 168f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 169f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 170f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define iDCT_8_COL() \ 171f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x3 = _mm_load_si128(( __m128i*)(wsptr+24));\ 172f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x1 = _mm_load_si128(( __m128i*)(wsptr+8));\ 173f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x5 = row0;\ 174f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x7 = row2;\ 175f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 176f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tg3 = ( __m128i*)(M128_tg_3_16);\ 177f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tg1 = ( __m128i*)(M128_tg_1_16);\ 178f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tg2 = ( __m128i*)(M128_tg_2_16);\ 179f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh cos4 =(__m128i*)(M128_cos_4_16);\ 180f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 181f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_mulhi_epi16(x5, *tg3); /*row5*tg3*/ \ 182f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_mulhi_epi16(x3, *tg3);\ 183f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(temp, x5); /*coef adjustment*/ \ 184f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_adds_epi16(temp2, x3);\ 185f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tm765 = _mm_adds_epi16(temp, x3);\ 186f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tm465 = _mm_subs_epi16(x5, temp2);\ 187f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 188f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_mulhi_epi16(x7, *tg1); /*row7*tg1*/ \ 189f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_mulhi_epi16(x1, *tg1);\ 190f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tp765 = _mm_adds_epi16(temp, x1);\ 191f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tp465 = _mm_subs_epi16(temp2, x7); /*row1*tg1 - row7*/ \ 192f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 193f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t7 = _mm_adds_epi16(tp765, tm765);\ 194f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t7 = _mm_adds_epi16(t7, *( __m128i*)M128_one_corr);\ 195f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tp65 = _mm_subs_epi16(tp765, tm765);\ 196f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t4 = _mm_adds_epi16(tp465, tm465);\ 197f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tm65 = _mm_subs_epi16(tp465, tm465);\ 198f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tm65 = _mm_adds_epi16(tm65, *( __m128i*)M128_one_corr);\ 199f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 200f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x0 = _mm_load_si128(( __m128i*)(wsptr));\ 201f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x4 = _mm_load_si128(( __m128i*)(wsptr+32));\ 202f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x2 = _mm_load_si128(( __m128i*)(wsptr+16));\ 203f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh x6 = _mm_load_si128(( __m128i*)(wsptr+48));\ 204f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 205f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh /*t6 = ( tp65 + tm65 ) * cos_4_16;*/ \ 206f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(tp65, tm65);\ 207f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_subs_epi16(tp65, tm65);\ 208f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t6 = _mm_mulhi_epi16(temp, *cos4);\ 209f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t5 = _mm_mulhi_epi16(temp2, *cos4);\ 210f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t6 = _mm_adds_epi16(t6, temp);\ 211f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t6 = _mm_or_si128(t6, *( __m128i*)M128_one_corr);\ 212f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t5 = _mm_adds_epi16(t5, temp2);\ 213f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t5 = _mm_or_si128(t5, *( __m128i*)M128_one_corr);\ 214f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 215f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tp03 = _mm_adds_epi16(x0, x4);\ 216f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tp12 = _mm_subs_epi16(x0, x4);\ 217f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 218f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_mulhi_epi16(x6, *tg2);\ 219f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_mulhi_epi16(x2, *tg2);\ 220f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tm03 = _mm_adds_epi16(temp, x2);\ 221f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh tm12 = _mm_subs_epi16(temp2, x6);\ 222f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 223f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t0 = _mm_adds_epi16(tp03, tm03);\ 224f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t0 = _mm_adds_epi16(t0, *( __m128i*)M128_round_inv_col);\ 225f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t3 = _mm_subs_epi16(tp03, tm03);\ 226f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t3 = _mm_adds_epi16(t3, *( __m128i*)M128_round_inv_corr);\ 227f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t1 = _mm_adds_epi16(tp12, tm12);\ 228f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t1 = _mm_adds_epi16(t1, *( __m128i*)M128_round_inv_col);\ 229f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t2 = _mm_subs_epi16(tp12, tm12);\ 230f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh t2 = _mm_adds_epi16(t2, *( __m128i*)M128_round_inv_corr);\ 231f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 232f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(t0, t7); /*y0*/ \ 233f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_adds_epi16(t1, t6); /*y1*/ \ 234f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ 235f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ 236f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust); /*Add 128 for jpeg decoding*/ \ 237f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ 238f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 239f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_packus_epi16(temp, temp2);\ 240f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128(( __m128i*)(outptr), temp); /*store y0, y1*/ \ 241f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 242f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(t2, t5);\ 243f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_adds_epi16(t3, t4);\ 244f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ 245f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ 246f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\ 247f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ 248f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 249f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_packus_epi16(temp, temp2);\ 250f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128(( __m128i*)(outptr+16), temp); /*store y2, y3*/ \ 251f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 252f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_subs_epi16(t3, t4);\ 253f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_subs_epi16(t2, t5);\ 254f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ 255f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ 256f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\ 257f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ 258f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 259f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_packus_epi16(temp, temp2);\ 260f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128(( __m128i*)(outptr+32), temp); /*store y4, y5*/ \ 261f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 262f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_subs_epi16(t1, t6);\ 263f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_subs_epi16(t0, t7);\ 264f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\ 265f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\ 266f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\ 267f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\ 268f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\ 269f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh temp = _mm_packus_epi16(temp, temp2);\ 270f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128(( __m128i*)(outptr+48), temp); /*store y6, y7*/ 271f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 272f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 273f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh /*Memcpy to do 16byte alignment. */ 274f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh memcpy((char*)quantptrSSE, (char*)compptr->dct_table, sizeof(quantptrSSE)); 275f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh memcpy((char*)coef_blockSSE, (char*)coef_block, sizeof(coef_blockSSE)); 276f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 277f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh wsptr = (short *)workspaceSSE; 278f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh outptr = (unsigned char*)workspaceSSE; 279f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 280f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh // row 0 and row 2 281f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_load_si128((__m128i const*)(coef_blockSSE)); 282f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*2)); 283f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_mullo_epi16( row0, *(__m128i const*)quantptrSSE ); 284f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_mullo_epi16( row2, *(__m128i const*)(quantptrSSE+8*2) ); 285f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 286f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26); 287f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 288f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128((__m128i*)(wsptr), row0); 289f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128((__m128i*)(wsptr+8*2), row2); 290f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 291f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh // row 4 and row 6 292f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*4)); 293f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*6)); 294f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+8*4) ); 295f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8*6) ); 296f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 297f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26); 298f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 299f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128((__m128i*)(wsptr+32), row0); 300f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128((__m128i*)(wsptr+48), row2); 301f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 302f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh // row 3 and row 1 303f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*3)); 304f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*1)); 305f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+24) ); 306f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8) ); 307f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 308f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh iDCT_8_2ROWs(M128_tab_i_35, M128_tab_i_17); 309f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 310f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128((__m128i*)(wsptr+24), row0); 311f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh _mm_store_si128((__m128i*)(wsptr+8), row2); 312f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 313f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh // row 5 and row 7 314f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*5)); 315f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*7)); 316f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+40) ); 317f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+56)); 318f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 319f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh iDCT_8_2ROWs( M128_tab_i_35, M128_tab_i_17); 320f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 321f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh iDCT_8_COL(); 322f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 323f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh for(ctr = 0; ctr < DCTSIZE; ctr++) 324f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh { 325f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh outptrTemp = output_buf[ctr] + output_col; 326f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh memcpy(outptrTemp, outptr, DCTSIZE); 327f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh outptr += DCTSIZE; /* advance pointer to next row */ 328f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh } 329f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh 330f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh return; 331f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh} 332f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#endif /* ANDROID_INTELSSE2_IDCT */ 333