1f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh/*
2f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Copyright (C) 2010-2011 Intel Corporation
3f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh *
4f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Licensed under the Apache License, Version 2.0 (the "License");
5f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * you may not use this file except in compliance with the License.
6f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * You may obtain a copy of the License at
7f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh *
8f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh *      http://www.apache.org/licenses/LICENSE-2.0
9f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh *
10f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Unless required by applicable law or agreed to in writing, software
11f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * distributed under the License is distributed on an "AS IS" BASIS,
12f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * See the License for the specific language governing permissions and
14f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * limitations under the License.
15f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh */
16f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
17f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define JPEG_INTERNALS
18f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include "jinclude.h"
19f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include "jpeglib.h"
20f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include "jdct.h"		/* Private declarations for DCT subsystem */
21f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
22f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#ifdef ANDROID_INTELSSE2_IDCT
23f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#include <emmintrin.h>
24f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
25f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#if DCTSIZE != 8
26f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
27f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#endif
28f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
29f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define BITS_INV_ACC 4
30f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define SHIFT_INV_ROW 12
31f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define SHIFT_INV_COL 5
32f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehconst short RND_INV_ROW = 2048;
33f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehconst short RND_INV_COL = 16;
34f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehconst short RND_INV_CORR = 15;
35f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
36f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_one_corr[8] = {1,1,1,1,1,1,1,1};
37f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_round_inv_row[8] = {2048,0,2048,0,2048,0,2048,0};
38f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_round_inv_col[8] = {16,16,16,16,16,16,16,16};
39f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_round_inv_corr[8] = {15,15,15,15,15,15,15,15};
40f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
41f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tg_1_16[8] = {13036, 13036, 13036, 13036, 13036, 13036, 13036, 13036};
42f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tg_2_16[8] = {27146, 27146, 27146, 27146, 27146, 27146, 27146, 27146};
43f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tg_3_16[8] = {-21746, -21746, -21746, -21746, -21746, -21746, -21746, -21746};
44f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_cos_4_16[8] = {-19195, -19195, -19195, -19195, -19195, -19195, -19195, -19195};
45f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
46f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) jpeg_adjust[8] = {128, 128, 128, 128, 128, 128, 128, 128};
47f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
48f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 0,4
49f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_04[32] = {
50f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16384, 21407, 16384, 8867,
51f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16384, -8867, 16384, -21407,
52f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16384, 8867, -16384, -21407,
53f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-16384, 21407, 16384, -8867,
54f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, 19266, 19266, -4520,
55f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh12873, -22725, 4520, -12873,
56f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh12873, 4520, -22725, -12873,
57f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh4520, 19266, 19266, -22725
58f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh};
59f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
60f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 1,7
61f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_17[32] = {
62f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, 29692, 22725, 12299,
63f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, -12299, 22725, -29692,
64f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh22725, 12299, -22725, -29692,
65f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-22725, 29692, 22725, -12299,
66f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh31521, 26722, 26722, -6270,
67f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh17855, -31521, 6270, -17855,
68f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh17855, 6270, -31521, -17855,
69f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh6270, 26722, 26722, -31521
70f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh};
71f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
72f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 2,6
73f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_26[32] = {
74f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh21407, 27969, 21407, 11585,
75f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh21407, -11585, 21407, -27969,
76f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh21407, 11585, -21407, -27969,
77f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-21407, 27969, 21407, -11585,
78f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh29692, 25172, 25172, -5906,
79f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16819, -29692, 5906, -16819,
80f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh16819, 5906, -29692, -16819,
81f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh5906, 25172, 25172, -29692
82f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh};
83f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
84f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh// Table for rows 3,5
85f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehstatic const short __attribute__ ((aligned(16))) M128_tab_i_35[32] = {
86f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh19266, 25172, 19266, 10426,
87f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh19266, -10426, 19266, -25172,
88f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh19266, 10426, -19266, -25172,
89f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh-19266, 25172, 19266, -10426,
90f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh26722, 22654, 22654, -5315,
91f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh15137, -26722, 5315, -15137,
92f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh15137, 5315, -26722, -15137,
93f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh5315, 22654, 22654, -26722
94f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh};
95f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
96f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
97f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh/*
98f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh * Perform dequantization and inverse DCT on one block of coefficients by SSE.
99f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh */
100f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
101f897702251443b531b2ded93df71ffd87fbae076Andrew HsiehGLOBAL(void)
102f897702251443b531b2ded93df71ffd87fbae076Andrew Hsiehjpeg_idct_intelsse (j_decompress_ptr cinfo, jpeg_component_info * compptr,
103f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh		 JCOEFPTR coef_block,
104f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh		 JSAMPARRAY output_buf, JDIMENSION output_col)
105f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh{
106f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  __m128i row0, tmp1, tmp2, tmp3, row2, tmp5, tmp6, tmp7;
107f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  int ctr;
108f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  JSAMPROW  outptrTemp;
109f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  JSAMPLE *range_limit = IDCT_range_limit(cinfo);
110f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  short __attribute__((aligned(16))) quantptrSSE[DCTSIZE2];
111f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  short __attribute__((aligned(16))) workspaceSSE[DCTSIZE2];
112f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  short __attribute__((aligned(16))) coef_blockSSE[DCTSIZE2];
113f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
114f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  __m128i* tg3, *tg1, *tg2, *cos4;
115f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  __m128i tm765, tp765, tm465, tp465, tp03, tm03, tp12, tm12, tp65, tm65;
116f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  __m128i t0, t1, t2, t3, t4, t5, t6, t7;
117f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  __m128i temp, temp2;
118f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  short * wsptr;
119f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  unsigned char * outptr;
120f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
121f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define iDCT_8_2ROWs(table1, table2)   \
122f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_shufflelo_epi16(row0, 0xD8); /*x7, x6, x5, x4, x3, x1, x2, x0*/    \
123f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_shufflelo_epi16(row2, 0xD8);   \
124f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp1 = _mm_shuffle_epi32(row0, 0);      /*x2, x0, x2, x0, x2, x0, x2, x0*/    \
125f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp5 = _mm_shuffle_epi32(row2, 0);        \
126f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh                                                                                  \
127f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp3 = _mm_shuffle_epi32(row0, 0x55);   /*x3, x1, x3, x1, x3, x1, x3, x1*/    \
128f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp7 = _mm_shuffle_epi32(row2, 0x55);     \
129f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_shufflehi_epi16(row0, 0xD8); /*x7, x5, x6, x4, x3, x1, x2, x0*/    \
130f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_shufflehi_epi16(row2, 0xD8);   \
131f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh						\
132f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp1 = _mm_madd_epi16(tmp1, * ( __m128i*)table1);      /*x2*w13+x0*w12, x2*w9+x0*w8, x2*w5+x0*w4, x2*w1+x0*w0*/   \
133f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp5 = _mm_madd_epi16(tmp5, * ( __m128i*)table2);       \
134f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh						\
135f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp2 =  _mm_shuffle_epi32(row0, 0xAA);  /*x6, x4, x6, x4, x6, x4, x6, x4*/    \
136f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp6 = _mm_shuffle_epi32(row2, 0xAA);     \
137f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_shuffle_epi32(row0, 0xFF);   /*x7, x5, x7, x5, x7, x5, x7, x5*/    \
138f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_shuffle_epi32(row2, 0xFF);     \
139f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
140f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp3 = _mm_madd_epi16(tmp3, * ( __m128i*)(table1+16)); /*x3*w29+x1*w28, x3*w25+x1*w24, x3*w21+x1*w20, x3*w17+x1*w16*/  \
141f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp7 = _mm_madd_epi16(tmp7, * ( __m128i*)(table2+16) ); \
142f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_madd_epi16(row0, * ( __m128i*)(table1+24)); /*x7*w31+x5*w30, x7*w27+x5*w26, x7*w23+x5*w22, x7*w19+x5*w18*/  \
143f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_madd_epi16(row2, * ( __m128i*)(table2+24) ); \
144f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp2 = _mm_madd_epi16(tmp2, * ( __m128i*)(table1+8) ); /*x6*w15+x4*w14, x6*w11+x4*w10, x6*w7+x4*w6, x6*w3+x4*w2*/  \
145f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp6 = _mm_madd_epi16(tmp6, * ( __m128i*)(table2+8) );  \
146f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh                                                             \
147f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp1 = _mm_add_epi32(tmp1, * ( __m128i*)M128_round_inv_row);       \
148f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp5 = _mm_add_epi32(tmp5, * ( __m128i*)M128_round_inv_row);      \
149f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_add_epi32(row0, tmp3);    /*b3, b2, b1, b0*/  \
150f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_add_epi32(row2, tmp7);                       \
151f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp1 = _mm_add_epi32(tmp1, tmp2);    /*a3, a2, a1, a0*/  \
152f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp5 = _mm_add_epi32(tmp5, tmp6);                       \
153f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh                                                             \
154f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp2 = tmp1;  \
155f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp6 = tmp5;  \
156f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp2 = _mm_sub_epi32(tmp2, row0); /*for row0. y4= a3-b3, y5=a2-b2, y6=a1-b1, y7=a0-b0 */   \
157f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp6 = _mm_sub_epi32(tmp6, row2);  \
158f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_add_epi32(row0, tmp1); /*y3=a3+b3,y2=a2+b2,y1=a1+b1,y0=a0+b0*/   \
159f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_add_epi32(row2, tmp5);  \
160f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp2 = _mm_srai_epi32(tmp2, SHIFT_INV_ROW);             \
161f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp6 = _mm_srai_epi32(tmp6, SHIFT_INV_ROW);  \
162f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_srai_epi32(row0, SHIFT_INV_ROW);             \
163f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_srai_epi32(row2, SHIFT_INV_ROW);  \
164f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp2 = _mm_shuffle_epi32(tmp2, 0x1B); /*y7, y6, y5, y4*/   \
165f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tmp6 = _mm_shuffle_epi32(tmp6, 0x1B);  \
166f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row0 = _mm_packs_epi32(row0, tmp2); /*row0 = y7,y6,y5,y4,y3,y2,y1,y0*/  \
167f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    row2 = _mm_packs_epi32(row2, tmp6);  /*row2 = y7,...y0*/
168f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
169f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
170f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#define iDCT_8_COL()  \
171f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x3 = _mm_load_si128(( __m128i*)(wsptr+24));\
172f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x1 = _mm_load_si128(( __m128i*)(wsptr+8));\
173f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x5 = row0;\
174f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x7 = row2;\
175f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
176f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tg3 = ( __m128i*)(M128_tg_3_16);\
177f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tg1 = ( __m128i*)(M128_tg_1_16);\
178f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tg2 = ( __m128i*)(M128_tg_2_16);\
179f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    cos4 =(__m128i*)(M128_cos_4_16);\
180f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
181f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_mulhi_epi16(x5, *tg3);  /*row5*tg3*/ \
182f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_mulhi_epi16(x3, *tg3);\
183f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(temp, x5); /*coef adjustment*/ \
184f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_adds_epi16(temp2, x3);\
185f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tm765 = _mm_adds_epi16(temp, x3);\
186f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tm465 = _mm_subs_epi16(x5, temp2);\
187f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
188f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_mulhi_epi16(x7, *tg1);  /*row7*tg1*/ \
189f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_mulhi_epi16(x1, *tg1);\
190f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tp765 = _mm_adds_epi16(temp, x1);\
191f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tp465 = _mm_subs_epi16(temp2, x7);  /*row1*tg1 - row7*/ \
192f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
193f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t7 = _mm_adds_epi16(tp765, tm765);\
194f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t7 = _mm_adds_epi16(t7, *( __m128i*)M128_one_corr);\
195f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tp65 = _mm_subs_epi16(tp765, tm765);\
196f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t4 =  _mm_adds_epi16(tp465, tm465);\
197f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tm65 = _mm_subs_epi16(tp465, tm465);\
198f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tm65 = _mm_adds_epi16(tm65, *( __m128i*)M128_one_corr);\
199f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
200f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x0 = _mm_load_si128(( __m128i*)(wsptr));\
201f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x4 = _mm_load_si128(( __m128i*)(wsptr+32));\
202f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x2 = _mm_load_si128(( __m128i*)(wsptr+16));\
203f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    x6 = _mm_load_si128(( __m128i*)(wsptr+48));\
204f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
205f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    /*t6 = ( tp65 + tm65 ) * cos_4_16;*/ \
206f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(tp65, tm65);\
207f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_subs_epi16(tp65, tm65);\
208f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t6 = _mm_mulhi_epi16(temp, *cos4);\
209f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t5 = _mm_mulhi_epi16(temp2, *cos4);\
210f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t6 = _mm_adds_epi16(t6, temp);\
211f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t6 = _mm_or_si128(t6, *( __m128i*)M128_one_corr);\
212f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t5 = _mm_adds_epi16(t5, temp2);\
213f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t5 = _mm_or_si128(t5, *( __m128i*)M128_one_corr);\
214f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
215f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tp03 = _mm_adds_epi16(x0, x4);\
216f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tp12 = _mm_subs_epi16(x0, x4);\
217f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
218f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_mulhi_epi16(x6, *tg2);\
219f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_mulhi_epi16(x2, *tg2);\
220f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tm03 = _mm_adds_epi16(temp, x2);\
221f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    tm12 = _mm_subs_epi16(temp2, x6);\
222f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
223f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t0 = _mm_adds_epi16(tp03, tm03);\
224f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t0 = _mm_adds_epi16(t0, *( __m128i*)M128_round_inv_col);\
225f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t3 = _mm_subs_epi16(tp03, tm03);\
226f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t3 = _mm_adds_epi16(t3, *( __m128i*)M128_round_inv_corr);\
227f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t1 = _mm_adds_epi16(tp12, tm12);\
228f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t1 = _mm_adds_epi16(t1, *( __m128i*)M128_round_inv_col);\
229f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t2 = _mm_subs_epi16(tp12, tm12);\
230f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    t2 = _mm_adds_epi16(t2, *( __m128i*)M128_round_inv_corr);\
231f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
232f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(t0, t7);   /*y0*/ \
233f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_adds_epi16(t1, t6);  /*y1*/ \
234f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
235f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
236f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust); /*Add 128 for jpeg decoding*/ \
237f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
238f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
239f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_packus_epi16(temp, temp2);\
240f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    _mm_store_si128(( __m128i*)(outptr), temp);  /*store y0, y1*/ \
241f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
242f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(t2, t5);\
243f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_adds_epi16(t3, t4);\
244f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
245f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
246f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
247f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
248f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
249f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_packus_epi16(temp, temp2);\
250f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    _mm_store_si128(( __m128i*)(outptr+16), temp);  /*store y2, y3*/ \
251f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
252f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_subs_epi16(t3, t4);\
253f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_subs_epi16(t2, t5);\
254f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
255f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
256f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
257f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
258f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
259f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_packus_epi16(temp, temp2);\
260f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    _mm_store_si128(( __m128i*)(outptr+32), temp);  /*store y4, y5*/ \
261f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
262f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_subs_epi16(t1, t6);\
263f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_subs_epi16(t0, t7);\
264f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_srai_epi16(temp, SHIFT_INV_COL);\
265f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_srai_epi16(temp2, SHIFT_INV_COL);\
266f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_adds_epi16(temp, *( __m128i*)jpeg_adjust);\
267f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp2 = _mm_adds_epi16(temp2, *( __m128i*)jpeg_adjust);\
268f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh\
269f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    temp = _mm_packus_epi16(temp, temp2);\
270f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    _mm_store_si128(( __m128i*)(outptr+48), temp);  /*store y6, y7*/
271f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
272f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
273f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  /*Memcpy to do 16byte alignment. */
274f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  memcpy((char*)quantptrSSE, (char*)compptr->dct_table, sizeof(quantptrSSE));
275f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  memcpy((char*)coef_blockSSE, (char*)coef_block, sizeof(coef_blockSSE));
276f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
277f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  wsptr = (short *)workspaceSSE;
278f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  outptr = (unsigned char*)workspaceSSE;
279f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
280f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  // row 0 and row 2
281f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_load_si128((__m128i const*)(coef_blockSSE));
282f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*2));
283f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_mullo_epi16( row0, *(__m128i const*)quantptrSSE );
284f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_mullo_epi16( row2, *(__m128i const*)(quantptrSSE+8*2) );
285f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
286f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
287f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
288f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  _mm_store_si128((__m128i*)(wsptr), row0);
289f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  _mm_store_si128((__m128i*)(wsptr+8*2), row2);
290f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
291f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  // row 4 and row 6
292f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*4));
293f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*6));
294f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+8*4) );
295f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8*6) );
296f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
297f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  iDCT_8_2ROWs(M128_tab_i_04, M128_tab_i_26);
298f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
299f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  _mm_store_si128((__m128i*)(wsptr+32), row0);
300f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  _mm_store_si128((__m128i*)(wsptr+48), row2);
301f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
302f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  // row 3 and row 1
303f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*3));
304f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*1));
305f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+24) );
306f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+8) );
307f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
308f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  iDCT_8_2ROWs(M128_tab_i_35, M128_tab_i_17);
309f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
310f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  _mm_store_si128((__m128i*)(wsptr+24), row0);
311f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  _mm_store_si128((__m128i*)(wsptr+8), row2);
312f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
313f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  // row 5 and row 7
314f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*5));
315f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_load_si128((__m128i const*)(coef_blockSSE+8*7));
316f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row0 = _mm_mullo_epi16(row0, *(__m128i const*)(quantptrSSE+40) );
317f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  row2 = _mm_mullo_epi16(row2, *(__m128i const*)(quantptrSSE+56));
318f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
319f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  iDCT_8_2ROWs( M128_tab_i_35, M128_tab_i_17);
320f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
321f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  iDCT_8_COL();
322f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
323f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  for(ctr = 0; ctr < DCTSIZE; ctr++)
324f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  {
325f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    outptrTemp = output_buf[ctr] + output_col;
326f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    memcpy(outptrTemp, outptr, DCTSIZE);
327f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh    outptr += DCTSIZE;   /* advance pointer to next row */
328f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  }
329f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh
330f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh  return;
331f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh}
332f897702251443b531b2ded93df71ffd87fbae076Andrew Hsieh#endif /* ANDROID_INTELSSE2_IDCT */
333