1/* 2 * jidctflt.c 3 * 4 * Copyright (C) 1994-1998, Thomas G. Lane. 5 * This file is part of the Independent JPEG Group's software. 6 * 7 * The authors make NO WARRANTY or representation, either express or implied, 8 * with respect to this software, its quality, accuracy, merchantability, or 9 * fitness for a particular purpose. This software is provided "AS IS", and you, 10 * its user, assume the entire risk as to its quality and accuracy. 11 * 12 * This software is copyright (C) 1991-1998, Thomas G. Lane. 13 * All Rights Reserved except as specified below. 14 * 15 * Permission is hereby granted to use, copy, modify, and distribute this 16 * software (or portions thereof) for any purpose, without fee, subject to these 17 * conditions: 18 * (1) If any part of the source code for this software is distributed, then this 19 * README file must be included, with this copyright and no-warranty notice 20 * unaltered; and any additions, deletions, or changes to the original files 21 * must be clearly indicated in accompanying documentation. 22 * (2) If only executable code is distributed, then the accompanying 23 * documentation must state that "this software is based in part on the work of 24 * the Independent JPEG Group". 25 * (3) Permission for use of this software is granted only if the user accepts 26 * full responsibility for any undesirable consequences; the authors accept 27 * NO LIABILITY for damages of any kind. 28 * 29 * These conditions apply to any software derived from or based on the IJG code, 30 * not just to the unmodified library. If you use our work, you ought to 31 * acknowledge us. 32 * 33 * Permission is NOT granted for the use of any IJG author's name or company name 34 * in advertising or publicity relating to this software or products derived from 35 * it. This software may be referred to only as "the Independent JPEG Group's 36 * software". 37 * 38 * We specifically permit and encourage the use of this software as the basis of 39 * commercial products, provided that all warranty or liability claims are 40 * assumed by the product vendor. 41 * 42 * 43 * This file contains a floating-point implementation of the 44 * inverse DCT (Discrete Cosine Transform). In the IJG code, this routine 45 * must also perform dequantization of the input coefficients. 46 * 47 * This implementation should be more accurate than either of the integer 48 * IDCT implementations. However, it may not give the same results on all 49 * machines because of differences in roundoff behavior. Speed will depend 50 * on the hardware's floating point capacity. 51 * 52 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT 53 * on each row (or vice versa, but it's more convenient to emit a row at 54 * a time). Direct algorithms are also available, but they are much more 55 * complex and seem not to be any faster when reduced to code. 56 * 57 * This implementation is based on Arai, Agui, and Nakajima's algorithm for 58 * scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in 59 * Japanese, but the algorithm is described in the Pennebaker & Mitchell 60 * JPEG textbook (see REFERENCES section in file README). The following code 61 * is based directly on figure 4-8 in P&M. 62 * While an 8-point DCT cannot be done in less than 11 multiplies, it is 63 * possible to arrange the computation so that many of the multiplies are 64 * simple scalings of the final outputs. These multiplies can then be 65 * folded into the multiplications or divisions by the JPEG quantization 66 * table entries. The AA&N method leaves only 5 multiplies and 29 adds 67 * to be done in the DCT itself. 68 * The primary disadvantage of this method is that with a fixed-point 69 * implementation, accuracy is lost due to imprecise representation of the 70 * scaled quantization values. However, that problem does not arise if 71 * we use floating point arithmetic. 72 */ 73 74#include <stdint.h> 75#include "tinyjpeg-internal.h" 76 77#define FAST_FLOAT float 78#define DCTSIZE 8 79#define DCTSIZE2 (DCTSIZE*DCTSIZE) 80 81#define DEQUANTIZE(coef,quantval) (((FAST_FLOAT) (coef)) * (quantval)) 82 83#if 1 && defined(__GNUC__) && (defined(__i686__) || defined(__x86_64__)) 84 85static inline unsigned char descale_and_clamp(int x, int shift) 86{ 87 __asm__ ( 88 "add %3,%1\n" 89 "\tsar %2,%1\n" 90 "\tsub $-128,%1\n" 91 "\tcmovl %5,%1\n" /* Use the sub to compare to 0 */ 92 "\tcmpl %4,%1\n" 93 "\tcmovg %4,%1\n" 94 : "=r"(x) 95 : "0"(x), "Ir"(shift), "ir"(1UL<<(shift-1)), "r" (0xff), "r" (0) 96 ); 97 return x; 98} 99 100#else 101static inline unsigned char descale_and_clamp(int x, int shift) 102{ 103 x += (1UL<<(shift-1)); 104 if (x<0) 105 x = (x >> shift) | ((~(0UL)) << (32-(shift))); 106 else 107 x >>= shift; 108 x += 128; 109 if (x>255) 110 return 255; 111 else if (x<0) 112 return 0; 113 else 114 return x; 115} 116#endif 117 118/* 119 * Perform dequantization and inverse DCT on one block of coefficients. 120 */ 121 122void 123tinyjpeg_idct_float (struct component *compptr, uint8_t *output_buf, int stride) 124{ 125 FAST_FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 126 FAST_FLOAT tmp10, tmp11, tmp12, tmp13; 127 FAST_FLOAT z5, z10, z11, z12, z13; 128 int16_t *inptr; 129 FAST_FLOAT *quantptr; 130 FAST_FLOAT *wsptr; 131 uint8_t *outptr; 132 int ctr; 133 FAST_FLOAT workspace[DCTSIZE2]; /* buffers data between passes */ 134 135 /* Pass 1: process columns from input, store into work array. */ 136 137 inptr = compptr->DCT; 138 quantptr = compptr->Q_table; 139 wsptr = workspace; 140 for (ctr = DCTSIZE; ctr > 0; ctr--) { 141 /* Due to quantization, we will usually find that many of the input 142 * coefficients are zero, especially the AC terms. We can exploit this 143 * by short-circuiting the IDCT calculation for any column in which all 144 * the AC terms are zero. In that case each output is equal to the 145 * DC coefficient (with scale factor as needed). 146 * With typical images and quantization tables, half or more of the 147 * column DCT calculations can be simplified this way. 148 */ 149 150 if (inptr[DCTSIZE*1] == 0 && inptr[DCTSIZE*2] == 0 && 151 inptr[DCTSIZE*3] == 0 && inptr[DCTSIZE*4] == 0 && 152 inptr[DCTSIZE*5] == 0 && inptr[DCTSIZE*6] == 0 && 153 inptr[DCTSIZE*7] == 0) { 154 /* AC terms all zero */ 155 FAST_FLOAT dcval = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); 156 157 wsptr[DCTSIZE*0] = dcval; 158 wsptr[DCTSIZE*1] = dcval; 159 wsptr[DCTSIZE*2] = dcval; 160 wsptr[DCTSIZE*3] = dcval; 161 wsptr[DCTSIZE*4] = dcval; 162 wsptr[DCTSIZE*5] = dcval; 163 wsptr[DCTSIZE*6] = dcval; 164 wsptr[DCTSIZE*7] = dcval; 165 166 inptr++; /* advance pointers to next column */ 167 quantptr++; 168 wsptr++; 169 continue; 170 } 171 172 /* Even part */ 173 174 tmp0 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]); 175 tmp1 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]); 176 tmp2 = DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]); 177 tmp3 = DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]); 178 179 tmp10 = tmp0 + tmp2; /* phase 3 */ 180 tmp11 = tmp0 - tmp2; 181 182 tmp13 = tmp1 + tmp3; /* phases 5-3 */ 183 tmp12 = (tmp1 - tmp3) * ((FAST_FLOAT) 1.414213562) - tmp13; /* 2*c4 */ 184 185 tmp0 = tmp10 + tmp13; /* phase 2 */ 186 tmp3 = tmp10 - tmp13; 187 tmp1 = tmp11 + tmp12; 188 tmp2 = tmp11 - tmp12; 189 190 /* Odd part */ 191 192 tmp4 = DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]); 193 tmp5 = DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]); 194 tmp6 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]); 195 tmp7 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]); 196 197 z13 = tmp6 + tmp5; /* phase 6 */ 198 z10 = tmp6 - tmp5; 199 z11 = tmp4 + tmp7; 200 z12 = tmp4 - tmp7; 201 202 tmp7 = z11 + z13; /* phase 5 */ 203 tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); /* 2*c4 */ 204 205 z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */ 206 tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */ 207 tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */ 208 209 tmp6 = tmp12 - tmp7; /* phase 2 */ 210 tmp5 = tmp11 - tmp6; 211 tmp4 = tmp10 + tmp5; 212 213 wsptr[DCTSIZE*0] = tmp0 + tmp7; 214 wsptr[DCTSIZE*7] = tmp0 - tmp7; 215 wsptr[DCTSIZE*1] = tmp1 + tmp6; 216 wsptr[DCTSIZE*6] = tmp1 - tmp6; 217 wsptr[DCTSIZE*2] = tmp2 + tmp5; 218 wsptr[DCTSIZE*5] = tmp2 - tmp5; 219 wsptr[DCTSIZE*4] = tmp3 + tmp4; 220 wsptr[DCTSIZE*3] = tmp3 - tmp4; 221 222 inptr++; /* advance pointers to next column */ 223 quantptr++; 224 wsptr++; 225 } 226 227 /* Pass 2: process rows from work array, store into output array. */ 228 /* Note that we must descale the results by a factor of 8 == 2**3. */ 229 230 wsptr = workspace; 231 outptr = output_buf; 232 for (ctr = 0; ctr < DCTSIZE; ctr++) { 233 /* Rows of zeroes can be exploited in the same way as we did with columns. 234 * However, the column calculation has created many nonzero AC terms, so 235 * the simplification applies less often (typically 5% to 10% of the time). 236 * And testing floats for zero is relatively expensive, so we don't bother. 237 */ 238 239 /* Even part */ 240 241 tmp10 = wsptr[0] + wsptr[4]; 242 tmp11 = wsptr[0] - wsptr[4]; 243 244 tmp13 = wsptr[2] + wsptr[6]; 245 tmp12 = (wsptr[2] - wsptr[6]) * ((FAST_FLOAT) 1.414213562) - tmp13; 246 247 tmp0 = tmp10 + tmp13; 248 tmp3 = tmp10 - tmp13; 249 tmp1 = tmp11 + tmp12; 250 tmp2 = tmp11 - tmp12; 251 252 /* Odd part */ 253 254 z13 = wsptr[5] + wsptr[3]; 255 z10 = wsptr[5] - wsptr[3]; 256 z11 = wsptr[1] + wsptr[7]; 257 z12 = wsptr[1] - wsptr[7]; 258 259 tmp7 = z11 + z13; 260 tmp11 = (z11 - z13) * ((FAST_FLOAT) 1.414213562); 261 262 z5 = (z10 + z12) * ((FAST_FLOAT) 1.847759065); /* 2*c2 */ 263 tmp10 = ((FAST_FLOAT) 1.082392200) * z12 - z5; /* 2*(c2-c6) */ 264 tmp12 = ((FAST_FLOAT) -2.613125930) * z10 + z5; /* -2*(c2+c6) */ 265 266 tmp6 = tmp12 - tmp7; 267 tmp5 = tmp11 - tmp6; 268 tmp4 = tmp10 + tmp5; 269 270 /* Final output stage: scale down by a factor of 8 and range-limit */ 271 272 outptr[0] = descale_and_clamp((int)(tmp0 + tmp7), 3); 273 outptr[7] = descale_and_clamp((int)(tmp0 - tmp7), 3); 274 outptr[1] = descale_and_clamp((int)(tmp1 + tmp6), 3); 275 outptr[6] = descale_and_clamp((int)(tmp1 - tmp6), 3); 276 outptr[2] = descale_and_clamp((int)(tmp2 + tmp5), 3); 277 outptr[5] = descale_and_clamp((int)(tmp2 - tmp5), 3); 278 outptr[4] = descale_and_clamp((int)(tmp3 + tmp4), 3); 279 outptr[3] = descale_and_clamp((int)(tmp3 - tmp4), 3); 280 281 282 wsptr += DCTSIZE; /* advance pointer to next row */ 283 outptr += stride; 284 } 285} 286