jidctfst-altivec.c revision 6eb7d3798b5a79347c62825fc4c16f7ce673bdd0
1/*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty.  In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 *    claim that you wrote the original software. If you use this software
16 *    in a product, an acknowledgment in the product documentation would be
17 *    appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 *    misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23/* FAST INTEGER INVERSE DCT
24 *
25 * This is similar to the SSE2 implementation, except that we left-shift the
26 * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
27 * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28 *   the elements in arg3 + the most significant 17 bits of
29 *     (the elements in arg1 * the elements in arg2).
30 */
31
32#include "jsimd_altivec.h"
33
34
35#define F_1_082 277              /* FIX(1.082392200) */
36#define F_1_414 362              /* FIX(1.414213562) */
37#define F_1_847 473              /* FIX(1.847759065) */
38#define F_2_613 669              /* FIX(2.613125930) */
39#define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
40
41#define CONST_BITS 8
42#define PASS1_BITS 2
43#define PRE_MULTIPLY_SCALE_BITS 2
44#define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
45
46
47#define DO_IDCT(in)  \
48{  \
49  /* Even part */  \
50  \
51  tmp10 = vec_add(in##0, in##4);  \
52  tmp11 = vec_sub(in##0, in##4);  \
53  tmp13 = vec_add(in##2, in##6);  \
54  \
55  tmp12 = vec_sub(in##2, in##6);  \
56  tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
57  tmp12 = vec_madds(tmp12, pw_F1414, pw_zero);  \
58  tmp12 = vec_sub(tmp12, tmp13);  \
59  \
60  tmp0 = vec_add(tmp10, tmp13);  \
61  tmp3 = vec_sub(tmp10, tmp13);  \
62  tmp1 = vec_add(tmp11, tmp12);  \
63  tmp2 = vec_sub(tmp11, tmp12);  \
64  \
65  /* Odd part */  \
66  \
67  z13 = vec_add(in##5, in##3);  \
68  z10 = vec_sub(in##5, in##3);  \
69  z10s = vec_sl(z10, pre_multiply_scale_bits);  \
70  z11 = vec_add(in##1, in##7);  \
71  z12s = vec_sub(in##1, in##7);  \
72  z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
73  \
74  tmp11 = vec_sub(z11, z13);  \
75  tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
76  tmp11 = vec_madds(tmp11, pw_F1414, pw_zero);  \
77  \
78  tmp7 = vec_add(z11, z13);  \
79  \
80  /* To avoid overflow...  \
81   *  \
82   * (Original)  \
83   * tmp12 = -2.613125930 * z10 + z5;  \
84   *  \
85   * (This implementation)  \
86   * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
87   *       = -1.613125930 * z10 - z10 + z5;  \
88   */  \
89  \
90  z5 = vec_add(z10s, z12s);  \
91  z5 = vec_madds(z5, pw_F1847, pw_zero);  \
92  \
93  tmp10 = vec_madds(z12s, pw_F1082, pw_zero);  \
94  tmp10 = vec_sub(tmp10, z5);  \
95  tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
96  tmp12 = vec_sub(tmp12, z10);  \
97  \
98  tmp6 = vec_sub(tmp12, tmp7);  \
99  tmp5 = vec_sub(tmp11, tmp6);  \
100  tmp4 = vec_add(tmp10, tmp5);  \
101  \
102  out0 = vec_add(tmp0, tmp7);  \
103  out1 = vec_add(tmp1, tmp6);  \
104  out2 = vec_add(tmp2, tmp5);  \
105  out3 = vec_sub(tmp3, tmp4);  \
106  out4 = vec_add(tmp3, tmp4);  \
107  out5 = vec_sub(tmp2, tmp5);  \
108  out6 = vec_sub(tmp1, tmp6);  \
109  out7 = vec_sub(tmp0, tmp7);  \
110}
111
112
113void
114jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
115                          JSAMPARRAY output_buf, JDIMENSION output_col)
116{
117  short *dct_table = (short *)dct_table_;
118  int *outptr;
119
120  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
121    col0, col1, col2, col3, col4, col5, col6, col7,
122    quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
123    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
124    z5, z10, z10s, z11, z12s, z13,
125    out0, out1, out2, out3, out4, out5, out6, out7;
126  __vector signed char outb;
127
128  /* Constants */
129  __vector short pw_zero = { __8X(0) },
130    pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
131    pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
132    pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
133    pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
134  __vector unsigned short
135    pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
136    pass1_bits3 = { __8X(PASS1_BITS + 3) };
137  __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
138
139  /* Pass 1: process columns */
140
141  col0 = vec_ld(0, coef_block);
142  col1 = vec_ld(16, coef_block);
143  col2 = vec_ld(32, coef_block);
144  col3 = vec_ld(48, coef_block);
145  col4 = vec_ld(64, coef_block);
146  col5 = vec_ld(80, coef_block);
147  col6 = vec_ld(96, coef_block);
148  col7 = vec_ld(112, coef_block);
149
150  tmp1 = vec_or(col1, col2);
151  tmp2 = vec_or(col3, col4);
152  tmp1 = vec_or(tmp1, tmp2);
153  tmp3 = vec_or(col5, col6);
154  tmp3 = vec_or(tmp3, col7);
155  tmp1 = vec_or(tmp1, tmp3);
156
157  quant0 = vec_ld(0, dct_table);
158  col0 = vec_mladd(col0, quant0, pw_zero);
159
160  if (vec_all_eq(tmp1, pw_zero)) {
161    /* AC terms all zero */
162
163    row0 = vec_splat(col0, 0);
164    row1 = vec_splat(col0, 1);
165    row2 = vec_splat(col0, 2);
166    row3 = vec_splat(col0, 3);
167    row4 = vec_splat(col0, 4);
168    row5 = vec_splat(col0, 5);
169    row6 = vec_splat(col0, 6);
170    row7 = vec_splat(col0, 7);
171
172  } else {
173
174    quant1 = vec_ld(16, dct_table);
175    quant2 = vec_ld(32, dct_table);
176    quant3 = vec_ld(48, dct_table);
177    quant4 = vec_ld(64, dct_table);
178    quant5 = vec_ld(80, dct_table);
179    quant6 = vec_ld(96, dct_table);
180    quant7 = vec_ld(112, dct_table);
181
182    col1 = vec_mladd(col1, quant1, pw_zero);
183    col2 = vec_mladd(col2, quant2, pw_zero);
184    col3 = vec_mladd(col3, quant3, pw_zero);
185    col4 = vec_mladd(col4, quant4, pw_zero);
186    col5 = vec_mladd(col5, quant5, pw_zero);
187    col6 = vec_mladd(col6, quant6, pw_zero);
188    col7 = vec_mladd(col7, quant7, pw_zero);
189
190    DO_IDCT(col);
191
192    TRANSPOSE(out, row);
193  }
194
195  /* Pass 2: process rows */
196
197  DO_IDCT(row);
198
199  out0 = vec_sra(out0, pass1_bits3);
200  out1 = vec_sra(out1, pass1_bits3);
201  out2 = vec_sra(out2, pass1_bits3);
202  out3 = vec_sra(out3, pass1_bits3);
203  out4 = vec_sra(out4, pass1_bits3);
204  out5 = vec_sra(out5, pass1_bits3);
205  out6 = vec_sra(out6, pass1_bits3);
206  out7 = vec_sra(out7, pass1_bits3);
207
208  TRANSPOSE(out, col);
209
210  outb = vec_packs(col0, col0);
211  outb = vec_add(outb, pb_centerjsamp);
212  outptr = (int *)(output_buf[0] + output_col);
213  vec_ste((__vector int)outb, 0, outptr);
214  vec_ste((__vector int)outb, 4, outptr);
215
216  outb = vec_packs(col1, col1);
217  outb = vec_add(outb, pb_centerjsamp);
218  outptr = (int *)(output_buf[1] + output_col);
219  vec_ste((__vector int)outb, 0, outptr);
220  vec_ste((__vector int)outb, 4, outptr);
221
222  outb = vec_packs(col2, col2);
223  outb = vec_add(outb, pb_centerjsamp);
224  outptr = (int *)(output_buf[2] + output_col);
225  vec_ste((__vector int)outb, 0, outptr);
226  vec_ste((__vector int)outb, 4, outptr);
227
228  outb = vec_packs(col3, col3);
229  outb = vec_add(outb, pb_centerjsamp);
230  outptr = (int *)(output_buf[3] + output_col);
231  vec_ste((__vector int)outb, 0, outptr);
232  vec_ste((__vector int)outb, 4, outptr);
233
234  outb = vec_packs(col4, col4);
235  outb = vec_add(outb, pb_centerjsamp);
236  outptr = (int *)(output_buf[4] + output_col);
237  vec_ste((__vector int)outb, 0, outptr);
238  vec_ste((__vector int)outb, 4, outptr);
239
240  outb = vec_packs(col5, col5);
241  outb = vec_add(outb, pb_centerjsamp);
242  outptr = (int *)(output_buf[5] + output_col);
243  vec_ste((__vector int)outb, 0, outptr);
244  vec_ste((__vector int)outb, 4, outptr);
245
246  outb = vec_packs(col6, col6);
247  outb = vec_add(outb, pb_centerjsamp);
248  outptr = (int *)(output_buf[6] + output_col);
249  vec_ste((__vector int)outb, 0, outptr);
250  vec_ste((__vector int)outb, 4, outptr);
251
252  outb = vec_packs(col7, col7);
253  outb = vec_add(outb, pb_centerjsamp);
254  outptr = (int *)(output_buf[7] + output_col);
255  vec_ste((__vector int)outb, 0, outptr);
256  vec_ste((__vector int)outb, 4, outptr);
257}
258