1/*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014, D. R. Commander.  All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty.  In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 *    claim that you wrote the original software. If you use this software
16 *    in a product, an acknowledgment in the product documentation would be
17 *    appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 *    misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23/* SLOW INTEGER FORWARD DCT */
24
25#include "jsimd_altivec.h"
26
27
28#define F_0_298 2446   /* FIX(0.298631336) */
29#define F_0_390 3196   /* FIX(0.390180644) */
30#define F_0_541 4433   /* FIX(0.541196100) */
31#define F_0_765 6270   /* FIX(0.765366865) */
32#define F_0_899 7373   /* FIX(0.899976223) */
33#define F_1_175 9633   /* FIX(1.175875602) */
34#define F_1_501 12299  /* FIX(1.501321110) */
35#define F_1_847 15137  /* FIX(1.847759065) */
36#define F_1_961 16069  /* FIX(1.961570560) */
37#define F_2_053 16819  /* FIX(2.053119869) */
38#define F_2_562 20995  /* FIX(2.562915447) */
39#define F_3_072 25172  /* FIX(3.072711026) */
40
41#define CONST_BITS 13
42#define PASS1_BITS 2
43#define DESCALE_P1 (CONST_BITS - PASS1_BITS)
44#define DESCALE_P2 (CONST_BITS + PASS1_BITS)
45
46
47#define DO_FDCT_COMMON(PASS)  \
48{  \
49  /* (Original)  \
50   * z1 = (tmp12 + tmp13) * 0.541196100;  \
51   * data2 = z1 + tmp13 * 0.765366865;  \
52   * data6 = z1 + tmp12 * -1.847759065;  \
53   *  \
54   * (This implementation)  \
55   * data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;  \
56   * data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);  \
57   */  \
58  \
59  tmp1312l = vec_mergeh(tmp13, tmp12);  \
60  tmp1312h = vec_mergel(tmp13, tmp12);  \
61  \
62  out2l = vec_msums(tmp1312l, pw_f130_f054, pd_descale_p##PASS);  \
63  out2h = vec_msums(tmp1312h, pw_f130_f054, pd_descale_p##PASS);  \
64  out6l = vec_msums(tmp1312l, pw_f054_mf130, pd_descale_p##PASS);  \
65  out6h = vec_msums(tmp1312h, pw_f054_mf130, pd_descale_p##PASS);  \
66  \
67  out2l = vec_sra(out2l, descale_p##PASS);  \
68  out2h = vec_sra(out2h, descale_p##PASS);  \
69  out6l = vec_sra(out6l, descale_p##PASS);  \
70  out6h = vec_sra(out6h, descale_p##PASS);  \
71  \
72  out2 = vec_pack(out2l, out2h);  \
73  out6 = vec_pack(out6l, out6h);  \
74  \
75  /* Odd part */  \
76  \
77  z3 = vec_add(tmp4, tmp6);  \
78  z4 = vec_add(tmp5, tmp7);  \
79  \
80  /* (Original)  \
81   * z5 = (z3 + z4) * 1.175875602;  \
82   * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
83   * z3 += z5;  z4 += z5;  \
84   *  \
85   * (This implementation)  \
86   * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
87   * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
88   */  \
89  \
90  z34l = vec_mergeh(z3, z4);  \
91  z34h = vec_mergel(z3, z4);  \
92  \
93  z3l = vec_msums(z34l, pw_mf078_f117, pd_descale_p##PASS);  \
94  z3h = vec_msums(z34h, pw_mf078_f117, pd_descale_p##PASS);  \
95  z4l = vec_msums(z34l, pw_f117_f078, pd_descale_p##PASS);  \
96  z4h = vec_msums(z34h, pw_f117_f078, pd_descale_p##PASS);  \
97  \
98  /* (Original)  \
99   * z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;  \
100   * tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;  \
101   * tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;  \
102   * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
103   * data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;  \
104   * data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;  \
105   *  \
106   * (This implementation)  \
107   * tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;  \
108   * tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;  \
109   * tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);  \
110   * tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);  \
111   * data7 = tmp4 + z3;  data5 = tmp5 + z4;  \
112   * data3 = tmp6 + z3;  data1 = tmp7 + z4;  \
113   */  \
114  \
115  tmp47l = vec_mergeh(tmp4, tmp7);  \
116  tmp47h = vec_mergel(tmp4, tmp7);  \
117  \
118  out7l = vec_msums(tmp47l, pw_mf060_mf089, z3l);  \
119  out7h = vec_msums(tmp47h, pw_mf060_mf089, z3h);  \
120  out1l = vec_msums(tmp47l, pw_mf089_f060, z4l);  \
121  out1h = vec_msums(tmp47h, pw_mf089_f060, z4h);  \
122  \
123  out7l = vec_sra(out7l, descale_p##PASS);  \
124  out7h = vec_sra(out7h, descale_p##PASS);  \
125  out1l = vec_sra(out1l, descale_p##PASS);  \
126  out1h = vec_sra(out1h, descale_p##PASS);  \
127  \
128  out7 = vec_pack(out7l, out7h);  \
129  out1 = vec_pack(out1l, out1h);  \
130  \
131  tmp56l = vec_mergeh(tmp5, tmp6);  \
132  tmp56h = vec_mergel(tmp5, tmp6);  \
133  \
134  out5l = vec_msums(tmp56l, pw_mf050_mf256, z4l);  \
135  out5h = vec_msums(tmp56h, pw_mf050_mf256, z4h);  \
136  out3l = vec_msums(tmp56l, pw_mf256_f050, z3l);  \
137  out3h = vec_msums(tmp56h, pw_mf256_f050, z3h);  \
138  \
139  out5l = vec_sra(out5l, descale_p##PASS);  \
140  out5h = vec_sra(out5h, descale_p##PASS);  \
141  out3l = vec_sra(out3l, descale_p##PASS);  \
142  out3h = vec_sra(out3h, descale_p##PASS);  \
143  \
144  out5 = vec_pack(out5l, out5h);  \
145  out3 = vec_pack(out3l, out3h);  \
146}
147
148#define DO_FDCT_PASS1()  \
149{  \
150  /* Even part */  \
151  \
152  tmp10 = vec_add(tmp0, tmp3);  \
153  tmp13 = vec_sub(tmp0, tmp3);  \
154  tmp11 = vec_add(tmp1, tmp2);  \
155  tmp12 = vec_sub(tmp1, tmp2);  \
156  \
157  out0  = vec_add(tmp10, tmp11);  \
158  out0  = vec_sl(out0, pass1_bits);  \
159  out4  = vec_sub(tmp10, tmp11);  \
160  out4  = vec_sl(out4, pass1_bits);  \
161  \
162  DO_FDCT_COMMON(1);  \
163}
164
165#define DO_FDCT_PASS2()  \
166{  \
167  /* Even part */  \
168  \
169  tmp10 = vec_add(tmp0, tmp3);  \
170  tmp13 = vec_sub(tmp0, tmp3);  \
171  tmp11 = vec_add(tmp1, tmp2);  \
172  tmp12 = vec_sub(tmp1, tmp2);  \
173  \
174  out0  = vec_add(tmp10, tmp11);  \
175  out0  = vec_add(out0, pw_descale_p2x);  \
176  out0  = vec_sra(out0, pass1_bits);  \
177  out4  = vec_sub(tmp10, tmp11);  \
178  out4  = vec_add(out4, pw_descale_p2x);  \
179  out4  = vec_sra(out4, pass1_bits);  \
180  \
181  DO_FDCT_COMMON(2);  \
182}
183
184
185void
186jsimd_fdct_islow_altivec (DCTELEM *data)
187{
188  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
189    col0, col1, col2, col3, col4, col5, col6, col7,
190    tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
191    tmp47l, tmp47h, tmp56l, tmp56h, tmp1312l, tmp1312h,
192    z3, z4, z34l, z34h,
193    out0, out1, out2, out3, out4, out5, out6, out7;
194  __vector int z3l, z3h, z4l, z4h,
195    out1l, out1h, out2l, out2h, out3l, out3h, out5l, out5h, out6l, out6h,
196    out7l, out7h;
197
198  /* Constants */
199  __vector short
200    pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
201    pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
202    pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
203    pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
204    pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
205    pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
206    pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
207    pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) },
208    pw_descale_p2x = { __8X(1 << (PASS1_BITS - 1)) };
209  __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
210  __vector int pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
211    pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
212  __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
213    descale_p2 = { __4X(DESCALE_P2) };
214
215  /* Pass 1: process rows */
216
217  row0 = vec_ld(0, data);
218  row1 = vec_ld(16, data);
219  row2 = vec_ld(32, data);
220  row3 = vec_ld(48, data);
221  row4 = vec_ld(64, data);
222  row5 = vec_ld(80, data);
223  row6 = vec_ld(96, data);
224  row7 = vec_ld(112, data);
225
226  TRANSPOSE(row, col);
227
228  tmp0 = vec_add(col0, col7);
229  tmp7 = vec_sub(col0, col7);
230  tmp1 = vec_add(col1, col6);
231  tmp6 = vec_sub(col1, col6);
232  tmp2 = vec_add(col2, col5);
233  tmp5 = vec_sub(col2, col5);
234  tmp3 = vec_add(col3, col4);
235  tmp4 = vec_sub(col3, col4);
236
237  DO_FDCT_PASS1();
238
239  /* Pass 2: process columns */
240
241  TRANSPOSE(out, row);
242
243  tmp0 = vec_add(row0, row7);
244  tmp7 = vec_sub(row0, row7);
245  tmp1 = vec_add(row1, row6);
246  tmp6 = vec_sub(row1, row6);
247  tmp2 = vec_add(row2, row5);
248  tmp5 = vec_sub(row2, row5);
249  tmp3 = vec_add(row3, row4);
250  tmp4 = vec_sub(row3, row4);
251
252  DO_FDCT_PASS2();
253
254  vec_st(out0, 0, data);
255  vec_st(out1, 16, data);
256  vec_st(out2, 32, data);
257  vec_st(out3, 48, data);
258  vec_st(out4, 64, data);
259  vec_st(out5, 80, data);
260  vec_st(out6, 96, data);
261  vec_st(out7, 112, data);
262}
263