1/*
2 * AltiVec optimizations for libjpeg-turbo
3 *
4 * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5 *
6 * This software is provided 'as-is', without any express or implied
7 * warranty.  In no event will the authors be held liable for any damages
8 * arising from the use of this software.
9 *
10 * Permission is granted to anyone to use this software for any purpose,
11 * including commercial applications, and to alter it and redistribute it
12 * freely, subject to the following restrictions:
13 *
14 * 1. The origin of this software must not be misrepresented; you must not
15 *    claim that you wrote the original software. If you use this software
16 *    in a product, an acknowledgment in the product documentation would be
17 *    appreciated but is not required.
18 * 2. Altered source versions must be plainly marked as such, and must not be
19 *    misrepresented as being the original software.
20 * 3. This notice may not be removed or altered from any source distribution.
21 */
22
23/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
24
25#include "jsimd_altivec.h"
26
27
28/* NOTE: The address will either be aligned or offset by 8 bytes, so we can
29 * always get the data we want by using a single vector load (although we may
30 * have to permute the result.)
31 */
32#if __BIG_ENDIAN__
33
34#define LOAD_ROW(row) {  \
35  elemptr = sample_data[row] + start_col;  \
36  in##row = vec_ld(0, elemptr);  \
37  if ((size_t)elemptr & 15)  \
38    in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
39}
40
41#else
42
43#define LOAD_ROW(row) {  \
44  elemptr = sample_data[row] + start_col;  \
45  in##row = vec_vsx_ld(0, elemptr);  \
46}
47
48#endif
49
50
51void
52jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
53                        DCTELEM *workspace)
54{
55  JSAMPROW elemptr;
56
57  __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
58  __vector short out0, out1, out2, out3, out4, out5, out6, out7;
59
60  /* Constants */
61  __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
62  __vector unsigned char pb_zero = { __16X(0) };
63
64  LOAD_ROW(0);
65  LOAD_ROW(1);
66  LOAD_ROW(2);
67  LOAD_ROW(3);
68  LOAD_ROW(4);
69  LOAD_ROW(5);
70  LOAD_ROW(6);
71  LOAD_ROW(7);
72
73  out0 = (__vector short)VEC_UNPACKHU(in0);
74  out1 = (__vector short)VEC_UNPACKHU(in1);
75  out2 = (__vector short)VEC_UNPACKHU(in2);
76  out3 = (__vector short)VEC_UNPACKHU(in3);
77  out4 = (__vector short)VEC_UNPACKHU(in4);
78  out5 = (__vector short)VEC_UNPACKHU(in5);
79  out6 = (__vector short)VEC_UNPACKHU(in6);
80  out7 = (__vector short)VEC_UNPACKHU(in7);
81
82  out0 = vec_sub(out0, pw_centerjsamp);
83  out1 = vec_sub(out1, pw_centerjsamp);
84  out2 = vec_sub(out2, pw_centerjsamp);
85  out3 = vec_sub(out3, pw_centerjsamp);
86  out4 = vec_sub(out4, pw_centerjsamp);
87  out5 = vec_sub(out5, pw_centerjsamp);
88  out6 = vec_sub(out6, pw_centerjsamp);
89  out7 = vec_sub(out7, pw_centerjsamp);
90
91  vec_st(out0, 0, workspace);
92  vec_st(out1, 16, workspace);
93  vec_st(out2, 32, workspace);
94  vec_st(out3, 48, workspace);
95  vec_st(out4, 64, workspace);
96  vec_st(out5, 80, workspace);
97  vec_st(out6, 96, workspace);
98  vec_st(out7, 112, workspace);
99}
100
101
102#define WORD_BIT 16
103
104/* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
105   We basically need an unsigned equivalent of vec_madds(). */
106
107#define MULTIPLY(vs0, vs1, out) {  \
108  tmpe = vec_mule((__vector unsigned short)vs0,  \
109                  (__vector unsigned short)vs1);  \
110  tmpo = vec_mulo((__vector unsigned short)vs0,  \
111                  (__vector unsigned short)vs1);  \
112  out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
113                                 (__vector unsigned short)tmpo,  \
114                                 shift_pack_index);  \
115}
116
117void
118jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
119                        DCTELEM *workspace)
120{
121  __vector short row0, row1, row2, row3, row4, row5, row6, row7,
122    row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
123    corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
124    recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
125    scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
126  __vector unsigned int tmpe, tmpo;
127
128  /* Constants */
129  __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
130#if __BIG_ENDIAN__
131  __vector unsigned char shift_pack_index =
132    {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
133#else
134  __vector unsigned char shift_pack_index =
135    {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
136#endif
137
138  row0 = vec_ld(0, workspace);
139  row1 = vec_ld(16, workspace);
140  row2 = vec_ld(32, workspace);
141  row3 = vec_ld(48, workspace);
142  row4 = vec_ld(64, workspace);
143  row5 = vec_ld(80, workspace);
144  row6 = vec_ld(96, workspace);
145  row7 = vec_ld(112, workspace);
146
147  /* Branch-less absolute value */
148  row0s = vec_sra(row0, pw_word_bit_m1);
149  row1s = vec_sra(row1, pw_word_bit_m1);
150  row2s = vec_sra(row2, pw_word_bit_m1);
151  row3s = vec_sra(row3, pw_word_bit_m1);
152  row4s = vec_sra(row4, pw_word_bit_m1);
153  row5s = vec_sra(row5, pw_word_bit_m1);
154  row6s = vec_sra(row6, pw_word_bit_m1);
155  row7s = vec_sra(row7, pw_word_bit_m1);
156  row0 = vec_xor(row0, row0s);
157  row1 = vec_xor(row1, row1s);
158  row2 = vec_xor(row2, row2s);
159  row3 = vec_xor(row3, row3s);
160  row4 = vec_xor(row4, row4s);
161  row5 = vec_xor(row5, row5s);
162  row6 = vec_xor(row6, row6s);
163  row7 = vec_xor(row7, row7s);
164  row0 = vec_sub(row0, row0s);
165  row1 = vec_sub(row1, row1s);
166  row2 = vec_sub(row2, row2s);
167  row3 = vec_sub(row3, row3s);
168  row4 = vec_sub(row4, row4s);
169  row5 = vec_sub(row5, row5s);
170  row6 = vec_sub(row6, row6s);
171  row7 = vec_sub(row7, row7s);
172
173  corr0 = vec_ld(DCTSIZE2 * 2, divisors);
174  corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
175  corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
176  corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
177  corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
178  corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
179  corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
180  corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
181
182  row0 = vec_add(row0, corr0);
183  row1 = vec_add(row1, corr1);
184  row2 = vec_add(row2, corr2);
185  row3 = vec_add(row3, corr3);
186  row4 = vec_add(row4, corr4);
187  row5 = vec_add(row5, corr5);
188  row6 = vec_add(row6, corr6);
189  row7 = vec_add(row7, corr7);
190
191  recip0 = vec_ld(0, divisors);
192  recip1 = vec_ld(16, divisors);
193  recip2 = vec_ld(32, divisors);
194  recip3 = vec_ld(48, divisors);
195  recip4 = vec_ld(64, divisors);
196  recip5 = vec_ld(80, divisors);
197  recip6 = vec_ld(96, divisors);
198  recip7 = vec_ld(112, divisors);
199
200  MULTIPLY(row0, recip0, row0);
201  MULTIPLY(row1, recip1, row1);
202  MULTIPLY(row2, recip2, row2);
203  MULTIPLY(row3, recip3, row3);
204  MULTIPLY(row4, recip4, row4);
205  MULTIPLY(row5, recip5, row5);
206  MULTIPLY(row6, recip6, row6);
207  MULTIPLY(row7, recip7, row7);
208
209  scale0 = vec_ld(DCTSIZE2 * 4, divisors);
210  scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
211  scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
212  scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
213  scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
214  scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
215  scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
216  scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
217
218  MULTIPLY(row0, scale0, row0);
219  MULTIPLY(row1, scale1, row1);
220  MULTIPLY(row2, scale2, row2);
221  MULTIPLY(row3, scale3, row3);
222  MULTIPLY(row4, scale4, row4);
223  MULTIPLY(row5, scale5, row5);
224  MULTIPLY(row6, scale6, row6);
225  MULTIPLY(row7, scale7, row7);
226
227  row0 = vec_xor(row0, row0s);
228  row1 = vec_xor(row1, row1s);
229  row2 = vec_xor(row2, row2s);
230  row3 = vec_xor(row3, row3s);
231  row4 = vec_xor(row4, row4s);
232  row5 = vec_xor(row5, row5s);
233  row6 = vec_xor(row6, row6s);
234  row7 = vec_xor(row7, row7s);
235  row0 = vec_sub(row0, row0s);
236  row1 = vec_sub(row1, row1s);
237  row2 = vec_sub(row2, row2s);
238  row3 = vec_sub(row3, row3s);
239  row4 = vec_sub(row4, row4s);
240  row5 = vec_sub(row5, row5s);
241  row6 = vec_sub(row6, row6s);
242  row7 = vec_sub(row7, row7s);
243
244  vec_st(row0, 0, coef_block);
245  vec_st(row1, 16, coef_block);
246  vec_st(row2, 32, coef_block);
247  vec_st(row3, 48, coef_block);
248  vec_st(row4, 64, coef_block);
249  vec_st(row5, 80, coef_block);
250  vec_st(row6, 96, coef_block);
251  vec_st(row7, 112, coef_block);
252}
253