jsimd_x86_64.c revision ba82ddf6ca0270d65e3b8fb56b2d3116cf423709
1/*
2 * jsimd_x86_64.c
3 *
4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 * Copyright 2009 D. R. Commander
6 *
7 * Based on the x86 SIMD extension for IJG JPEG library,
8 * Copyright (C) 1999-2006, MIYASAKA Masaru.
9 *
10 * This file contains the interface between the "normal" portions
11 * of the library and the SIMD implementations when running on a
12 * x86_64 architecture.
13 */
14
15#define JPEG_INTERNALS
16#include "../jinclude.h"
17#include "../jpeglib.h"
18#include "../jsimd.h"
19#include "../jdct.h"
20#include "../jsimddct.h"
21#include "simd/jsimd.h"
22
23/*
24 * In the PIC cases, we have no guarantee that constants will keep
25 * their alignment. This macro allows us to verify it at runtime.
26 */
27#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
28
29#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
30
31GLOBAL(int)
32jsimd_can_rgb_ycc (void)
33{
34  /* The code is optimised for these values only */
35  if (BITS_IN_JSAMPLE != 8)
36    return 0;
37  if (sizeof(JDIMENSION) != 4)
38    return 0;
39  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
40    return 0;
41
42  if (!IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2))
43    return 0;
44
45  return 1;
46}
47
48GLOBAL(int)
49jsimd_can_ycc_rgb (void)
50{
51  /* The code is optimised for these values only */
52  if (BITS_IN_JSAMPLE != 8)
53    return 0;
54  if (sizeof(JDIMENSION) != 4)
55    return 0;
56  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
57    return 0;
58
59  if (!IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2))
60    return 0;
61
62  return 1;
63}
64
65GLOBAL(void)
66jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
67                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
68                       JDIMENSION output_row, int num_rows)
69{
70  void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
71
72  switch(cinfo->in_color_space)
73  {
74    case JCS_EXT_RGB:
75      sse2fct=jsimd_extrgb_ycc_convert_sse2;
76      break;
77    case JCS_EXT_RGBX:
78      sse2fct=jsimd_extrgbx_ycc_convert_sse2;
79      break;
80    case JCS_EXT_BGR:
81      sse2fct=jsimd_extbgr_ycc_convert_sse2;
82      break;
83    case JCS_EXT_BGRX:
84      sse2fct=jsimd_extbgrx_ycc_convert_sse2;
85      break;
86    case JCS_EXT_XBGR:
87      sse2fct=jsimd_extxbgr_ycc_convert_sse2;
88      break;
89    case JCS_EXT_XRGB:
90      sse2fct=jsimd_extxrgb_ycc_convert_sse2;
91      break;
92    default:
93      sse2fct=jsimd_rgb_ycc_convert_sse2;
94      break;
95  }
96
97  sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
98}
99
100GLOBAL(void)
101jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
102                       JSAMPIMAGE input_buf, JDIMENSION input_row,
103                       JSAMPARRAY output_buf, int num_rows)
104{
105  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
106
107  switch(cinfo->out_color_space)
108  {
109    case JCS_EXT_RGB:
110      sse2fct=jsimd_ycc_extrgb_convert_sse2;
111      break;
112    case JCS_EXT_RGBX:
113      sse2fct=jsimd_ycc_extrgbx_convert_sse2;
114      break;
115    case JCS_EXT_BGR:
116      sse2fct=jsimd_ycc_extbgr_convert_sse2;
117      break;
118    case JCS_EXT_BGRX:
119      sse2fct=jsimd_ycc_extbgrx_convert_sse2;
120      break;
121    case JCS_EXT_XBGR:
122      sse2fct=jsimd_ycc_extxbgr_convert_sse2;
123      break;
124    case JCS_EXT_XRGB:
125      sse2fct=jsimd_ycc_extxrgb_convert_sse2;
126      break;
127    default:
128      sse2fct=jsimd_ycc_rgb_convert_sse2;
129      break;
130  }
131
132  sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
133}
134
135GLOBAL(int)
136jsimd_can_h2v2_downsample (void)
137{
138  /* The code is optimised for these values only */
139  if (BITS_IN_JSAMPLE != 8)
140    return 0;
141  if (sizeof(JDIMENSION) != 4)
142    return 0;
143
144  return 1;
145}
146
147GLOBAL(int)
148jsimd_can_h2v1_downsample (void)
149{
150  /* The code is optimised for these values only */
151  if (BITS_IN_JSAMPLE != 8)
152    return 0;
153  if (sizeof(JDIMENSION) != 4)
154    return 0;
155
156  return 1;
157}
158
159GLOBAL(void)
160jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
161                       JSAMPARRAY input_data, JSAMPARRAY output_data)
162{
163  jsimd_h2v2_downsample_sse2(cinfo->image_width,
164                             cinfo->max_v_samp_factor,
165                             compptr->v_samp_factor,
166                             compptr->width_in_blocks,
167                             input_data, output_data);
168}
169
170GLOBAL(void)
171jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
172                       JSAMPARRAY input_data, JSAMPARRAY output_data)
173{
174  jsimd_h2v1_downsample_sse2(cinfo->image_width,
175                             cinfo->max_v_samp_factor,
176                             compptr->v_samp_factor,
177                             compptr->width_in_blocks,
178                             input_data, output_data);
179}
180
181GLOBAL(int)
182jsimd_can_h2v2_upsample (void)
183{
184  /* The code is optimised for these values only */
185  if (BITS_IN_JSAMPLE != 8)
186    return 0;
187  if (sizeof(JDIMENSION) != 4)
188    return 0;
189
190  return 1;
191}
192
193GLOBAL(int)
194jsimd_can_h2v1_upsample (void)
195{
196  /* The code is optimised for these values only */
197  if (BITS_IN_JSAMPLE != 8)
198    return 0;
199  if (sizeof(JDIMENSION) != 4)
200    return 0;
201
202  return 1;
203}
204
205GLOBAL(void)
206jsimd_h2v2_upsample (j_decompress_ptr cinfo,
207                     jpeg_component_info * compptr,
208                     JSAMPARRAY input_data,
209                     JSAMPARRAY * output_data_ptr)
210{
211  jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor,
212                           cinfo->output_width,
213                           input_data, output_data_ptr);
214}
215
216GLOBAL(void)
217jsimd_h2v1_upsample (j_decompress_ptr cinfo,
218                     jpeg_component_info * compptr,
219                     JSAMPARRAY input_data,
220                     JSAMPARRAY * output_data_ptr)
221{
222  jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor,
223                           cinfo->output_width,
224                           input_data, output_data_ptr);
225}
226
227GLOBAL(int)
228jsimd_can_h2v2_fancy_upsample (void)
229{
230  /* The code is optimised for these values only */
231  if (BITS_IN_JSAMPLE != 8)
232    return 0;
233  if (sizeof(JDIMENSION) != 4)
234    return 0;
235
236  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
237    return 0;
238
239  return 1;
240}
241
242GLOBAL(int)
243jsimd_can_h2v1_fancy_upsample (void)
244{
245  /* The code is optimised for these values only */
246  if (BITS_IN_JSAMPLE != 8)
247    return 0;
248  if (sizeof(JDIMENSION) != 4)
249    return 0;
250
251  if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2))
252    return 0;
253
254  return 1;
255}
256
257GLOBAL(void)
258jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
259                           jpeg_component_info * compptr,
260                           JSAMPARRAY input_data,
261                           JSAMPARRAY * output_data_ptr)
262{
263  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
264                                 compptr->downsampled_width,
265                                 input_data, output_data_ptr);
266}
267
268GLOBAL(void)
269jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
270                           jpeg_component_info * compptr,
271                           JSAMPARRAY input_data,
272                           JSAMPARRAY * output_data_ptr)
273{
274  jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor,
275                                 compptr->downsampled_width,
276                                 input_data, output_data_ptr);
277}
278
279GLOBAL(int)
280jsimd_can_h2v2_merged_upsample (void)
281{
282  /* The code is optimised for these values only */
283  if (BITS_IN_JSAMPLE != 8)
284    return 0;
285  if (sizeof(JDIMENSION) != 4)
286    return 0;
287
288  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
289    return 0;
290
291  return 1;
292}
293
294GLOBAL(int)
295jsimd_can_h2v1_merged_upsample (void)
296{
297  /* The code is optimised for these values only */
298  if (BITS_IN_JSAMPLE != 8)
299    return 0;
300  if (sizeof(JDIMENSION) != 4)
301    return 0;
302
303  if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
304    return 0;
305
306  return 1;
307}
308
309GLOBAL(void)
310jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
311                            JSAMPIMAGE input_buf,
312                            JDIMENSION in_row_group_ctr,
313                            JSAMPARRAY output_buf)
314{
315  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
316
317  switch(cinfo->out_color_space)
318  {
319    case JCS_EXT_RGB:
320      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
321      break;
322    case JCS_EXT_RGBX:
323      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
324      break;
325    case JCS_EXT_BGR:
326      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
327      break;
328    case JCS_EXT_BGRX:
329      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
330      break;
331    case JCS_EXT_XBGR:
332      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
333      break;
334    case JCS_EXT_XRGB:
335      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
336      break;
337    default:
338      sse2fct=jsimd_h2v2_merged_upsample_sse2;
339      break;
340  }
341
342  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
343}
344
345GLOBAL(void)
346jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
347                            JSAMPIMAGE input_buf,
348                            JDIMENSION in_row_group_ctr,
349                            JSAMPARRAY output_buf)
350{
351  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
352
353  switch(cinfo->out_color_space)
354  {
355    case JCS_EXT_RGB:
356      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
357      break;
358    case JCS_EXT_RGBX:
359      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
360      break;
361    case JCS_EXT_BGR:
362      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
363      break;
364    case JCS_EXT_BGRX:
365      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
366      break;
367    case JCS_EXT_XBGR:
368      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
369      break;
370    case JCS_EXT_XRGB:
371      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
372      break;
373    default:
374      sse2fct=jsimd_h2v1_merged_upsample_sse2;
375      break;
376  }
377
378  sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
379}
380
381GLOBAL(int)
382jsimd_can_convsamp (void)
383{
384  /* The code is optimised for these values only */
385  if (DCTSIZE != 8)
386    return 0;
387  if (BITS_IN_JSAMPLE != 8)
388    return 0;
389  if (sizeof(JDIMENSION) != 4)
390    return 0;
391  if (sizeof(DCTELEM) != 2)
392    return 0;
393
394  return 1;
395}
396
397GLOBAL(int)
398jsimd_can_convsamp_float (void)
399{
400  /* The code is optimised for these values only */
401  if (DCTSIZE != 8)
402    return 0;
403  if (BITS_IN_JSAMPLE != 8)
404    return 0;
405  if (sizeof(JDIMENSION) != 4)
406    return 0;
407  if (sizeof(FAST_FLOAT) != 4)
408    return 0;
409
410  return 1;
411}
412
413GLOBAL(void)
414jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
415                DCTELEM * workspace)
416{
417  jsimd_convsamp_sse2(sample_data, start_col, workspace);
418}
419
420GLOBAL(void)
421jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
422                      FAST_FLOAT * workspace)
423{
424  jsimd_convsamp_float_sse2(sample_data, start_col, workspace);
425}
426
427GLOBAL(int)
428jsimd_can_fdct_islow (void)
429{
430  /* The code is optimised for these values only */
431  if (DCTSIZE != 8)
432    return 0;
433  if (sizeof(DCTELEM) != 2)
434    return 0;
435
436  if (!IS_ALIGNED_SSE(jconst_fdct_islow_sse2))
437    return 0;
438
439  return 1;
440}
441
442GLOBAL(int)
443jsimd_can_fdct_ifast (void)
444{
445  /* The code is optimised for these values only */
446  if (DCTSIZE != 8)
447    return 0;
448  if (sizeof(DCTELEM) != 2)
449    return 0;
450
451  if (!IS_ALIGNED_SSE(jconst_fdct_ifast_sse2))
452    return 0;
453
454  return 1;
455}
456
457GLOBAL(int)
458jsimd_can_fdct_float (void)
459{
460  /* The code is optimised for these values only */
461  if (DCTSIZE != 8)
462    return 0;
463  if (sizeof(FAST_FLOAT) != 4)
464    return 0;
465
466  if (!IS_ALIGNED_SSE(jconst_fdct_float_sse))
467    return 0;
468
469  return 1;
470}
471
472GLOBAL(void)
473jsimd_fdct_islow (DCTELEM * data)
474{
475  jsimd_fdct_islow_sse2(data);
476}
477
478GLOBAL(void)
479jsimd_fdct_ifast (DCTELEM * data)
480{
481  jsimd_fdct_ifast_sse2(data);
482}
483
484GLOBAL(void)
485jsimd_fdct_float (FAST_FLOAT * data)
486{
487  jsimd_fdct_float_sse(data);
488}
489
490GLOBAL(int)
491jsimd_can_quantize (void)
492{
493  /* The code is optimised for these values only */
494  if (DCTSIZE != 8)
495    return 0;
496  if (sizeof(JCOEF) != 2)
497    return 0;
498  if (sizeof(DCTELEM) != 2)
499    return 0;
500
501  return 1;
502}
503
504GLOBAL(int)
505jsimd_can_quantize_float (void)
506{
507  /* The code is optimised for these values only */
508  if (DCTSIZE != 8)
509    return 0;
510  if (sizeof(JCOEF) != 2)
511    return 0;
512  if (sizeof(FAST_FLOAT) != 4)
513    return 0;
514
515  return 1;
516}
517
518GLOBAL(void)
519jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
520                DCTELEM * workspace)
521{
522  jsimd_quantize_sse2(coef_block, divisors, workspace);
523}
524
525GLOBAL(void)
526jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
527                      FAST_FLOAT * workspace)
528{
529  jsimd_quantize_float_sse2(coef_block, divisors, workspace);
530}
531
532GLOBAL(int)
533jsimd_can_idct_2x2 (void)
534{
535  /* The code is optimised for these values only */
536  if (DCTSIZE != 8)
537    return 0;
538  if (sizeof(JCOEF) != 2)
539    return 0;
540  if (BITS_IN_JSAMPLE != 8)
541    return 0;
542  if (sizeof(JDIMENSION) != 4)
543    return 0;
544  if (sizeof(ISLOW_MULT_TYPE) != 2)
545    return 0;
546
547  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
548    return 0;
549
550  return 1;
551}
552
553GLOBAL(int)
554jsimd_can_idct_4x4 (void)
555{
556  /* The code is optimised for these values only */
557  if (DCTSIZE != 8)
558    return 0;
559  if (sizeof(JCOEF) != 2)
560    return 0;
561  if (BITS_IN_JSAMPLE != 8)
562    return 0;
563  if (sizeof(JDIMENSION) != 4)
564    return 0;
565  if (sizeof(ISLOW_MULT_TYPE) != 2)
566    return 0;
567
568  if (!IS_ALIGNED_SSE(jconst_idct_red_sse2))
569    return 0;
570
571  return 1;
572}
573
574GLOBAL(void)
575jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
576                JCOEFPTR coef_block, JSAMPARRAY output_buf,
577                JDIMENSION output_col)
578{
579  jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col);
580}
581
582GLOBAL(void)
583jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
584                JCOEFPTR coef_block, JSAMPARRAY output_buf,
585                JDIMENSION output_col)
586{
587  jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col);
588}
589
590GLOBAL(int)
591jsimd_can_idct_islow (void)
592{
593  /* The code is optimised for these values only */
594  if (DCTSIZE != 8)
595    return 0;
596  if (sizeof(JCOEF) != 2)
597    return 0;
598  if (BITS_IN_JSAMPLE != 8)
599    return 0;
600  if (sizeof(JDIMENSION) != 4)
601    return 0;
602  if (sizeof(ISLOW_MULT_TYPE) != 2)
603    return 0;
604
605  if (!IS_ALIGNED_SSE(jconst_idct_islow_sse2))
606    return 0;
607
608  return 1;
609}
610
611GLOBAL(int)
612jsimd_can_idct_ifast (void)
613{
614  /* The code is optimised for these values only */
615  if (DCTSIZE != 8)
616    return 0;
617  if (sizeof(JCOEF) != 2)
618    return 0;
619  if (BITS_IN_JSAMPLE != 8)
620    return 0;
621  if (sizeof(JDIMENSION) != 4)
622    return 0;
623  if (sizeof(IFAST_MULT_TYPE) != 2)
624    return 0;
625  if (IFAST_SCALE_BITS != 2)
626    return 0;
627
628  if (!IS_ALIGNED_SSE(jconst_idct_ifast_sse2))
629    return 0;
630
631  return 1;
632}
633
634GLOBAL(int)
635jsimd_can_idct_float (void)
636{
637  if (DCTSIZE != 8)
638    return 0;
639  if (sizeof(JCOEF) != 2)
640    return 0;
641  if (BITS_IN_JSAMPLE != 8)
642    return 0;
643  if (sizeof(JDIMENSION) != 4)
644    return 0;
645  if (sizeof(FAST_FLOAT) != 4)
646    return 0;
647  if (sizeof(FLOAT_MULT_TYPE) != 4)
648    return 0;
649
650  if (!IS_ALIGNED_SSE(jconst_idct_float_sse2))
651    return 0;
652
653  return 1;
654}
655
656GLOBAL(void)
657jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
658                JCOEFPTR coef_block, JSAMPARRAY output_buf,
659                JDIMENSION output_col)
660{
661  jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col);
662}
663
664GLOBAL(void)
665jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
666                JCOEFPTR coef_block, JSAMPARRAY output_buf,
667                JDIMENSION output_col)
668{
669  jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col);
670}
671
672GLOBAL(void)
673jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
674                JCOEFPTR coef_block, JSAMPARRAY output_buf,
675                JDIMENSION output_col)
676{
677  jsimd_idct_float_sse2(compptr->dct_table, coef_block,
678                        output_buf, output_col);
679}
680
681