1/*
2 * jsimd_arm64.c
3 *
4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander.
6 * Copyright (C) 2015-2016, Matthieu Darbois.
7 *
8 * Based on the x86 SIMD extension for IJG JPEG library,
9 * Copyright (C) 1999-2006, MIYASAKA Masaru.
10 * For conditions of distribution and use, see copyright notice in jsimdext.inc
11 *
12 * This file contains the interface between the "normal" portions
13 * of the library and the SIMD implementations when running on a
14 * 64-bit ARM architecture.
15 */
16
17#define JPEG_INTERNALS
18#include "../jinclude.h"
19#include "../jpeglib.h"
20#include "../jsimd.h"
21#include "../jdct.h"
22#include "../jsimddct.h"
23#include "jsimd.h"
24
25#include <stdio.h>
26#include <string.h>
27#include <ctype.h>
28
29#define JSIMD_FASTLD3 1
30#define JSIMD_FASTST3 2
31#define JSIMD_FASTTBL 4
32
33static unsigned int simd_support = ~0;
34static unsigned int simd_huffman = 1;
35static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 |
36                                    JSIMD_FASTTBL;
37
38#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
39
40#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024)
41
42LOCAL(int)
43check_cpuinfo (char *buffer, const char *field, char *value)
44{
45  char *p;
46  if (*value == 0)
47    return 0;
48  if (strncmp(buffer, field, strlen(field)) != 0)
49    return 0;
50  buffer += strlen(field);
51  while (isspace(*buffer))
52    buffer++;
53
54  /* Check if 'value' is present in the buffer as a separate word */
55  while ((p = strstr(buffer, value))) {
56    if (p > buffer && !isspace(*(p - 1))) {
57      buffer++;
58      continue;
59    }
60    p += strlen(value);
61    if (*p != 0 && !isspace(*p)) {
62      buffer++;
63      continue;
64    }
65    return 1;
66  }
67  return 0;
68}
69
70LOCAL(int)
71parse_proc_cpuinfo (int bufsize)
72{
73  char *buffer = (char *)malloc(bufsize);
74  FILE *fd;
75
76  if (!buffer)
77    return 0;
78
79  fd = fopen("/proc/cpuinfo", "r");
80  if (fd) {
81    while (fgets(buffer, bufsize, fd)) {
82      if (!strchr(buffer, '\n') && !feof(fd)) {
83        /* "impossible" happened - insufficient size of the buffer! */
84        fclose(fd);
85        free(buffer);
86        return 0;
87      }
88      if (check_cpuinfo(buffer, "CPU part", "0xd03") ||
89          check_cpuinfo(buffer, "CPU part", "0xd07"))
90        /* The Cortex-A53 has a slow tbl implementation.  We can gain a few
91           percent speedup by disabling the use of that instruction.  The
92           speedup on Cortex-A57 is more subtle but still measurable. */
93        simd_features &= ~JSIMD_FASTTBL;
94      else if (check_cpuinfo(buffer, "CPU part", "0x0a1"))
95        /* The SIMD version of Huffman encoding is slower than the C version on
96           Cavium ThunderX.  Also, ld3 and st3 are abyssmally slow on that
97           CPU. */
98        simd_huffman = simd_features = 0;
99    }
100    fclose(fd);
101  }
102  free(buffer);
103  return 1;
104}
105
106#endif
107
108/*
109 * Check what SIMD accelerations are supported.
110 *
111 * FIXME: This code is racy under a multi-threaded environment.
112 */
113
114/*
115 * ARMv8 architectures support NEON extensions by default.
116 * It is no longer optional as it was with ARMv7.
117 */
118
119
120LOCAL(void)
121init_simd (void)
122{
123  char *env = NULL;
124#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
125  int bufsize = 1024; /* an initial guess for the line buffer size limit */
126#endif
127
128  if (simd_support != ~0U)
129    return;
130
131  simd_support = 0;
132
133  simd_support |= JSIMD_ARM_NEON;
134#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__)
135  while (!parse_proc_cpuinfo(bufsize)) {
136    bufsize *= 2;
137    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
138      break;
139  }
140#endif
141
142  /* Force different settings through environment variables */
143  env = getenv("JSIMD_FORCENEON");
144  if ((env != NULL) && (strcmp(env, "1") == 0))
145    simd_support = JSIMD_ARM_NEON;
146  env = getenv("JSIMD_FORCENONE");
147  if ((env != NULL) && (strcmp(env, "1") == 0))
148    simd_support = 0;
149  env = getenv("JSIMD_NOHUFFENC");
150  if ((env != NULL) && (strcmp(env, "1") == 0))
151    simd_huffman = 0;
152  env = getenv("JSIMD_FASTLD3");
153  if ((env != NULL) && (strcmp(env, "1") == 0))
154    simd_features |= JSIMD_FASTLD3;
155  if ((env != NULL) && (strcmp(env, "0") == 0))
156    simd_features &= ~JSIMD_FASTLD3;
157  env = getenv("JSIMD_FASTST3");
158  if ((env != NULL) && (strcmp(env, "1") == 0))
159    simd_features |= JSIMD_FASTST3;
160  if ((env != NULL) && (strcmp(env, "0") == 0))
161    simd_features &= ~JSIMD_FASTST3;
162}
163
164GLOBAL(int)
165jsimd_can_rgb_ycc (void)
166{
167  init_simd();
168
169  /* The code is optimised for these values only */
170  if (BITS_IN_JSAMPLE != 8)
171    return 0;
172  if (sizeof(JDIMENSION) != 4)
173    return 0;
174  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
175    return 0;
176
177  if (simd_support & JSIMD_ARM_NEON)
178    return 1;
179
180  return 0;
181}
182
183GLOBAL(int)
184jsimd_can_rgb_gray (void)
185{
186  init_simd();
187
188  return 0;
189}
190
191GLOBAL(int)
192jsimd_can_ycc_rgb (void)
193{
194  init_simd();
195
196  /* The code is optimised for these values only */
197  if (BITS_IN_JSAMPLE != 8)
198    return 0;
199  if (sizeof(JDIMENSION) != 4)
200    return 0;
201  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
202    return 0;
203
204  if (simd_support & JSIMD_ARM_NEON)
205    return 1;
206
207  return 0;
208}
209
210GLOBAL(int)
211jsimd_can_ycc_rgb565 (void)
212{
213  init_simd();
214
215  /* The code is optimised for these values only */
216  if (BITS_IN_JSAMPLE != 8)
217    return 0;
218  if (sizeof(JDIMENSION) != 4)
219    return 0;
220
221  if (simd_support & JSIMD_ARM_NEON)
222    return 1;
223
224  return 0;
225}
226
227GLOBAL(void)
228jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
229                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
230                       JDIMENSION output_row, int num_rows)
231{
232  void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
233
234  switch(cinfo->in_color_space) {
235    case JCS_EXT_RGB:
236      if (simd_features & JSIMD_FASTLD3)
237        neonfct=jsimd_extrgb_ycc_convert_neon;
238      else
239        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
240      break;
241    case JCS_EXT_RGBX:
242    case JCS_EXT_RGBA:
243      neonfct=jsimd_extrgbx_ycc_convert_neon;
244      break;
245    case JCS_EXT_BGR:
246      if (simd_features & JSIMD_FASTLD3)
247        neonfct=jsimd_extbgr_ycc_convert_neon;
248      else
249        neonfct=jsimd_extbgr_ycc_convert_neon_slowld3;
250      break;
251    case JCS_EXT_BGRX:
252    case JCS_EXT_BGRA:
253      neonfct=jsimd_extbgrx_ycc_convert_neon;
254      break;
255    case JCS_EXT_XBGR:
256    case JCS_EXT_ABGR:
257      neonfct=jsimd_extxbgr_ycc_convert_neon;
258      break;
259    case JCS_EXT_XRGB:
260    case JCS_EXT_ARGB:
261      neonfct=jsimd_extxrgb_ycc_convert_neon;
262      break;
263    default:
264      if (simd_features & JSIMD_FASTLD3)
265        neonfct=jsimd_extrgb_ycc_convert_neon;
266      else
267        neonfct=jsimd_extrgb_ycc_convert_neon_slowld3;
268      break;
269  }
270
271  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
272}
273
274GLOBAL(void)
275jsimd_rgb_gray_convert (j_compress_ptr cinfo,
276                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
277                        JDIMENSION output_row, int num_rows)
278{
279}
280
281GLOBAL(void)
282jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
283                       JSAMPIMAGE input_buf, JDIMENSION input_row,
284                       JSAMPARRAY output_buf, int num_rows)
285{
286  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
287
288  switch(cinfo->out_color_space) {
289    case JCS_EXT_RGB:
290      if (simd_features & JSIMD_FASTST3)
291        neonfct=jsimd_ycc_extrgb_convert_neon;
292      else
293        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
294      break;
295    case JCS_EXT_RGBX:
296    case JCS_EXT_RGBA:
297      neonfct=jsimd_ycc_extrgbx_convert_neon;
298      break;
299    case JCS_EXT_BGR:
300      if (simd_features & JSIMD_FASTST3)
301        neonfct=jsimd_ycc_extbgr_convert_neon;
302      else
303        neonfct=jsimd_ycc_extbgr_convert_neon_slowst3;
304      break;
305    case JCS_EXT_BGRX:
306    case JCS_EXT_BGRA:
307      neonfct=jsimd_ycc_extbgrx_convert_neon;
308      break;
309    case JCS_EXT_XBGR:
310    case JCS_EXT_ABGR:
311      neonfct=jsimd_ycc_extxbgr_convert_neon;
312      break;
313    case JCS_EXT_XRGB:
314    case JCS_EXT_ARGB:
315      neonfct=jsimd_ycc_extxrgb_convert_neon;
316      break;
317    default:
318      if (simd_features & JSIMD_FASTST3)
319        neonfct=jsimd_ycc_extrgb_convert_neon;
320      else
321        neonfct=jsimd_ycc_extrgb_convert_neon_slowst3;
322      break;
323  }
324
325  neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
326}
327
328GLOBAL(void)
329jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
330                          JSAMPIMAGE input_buf, JDIMENSION input_row,
331                          JSAMPARRAY output_buf, int num_rows)
332{
333  jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
334                                output_buf, num_rows);
335}
336
337GLOBAL(int)
338jsimd_can_h2v2_downsample (void)
339{
340  init_simd();
341
342  /* The code is optimised for these values only */
343  if (BITS_IN_JSAMPLE != 8)
344    return 0;
345  if (DCTSIZE != 8)
346    return 0;
347  if (sizeof(JDIMENSION) != 4)
348    return 0;
349
350  if (simd_support & JSIMD_ARM_NEON)
351    return 1;
352
353  return 0;
354}
355
356GLOBAL(int)
357jsimd_can_h2v1_downsample (void)
358{
359  init_simd();
360
361  /* The code is optimised for these values only */
362  if (BITS_IN_JSAMPLE != 8)
363    return 0;
364  if (DCTSIZE != 8)
365    return 0;
366  if (sizeof(JDIMENSION) != 4)
367    return 0;
368
369  if (simd_support & JSIMD_ARM_NEON)
370    return 1;
371
372  return 0;
373}
374
375GLOBAL(void)
376jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
377                       JSAMPARRAY input_data, JSAMPARRAY output_data)
378{
379  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
380                             compptr->v_samp_factor, compptr->width_in_blocks,
381                             input_data, output_data);
382}
383
384GLOBAL(void)
385jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr,
386                       JSAMPARRAY input_data, JSAMPARRAY output_data)
387{
388  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
389                             compptr->v_samp_factor, compptr->width_in_blocks,
390                             input_data, output_data);
391}
392
393GLOBAL(int)
394jsimd_can_h2v2_upsample (void)
395{
396  init_simd();
397
398  return 0;
399}
400
401GLOBAL(int)
402jsimd_can_h2v1_upsample (void)
403{
404  init_simd();
405
406  return 0;
407}
408
409GLOBAL(void)
410jsimd_h2v2_upsample (j_decompress_ptr cinfo,
411                     jpeg_component_info *compptr,
412                     JSAMPARRAY input_data,
413                     JSAMPARRAY *output_data_ptr)
414{
415}
416
417GLOBAL(void)
418jsimd_h2v1_upsample (j_decompress_ptr cinfo,
419                     jpeg_component_info *compptr,
420                     JSAMPARRAY input_data,
421                     JSAMPARRAY *output_data_ptr)
422{
423}
424
425GLOBAL(int)
426jsimd_can_h2v2_fancy_upsample (void)
427{
428  init_simd();
429
430  return 0;
431}
432
433GLOBAL(int)
434jsimd_can_h2v1_fancy_upsample (void)
435{
436  init_simd();
437
438  return 0;
439}
440
441GLOBAL(void)
442jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
443                           jpeg_component_info *compptr,
444                           JSAMPARRAY input_data,
445                           JSAMPARRAY *output_data_ptr)
446{
447}
448
449GLOBAL(void)
450jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
451                           jpeg_component_info *compptr,
452                           JSAMPARRAY input_data,
453                           JSAMPARRAY *output_data_ptr)
454{
455}
456
457GLOBAL(int)
458jsimd_can_h2v2_merged_upsample (void)
459{
460  init_simd();
461
462  return 0;
463}
464
465GLOBAL(int)
466jsimd_can_h2v1_merged_upsample (void)
467{
468  init_simd();
469
470  return 0;
471}
472
473GLOBAL(void)
474jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
475                            JSAMPIMAGE input_buf,
476                            JDIMENSION in_row_group_ctr,
477                            JSAMPARRAY output_buf)
478{
479}
480
481GLOBAL(void)
482jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
483                            JSAMPIMAGE input_buf,
484                            JDIMENSION in_row_group_ctr,
485                            JSAMPARRAY output_buf)
486{
487}
488
489GLOBAL(int)
490jsimd_can_convsamp (void)
491{
492  init_simd();
493
494  /* The code is optimised for these values only */
495  if (DCTSIZE != 8)
496    return 0;
497  if (BITS_IN_JSAMPLE != 8)
498    return 0;
499  if (sizeof(JDIMENSION) != 4)
500    return 0;
501  if (sizeof(DCTELEM) != 2)
502    return 0;
503
504  if (simd_support & JSIMD_ARM_NEON)
505    return 1;
506
507  return 0;
508}
509
510GLOBAL(int)
511jsimd_can_convsamp_float (void)
512{
513  init_simd();
514
515  return 0;
516}
517
518GLOBAL(void)
519jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
520                DCTELEM *workspace)
521{
522  jsimd_convsamp_neon(sample_data, start_col, workspace);
523}
524
525GLOBAL(void)
526jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
527                      FAST_FLOAT *workspace)
528{
529}
530
531GLOBAL(int)
532jsimd_can_fdct_islow (void)
533{
534  init_simd();
535
536  /* The code is optimised for these values only */
537  if (DCTSIZE != 8)
538    return 0;
539  if (sizeof(DCTELEM) != 2)
540    return 0;
541
542  if (simd_support & JSIMD_ARM_NEON)
543    return 1;
544
545  return 0;
546}
547
548GLOBAL(int)
549jsimd_can_fdct_ifast (void)
550{
551  init_simd();
552
553  /* The code is optimised for these values only */
554  if (DCTSIZE != 8)
555    return 0;
556  if (sizeof(DCTELEM) != 2)
557    return 0;
558
559  if (simd_support & JSIMD_ARM_NEON)
560    return 1;
561
562  return 0;
563}
564
565GLOBAL(int)
566jsimd_can_fdct_float (void)
567{
568  init_simd();
569
570  return 0;
571}
572
573GLOBAL(void)
574jsimd_fdct_islow (DCTELEM *data)
575{
576  jsimd_fdct_islow_neon(data);
577}
578
579GLOBAL(void)
580jsimd_fdct_ifast (DCTELEM *data)
581{
582  jsimd_fdct_ifast_neon(data);
583}
584
585GLOBAL(void)
586jsimd_fdct_float (FAST_FLOAT *data)
587{
588}
589
590GLOBAL(int)
591jsimd_can_quantize (void)
592{
593  init_simd();
594
595  /* The code is optimised for these values only */
596  if (DCTSIZE != 8)
597    return 0;
598  if (sizeof(JCOEF) != 2)
599    return 0;
600  if (sizeof(DCTELEM) != 2)
601    return 0;
602
603  if (simd_support & JSIMD_ARM_NEON)
604    return 1;
605
606  return 0;
607}
608
609GLOBAL(int)
610jsimd_can_quantize_float (void)
611{
612  init_simd();
613
614  return 0;
615}
616
617GLOBAL(void)
618jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors,
619                DCTELEM *workspace)
620{
621  jsimd_quantize_neon(coef_block, divisors, workspace);
622}
623
624GLOBAL(void)
625jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors,
626                      FAST_FLOAT *workspace)
627{
628}
629
630GLOBAL(int)
631jsimd_can_idct_2x2 (void)
632{
633  init_simd();
634
635  /* The code is optimised for these values only */
636  if (DCTSIZE != 8)
637    return 0;
638  if (sizeof(JCOEF) != 2)
639    return 0;
640  if (BITS_IN_JSAMPLE != 8)
641    return 0;
642  if (sizeof(JDIMENSION) != 4)
643    return 0;
644  if (sizeof(ISLOW_MULT_TYPE) != 2)
645    return 0;
646
647  if (simd_support & JSIMD_ARM_NEON)
648    return 1;
649
650  return 0;
651}
652
653GLOBAL(int)
654jsimd_can_idct_4x4 (void)
655{
656  init_simd();
657
658  /* The code is optimised for these values only */
659  if (DCTSIZE != 8)
660    return 0;
661  if (sizeof(JCOEF) != 2)
662    return 0;
663  if (BITS_IN_JSAMPLE != 8)
664    return 0;
665  if (sizeof(JDIMENSION) != 4)
666    return 0;
667  if (sizeof(ISLOW_MULT_TYPE) != 2)
668    return 0;
669
670  if (simd_support & JSIMD_ARM_NEON)
671    return 1;
672
673  return 0;
674}
675
676GLOBAL(void)
677jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
678                JCOEFPTR coef_block, JSAMPARRAY output_buf,
679                JDIMENSION output_col)
680{
681  jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
682                      output_col);
683}
684
685GLOBAL(void)
686jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr,
687                JCOEFPTR coef_block, JSAMPARRAY output_buf,
688                JDIMENSION output_col)
689{
690  jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
691                      output_col);
692}
693
694GLOBAL(int)
695jsimd_can_idct_islow (void)
696{
697  init_simd();
698
699  /* The code is optimised for these values only */
700  if (DCTSIZE != 8)
701    return 0;
702  if (sizeof(JCOEF) != 2)
703    return 0;
704  if (BITS_IN_JSAMPLE != 8)
705    return 0;
706  if (sizeof(JDIMENSION) != 4)
707    return 0;
708  if (sizeof(ISLOW_MULT_TYPE) != 2)
709    return 0;
710
711  if (simd_support & JSIMD_ARM_NEON)
712    return 1;
713
714  return 0;
715}
716
717GLOBAL(int)
718jsimd_can_idct_ifast (void)
719{
720  init_simd();
721
722  /* The code is optimised for these values only */
723  if (DCTSIZE != 8)
724    return 0;
725  if (sizeof(JCOEF) != 2)
726    return 0;
727  if (BITS_IN_JSAMPLE != 8)
728    return 0;
729  if (sizeof(JDIMENSION) != 4)
730    return 0;
731  if (sizeof(IFAST_MULT_TYPE) != 2)
732    return 0;
733  if (IFAST_SCALE_BITS != 2)
734    return 0;
735
736  if (simd_support & JSIMD_ARM_NEON)
737    return 1;
738
739  return 0;
740}
741
742GLOBAL(int)
743jsimd_can_idct_float (void)
744{
745  init_simd();
746
747  return 0;
748}
749
750GLOBAL(void)
751jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr,
752                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
753                  JDIMENSION output_col)
754{
755  jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
756                        output_col);
757}
758
759GLOBAL(void)
760jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr,
761                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
762                  JDIMENSION output_col)
763{
764  jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
765                        output_col);
766}
767
768GLOBAL(void)
769jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr,
770                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
771                  JDIMENSION output_col)
772{
773}
774
775GLOBAL(int)
776jsimd_can_huff_encode_one_block (void)
777{
778  init_simd();
779
780  if (DCTSIZE != 8)
781    return 0;
782  if (sizeof(JCOEF) != 2)
783    return 0;
784
785  if (simd_support & JSIMD_ARM_NEON && simd_huffman)
786    return 1;
787
788  return 0;
789}
790
791GLOBAL(JOCTET*)
792jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block,
793                             int last_dc_val, c_derived_tbl *dctbl,
794                             c_derived_tbl *actbl)
795{
796  if (simd_features & JSIMD_FASTTBL)
797    return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
798                                            dctbl, actbl);
799  else
800    return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
801                                                    last_dc_val, dctbl, actbl);
802}
803