1/* 2 * jsimd_arm64.c 3 * 4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 * Copyright (C) 2009-2011, 2013-2014, 2016, D. R. Commander. 6 * Copyright (C) 2015-2016, Matthieu Darbois. 7 * 8 * Based on the x86 SIMD extension for IJG JPEG library, 9 * Copyright (C) 1999-2006, MIYASAKA Masaru. 10 * For conditions of distribution and use, see copyright notice in jsimdext.inc 11 * 12 * This file contains the interface between the "normal" portions 13 * of the library and the SIMD implementations when running on a 14 * 64-bit ARM architecture. 15 */ 16 17#define JPEG_INTERNALS 18#include "../jinclude.h" 19#include "../jpeglib.h" 20#include "../jsimd.h" 21#include "../jdct.h" 22#include "../jsimddct.h" 23#include "jsimd.h" 24 25#include <stdio.h> 26#include <string.h> 27#include <ctype.h> 28 29#define JSIMD_FASTLD3 1 30#define JSIMD_FASTST3 2 31#define JSIMD_FASTTBL 4 32 33static unsigned int simd_support = ~0; 34static unsigned int simd_huffman = 1; 35static unsigned int simd_features = JSIMD_FASTLD3 | JSIMD_FASTST3 | 36 JSIMD_FASTTBL; 37 38#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) 39 40#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT (1024 * 1024) 41 42LOCAL(int) 43check_cpuinfo (char *buffer, const char *field, char *value) 44{ 45 char *p; 46 if (*value == 0) 47 return 0; 48 if (strncmp(buffer, field, strlen(field)) != 0) 49 return 0; 50 buffer += strlen(field); 51 while (isspace(*buffer)) 52 buffer++; 53 54 /* Check if 'value' is present in the buffer as a separate word */ 55 while ((p = strstr(buffer, value))) { 56 if (p > buffer && !isspace(*(p - 1))) { 57 buffer++; 58 continue; 59 } 60 p += strlen(value); 61 if (*p != 0 && !isspace(*p)) { 62 buffer++; 63 continue; 64 } 65 return 1; 66 } 67 return 0; 68} 69 70LOCAL(int) 71parse_proc_cpuinfo (int bufsize) 72{ 73 char *buffer = (char *)malloc(bufsize); 74 FILE *fd; 75 76 if (!buffer) 77 return 0; 78 79 fd = fopen("/proc/cpuinfo", "r"); 80 if (fd) { 81 while (fgets(buffer, bufsize, fd)) { 82 if (!strchr(buffer, '\n') && !feof(fd)) { 83 /* "impossible" happened - insufficient size of the buffer! */ 84 fclose(fd); 85 free(buffer); 86 return 0; 87 } 88 if (check_cpuinfo(buffer, "CPU part", "0xd03") || 89 check_cpuinfo(buffer, "CPU part", "0xd07")) 90 /* The Cortex-A53 has a slow tbl implementation. We can gain a few 91 percent speedup by disabling the use of that instruction. The 92 speedup on Cortex-A57 is more subtle but still measurable. */ 93 simd_features &= ~JSIMD_FASTTBL; 94 else if (check_cpuinfo(buffer, "CPU part", "0x0a1")) 95 /* The SIMD version of Huffman encoding is slower than the C version on 96 Cavium ThunderX. Also, ld3 and st3 are abyssmally slow on that 97 CPU. */ 98 simd_huffman = simd_features = 0; 99 } 100 fclose(fd); 101 } 102 free(buffer); 103 return 1; 104} 105 106#endif 107 108/* 109 * Check what SIMD accelerations are supported. 110 * 111 * FIXME: This code is racy under a multi-threaded environment. 112 */ 113 114/* 115 * ARMv8 architectures support NEON extensions by default. 116 * It is no longer optional as it was with ARMv7. 117 */ 118 119 120LOCAL(void) 121init_simd (void) 122{ 123 char *env = NULL; 124#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) 125 int bufsize = 1024; /* an initial guess for the line buffer size limit */ 126#endif 127 128 if (simd_support != ~0U) 129 return; 130 131 simd_support = 0; 132 133 simd_support |= JSIMD_ARM_NEON; 134#if defined(__linux__) || defined(ANDROID) || defined(__ANDROID__) 135 while (!parse_proc_cpuinfo(bufsize)) { 136 bufsize *= 2; 137 if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT) 138 break; 139 } 140#endif 141 142 /* Force different settings through environment variables */ 143 env = getenv("JSIMD_FORCENEON"); 144 if ((env != NULL) && (strcmp(env, "1") == 0)) 145 simd_support = JSIMD_ARM_NEON; 146 env = getenv("JSIMD_FORCENONE"); 147 if ((env != NULL) && (strcmp(env, "1") == 0)) 148 simd_support = 0; 149 env = getenv("JSIMD_NOHUFFENC"); 150 if ((env != NULL) && (strcmp(env, "1") == 0)) 151 simd_huffman = 0; 152 env = getenv("JSIMD_FASTLD3"); 153 if ((env != NULL) && (strcmp(env, "1") == 0)) 154 simd_features |= JSIMD_FASTLD3; 155 if ((env != NULL) && (strcmp(env, "0") == 0)) 156 simd_features &= ~JSIMD_FASTLD3; 157 env = getenv("JSIMD_FASTST3"); 158 if ((env != NULL) && (strcmp(env, "1") == 0)) 159 simd_features |= JSIMD_FASTST3; 160 if ((env != NULL) && (strcmp(env, "0") == 0)) 161 simd_features &= ~JSIMD_FASTST3; 162} 163 164GLOBAL(int) 165jsimd_can_rgb_ycc (void) 166{ 167 init_simd(); 168 169 /* The code is optimised for these values only */ 170 if (BITS_IN_JSAMPLE != 8) 171 return 0; 172 if (sizeof(JDIMENSION) != 4) 173 return 0; 174 if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) 175 return 0; 176 177 if (simd_support & JSIMD_ARM_NEON) 178 return 1; 179 180 return 0; 181} 182 183GLOBAL(int) 184jsimd_can_rgb_gray (void) 185{ 186 init_simd(); 187 188 return 0; 189} 190 191GLOBAL(int) 192jsimd_can_ycc_rgb (void) 193{ 194 init_simd(); 195 196 /* The code is optimised for these values only */ 197 if (BITS_IN_JSAMPLE != 8) 198 return 0; 199 if (sizeof(JDIMENSION) != 4) 200 return 0; 201 if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) 202 return 0; 203 204 if (simd_support & JSIMD_ARM_NEON) 205 return 1; 206 207 return 0; 208} 209 210GLOBAL(int) 211jsimd_can_ycc_rgb565 (void) 212{ 213 init_simd(); 214 215 /* The code is optimised for these values only */ 216 if (BITS_IN_JSAMPLE != 8) 217 return 0; 218 if (sizeof(JDIMENSION) != 4) 219 return 0; 220 221 if (simd_support & JSIMD_ARM_NEON) 222 return 1; 223 224 return 0; 225} 226 227GLOBAL(void) 228jsimd_rgb_ycc_convert (j_compress_ptr cinfo, 229 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 230 JDIMENSION output_row, int num_rows) 231{ 232 void (*neonfct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); 233 234 switch(cinfo->in_color_space) { 235 case JCS_EXT_RGB: 236 if (simd_features & JSIMD_FASTLD3) 237 neonfct=jsimd_extrgb_ycc_convert_neon; 238 else 239 neonfct=jsimd_extrgb_ycc_convert_neon_slowld3; 240 break; 241 case JCS_EXT_RGBX: 242 case JCS_EXT_RGBA: 243 neonfct=jsimd_extrgbx_ycc_convert_neon; 244 break; 245 case JCS_EXT_BGR: 246 if (simd_features & JSIMD_FASTLD3) 247 neonfct=jsimd_extbgr_ycc_convert_neon; 248 else 249 neonfct=jsimd_extbgr_ycc_convert_neon_slowld3; 250 break; 251 case JCS_EXT_BGRX: 252 case JCS_EXT_BGRA: 253 neonfct=jsimd_extbgrx_ycc_convert_neon; 254 break; 255 case JCS_EXT_XBGR: 256 case JCS_EXT_ABGR: 257 neonfct=jsimd_extxbgr_ycc_convert_neon; 258 break; 259 case JCS_EXT_XRGB: 260 case JCS_EXT_ARGB: 261 neonfct=jsimd_extxrgb_ycc_convert_neon; 262 break; 263 default: 264 if (simd_features & JSIMD_FASTLD3) 265 neonfct=jsimd_extrgb_ycc_convert_neon; 266 else 267 neonfct=jsimd_extrgb_ycc_convert_neon_slowld3; 268 break; 269 } 270 271 neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); 272} 273 274GLOBAL(void) 275jsimd_rgb_gray_convert (j_compress_ptr cinfo, 276 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 277 JDIMENSION output_row, int num_rows) 278{ 279} 280 281GLOBAL(void) 282jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, 283 JSAMPIMAGE input_buf, JDIMENSION input_row, 284 JSAMPARRAY output_buf, int num_rows) 285{ 286 void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); 287 288 switch(cinfo->out_color_space) { 289 case JCS_EXT_RGB: 290 if (simd_features & JSIMD_FASTST3) 291 neonfct=jsimd_ycc_extrgb_convert_neon; 292 else 293 neonfct=jsimd_ycc_extrgb_convert_neon_slowst3; 294 break; 295 case JCS_EXT_RGBX: 296 case JCS_EXT_RGBA: 297 neonfct=jsimd_ycc_extrgbx_convert_neon; 298 break; 299 case JCS_EXT_BGR: 300 if (simd_features & JSIMD_FASTST3) 301 neonfct=jsimd_ycc_extbgr_convert_neon; 302 else 303 neonfct=jsimd_ycc_extbgr_convert_neon_slowst3; 304 break; 305 case JCS_EXT_BGRX: 306 case JCS_EXT_BGRA: 307 neonfct=jsimd_ycc_extbgrx_convert_neon; 308 break; 309 case JCS_EXT_XBGR: 310 case JCS_EXT_ABGR: 311 neonfct=jsimd_ycc_extxbgr_convert_neon; 312 break; 313 case JCS_EXT_XRGB: 314 case JCS_EXT_ARGB: 315 neonfct=jsimd_ycc_extxrgb_convert_neon; 316 break; 317 default: 318 if (simd_features & JSIMD_FASTST3) 319 neonfct=jsimd_ycc_extrgb_convert_neon; 320 else 321 neonfct=jsimd_ycc_extrgb_convert_neon_slowst3; 322 break; 323 } 324 325 neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); 326} 327 328GLOBAL(void) 329jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo, 330 JSAMPIMAGE input_buf, JDIMENSION input_row, 331 JSAMPARRAY output_buf, int num_rows) 332{ 333 jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row, 334 output_buf, num_rows); 335} 336 337GLOBAL(int) 338jsimd_can_h2v2_downsample (void) 339{ 340 init_simd(); 341 342 /* The code is optimised for these values only */ 343 if (BITS_IN_JSAMPLE != 8) 344 return 0; 345 if (DCTSIZE != 8) 346 return 0; 347 if (sizeof(JDIMENSION) != 4) 348 return 0; 349 350 if (simd_support & JSIMD_ARM_NEON) 351 return 1; 352 353 return 0; 354} 355 356GLOBAL(int) 357jsimd_can_h2v1_downsample (void) 358{ 359 init_simd(); 360 361 /* The code is optimised for these values only */ 362 if (BITS_IN_JSAMPLE != 8) 363 return 0; 364 if (DCTSIZE != 8) 365 return 0; 366 if (sizeof(JDIMENSION) != 4) 367 return 0; 368 369 if (simd_support & JSIMD_ARM_NEON) 370 return 1; 371 372 return 0; 373} 374 375GLOBAL(void) 376jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, 377 JSAMPARRAY input_data, JSAMPARRAY output_data) 378{ 379 jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, 380 compptr->v_samp_factor, compptr->width_in_blocks, 381 input_data, output_data); 382} 383 384GLOBAL(void) 385jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info *compptr, 386 JSAMPARRAY input_data, JSAMPARRAY output_data) 387{ 388 jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor, 389 compptr->v_samp_factor, compptr->width_in_blocks, 390 input_data, output_data); 391} 392 393GLOBAL(int) 394jsimd_can_h2v2_upsample (void) 395{ 396 init_simd(); 397 398 return 0; 399} 400 401GLOBAL(int) 402jsimd_can_h2v1_upsample (void) 403{ 404 init_simd(); 405 406 return 0; 407} 408 409GLOBAL(void) 410jsimd_h2v2_upsample (j_decompress_ptr cinfo, 411 jpeg_component_info *compptr, 412 JSAMPARRAY input_data, 413 JSAMPARRAY *output_data_ptr) 414{ 415} 416 417GLOBAL(void) 418jsimd_h2v1_upsample (j_decompress_ptr cinfo, 419 jpeg_component_info *compptr, 420 JSAMPARRAY input_data, 421 JSAMPARRAY *output_data_ptr) 422{ 423} 424 425GLOBAL(int) 426jsimd_can_h2v2_fancy_upsample (void) 427{ 428 init_simd(); 429 430 return 0; 431} 432 433GLOBAL(int) 434jsimd_can_h2v1_fancy_upsample (void) 435{ 436 init_simd(); 437 438 return 0; 439} 440 441GLOBAL(void) 442jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, 443 jpeg_component_info *compptr, 444 JSAMPARRAY input_data, 445 JSAMPARRAY *output_data_ptr) 446{ 447} 448 449GLOBAL(void) 450jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, 451 jpeg_component_info *compptr, 452 JSAMPARRAY input_data, 453 JSAMPARRAY *output_data_ptr) 454{ 455} 456 457GLOBAL(int) 458jsimd_can_h2v2_merged_upsample (void) 459{ 460 init_simd(); 461 462 return 0; 463} 464 465GLOBAL(int) 466jsimd_can_h2v1_merged_upsample (void) 467{ 468 init_simd(); 469 470 return 0; 471} 472 473GLOBAL(void) 474jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, 475 JSAMPIMAGE input_buf, 476 JDIMENSION in_row_group_ctr, 477 JSAMPARRAY output_buf) 478{ 479} 480 481GLOBAL(void) 482jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, 483 JSAMPIMAGE input_buf, 484 JDIMENSION in_row_group_ctr, 485 JSAMPARRAY output_buf) 486{ 487} 488 489GLOBAL(int) 490jsimd_can_convsamp (void) 491{ 492 init_simd(); 493 494 /* The code is optimised for these values only */ 495 if (DCTSIZE != 8) 496 return 0; 497 if (BITS_IN_JSAMPLE != 8) 498 return 0; 499 if (sizeof(JDIMENSION) != 4) 500 return 0; 501 if (sizeof(DCTELEM) != 2) 502 return 0; 503 504 if (simd_support & JSIMD_ARM_NEON) 505 return 1; 506 507 return 0; 508} 509 510GLOBAL(int) 511jsimd_can_convsamp_float (void) 512{ 513 init_simd(); 514 515 return 0; 516} 517 518GLOBAL(void) 519jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, 520 DCTELEM *workspace) 521{ 522 jsimd_convsamp_neon(sample_data, start_col, workspace); 523} 524 525GLOBAL(void) 526jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, 527 FAST_FLOAT *workspace) 528{ 529} 530 531GLOBAL(int) 532jsimd_can_fdct_islow (void) 533{ 534 init_simd(); 535 536 /* The code is optimised for these values only */ 537 if (DCTSIZE != 8) 538 return 0; 539 if (sizeof(DCTELEM) != 2) 540 return 0; 541 542 if (simd_support & JSIMD_ARM_NEON) 543 return 1; 544 545 return 0; 546} 547 548GLOBAL(int) 549jsimd_can_fdct_ifast (void) 550{ 551 init_simd(); 552 553 /* The code is optimised for these values only */ 554 if (DCTSIZE != 8) 555 return 0; 556 if (sizeof(DCTELEM) != 2) 557 return 0; 558 559 if (simd_support & JSIMD_ARM_NEON) 560 return 1; 561 562 return 0; 563} 564 565GLOBAL(int) 566jsimd_can_fdct_float (void) 567{ 568 init_simd(); 569 570 return 0; 571} 572 573GLOBAL(void) 574jsimd_fdct_islow (DCTELEM *data) 575{ 576 jsimd_fdct_islow_neon(data); 577} 578 579GLOBAL(void) 580jsimd_fdct_ifast (DCTELEM *data) 581{ 582 jsimd_fdct_ifast_neon(data); 583} 584 585GLOBAL(void) 586jsimd_fdct_float (FAST_FLOAT *data) 587{ 588} 589 590GLOBAL(int) 591jsimd_can_quantize (void) 592{ 593 init_simd(); 594 595 /* The code is optimised for these values only */ 596 if (DCTSIZE != 8) 597 return 0; 598 if (sizeof(JCOEF) != 2) 599 return 0; 600 if (sizeof(DCTELEM) != 2) 601 return 0; 602 603 if (simd_support & JSIMD_ARM_NEON) 604 return 1; 605 606 return 0; 607} 608 609GLOBAL(int) 610jsimd_can_quantize_float (void) 611{ 612 init_simd(); 613 614 return 0; 615} 616 617GLOBAL(void) 618jsimd_quantize (JCOEFPTR coef_block, DCTELEM *divisors, 619 DCTELEM *workspace) 620{ 621 jsimd_quantize_neon(coef_block, divisors, workspace); 622} 623 624GLOBAL(void) 625jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT *divisors, 626 FAST_FLOAT *workspace) 627{ 628} 629 630GLOBAL(int) 631jsimd_can_idct_2x2 (void) 632{ 633 init_simd(); 634 635 /* The code is optimised for these values only */ 636 if (DCTSIZE != 8) 637 return 0; 638 if (sizeof(JCOEF) != 2) 639 return 0; 640 if (BITS_IN_JSAMPLE != 8) 641 return 0; 642 if (sizeof(JDIMENSION) != 4) 643 return 0; 644 if (sizeof(ISLOW_MULT_TYPE) != 2) 645 return 0; 646 647 if (simd_support & JSIMD_ARM_NEON) 648 return 1; 649 650 return 0; 651} 652 653GLOBAL(int) 654jsimd_can_idct_4x4 (void) 655{ 656 init_simd(); 657 658 /* The code is optimised for these values only */ 659 if (DCTSIZE != 8) 660 return 0; 661 if (sizeof(JCOEF) != 2) 662 return 0; 663 if (BITS_IN_JSAMPLE != 8) 664 return 0; 665 if (sizeof(JDIMENSION) != 4) 666 return 0; 667 if (sizeof(ISLOW_MULT_TYPE) != 2) 668 return 0; 669 670 if (simd_support & JSIMD_ARM_NEON) 671 return 1; 672 673 return 0; 674} 675 676GLOBAL(void) 677jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info *compptr, 678 JCOEFPTR coef_block, JSAMPARRAY output_buf, 679 JDIMENSION output_col) 680{ 681 jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf, 682 output_col); 683} 684 685GLOBAL(void) 686jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info *compptr, 687 JCOEFPTR coef_block, JSAMPARRAY output_buf, 688 JDIMENSION output_col) 689{ 690 jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf, 691 output_col); 692} 693 694GLOBAL(int) 695jsimd_can_idct_islow (void) 696{ 697 init_simd(); 698 699 /* The code is optimised for these values only */ 700 if (DCTSIZE != 8) 701 return 0; 702 if (sizeof(JCOEF) != 2) 703 return 0; 704 if (BITS_IN_JSAMPLE != 8) 705 return 0; 706 if (sizeof(JDIMENSION) != 4) 707 return 0; 708 if (sizeof(ISLOW_MULT_TYPE) != 2) 709 return 0; 710 711 if (simd_support & JSIMD_ARM_NEON) 712 return 1; 713 714 return 0; 715} 716 717GLOBAL(int) 718jsimd_can_idct_ifast (void) 719{ 720 init_simd(); 721 722 /* The code is optimised for these values only */ 723 if (DCTSIZE != 8) 724 return 0; 725 if (sizeof(JCOEF) != 2) 726 return 0; 727 if (BITS_IN_JSAMPLE != 8) 728 return 0; 729 if (sizeof(JDIMENSION) != 4) 730 return 0; 731 if (sizeof(IFAST_MULT_TYPE) != 2) 732 return 0; 733 if (IFAST_SCALE_BITS != 2) 734 return 0; 735 736 if (simd_support & JSIMD_ARM_NEON) 737 return 1; 738 739 return 0; 740} 741 742GLOBAL(int) 743jsimd_can_idct_float (void) 744{ 745 init_simd(); 746 747 return 0; 748} 749 750GLOBAL(void) 751jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info *compptr, 752 JCOEFPTR coef_block, JSAMPARRAY output_buf, 753 JDIMENSION output_col) 754{ 755 jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf, 756 output_col); 757} 758 759GLOBAL(void) 760jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info *compptr, 761 JCOEFPTR coef_block, JSAMPARRAY output_buf, 762 JDIMENSION output_col) 763{ 764 jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf, 765 output_col); 766} 767 768GLOBAL(void) 769jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info *compptr, 770 JCOEFPTR coef_block, JSAMPARRAY output_buf, 771 JDIMENSION output_col) 772{ 773} 774 775GLOBAL(int) 776jsimd_can_huff_encode_one_block (void) 777{ 778 init_simd(); 779 780 if (DCTSIZE != 8) 781 return 0; 782 if (sizeof(JCOEF) != 2) 783 return 0; 784 785 if (simd_support & JSIMD_ARM_NEON && simd_huffman) 786 return 1; 787 788 return 0; 789} 790 791GLOBAL(JOCTET*) 792jsimd_huff_encode_one_block (void *state, JOCTET *buffer, JCOEFPTR block, 793 int last_dc_val, c_derived_tbl *dctbl, 794 c_derived_tbl *actbl) 795{ 796 if (simd_features & JSIMD_FASTTBL) 797 return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val, 798 dctbl, actbl); 799 else 800 return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block, 801 last_dc_val, dctbl, actbl); 802} 803