jsimd_x86_64.c revision ba82ddf6ca0270d65e3b8fb56b2d3116cf423709
1/* 2 * jsimd_x86_64.c 3 * 4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 * Copyright 2009 D. R. Commander 6 * 7 * Based on the x86 SIMD extension for IJG JPEG library, 8 * Copyright (C) 1999-2006, MIYASAKA Masaru. 9 * 10 * This file contains the interface between the "normal" portions 11 * of the library and the SIMD implementations when running on a 12 * x86_64 architecture. 13 */ 14 15#define JPEG_INTERNALS 16#include "../jinclude.h" 17#include "../jpeglib.h" 18#include "../jsimd.h" 19#include "../jdct.h" 20#include "../jsimddct.h" 21#include "simd/jsimd.h" 22 23/* 24 * In the PIC cases, we have no guarantee that constants will keep 25 * their alignment. This macro allows us to verify it at runtime. 26 */ 27#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0) 28 29#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ 30 31GLOBAL(int) 32jsimd_can_rgb_ycc (void) 33{ 34 /* The code is optimised for these values only */ 35 if (BITS_IN_JSAMPLE != 8) 36 return 0; 37 if (sizeof(JDIMENSION) != 4) 38 return 0; 39 if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) 40 return 0; 41 42 if (!IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) 43 return 0; 44 45 return 1; 46} 47 48GLOBAL(int) 49jsimd_can_ycc_rgb (void) 50{ 51 /* The code is optimised for these values only */ 52 if (BITS_IN_JSAMPLE != 8) 53 return 0; 54 if (sizeof(JDIMENSION) != 4) 55 return 0; 56 if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) 57 return 0; 58 59 if (!IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) 60 return 0; 61 62 return 1; 63} 64 65GLOBAL(void) 66jsimd_rgb_ycc_convert (j_compress_ptr cinfo, 67 JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 68 JDIMENSION output_row, int num_rows) 69{ 70 void (*sse2fct)(JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int); 71 72 switch(cinfo->in_color_space) 73 { 74 case JCS_EXT_RGB: 75 sse2fct=jsimd_extrgb_ycc_convert_sse2; 76 break; 77 case JCS_EXT_RGBX: 78 sse2fct=jsimd_extrgbx_ycc_convert_sse2; 79 break; 80 case JCS_EXT_BGR: 81 sse2fct=jsimd_extbgr_ycc_convert_sse2; 82 break; 83 case JCS_EXT_BGRX: 84 sse2fct=jsimd_extbgrx_ycc_convert_sse2; 85 break; 86 case JCS_EXT_XBGR: 87 sse2fct=jsimd_extxbgr_ycc_convert_sse2; 88 break; 89 case JCS_EXT_XRGB: 90 sse2fct=jsimd_extxrgb_ycc_convert_sse2; 91 break; 92 default: 93 sse2fct=jsimd_rgb_ycc_convert_sse2; 94 break; 95 } 96 97 sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows); 98} 99 100GLOBAL(void) 101jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, 102 JSAMPIMAGE input_buf, JDIMENSION input_row, 103 JSAMPARRAY output_buf, int num_rows) 104{ 105 void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int); 106 107 switch(cinfo->out_color_space) 108 { 109 case JCS_EXT_RGB: 110 sse2fct=jsimd_ycc_extrgb_convert_sse2; 111 break; 112 case JCS_EXT_RGBX: 113 sse2fct=jsimd_ycc_extrgbx_convert_sse2; 114 break; 115 case JCS_EXT_BGR: 116 sse2fct=jsimd_ycc_extbgr_convert_sse2; 117 break; 118 case JCS_EXT_BGRX: 119 sse2fct=jsimd_ycc_extbgrx_convert_sse2; 120 break; 121 case JCS_EXT_XBGR: 122 sse2fct=jsimd_ycc_extxbgr_convert_sse2; 123 break; 124 case JCS_EXT_XRGB: 125 sse2fct=jsimd_ycc_extxrgb_convert_sse2; 126 break; 127 default: 128 sse2fct=jsimd_ycc_rgb_convert_sse2; 129 break; 130 } 131 132 sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows); 133} 134 135GLOBAL(int) 136jsimd_can_h2v2_downsample (void) 137{ 138 /* The code is optimised for these values only */ 139 if (BITS_IN_JSAMPLE != 8) 140 return 0; 141 if (sizeof(JDIMENSION) != 4) 142 return 0; 143 144 return 1; 145} 146 147GLOBAL(int) 148jsimd_can_h2v1_downsample (void) 149{ 150 /* The code is optimised for these values only */ 151 if (BITS_IN_JSAMPLE != 8) 152 return 0; 153 if (sizeof(JDIMENSION) != 4) 154 return 0; 155 156 return 1; 157} 158 159GLOBAL(void) 160jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, 161 JSAMPARRAY input_data, JSAMPARRAY output_data) 162{ 163 jsimd_h2v2_downsample_sse2(cinfo->image_width, 164 cinfo->max_v_samp_factor, 165 compptr->v_samp_factor, 166 compptr->width_in_blocks, 167 input_data, output_data); 168} 169 170GLOBAL(void) 171jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, 172 JSAMPARRAY input_data, JSAMPARRAY output_data) 173{ 174 jsimd_h2v1_downsample_sse2(cinfo->image_width, 175 cinfo->max_v_samp_factor, 176 compptr->v_samp_factor, 177 compptr->width_in_blocks, 178 input_data, output_data); 179} 180 181GLOBAL(int) 182jsimd_can_h2v2_upsample (void) 183{ 184 /* The code is optimised for these values only */ 185 if (BITS_IN_JSAMPLE != 8) 186 return 0; 187 if (sizeof(JDIMENSION) != 4) 188 return 0; 189 190 return 1; 191} 192 193GLOBAL(int) 194jsimd_can_h2v1_upsample (void) 195{ 196 /* The code is optimised for these values only */ 197 if (BITS_IN_JSAMPLE != 8) 198 return 0; 199 if (sizeof(JDIMENSION) != 4) 200 return 0; 201 202 return 1; 203} 204 205GLOBAL(void) 206jsimd_h2v2_upsample (j_decompress_ptr cinfo, 207 jpeg_component_info * compptr, 208 JSAMPARRAY input_data, 209 JSAMPARRAY * output_data_ptr) 210{ 211 jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, 212 cinfo->output_width, 213 input_data, output_data_ptr); 214} 215 216GLOBAL(void) 217jsimd_h2v1_upsample (j_decompress_ptr cinfo, 218 jpeg_component_info * compptr, 219 JSAMPARRAY input_data, 220 JSAMPARRAY * output_data_ptr) 221{ 222 jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, 223 cinfo->output_width, 224 input_data, output_data_ptr); 225} 226 227GLOBAL(int) 228jsimd_can_h2v2_fancy_upsample (void) 229{ 230 /* The code is optimised for these values only */ 231 if (BITS_IN_JSAMPLE != 8) 232 return 0; 233 if (sizeof(JDIMENSION) != 4) 234 return 0; 235 236 if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) 237 return 0; 238 239 return 1; 240} 241 242GLOBAL(int) 243jsimd_can_h2v1_fancy_upsample (void) 244{ 245 /* The code is optimised for these values only */ 246 if (BITS_IN_JSAMPLE != 8) 247 return 0; 248 if (sizeof(JDIMENSION) != 4) 249 return 0; 250 251 if (!IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) 252 return 0; 253 254 return 1; 255} 256 257GLOBAL(void) 258jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, 259 jpeg_component_info * compptr, 260 JSAMPARRAY input_data, 261 JSAMPARRAY * output_data_ptr) 262{ 263 jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, 264 compptr->downsampled_width, 265 input_data, output_data_ptr); 266} 267 268GLOBAL(void) 269jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, 270 jpeg_component_info * compptr, 271 JSAMPARRAY input_data, 272 JSAMPARRAY * output_data_ptr) 273{ 274 jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, 275 compptr->downsampled_width, 276 input_data, output_data_ptr); 277} 278 279GLOBAL(int) 280jsimd_can_h2v2_merged_upsample (void) 281{ 282 /* The code is optimised for these values only */ 283 if (BITS_IN_JSAMPLE != 8) 284 return 0; 285 if (sizeof(JDIMENSION) != 4) 286 return 0; 287 288 if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) 289 return 0; 290 291 return 1; 292} 293 294GLOBAL(int) 295jsimd_can_h2v1_merged_upsample (void) 296{ 297 /* The code is optimised for these values only */ 298 if (BITS_IN_JSAMPLE != 8) 299 return 0; 300 if (sizeof(JDIMENSION) != 4) 301 return 0; 302 303 if (!IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) 304 return 0; 305 306 return 1; 307} 308 309GLOBAL(void) 310jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, 311 JSAMPIMAGE input_buf, 312 JDIMENSION in_row_group_ctr, 313 JSAMPARRAY output_buf) 314{ 315 void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); 316 317 switch(cinfo->out_color_space) 318 { 319 case JCS_EXT_RGB: 320 sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2; 321 break; 322 case JCS_EXT_RGBX: 323 sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2; 324 break; 325 case JCS_EXT_BGR: 326 sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2; 327 break; 328 case JCS_EXT_BGRX: 329 sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2; 330 break; 331 case JCS_EXT_XBGR: 332 sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2; 333 break; 334 case JCS_EXT_XRGB: 335 sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2; 336 break; 337 default: 338 sse2fct=jsimd_h2v2_merged_upsample_sse2; 339 break; 340 } 341 342 sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); 343} 344 345GLOBAL(void) 346jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, 347 JSAMPIMAGE input_buf, 348 JDIMENSION in_row_group_ctr, 349 JSAMPARRAY output_buf) 350{ 351 void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY); 352 353 switch(cinfo->out_color_space) 354 { 355 case JCS_EXT_RGB: 356 sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2; 357 break; 358 case JCS_EXT_RGBX: 359 sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2; 360 break; 361 case JCS_EXT_BGR: 362 sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2; 363 break; 364 case JCS_EXT_BGRX: 365 sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2; 366 break; 367 case JCS_EXT_XBGR: 368 sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2; 369 break; 370 case JCS_EXT_XRGB: 371 sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2; 372 break; 373 default: 374 sse2fct=jsimd_h2v1_merged_upsample_sse2; 375 break; 376 } 377 378 sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf); 379} 380 381GLOBAL(int) 382jsimd_can_convsamp (void) 383{ 384 /* The code is optimised for these values only */ 385 if (DCTSIZE != 8) 386 return 0; 387 if (BITS_IN_JSAMPLE != 8) 388 return 0; 389 if (sizeof(JDIMENSION) != 4) 390 return 0; 391 if (sizeof(DCTELEM) != 2) 392 return 0; 393 394 return 1; 395} 396 397GLOBAL(int) 398jsimd_can_convsamp_float (void) 399{ 400 /* The code is optimised for these values only */ 401 if (DCTSIZE != 8) 402 return 0; 403 if (BITS_IN_JSAMPLE != 8) 404 return 0; 405 if (sizeof(JDIMENSION) != 4) 406 return 0; 407 if (sizeof(FAST_FLOAT) != 4) 408 return 0; 409 410 return 1; 411} 412 413GLOBAL(void) 414jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, 415 DCTELEM * workspace) 416{ 417 jsimd_convsamp_sse2(sample_data, start_col, workspace); 418} 419 420GLOBAL(void) 421jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, 422 FAST_FLOAT * workspace) 423{ 424 jsimd_convsamp_float_sse2(sample_data, start_col, workspace); 425} 426 427GLOBAL(int) 428jsimd_can_fdct_islow (void) 429{ 430 /* The code is optimised for these values only */ 431 if (DCTSIZE != 8) 432 return 0; 433 if (sizeof(DCTELEM) != 2) 434 return 0; 435 436 if (!IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) 437 return 0; 438 439 return 1; 440} 441 442GLOBAL(int) 443jsimd_can_fdct_ifast (void) 444{ 445 /* The code is optimised for these values only */ 446 if (DCTSIZE != 8) 447 return 0; 448 if (sizeof(DCTELEM) != 2) 449 return 0; 450 451 if (!IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) 452 return 0; 453 454 return 1; 455} 456 457GLOBAL(int) 458jsimd_can_fdct_float (void) 459{ 460 /* The code is optimised for these values only */ 461 if (DCTSIZE != 8) 462 return 0; 463 if (sizeof(FAST_FLOAT) != 4) 464 return 0; 465 466 if (!IS_ALIGNED_SSE(jconst_fdct_float_sse)) 467 return 0; 468 469 return 1; 470} 471 472GLOBAL(void) 473jsimd_fdct_islow (DCTELEM * data) 474{ 475 jsimd_fdct_islow_sse2(data); 476} 477 478GLOBAL(void) 479jsimd_fdct_ifast (DCTELEM * data) 480{ 481 jsimd_fdct_ifast_sse2(data); 482} 483 484GLOBAL(void) 485jsimd_fdct_float (FAST_FLOAT * data) 486{ 487 jsimd_fdct_float_sse(data); 488} 489 490GLOBAL(int) 491jsimd_can_quantize (void) 492{ 493 /* The code is optimised for these values only */ 494 if (DCTSIZE != 8) 495 return 0; 496 if (sizeof(JCOEF) != 2) 497 return 0; 498 if (sizeof(DCTELEM) != 2) 499 return 0; 500 501 return 1; 502} 503 504GLOBAL(int) 505jsimd_can_quantize_float (void) 506{ 507 /* The code is optimised for these values only */ 508 if (DCTSIZE != 8) 509 return 0; 510 if (sizeof(JCOEF) != 2) 511 return 0; 512 if (sizeof(FAST_FLOAT) != 4) 513 return 0; 514 515 return 1; 516} 517 518GLOBAL(void) 519jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, 520 DCTELEM * workspace) 521{ 522 jsimd_quantize_sse2(coef_block, divisors, workspace); 523} 524 525GLOBAL(void) 526jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, 527 FAST_FLOAT * workspace) 528{ 529 jsimd_quantize_float_sse2(coef_block, divisors, workspace); 530} 531 532GLOBAL(int) 533jsimd_can_idct_2x2 (void) 534{ 535 /* The code is optimised for these values only */ 536 if (DCTSIZE != 8) 537 return 0; 538 if (sizeof(JCOEF) != 2) 539 return 0; 540 if (BITS_IN_JSAMPLE != 8) 541 return 0; 542 if (sizeof(JDIMENSION) != 4) 543 return 0; 544 if (sizeof(ISLOW_MULT_TYPE) != 2) 545 return 0; 546 547 if (!IS_ALIGNED_SSE(jconst_idct_red_sse2)) 548 return 0; 549 550 return 1; 551} 552 553GLOBAL(int) 554jsimd_can_idct_4x4 (void) 555{ 556 /* The code is optimised for these values only */ 557 if (DCTSIZE != 8) 558 return 0; 559 if (sizeof(JCOEF) != 2) 560 return 0; 561 if (BITS_IN_JSAMPLE != 8) 562 return 0; 563 if (sizeof(JDIMENSION) != 4) 564 return 0; 565 if (sizeof(ISLOW_MULT_TYPE) != 2) 566 return 0; 567 568 if (!IS_ALIGNED_SSE(jconst_idct_red_sse2)) 569 return 0; 570 571 return 1; 572} 573 574GLOBAL(void) 575jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, 576 JCOEFPTR coef_block, JSAMPARRAY output_buf, 577 JDIMENSION output_col) 578{ 579 jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); 580} 581 582GLOBAL(void) 583jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, 584 JCOEFPTR coef_block, JSAMPARRAY output_buf, 585 JDIMENSION output_col) 586{ 587 jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); 588} 589 590GLOBAL(int) 591jsimd_can_idct_islow (void) 592{ 593 /* The code is optimised for these values only */ 594 if (DCTSIZE != 8) 595 return 0; 596 if (sizeof(JCOEF) != 2) 597 return 0; 598 if (BITS_IN_JSAMPLE != 8) 599 return 0; 600 if (sizeof(JDIMENSION) != 4) 601 return 0; 602 if (sizeof(ISLOW_MULT_TYPE) != 2) 603 return 0; 604 605 if (!IS_ALIGNED_SSE(jconst_idct_islow_sse2)) 606 return 0; 607 608 return 1; 609} 610 611GLOBAL(int) 612jsimd_can_idct_ifast (void) 613{ 614 /* The code is optimised for these values only */ 615 if (DCTSIZE != 8) 616 return 0; 617 if (sizeof(JCOEF) != 2) 618 return 0; 619 if (BITS_IN_JSAMPLE != 8) 620 return 0; 621 if (sizeof(JDIMENSION) != 4) 622 return 0; 623 if (sizeof(IFAST_MULT_TYPE) != 2) 624 return 0; 625 if (IFAST_SCALE_BITS != 2) 626 return 0; 627 628 if (!IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) 629 return 0; 630 631 return 1; 632} 633 634GLOBAL(int) 635jsimd_can_idct_float (void) 636{ 637 if (DCTSIZE != 8) 638 return 0; 639 if (sizeof(JCOEF) != 2) 640 return 0; 641 if (BITS_IN_JSAMPLE != 8) 642 return 0; 643 if (sizeof(JDIMENSION) != 4) 644 return 0; 645 if (sizeof(FAST_FLOAT) != 4) 646 return 0; 647 if (sizeof(FLOAT_MULT_TYPE) != 4) 648 return 0; 649 650 if (!IS_ALIGNED_SSE(jconst_idct_float_sse2)) 651 return 0; 652 653 return 1; 654} 655 656GLOBAL(void) 657jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, 658 JCOEFPTR coef_block, JSAMPARRAY output_buf, 659 JDIMENSION output_col) 660{ 661 jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col); 662} 663 664GLOBAL(void) 665jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, 666 JCOEFPTR coef_block, JSAMPARRAY output_buf, 667 JDIMENSION output_col) 668{ 669 jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col); 670} 671 672GLOBAL(void) 673jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, 674 JCOEFPTR coef_block, JSAMPARRAY output_buf, 675 JDIMENSION output_col) 676{ 677 jsimd_idct_float_sse2(compptr->dct_table, coef_block, 678 output_buf, output_col); 679} 680 681