1/* 2 * The copyright in this software is being made available under the 2-clauses 3 * BSD License, included below. This software may be subject to other third 4 * party and contributor rights, including patent rights, and no such rights 5 * are granted under this license. 6 * 7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium 8 * Copyright (c) 2002-2014, Professor Benoit Macq 9 * Copyright (c) 2001-2003, David Janssens 10 * Copyright (c) 2002-2003, Yannick Verschueren 11 * Copyright (c) 2003-2007, Francois-Olivier Devaux 12 * Copyright (c) 2003-2014, Antonin Descampe 13 * Copyright (c) 2005, Herve Drolon, FreeImage Team 14 * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net> 15 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com> 16 * Copyright (c) 2017, IntoPIX SA <support@intopix.com> 17 * All rights reserved. 18 * 19 * Redistribution and use in source and binary forms, with or without 20 * modification, are permitted provided that the following conditions 21 * are met: 22 * 1. Redistributions of source code must retain the above copyright 23 * notice, this list of conditions and the following disclaimer. 24 * 2. Redistributions in binary form must reproduce the above copyright 25 * notice, this list of conditions and the following disclaimer in the 26 * documentation and/or other materials provided with the distribution. 27 * 28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' 29 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 32 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 38 * POSSIBILITY OF SUCH DAMAGE. 39 */ 40 41#include <assert.h> 42 43#define OPJ_SKIP_POISON 44#include "opj_includes.h" 45 46#ifdef __SSE__ 47#include <xmmintrin.h> 48#endif 49#ifdef __SSE2__ 50#include <emmintrin.h> 51#endif 52#ifdef __SSSE3__ 53#include <tmmintrin.h> 54#endif 55#ifdef __AVX2__ 56#include <immintrin.h> 57#endif 58 59#if defined(__GNUC__) 60#pragma GCC poison malloc calloc realloc free 61#endif 62 63/** @defgroup DWT DWT - Implementation of a discrete wavelet transform */ 64/*@{*/ 65 66#ifdef __AVX2__ 67/** Number of int32 values in a AVX2 register */ 68#define VREG_INT_COUNT 8 69#else 70/** Number of int32 values in a SSE2 register */ 71#define VREG_INT_COUNT 4 72#endif 73 74/** Number of columns that we can process in parallel in the vertical pass */ 75#define PARALLEL_COLS_53 (2*VREG_INT_COUNT) 76 77/** @name Local data structures */ 78/*@{*/ 79 80typedef struct dwt_local { 81 OPJ_INT32* mem; 82 OPJ_SIZE_T mem_count; 83 OPJ_INT32 dn; /* number of elements in high pass band */ 84 OPJ_INT32 sn; /* number of elements in low pass band */ 85 OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */ 86} opj_dwt_t; 87 88typedef union { 89 OPJ_FLOAT32 f[4]; 90} opj_v4_t; 91 92typedef struct v4dwt_local { 93 opj_v4_t* wavelet ; 94 OPJ_INT32 dn ; /* number of elements in high pass band */ 95 OPJ_INT32 sn ; /* number of elements in low pass band */ 96 OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */ 97 OPJ_UINT32 win_l_x0; /* start coord in low pass band */ 98 OPJ_UINT32 win_l_x1; /* end coord in low pass band */ 99 OPJ_UINT32 win_h_x0; /* start coord in high pass band */ 100 OPJ_UINT32 win_h_x1; /* end coord in high pass band */ 101} opj_v4dwt_t ; 102 103static const OPJ_FLOAT32 opj_dwt_alpha = 1.586134342f; /* 12994 */ 104static const OPJ_FLOAT32 opj_dwt_beta = 0.052980118f; /* 434 */ 105static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /* -7233 */ 106static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /* -3633 */ 107 108static const OPJ_FLOAT32 opj_K = 1.230174105f; /* 10078 */ 109static const OPJ_FLOAT32 opj_c13318 = 1.625732422f; 110 111/*@}*/ 112 113/** 114Virtual function type for wavelet transform in 1-D 115*/ 116typedef void (*DWT1DFN)(const opj_dwt_t* v); 117 118/** @name Local static functions */ 119/*@{*/ 120 121/** 122Forward lazy transform (horizontal) 123*/ 124static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 125 OPJ_INT32 sn, OPJ_INT32 cas); 126/** 127Forward lazy transform (vertical) 128*/ 129static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 130 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas); 131/** 132Forward 5-3 wavelet transform in 1-D 133*/ 134static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn, 135 OPJ_INT32 sn, OPJ_INT32 cas); 136/** 137Forward 9-7 wavelet transform in 1-D 138*/ 139static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count, 140 OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas); 141 142 143/** 144Explicit calculation of the Quantization Stepsizes 145*/ 146static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, 147 opj_stepsize_t *bandno_stepsize); 148/** 149Inverse wavelet transform in 2-D. 150*/ 151static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, 152 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i); 153 154static OPJ_BOOL opj_dwt_decode_partial_tile( 155 opj_tcd_tilecomp_t* tilec, 156 OPJ_UINT32 numres); 157 158static OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec, 159 void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32)); 160 161static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, 162 OPJ_UINT32 i); 163 164/* <summary> */ 165/* Inverse 9-7 wavelet transform in 1-D. */ 166/* </summary> */ 167static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt); 168 169static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, 170 OPJ_FLOAT32* OPJ_RESTRICT a, 171 OPJ_UINT32 width, 172 OPJ_UINT32 remaining_height); 173 174static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, 175 OPJ_FLOAT32* OPJ_RESTRICT a, 176 OPJ_UINT32 width, 177 OPJ_UINT32 nb_elts_read); 178 179#ifdef __SSE__ 180static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, 181 OPJ_UINT32 start, 182 OPJ_UINT32 end, 183 const __m128 c); 184 185static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, 186 OPJ_UINT32 start, 187 OPJ_UINT32 end, 188 OPJ_UINT32 m, __m128 c); 189 190#else 191static void opj_v4dwt_decode_step1(opj_v4_t* w, 192 OPJ_UINT32 start, 193 OPJ_UINT32 end, 194 const OPJ_FLOAT32 c); 195 196static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, 197 OPJ_UINT32 start, 198 OPJ_UINT32 end, 199 OPJ_UINT32 m, 200 OPJ_FLOAT32 c); 201 202#endif 203 204/*@}*/ 205 206/*@}*/ 207 208#define IDX_S(i) (i)*2 209#define IDX_D(i) 1 + (i)* 2 210#define UNDERFLOW_SN(i) ((i) >= sn&&sn>0) 211#define UNDERFLOW_DN(i) ((i) >= dn&&dn>0) 212#define OVERFLOW_S(i) (IDX_S(i) >= a_count) 213#define OVERFLOW_D(i) (IDX_D(i) >= a_count) 214 215#define OPJ_S(i) a[IDX_S(i)] 216#define OPJ_D(i) a[IDX_D(i)] 217#define OPJ_S_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_SN(i) ? OPJ_S(sn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i))) 218#define OPJ_D_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_DN(i) ? OPJ_D(dn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i))) 219/* new */ 220#define OPJ_SS_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_DN(i) ? OPJ_S(dn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i))) 221#define OPJ_DD_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_SN(i) ? OPJ_D(sn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i))) 222 223/* <summary> */ 224/* This table contains the norms of the 5-3 wavelets for different bands. */ 225/* </summary> */ 226/* FIXME! the array should really be extended up to 33 resolution levels */ 227/* See https://github.com/uclouvain/openjpeg/issues/493 */ 228static const OPJ_FLOAT64 opj_dwt_norms[4][10] = { 229 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3}, 230 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, 231 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, 232 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93} 233}; 234 235/* <summary> */ 236/* This table contains the norms of the 9-7 wavelets for different bands. */ 237/* </summary> */ 238/* FIXME! the array should really be extended up to 33 resolution levels */ 239/* See https://github.com/uclouvain/openjpeg/issues/493 */ 240static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = { 241 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9}, 242 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, 243 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, 244 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2} 245}; 246 247/* 248========================================================== 249 local functions 250========================================================== 251*/ 252 253/* <summary> */ 254/* Forward lazy transform (horizontal). */ 255/* </summary> */ 256static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 257 OPJ_INT32 sn, OPJ_INT32 cas) 258{ 259 OPJ_INT32 i; 260 OPJ_INT32 * l_dest = b; 261 OPJ_INT32 * l_src = a + cas; 262 263 for (i = 0; i < sn; ++i) { 264 *l_dest++ = *l_src; 265 l_src += 2; 266 } 267 268 l_dest = b + sn; 269 l_src = a + 1 - cas; 270 271 for (i = 0; i < dn; ++i) { 272 *l_dest++ = *l_src; 273 l_src += 2; 274 } 275} 276 277/* <summary> */ 278/* Forward lazy transform (vertical). */ 279/* </summary> */ 280static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn, 281 OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas) 282{ 283 OPJ_INT32 i = sn; 284 OPJ_INT32 * l_dest = b; 285 OPJ_INT32 * l_src = a + cas; 286 287 while (i--) { 288 *l_dest = *l_src; 289 l_dest += x; 290 l_src += 2; 291 } /* b[i*x]=a[2*i+cas]; */ 292 293 l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x; 294 l_src = a + 1 - cas; 295 296 i = dn; 297 while (i--) { 298 *l_dest = *l_src; 299 l_dest += x; 300 l_src += 2; 301 } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/ 302} 303 304#ifdef STANDARD_SLOW_VERSION 305/* <summary> */ 306/* Inverse lazy transform (horizontal). */ 307/* </summary> */ 308static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a) 309{ 310 OPJ_INT32 *ai = a; 311 OPJ_INT32 *bi = h->mem + h->cas; 312 OPJ_INT32 i = h->sn; 313 while (i--) { 314 *bi = *(ai++); 315 bi += 2; 316 } 317 ai = a + h->sn; 318 bi = h->mem + 1 - h->cas; 319 i = h->dn ; 320 while (i--) { 321 *bi = *(ai++); 322 bi += 2; 323 } 324} 325 326/* <summary> */ 327/* Inverse lazy transform (vertical). */ 328/* </summary> */ 329static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) 330{ 331 OPJ_INT32 *ai = a; 332 OPJ_INT32 *bi = v->mem + v->cas; 333 OPJ_INT32 i = v->sn; 334 while (i--) { 335 *bi = *ai; 336 bi += 2; 337 ai += x; 338 } 339 ai = a + (v->sn * (OPJ_SIZE_T)x); 340 bi = v->mem + 1 - v->cas; 341 i = v->dn ; 342 while (i--) { 343 *bi = *ai; 344 bi += 2; 345 ai += x; 346 } 347} 348 349#endif /* STANDARD_SLOW_VERSION */ 350 351/* <summary> */ 352/* Forward 5-3 wavelet transform in 1-D. */ 353/* </summary> */ 354static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn, 355 OPJ_INT32 sn, OPJ_INT32 cas) 356{ 357 OPJ_INT32 i; 358 359 if (!cas) { 360 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 361 for (i = 0; i < dn; i++) { 362 OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 363 } 364 for (i = 0; i < sn; i++) { 365 OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 366 } 367 } 368 } else { 369 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 370 OPJ_S(0) *= 2; 371 } else { 372 for (i = 0; i < dn; i++) { 373 OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; 374 } 375 for (i = 0; i < sn; i++) { 376 OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; 377 } 378 } 379 } 380} 381 382#ifdef STANDARD_SLOW_VERSION 383/* <summary> */ 384/* Inverse 5-3 wavelet transform in 1-D. */ 385/* </summary> */ 386static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn, 387 OPJ_INT32 sn, OPJ_INT32 cas) 388{ 389 OPJ_INT32 i; 390 391 if (!cas) { 392 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 393 for (i = 0; i < sn; i++) { 394 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 395 } 396 for (i = 0; i < dn; i++) { 397 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 398 } 399 } 400 } else { 401 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 402 OPJ_S(0) /= 2; 403 } else { 404 for (i = 0; i < sn; i++) { 405 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; 406 } 407 for (i = 0; i < dn; i++) { 408 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; 409 } 410 } 411 } 412} 413 414static void opj_dwt_decode_1(const opj_dwt_t *v) 415{ 416 opj_dwt_decode_1_(v->mem, v->mem_count, v->dn, v->sn, v->cas); 417} 418 419#endif /* STANDARD_SLOW_VERSION */ 420 421#if !defined(STANDARD_SLOW_VERSION) 422static void opj_idwt53_h_cas0(OPJ_INT32* tmp, 423 const OPJ_INT32 sn, 424 const OPJ_INT32 len, 425 OPJ_INT32* tiledp) 426{ 427 OPJ_INT32 i, j; 428 const OPJ_INT32* in_even = &tiledp[0]; 429 const OPJ_INT32* in_odd = &tiledp[sn]; 430 431#ifdef TWO_PASS_VERSION 432 /* For documentation purpose: performs lifting in two iterations, */ 433 /* but without explicit interleaving */ 434 435 assert(len > 1); 436 437 /* Even */ 438 tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1); 439 for (i = 2, j = 0; i <= len - 2; i += 2, j++) { 440 tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2); 441 } 442 if (len & 1) { /* if len is odd */ 443 tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1); 444 } 445 446 /* Odd */ 447 for (i = 1, j = 0; i < len - 1; i += 2, j++) { 448 tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1); 449 } 450 if (!(len & 1)) { /* if len is even */ 451 tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2]; 452 } 453#else 454 OPJ_INT32 d1c, d1n, s1n, s0c, s0n; 455 456 assert(len > 1); 457 458 /* Improved version of the TWO_PASS_VERSION: */ 459 /* Performs lifting in one single iteration. Saves memory */ 460 /* accesses and explicit interleaving. */ 461 s1n = in_even[0]; 462 d1n = in_odd[0]; 463 s0n = s1n - ((d1n + 1) >> 1); 464 465 for (i = 0, j = 1; i < (len - 3); i += 2, j++) { 466 d1c = d1n; 467 s0c = s0n; 468 469 s1n = in_even[j]; 470 d1n = in_odd[j]; 471 472 s0n = s1n - ((d1c + d1n + 2) >> 2); 473 474 tmp[i ] = s0c; 475 tmp[i + 1] = d1c + ((s0c + s0n) >> 1); 476 } 477 478 tmp[i] = s0n; 479 480 if (len & 1) { 481 tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1); 482 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); 483 } else { 484 tmp[len - 1] = d1n + s0n; 485 } 486#endif 487 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); 488} 489 490static void opj_idwt53_h_cas1(OPJ_INT32* tmp, 491 const OPJ_INT32 sn, 492 const OPJ_INT32 len, 493 OPJ_INT32* tiledp) 494{ 495 OPJ_INT32 i, j; 496 const OPJ_INT32* in_even = &tiledp[sn]; 497 const OPJ_INT32* in_odd = &tiledp[0]; 498 499#ifdef TWO_PASS_VERSION 500 /* For documentation purpose: performs lifting in two iterations, */ 501 /* but without explicit interleaving */ 502 503 assert(len > 2); 504 505 /* Odd */ 506 for (i = 1, j = 0; i < len - 1; i += 2, j++) { 507 tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2); 508 } 509 if (!(len & 1)) { 510 tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1); 511 } 512 513 /* Even */ 514 tmp[0] = in_even[0] + tmp[1]; 515 for (i = 2, j = 1; i < len - 1; i += 2, j++) { 516 tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1); 517 } 518 if (len & 1) { 519 tmp[len - 1] = in_even[len / 2] + tmp[len - 2]; 520 } 521#else 522 OPJ_INT32 s1, s2, dc, dn; 523 524 assert(len > 2); 525 526 /* Improved version of the TWO_PASS_VERSION: */ 527 /* Performs lifting in one single iteration. Saves memory */ 528 /* accesses and explicit interleaving. */ 529 530 s1 = in_even[1]; 531 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); 532 tmp[0] = in_even[0] + dc; 533 534 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { 535 536 s2 = in_even[j + 1]; 537 538 dn = in_odd[j] - ((s1 + s2 + 2) >> 2); 539 tmp[i ] = dc; 540 tmp[i + 1] = s1 + ((dn + dc) >> 1); 541 542 dc = dn; 543 s1 = s2; 544 } 545 546 tmp[i] = dc; 547 548 if (!(len & 1)) { 549 dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1); 550 tmp[len - 2] = s1 + ((dn + dc) >> 1); 551 tmp[len - 1] = dn; 552 } else { 553 tmp[len - 1] = s1 + dc; 554 } 555#endif 556 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); 557} 558 559 560#endif /* !defined(STANDARD_SLOW_VERSION) */ 561 562/* <summary> */ 563/* Inverse 5-3 wavelet transform in 1-D for one row. */ 564/* </summary> */ 565/* Performs interleave, inverse wavelet transform and copy back to buffer */ 566static void opj_idwt53_h(const opj_dwt_t *dwt, 567 OPJ_INT32* tiledp) 568{ 569#ifdef STANDARD_SLOW_VERSION 570 /* For documentation purpose */ 571 opj_dwt_interleave_h(dwt, tiledp); 572 opj_dwt_decode_1(dwt); 573 memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32)); 574#else 575 const OPJ_INT32 sn = dwt->sn; 576 const OPJ_INT32 len = sn + dwt->dn; 577 if (dwt->cas == 0) { /* Left-most sample is on even coordinate */ 578 if (len > 1) { 579 opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp); 580 } else { 581 /* Unmodified value */ 582 } 583 } else { /* Left-most sample is on odd coordinate */ 584 if (len == 1) { 585 tiledp[0] /= 2; 586 } else if (len == 2) { 587 OPJ_INT32* out = dwt->mem; 588 const OPJ_INT32* in_even = &tiledp[sn]; 589 const OPJ_INT32* in_odd = &tiledp[0]; 590 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); 591 out[0] = in_even[0] + out[1]; 592 memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32)); 593 } else if (len > 2) { 594 opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp); 595 } 596 } 597#endif 598} 599 600#if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) 601 602/* Conveniency macros to improve the readabilty of the formulas */ 603#if __AVX2__ 604#define VREG __m256i 605#define LOAD_CST(x) _mm256_set1_epi32(x) 606#define LOAD(x) _mm256_load_si256((const VREG*)(x)) 607#define LOADU(x) _mm256_loadu_si256((const VREG*)(x)) 608#define STORE(x,y) _mm256_store_si256((VREG*)(x),(y)) 609#define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y)) 610#define ADD(x,y) _mm256_add_epi32((x),(y)) 611#define SUB(x,y) _mm256_sub_epi32((x),(y)) 612#define SAR(x,y) _mm256_srai_epi32((x),(y)) 613#else 614#define VREG __m128i 615#define LOAD_CST(x) _mm_set1_epi32(x) 616#define LOAD(x) _mm_load_si128((const VREG*)(x)) 617#define LOADU(x) _mm_loadu_si128((const VREG*)(x)) 618#define STORE(x,y) _mm_store_si128((VREG*)(x),(y)) 619#define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y)) 620#define ADD(x,y) _mm_add_epi32((x),(y)) 621#define SUB(x,y) _mm_sub_epi32((x),(y)) 622#define SAR(x,y) _mm_srai_epi32((x),(y)) 623#endif 624#define ADD3(x,y,z) ADD(ADD(x,y),z) 625 626static 627void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col, 628 const OPJ_INT32* tmp, 629 OPJ_INT32 len, 630 OPJ_SIZE_T stride) 631{ 632 OPJ_INT32 i; 633 for (i = 0; i < len; ++i) { 634 /* A memcpy(&tiledp_col[i * stride + 0], 635 &tmp[PARALLEL_COLS_53 * i + 0], 636 PARALLEL_COLS_53 * sizeof(OPJ_INT32)) 637 would do but would be a tiny bit slower. 638 We can take here advantage of our knowledge of alignment */ 639 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0], 640 LOAD(&tmp[PARALLEL_COLS_53 * i + 0])); 641 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT], 642 LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT])); 643 } 644} 645 646/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or 647 * 16 in AVX2, when top-most pixel is on even coordinate */ 648static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( 649 OPJ_INT32* tmp, 650 const OPJ_INT32 sn, 651 const OPJ_INT32 len, 652 OPJ_INT32* tiledp_col, 653 const OPJ_SIZE_T stride) 654{ 655 const OPJ_INT32* in_even = &tiledp_col[0]; 656 const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 657 658 OPJ_INT32 i; 659 OPJ_SIZE_T j; 660 VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0; 661 VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1; 662 const VREG two = LOAD_CST(2); 663 664 assert(len > 1); 665#if __AVX2__ 666 assert(PARALLEL_COLS_53 == 16); 667 assert(VREG_INT_COUNT == 8); 668#else 669 assert(PARALLEL_COLS_53 == 8); 670 assert(VREG_INT_COUNT == 4); 671#endif 672 673 /* Note: loads of input even/odd values must be done in a unaligned */ 674 /* fashion. But stores in tmp can be done with aligned store, since */ 675 /* the temporary buffer is properly aligned */ 676 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); 677 678 s1n_0 = LOADU(in_even + 0); 679 s1n_1 = LOADU(in_even + VREG_INT_COUNT); 680 d1n_0 = LOADU(in_odd); 681 d1n_1 = LOADU(in_odd + VREG_INT_COUNT); 682 683 /* s0n = s1n - ((d1n + 1) >> 1); <==> */ 684 /* s0n = s1n - ((d1n + d1n + 2) >> 2); */ 685 s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); 686 s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); 687 688 for (i = 0, j = 1; i < (len - 3); i += 2, j++) { 689 d1c_0 = d1n_0; 690 s0c_0 = s0n_0; 691 d1c_1 = d1n_1; 692 s0c_1 = s0n_1; 693 694 s1n_0 = LOADU(in_even + j * stride); 695 s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT); 696 d1n_0 = LOADU(in_odd + j * stride); 697 d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT); 698 699 /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/ 700 s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2)); 701 s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2)); 702 703 STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0); 704 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1); 705 706 /* d1c + ((s0c + s0n) >> 1) */ 707 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, 708 ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1))); 709 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, 710 ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1))); 711 } 712 713 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0); 714 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1); 715 716 if (len & 1) { 717 VREG tmp_len_minus_1; 718 s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride); 719 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ 720 tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); 721 STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1); 722 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ 723 STORE(tmp + PARALLEL_COLS_53 * (len - 2), 724 ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1))); 725 726 s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT); 727 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ 728 tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); 729 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, 730 tmp_len_minus_1); 731 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ 732 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, 733 ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1))); 734 735 736 } else { 737 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, 738 ADD(d1n_0, s0n_0)); 739 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, 740 ADD(d1n_1, s0n_1)); 741 } 742 743 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); 744} 745 746 747/** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or 748 * 16 in AVX2, when top-most pixel is on odd coordinate */ 749static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( 750 OPJ_INT32* tmp, 751 const OPJ_INT32 sn, 752 const OPJ_INT32 len, 753 OPJ_INT32* tiledp_col, 754 const OPJ_SIZE_T stride) 755{ 756 OPJ_INT32 i; 757 OPJ_SIZE_T j; 758 759 VREG s1_0, s2_0, dc_0, dn_0; 760 VREG s1_1, s2_1, dc_1, dn_1; 761 const VREG two = LOAD_CST(2); 762 763 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 764 const OPJ_INT32* in_odd = &tiledp_col[0]; 765 766 assert(len > 2); 767#if __AVX2__ 768 assert(PARALLEL_COLS_53 == 16); 769 assert(VREG_INT_COUNT == 8); 770#else 771 assert(PARALLEL_COLS_53 == 8); 772 assert(VREG_INT_COUNT == 4); 773#endif 774 775 /* Note: loads of input even/odd values must be done in a unaligned */ 776 /* fashion. But stores in tmp can be done with aligned store, since */ 777 /* the temporary buffer is properly aligned */ 778 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); 779 780 s1_0 = LOADU(in_even + stride); 781 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ 782 dc_0 = SUB(LOADU(in_odd + 0), 783 SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2)); 784 STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0)); 785 786 s1_1 = LOADU(in_even + stride + VREG_INT_COUNT); 787 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ 788 dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT), 789 SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2)); 790 STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT, 791 ADD(LOADU(in_even + VREG_INT_COUNT), dc_1)); 792 793 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { 794 795 s2_0 = LOADU(in_even + (j + 1) * stride); 796 s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT); 797 798 /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */ 799 dn_0 = SUB(LOADU(in_odd + j * stride), 800 SAR(ADD3(s1_0, s2_0, two), 2)); 801 dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT), 802 SAR(ADD3(s1_1, s2_1, two), 2)); 803 804 STORE(tmp + PARALLEL_COLS_53 * i, dc_0); 805 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); 806 807 /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */ 808 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, 809 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); 810 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, 811 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); 812 813 dc_0 = dn_0; 814 s1_0 = s2_0; 815 dc_1 = dn_1; 816 s1_1 = s2_1; 817 } 818 STORE(tmp + PARALLEL_COLS_53 * i, dc_0); 819 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); 820 821 if (!(len & 1)) { 822 /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */ 823 dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride), 824 SAR(ADD3(s1_0, s1_0, two), 2)); 825 dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT), 826 SAR(ADD3(s1_1, s1_1, two), 2)); 827 828 /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */ 829 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0, 830 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); 831 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, 832 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); 833 834 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0); 835 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1); 836 } else { 837 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0)); 838 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, 839 ADD(s1_1, dc_1)); 840 } 841 842 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); 843} 844 845#undef VREG 846#undef LOAD_CST 847#undef LOADU 848#undef LOAD 849#undef STORE 850#undef STOREU 851#undef ADD 852#undef ADD3 853#undef SUB 854#undef SAR 855 856#endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */ 857 858#if !defined(STANDARD_SLOW_VERSION) 859/** Vertical inverse 5x3 wavelet transform for one column, when top-most 860 * pixel is on even coordinate */ 861static void opj_idwt3_v_cas0(OPJ_INT32* tmp, 862 const OPJ_INT32 sn, 863 const OPJ_INT32 len, 864 OPJ_INT32* tiledp_col, 865 const OPJ_SIZE_T stride) 866{ 867 OPJ_INT32 i, j; 868 OPJ_INT32 d1c, d1n, s1n, s0c, s0n; 869 870 assert(len > 1); 871 872 /* Performs lifting in one single iteration. Saves memory */ 873 /* accesses and explicit interleaving. */ 874 875 s1n = tiledp_col[0]; 876 d1n = tiledp_col[(OPJ_SIZE_T)sn * stride]; 877 s0n = s1n - ((d1n + 1) >> 1); 878 879 for (i = 0, j = 0; i < (len - 3); i += 2, j++) { 880 d1c = d1n; 881 s0c = s0n; 882 883 s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride]; 884 d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride]; 885 886 s0n = s1n - ((d1c + d1n + 2) >> 2); 887 888 tmp[i ] = s0c; 889 tmp[i + 1] = d1c + ((s0c + s0n) >> 1); 890 } 891 892 tmp[i] = s0n; 893 894 if (len & 1) { 895 tmp[len - 1] = 896 tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] - 897 ((d1n + 1) >> 1); 898 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); 899 } else { 900 tmp[len - 1] = d1n + s0n; 901 } 902 903 for (i = 0; i < len; ++i) { 904 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; 905 } 906} 907 908 909/** Vertical inverse 5x3 wavelet transform for one column, when top-most 910 * pixel is on odd coordinate */ 911static void opj_idwt3_v_cas1(OPJ_INT32* tmp, 912 const OPJ_INT32 sn, 913 const OPJ_INT32 len, 914 OPJ_INT32* tiledp_col, 915 const OPJ_SIZE_T stride) 916{ 917 OPJ_INT32 i, j; 918 OPJ_INT32 s1, s2, dc, dn; 919 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 920 const OPJ_INT32* in_odd = &tiledp_col[0]; 921 922 assert(len > 2); 923 924 /* Performs lifting in one single iteration. Saves memory */ 925 /* accesses and explicit interleaving. */ 926 927 s1 = in_even[stride]; 928 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); 929 tmp[0] = in_even[0] + dc; 930 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { 931 932 s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride]; 933 934 dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2); 935 tmp[i ] = dc; 936 tmp[i + 1] = s1 + ((dn + dc) >> 1); 937 938 dc = dn; 939 s1 = s2; 940 } 941 tmp[i] = dc; 942 if (!(len & 1)) { 943 dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1); 944 tmp[len - 2] = s1 + ((dn + dc) >> 1); 945 tmp[len - 1] = dn; 946 } else { 947 tmp[len - 1] = s1 + dc; 948 } 949 950 for (i = 0; i < len; ++i) { 951 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; 952 } 953} 954#endif /* !defined(STANDARD_SLOW_VERSION) */ 955 956/* <summary> */ 957/* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */ 958/* </summary> */ 959/* Performs interleave, inverse wavelet transform and copy back to buffer */ 960static void opj_idwt53_v(const opj_dwt_t *dwt, 961 OPJ_INT32* tiledp_col, 962 OPJ_SIZE_T stride, 963 OPJ_INT32 nb_cols) 964{ 965#ifdef STANDARD_SLOW_VERSION 966 /* For documentation purpose */ 967 OPJ_INT32 k, c; 968 for (c = 0; c < nb_cols; c ++) { 969 opj_dwt_interleave_v(dwt, tiledp_col + c, stride); 970 opj_dwt_decode_1(dwt); 971 for (k = 0; k < dwt->sn + dwt->dn; ++k) { 972 tiledp_col[c + k * stride] = dwt->mem[k]; 973 } 974 } 975#else 976 const OPJ_INT32 sn = dwt->sn; 977 const OPJ_INT32 len = sn + dwt->dn; 978 if (dwt->cas == 0) { 979 /* If len == 1, unmodified value */ 980 981#if (defined(__SSE2__) || defined(__AVX2__)) 982 if (len > 1 && nb_cols == PARALLEL_COLS_53) { 983 /* Same as below general case, except that thanks to SSE2/AVX2 */ 984 /* we can efficently process 8/16 columns in parallel */ 985 opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); 986 return; 987 } 988#endif 989 if (len > 1) { 990 OPJ_INT32 c; 991 for (c = 0; c < nb_cols; c++, tiledp_col++) { 992 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride); 993 } 994 return; 995 } 996 } else { 997 if (len == 1) { 998 OPJ_INT32 c; 999 for (c = 0; c < nb_cols; c++, tiledp_col++) { 1000 tiledp_col[0] /= 2; 1001 } 1002 return; 1003 } 1004 1005 if (len == 2) { 1006 OPJ_INT32 c; 1007 OPJ_INT32* out = dwt->mem; 1008 for (c = 0; c < nb_cols; c++, tiledp_col++) { 1009 OPJ_INT32 i; 1010 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; 1011 const OPJ_INT32* in_odd = &tiledp_col[0]; 1012 1013 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); 1014 out[0] = in_even[0] + out[1]; 1015 1016 for (i = 0; i < len; ++i) { 1017 tiledp_col[(OPJ_SIZE_T)i * stride] = out[i]; 1018 } 1019 } 1020 1021 return; 1022 } 1023 1024#if (defined(__SSE2__) || defined(__AVX2__)) 1025 if (len > 2 && nb_cols == PARALLEL_COLS_53) { 1026 /* Same as below general case, except that thanks to SSE2/AVX2 */ 1027 /* we can efficently process 8/16 columns in parallel */ 1028 opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); 1029 return; 1030 } 1031#endif 1032 if (len > 2) { 1033 OPJ_INT32 c; 1034 for (c = 0; c < nb_cols; c++, tiledp_col++) { 1035 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride); 1036 } 1037 return; 1038 } 1039 } 1040#endif 1041} 1042 1043 1044/* <summary> */ 1045/* Forward 9-7 wavelet transform in 1-D. */ 1046/* </summary> */ 1047static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count, 1048 OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas) 1049{ 1050 OPJ_INT32 i; 1051 if (!cas) { 1052 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 1053 for (i = 0; i < dn; i++) { 1054 OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993); 1055 } 1056 for (i = 0; i < sn; i++) { 1057 OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434); 1058 } 1059 for (i = 0; i < dn; i++) { 1060 OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233); 1061 } 1062 for (i = 0; i < sn; i++) { 1063 OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633); 1064 } 1065 for (i = 0; i < dn; i++) { 1066 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038); /*5038 */ 1067 } 1068 for (i = 0; i < sn; i++) { 1069 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659); /*6660 */ 1070 } 1071 } 1072 } else { 1073 if ((sn > 0) || (dn > 1)) { /* NEW : CASE ONE ELEMENT */ 1074 for (i = 0; i < dn; i++) { 1075 OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993); 1076 } 1077 for (i = 0; i < sn; i++) { 1078 OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434); 1079 } 1080 for (i = 0; i < dn; i++) { 1081 OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233); 1082 } 1083 for (i = 0; i < sn; i++) { 1084 OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633); 1085 } 1086 for (i = 0; i < dn; i++) { 1087 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038); /*5038 */ 1088 } 1089 for (i = 0; i < sn; i++) { 1090 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659); /*6660 */ 1091 } 1092 } 1093 } 1094} 1095 1096static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, 1097 opj_stepsize_t *bandno_stepsize) 1098{ 1099 OPJ_INT32 p, n; 1100 p = opj_int_floorlog2(stepsize) - 13; 1101 n = 11 - opj_int_floorlog2(stepsize); 1102 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff; 1103 bandno_stepsize->expn = numbps - p; 1104} 1105 1106/* 1107========================================================== 1108 DWT interface 1109========================================================== 1110*/ 1111 1112 1113/* <summary> */ 1114/* Forward 5-3 wavelet transform in 2-D. */ 1115/* </summary> */ 1116static INLINE OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec, 1117 void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32)) 1118{ 1119 OPJ_INT32 i, j, k; 1120 OPJ_INT32 *a = 00; 1121 OPJ_INT32 *aj = 00; 1122 OPJ_INT32 *bj = 00; 1123 OPJ_INT32 w, l; 1124 1125 OPJ_INT32 rw; /* width of the resolution level computed */ 1126 OPJ_INT32 rh; /* height of the resolution level computed */ 1127 OPJ_SIZE_T l_data_count; 1128 OPJ_SIZE_T l_data_size; 1129 1130 opj_tcd_resolution_t * l_cur_res = 0; 1131 opj_tcd_resolution_t * l_last_res = 0; 1132 1133 w = tilec->x1 - tilec->x0; 1134 l = (OPJ_INT32)tilec->numresolutions - 1; 1135 a = tilec->data; 1136 1137 l_cur_res = tilec->resolutions + l; 1138 l_last_res = l_cur_res - 1; 1139 1140 l_data_count = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions); 1141 /* overflow check */ 1142 if (l_data_count > (SIZE_MAX / sizeof(OPJ_INT32))) { 1143 /* FIXME event manager error callback */ 1144 return OPJ_FALSE; 1145 } 1146 l_data_size = l_data_count * sizeof(OPJ_INT32); 1147 bj = (OPJ_INT32*)opj_malloc(l_data_size); 1148 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */ 1149 /* in that case, so do not error out */ 1150 if (l_data_size != 0 && ! bj) { 1151 return OPJ_FALSE; 1152 } 1153 i = l; 1154 1155 while (i--) { 1156 OPJ_INT32 rw1; /* width of the resolution level once lower than computed one */ 1157 OPJ_INT32 rh1; /* height of the resolution level once lower than computed one */ 1158 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */ 1159 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */ 1160 OPJ_INT32 dn, sn; 1161 1162 rw = l_cur_res->x1 - l_cur_res->x0; 1163 rh = l_cur_res->y1 - l_cur_res->y0; 1164 rw1 = l_last_res->x1 - l_last_res->x0; 1165 rh1 = l_last_res->y1 - l_last_res->y0; 1166 1167 cas_row = l_cur_res->x0 & 1; 1168 cas_col = l_cur_res->y0 & 1; 1169 1170 sn = rh1; 1171 dn = rh - rh1; 1172 for (j = 0; j < rw; ++j) { 1173 aj = a + j; 1174 for (k = 0; k < rh; ++k) { 1175 bj[k] = aj[k * w]; 1176 } 1177 1178 (*p_function) (bj, l_data_count, dn, sn, cas_col); 1179 1180 opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col); 1181 } 1182 1183 sn = rw1; 1184 dn = rw - rw1; 1185 1186 for (j = 0; j < rh; j++) { 1187 aj = a + j * w; 1188 for (k = 0; k < rw; k++) { 1189 bj[k] = aj[k]; 1190 } 1191 (*p_function) (bj, l_data_count, dn, sn, cas_row); 1192 opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row); 1193 } 1194 1195 l_cur_res = l_last_res; 1196 1197 --l_last_res; 1198 } 1199 1200 opj_free(bj); 1201 return OPJ_TRUE; 1202} 1203 1204/* Forward 5-3 wavelet transform in 2-D. */ 1205/* </summary> */ 1206OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec) 1207{ 1208 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1); 1209} 1210 1211/* <summary> */ 1212/* Inverse 5-3 wavelet transform in 2-D. */ 1213/* </summary> */ 1214OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec, 1215 OPJ_UINT32 numres) 1216{ 1217 if (p_tcd->whole_tile_decoding) { 1218 return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres); 1219 } else { 1220 return opj_dwt_decode_partial_tile(tilec, numres); 1221 } 1222} 1223 1224 1225/* <summary> */ 1226/* Get gain of 5-3 wavelet transform. */ 1227/* </summary> */ 1228OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient) 1229{ 1230 if (orient == 0) { 1231 return 0; 1232 } 1233 if (orient == 1 || orient == 2) { 1234 return 1; 1235 } 1236 return 2; 1237} 1238 1239/* <summary> */ 1240/* Get norm of 5-3 wavelet. */ 1241/* </summary> */ 1242OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) 1243{ 1244 /* FIXME ! This is just a band-aid to avoid a buffer overflow */ 1245 /* but the array should really be extended up to 33 resolution levels */ 1246 /* See https://github.com/uclouvain/openjpeg/issues/493 */ 1247 if (orient == 0 && level >= 10) { 1248 level = 9; 1249 } else if (orient > 0 && level >= 9) { 1250 level = 8; 1251 } 1252 return opj_dwt_norms[orient][level]; 1253} 1254 1255/* <summary> */ 1256/* Forward 9-7 wavelet transform in 2-D. */ 1257/* </summary> */ 1258OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec) 1259{ 1260 return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real); 1261} 1262 1263/* <summary> */ 1264/* Get gain of 9-7 wavelet transform. */ 1265/* </summary> */ 1266OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient) 1267{ 1268 (void)orient; 1269 return 0; 1270} 1271 1272/* <summary> */ 1273/* Get norm of 9-7 wavelet. */ 1274/* </summary> */ 1275OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient) 1276{ 1277 /* FIXME ! This is just a band-aid to avoid a buffer overflow */ 1278 /* but the array should really be extended up to 33 resolution levels */ 1279 /* See https://github.com/uclouvain/openjpeg/issues/493 */ 1280 if (orient == 0 && level >= 10) { 1281 level = 9; 1282 } else if (orient > 0 && level >= 9) { 1283 level = 8; 1284 } 1285 return opj_dwt_norms_real[orient][level]; 1286} 1287 1288void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec) 1289{ 1290 OPJ_UINT32 numbands, bandno; 1291 numbands = 3 * tccp->numresolutions - 2; 1292 for (bandno = 0; bandno < numbands; bandno++) { 1293 OPJ_FLOAT64 stepsize; 1294 OPJ_UINT32 resno, level, orient, gain; 1295 1296 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1); 1297 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1); 1298 level = tccp->numresolutions - 1 - resno; 1299 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || 1300 (orient == 2)) ? 1 : 2)); 1301 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { 1302 stepsize = 1.0; 1303 } else { 1304 OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level]; 1305 stepsize = (1 << (gain)) / norm; 1306 } 1307 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0), 1308 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]); 1309 } 1310} 1311 1312/* <summary> */ 1313/* Determine maximum computed resolution level for inverse wavelet transform */ 1314/* </summary> */ 1315static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, 1316 OPJ_UINT32 i) 1317{ 1318 OPJ_UINT32 mr = 0; 1319 OPJ_UINT32 w; 1320 while (--i) { 1321 ++r; 1322 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) { 1323 mr = w ; 1324 } 1325 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) { 1326 mr = w ; 1327 } 1328 } 1329 return mr ; 1330} 1331 1332typedef struct { 1333 opj_dwt_t h; 1334 OPJ_UINT32 rw; 1335 OPJ_UINT32 w; 1336 OPJ_INT32 * OPJ_RESTRICT tiledp; 1337 OPJ_UINT32 min_j; 1338 OPJ_UINT32 max_j; 1339} opj_dwd_decode_h_job_t; 1340 1341static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) 1342{ 1343 OPJ_UINT32 j; 1344 opj_dwd_decode_h_job_t* job; 1345 (void)tls; 1346 1347 job = (opj_dwd_decode_h_job_t*)user_data; 1348 for (j = job->min_j; j < job->max_j; j++) { 1349 opj_idwt53_h(&job->h, &job->tiledp[j * job->w]); 1350 } 1351 1352 opj_aligned_free(job->h.mem); 1353 opj_free(job); 1354} 1355 1356typedef struct { 1357 opj_dwt_t v; 1358 OPJ_UINT32 rh; 1359 OPJ_UINT32 w; 1360 OPJ_INT32 * OPJ_RESTRICT tiledp; 1361 OPJ_UINT32 min_j; 1362 OPJ_UINT32 max_j; 1363} opj_dwd_decode_v_job_t; 1364 1365static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) 1366{ 1367 OPJ_UINT32 j; 1368 opj_dwd_decode_v_job_t* job; 1369 (void)tls; 1370 1371 job = (opj_dwd_decode_v_job_t*)user_data; 1372 for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j; 1373 j += PARALLEL_COLS_53) { 1374 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, 1375 PARALLEL_COLS_53); 1376 } 1377 if (j < job->max_j) 1378 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, 1379 (OPJ_INT32)(job->max_j - j)); 1380 1381 opj_aligned_free(job->v.mem); 1382 opj_free(job); 1383} 1384 1385 1386/* <summary> */ 1387/* Inverse wavelet transform in 2-D. */ 1388/* </summary> */ 1389static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, 1390 const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) 1391{ 1392 opj_dwt_t h; 1393 opj_dwt_t v; 1394 1395 opj_tcd_resolution_t* tr = tilec->resolutions; 1396 1397 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - 1398 tr->x0); /* width of the resolution level computed */ 1399 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - 1400 tr->y0); /* height of the resolution level computed */ 1401 1402 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - 1403 1].x1 - 1404 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); 1405 OPJ_SIZE_T h_mem_size; 1406 int num_threads; 1407 1408 if (numres == 1U) { 1409 return OPJ_TRUE; 1410 } 1411 num_threads = opj_thread_pool_get_thread_count(tp); 1412 h.mem_count = opj_dwt_max_resolution(tr, numres); 1413 /* overflow check */ 1414 if (h.mem_count > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) { 1415 /* FIXME event manager error callback */ 1416 return OPJ_FALSE; 1417 } 1418 /* We need PARALLEL_COLS_53 times the height of the array, */ 1419 /* since for the vertical pass */ 1420 /* we process PARALLEL_COLS_53 columns at a time */ 1421 h_mem_size = h.mem_count * PARALLEL_COLS_53 * sizeof(OPJ_INT32); 1422 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1423 if (! h.mem) { 1424 /* FIXME event manager error callback */ 1425 return OPJ_FALSE; 1426 } 1427 1428 v.mem_count = h.mem_count; 1429 v.mem = h.mem; 1430 1431 while (--numres) { 1432 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data; 1433 OPJ_UINT32 j; 1434 1435 ++tr; 1436 h.sn = (OPJ_INT32)rw; 1437 v.sn = (OPJ_INT32)rh; 1438 1439 rw = (OPJ_UINT32)(tr->x1 - tr->x0); 1440 rh = (OPJ_UINT32)(tr->y1 - tr->y0); 1441 1442 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 1443 h.cas = tr->x0 % 2; 1444 1445 if (num_threads <= 1 || rh <= 1) { 1446 for (j = 0; j < rh; ++j) { 1447 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]); 1448 } 1449 } else { 1450 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; 1451 OPJ_UINT32 step_j; 1452 1453 if (rh < num_jobs) { 1454 num_jobs = rh; 1455 } 1456 step_j = (rh / num_jobs); 1457 1458 for (j = 0; j < num_jobs; j++) { 1459 opj_dwd_decode_h_job_t* job; 1460 1461 job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t)); 1462 if (!job) { 1463 /* It would be nice to fallback to single thread case, but */ 1464 /* unfortunately some jobs may be launched and have modified */ 1465 /* tiledp, so it is not practical to recover from that error */ 1466 /* FIXME event manager error callback */ 1467 opj_thread_pool_wait_completion(tp, 0); 1468 opj_aligned_free(h.mem); 1469 return OPJ_FALSE; 1470 } 1471 job->h = h; 1472 job->rw = rw; 1473 job->w = w; 1474 job->tiledp = tiledp; 1475 job->min_j = j * step_j; 1476 job->max_j = (j + 1U) * step_j; /* this can overflow */ 1477 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ 1478 job->max_j = rh; 1479 } 1480 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1481 if (!job->h.mem) { 1482 /* FIXME event manager error callback */ 1483 opj_thread_pool_wait_completion(tp, 0); 1484 opj_free(job); 1485 opj_aligned_free(h.mem); 1486 return OPJ_FALSE; 1487 } 1488 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job); 1489 } 1490 opj_thread_pool_wait_completion(tp, 0); 1491 } 1492 1493 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 1494 v.cas = tr->y0 % 2; 1495 1496 if (num_threads <= 1 || rw <= 1) { 1497 for (j = 0; j + PARALLEL_COLS_53 <= rw; 1498 j += PARALLEL_COLS_53) { 1499 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53); 1500 } 1501 if (j < rw) { 1502 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j)); 1503 } 1504 } else { 1505 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; 1506 OPJ_UINT32 step_j; 1507 1508 if (rw < num_jobs) { 1509 num_jobs = rw; 1510 } 1511 step_j = (rw / num_jobs); 1512 1513 for (j = 0; j < num_jobs; j++) { 1514 opj_dwd_decode_v_job_t* job; 1515 1516 job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t)); 1517 if (!job) { 1518 /* It would be nice to fallback to single thread case, but */ 1519 /* unfortunately some jobs may be launched and have modified */ 1520 /* tiledp, so it is not practical to recover from that error */ 1521 /* FIXME event manager error callback */ 1522 opj_thread_pool_wait_completion(tp, 0); 1523 opj_aligned_free(v.mem); 1524 return OPJ_FALSE; 1525 } 1526 job->v = v; 1527 job->rh = rh; 1528 job->w = w; 1529 job->tiledp = tiledp; 1530 job->min_j = j * step_j; 1531 job->max_j = (j + 1U) * step_j; /* this can overflow */ 1532 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ 1533 job->max_j = rw; 1534 } 1535 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1536 if (!job->v.mem) { 1537 /* FIXME event manager error callback */ 1538 opj_thread_pool_wait_completion(tp, 0); 1539 opj_free(job); 1540 opj_aligned_free(v.mem); 1541 return OPJ_FALSE; 1542 } 1543 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job); 1544 } 1545 opj_thread_pool_wait_completion(tp, 0); 1546 } 1547 } 1548 opj_aligned_free(h.mem); 1549 return OPJ_TRUE; 1550} 1551 1552static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest, 1553 OPJ_INT32 cas, 1554 opj_sparse_array_int32_t* sa, 1555 OPJ_UINT32 sa_line, 1556 OPJ_UINT32 sn, 1557 OPJ_UINT32 win_l_x0, 1558 OPJ_UINT32 win_l_x1, 1559 OPJ_UINT32 win_h_x0, 1560 OPJ_UINT32 win_h_x1) 1561{ 1562 OPJ_BOOL ret; 1563 ret = opj_sparse_array_int32_read(sa, 1564 win_l_x0, sa_line, 1565 win_l_x1, sa_line + 1, 1566 dest + cas + 2 * win_l_x0, 1567 2, 0, OPJ_TRUE); 1568 assert(ret); 1569 ret = opj_sparse_array_int32_read(sa, 1570 sn + win_h_x0, sa_line, 1571 sn + win_h_x1, sa_line + 1, 1572 dest + 1 - cas + 2 * win_h_x0, 1573 2, 0, OPJ_TRUE); 1574 assert(ret); 1575 OPJ_UNUSED(ret); 1576} 1577 1578 1579static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest, 1580 OPJ_INT32 cas, 1581 opj_sparse_array_int32_t* sa, 1582 OPJ_UINT32 sa_col, 1583 OPJ_UINT32 nb_cols, 1584 OPJ_UINT32 sn, 1585 OPJ_UINT32 win_l_y0, 1586 OPJ_UINT32 win_l_y1, 1587 OPJ_UINT32 win_h_y0, 1588 OPJ_UINT32 win_h_y1) 1589{ 1590 OPJ_BOOL ret; 1591 ret = opj_sparse_array_int32_read(sa, 1592 sa_col, win_l_y0, 1593 sa_col + nb_cols, win_l_y1, 1594 dest + cas * 4 + 2 * 4 * win_l_y0, 1595 1, 2 * 4, OPJ_TRUE); 1596 assert(ret); 1597 ret = opj_sparse_array_int32_read(sa, 1598 sa_col, sn + win_h_y0, 1599 sa_col + nb_cols, sn + win_h_y1, 1600 dest + (1 - cas) * 4 + 2 * 4 * win_h_y0, 1601 1, 2 * 4, OPJ_TRUE); 1602 assert(ret); 1603 OPJ_UNUSED(ret); 1604} 1605 1606static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, 1607 OPJ_INT32 dn, OPJ_INT32 sn, 1608 OPJ_INT32 cas, 1609 OPJ_INT32 win_l_x0, 1610 OPJ_INT32 win_l_x1, 1611 OPJ_INT32 win_h_x0, 1612 OPJ_INT32 win_h_x1) 1613{ 1614 OPJ_INT32 i; 1615 1616 if (!cas) { 1617 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 1618 1619 /* Naive version is : 1620 for (i = win_l_x0; i < i_max; i++) { 1621 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1622 } 1623 for (i = win_h_x0; i < win_h_x1; i++) { 1624 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 1625 } 1626 but the compiler doesn't manage to unroll it to avoid bound 1627 checking in OPJ_S_ and OPJ_D_ macros 1628 */ 1629 1630 i = win_l_x0; 1631 if (i < win_l_x1) { 1632 OPJ_INT32 i_max; 1633 1634 /* Left-most case */ 1635 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1636 i ++; 1637 1638 i_max = win_l_x1; 1639 if (i_max > dn) { 1640 i_max = dn; 1641 } 1642 for (; i < i_max; i++) { 1643 /* No bound checking */ 1644 OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2; 1645 } 1646 for (; i < win_l_x1; i++) { 1647 /* Right-most case */ 1648 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1649 } 1650 } 1651 1652 i = win_h_x0; 1653 if (i < win_h_x1) { 1654 OPJ_INT32 i_max = win_h_x1; 1655 if (i_max >= sn) { 1656 i_max = sn - 1; 1657 } 1658 for (; i < i_max; i++) { 1659 /* No bound checking */ 1660 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1; 1661 } 1662 for (; i < win_h_x1; i++) { 1663 /* Right-most case */ 1664 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 1665 } 1666 } 1667 } 1668 } else { 1669 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 1670 OPJ_S(0) /= 2; 1671 } else { 1672 for (i = win_l_x0; i < win_l_x1; i++) { 1673 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; 1674 } 1675 for (i = win_h_x0; i < win_h_x1; i++) { 1676 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; 1677 } 1678 } 1679 } 1680} 1681 1682#define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off] 1683#define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off] 1684#define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off))) 1685#define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off))) 1686#define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off))) 1687#define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off))) 1688 1689static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a, 1690 OPJ_UINT32 nb_cols, 1691 OPJ_INT32 dn, OPJ_INT32 sn, 1692 OPJ_INT32 cas, 1693 OPJ_INT32 win_l_x0, 1694 OPJ_INT32 win_l_x1, 1695 OPJ_INT32 win_h_x0, 1696 OPJ_INT32 win_h_x1) 1697{ 1698 OPJ_INT32 i; 1699 OPJ_UINT32 off; 1700 1701 (void)nb_cols; 1702 1703 if (!cas) { 1704 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ 1705 1706 /* Naive version is : 1707 for (i = win_l_x0; i < i_max; i++) { 1708 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; 1709 } 1710 for (i = win_h_x0; i < win_h_x1; i++) { 1711 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; 1712 } 1713 but the compiler doesn't manage to unroll it to avoid bound 1714 checking in OPJ_S_ and OPJ_D_ macros 1715 */ 1716 1717 i = win_l_x0; 1718 if (i < win_l_x1) { 1719 OPJ_INT32 i_max; 1720 1721 /* Left-most case */ 1722 for (off = 0; off < 4; off++) { 1723 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; 1724 } 1725 i ++; 1726 1727 i_max = win_l_x1; 1728 if (i_max > dn) { 1729 i_max = dn; 1730 } 1731 1732#ifdef __SSE2__ 1733 if (i + 1 < i_max) { 1734 const __m128i two = _mm_set1_epi32(2); 1735 __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8)); 1736 for (; i + 1 < i_max; i += 2) { 1737 /* No bound checking */ 1738 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); 1739 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); 1740 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); 1741 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); 1742 S = _mm_sub_epi32(S, 1743 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2)); 1744 S1 = _mm_sub_epi32(S1, 1745 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2)); 1746 _mm_store_si128((__m128i*)(a + i * 8), S); 1747 _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1); 1748 Dm1 = D1; 1749 } 1750 } 1751#endif 1752 1753 for (; i < i_max; i++) { 1754 /* No bound checking */ 1755 for (off = 0; off < 4; off++) { 1756 OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2; 1757 } 1758 } 1759 for (; i < win_l_x1; i++) { 1760 /* Right-most case */ 1761 for (off = 0; off < 4; off++) { 1762 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; 1763 } 1764 } 1765 } 1766 1767 i = win_h_x0; 1768 if (i < win_h_x1) { 1769 OPJ_INT32 i_max = win_h_x1; 1770 if (i_max >= sn) { 1771 i_max = sn - 1; 1772 } 1773 1774#ifdef __SSE2__ 1775 if (i + 1 < i_max) { 1776 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); 1777 for (; i + 1 < i_max; i += 2) { 1778 /* No bound checking */ 1779 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); 1780 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); 1781 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); 1782 __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8)); 1783 D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1)); 1784 D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1)); 1785 _mm_store_si128((__m128i*)(a + 4 + i * 8), D); 1786 _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1); 1787 S = S2; 1788 } 1789 } 1790#endif 1791 1792 for (; i < i_max; i++) { 1793 /* No bound checking */ 1794 for (off = 0; off < 4; off++) { 1795 OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1; 1796 } 1797 } 1798 for (; i < win_h_x1; i++) { 1799 /* Right-most case */ 1800 for (off = 0; off < 4; off++) { 1801 OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1; 1802 } 1803 } 1804 } 1805 } 1806 } else { 1807 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ 1808 for (off = 0; off < 4; off++) { 1809 OPJ_S_off(0, off) /= 2; 1810 } 1811 } else { 1812 for (i = win_l_x0; i < win_l_x1; i++) { 1813 for (off = 0; off < 4; off++) { 1814 OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2; 1815 } 1816 } 1817 for (i = win_h_x0; i < win_h_x1; i++) { 1818 for (off = 0; off < 4; off++) { 1819 OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1; 1820 } 1821 } 1822 } 1823 } 1824} 1825 1826static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec, 1827 OPJ_UINT32 resno, 1828 OPJ_UINT32 bandno, 1829 OPJ_UINT32 tcx0, 1830 OPJ_UINT32 tcy0, 1831 OPJ_UINT32 tcx1, 1832 OPJ_UINT32 tcy1, 1833 OPJ_UINT32* tbx0, 1834 OPJ_UINT32* tby0, 1835 OPJ_UINT32* tbx1, 1836 OPJ_UINT32* tby1) 1837{ 1838 /* Compute number of decomposition for this band. See table F-1 */ 1839 OPJ_UINT32 nb = (resno == 0) ? 1840 tilec->numresolutions - 1 : 1841 tilec->numresolutions - resno; 1842 /* Map above tile-based coordinates to sub-band-based coordinates per */ 1843 /* equation B-15 of the standard */ 1844 OPJ_UINT32 x0b = bandno & 1; 1845 OPJ_UINT32 y0b = bandno >> 1; 1846 if (tbx0) { 1847 *tbx0 = (nb == 0) ? tcx0 : 1848 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 : 1849 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb); 1850 } 1851 if (tby0) { 1852 *tby0 = (nb == 0) ? tcy0 : 1853 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 : 1854 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb); 1855 } 1856 if (tbx1) { 1857 *tbx1 = (nb == 0) ? tcx1 : 1858 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 : 1859 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb); 1860 } 1861 if (tby1) { 1862 *tby1 = (nb == 0) ? tcy1 : 1863 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 : 1864 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb); 1865 } 1866} 1867 1868static void opj_dwt_segment_grow(OPJ_UINT32 filter_width, 1869 OPJ_UINT32 max_size, 1870 OPJ_UINT32* start, 1871 OPJ_UINT32* end) 1872{ 1873 *start = opj_uint_subs(*start, filter_width); 1874 *end = opj_uint_adds(*end, filter_width); 1875 *end = opj_uint_min(*end, max_size); 1876} 1877 1878 1879static opj_sparse_array_int32_t* opj_dwt_init_sparse_array( 1880 opj_tcd_tilecomp_t* tilec, 1881 OPJ_UINT32 numres) 1882{ 1883 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); 1884 OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0); 1885 OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0); 1886 OPJ_UINT32 resno, bandno, precno, cblkno; 1887 opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create( 1888 w, h, opj_uint_min(w, 64), opj_uint_min(h, 64)); 1889 if (sa == NULL) { 1890 return NULL; 1891 } 1892 1893 for (resno = 0; resno < numres; ++resno) { 1894 opj_tcd_resolution_t* res = &tilec->resolutions[resno]; 1895 1896 for (bandno = 0; bandno < res->numbands; ++bandno) { 1897 opj_tcd_band_t* band = &res->bands[bandno]; 1898 1899 for (precno = 0; precno < res->pw * res->ph; ++precno) { 1900 opj_tcd_precinct_t* precinct = &band->precincts[precno]; 1901 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { 1902 opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; 1903 if (cblk->decoded_data != NULL) { 1904 OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0); 1905 OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0); 1906 OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0); 1907 OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0); 1908 1909 if (band->bandno & 1) { 1910 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; 1911 x += (OPJ_UINT32)(pres->x1 - pres->x0); 1912 } 1913 if (band->bandno & 2) { 1914 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; 1915 y += (OPJ_UINT32)(pres->y1 - pres->y0); 1916 } 1917 1918 if (!opj_sparse_array_int32_write(sa, x, y, 1919 x + cblk_w, y + cblk_h, 1920 cblk->decoded_data, 1921 1, cblk_w, OPJ_TRUE)) { 1922 opj_sparse_array_int32_free(sa); 1923 return NULL; 1924 } 1925 } 1926 } 1927 } 1928 } 1929 } 1930 1931 return sa; 1932} 1933 1934 1935static OPJ_BOOL opj_dwt_decode_partial_tile( 1936 opj_tcd_tilecomp_t* tilec, 1937 OPJ_UINT32 numres) 1938{ 1939 opj_sparse_array_int32_t* sa; 1940 opj_dwt_t h; 1941 opj_dwt_t v; 1942 OPJ_UINT32 resno; 1943 /* This value matches the maximum left/right extension given in tables */ 1944 /* F.2 and F.3 of the standard. */ 1945 const OPJ_UINT32 filter_width = 2U; 1946 1947 opj_tcd_resolution_t* tr = tilec->resolutions; 1948 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); 1949 1950 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - 1951 tr->x0); /* width of the resolution level computed */ 1952 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - 1953 tr->y0); /* height of the resolution level computed */ 1954 1955 OPJ_SIZE_T h_mem_size; 1956 1957 /* Compute the intersection of the area of interest, expressed in tile coordinates */ 1958 /* with the tile coordinates */ 1959 OPJ_UINT32 win_tcx0 = tilec->win_x0; 1960 OPJ_UINT32 win_tcy0 = tilec->win_y0; 1961 OPJ_UINT32 win_tcx1 = tilec->win_x1; 1962 OPJ_UINT32 win_tcy1 = tilec->win_y1; 1963 1964 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { 1965 return OPJ_TRUE; 1966 } 1967 1968 sa = opj_dwt_init_sparse_array(tilec, numres); 1969 if (sa == NULL) { 1970 return OPJ_FALSE; 1971 } 1972 1973 if (numres == 1U) { 1974 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 1975 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 1976 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 1977 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 1978 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 1979 tilec->data_win, 1980 1, tr_max->win_x1 - tr_max->win_x0, 1981 OPJ_TRUE); 1982 assert(ret); 1983 OPJ_UNUSED(ret); 1984 opj_sparse_array_int32_free(sa); 1985 return OPJ_TRUE; 1986 } 1987 h.mem_count = opj_dwt_max_resolution(tr, numres); 1988 /* overflow check */ 1989 /* in vertical pass, we process 4 columns at a time */ 1990 if (h.mem_count > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) { 1991 /* FIXME event manager error callback */ 1992 opj_sparse_array_int32_free(sa); 1993 return OPJ_FALSE; 1994 } 1995 1996 h_mem_size = h.mem_count * 4 * sizeof(OPJ_INT32); 1997 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); 1998 if (! h.mem) { 1999 /* FIXME event manager error callback */ 2000 opj_sparse_array_int32_free(sa); 2001 return OPJ_FALSE; 2002 } 2003 2004 v.mem_count = h.mem_count; 2005 v.mem = h.mem; 2006 2007 for (resno = 1; resno < numres; resno ++) { 2008 OPJ_UINT32 i, j; 2009 /* Window of interest subband-based coordinates */ 2010 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; 2011 OPJ_UINT32 win_hl_x0, win_hl_x1; 2012 OPJ_UINT32 win_lh_y0, win_lh_y1; 2013 /* Window of interest tile-resolution-based coordinates */ 2014 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; 2015 /* Tile-resolution subband-based coordinates */ 2016 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; 2017 2018 ++tr; 2019 2020 h.sn = (OPJ_INT32)rw; 2021 v.sn = (OPJ_INT32)rh; 2022 2023 rw = (OPJ_UINT32)(tr->x1 - tr->x0); 2024 rh = (OPJ_UINT32)(tr->y1 - tr->y0); 2025 2026 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 2027 h.cas = tr->x0 % 2; 2028 2029 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 2030 v.cas = tr->y0 % 2; 2031 2032 /* Get the subband coordinates for the window of interest */ 2033 /* LL band */ 2034 opj_dwt_get_band_coordinates(tilec, resno, 0, 2035 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2036 &win_ll_x0, &win_ll_y0, 2037 &win_ll_x1, &win_ll_y1); 2038 2039 /* HL band */ 2040 opj_dwt_get_band_coordinates(tilec, resno, 1, 2041 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2042 &win_hl_x0, NULL, &win_hl_x1, NULL); 2043 2044 /* LH band */ 2045 opj_dwt_get_band_coordinates(tilec, resno, 2, 2046 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2047 NULL, &win_lh_y0, NULL, &win_lh_y1); 2048 2049 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ 2050 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; 2051 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; 2052 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; 2053 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; 2054 2055 /* Substract the origin of the bands for this tile, to the subwindow */ 2056 /* of interest band coordinates, so as to get them relative to the */ 2057 /* tile */ 2058 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); 2059 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); 2060 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); 2061 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); 2062 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); 2063 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); 2064 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); 2065 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); 2066 2067 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); 2068 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); 2069 2070 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); 2071 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); 2072 2073 /* Compute the tile-resolution-based coordinates for the window of interest */ 2074 if (h.cas == 0) { 2075 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); 2076 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); 2077 } else { 2078 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); 2079 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); 2080 } 2081 2082 if (v.cas == 0) { 2083 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); 2084 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); 2085 } else { 2086 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); 2087 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); 2088 } 2089 2090 for (j = 0; j < rh; ++j) { 2091 if ((j >= win_ll_y0 && j < win_ll_y1) || 2092 (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) { 2093 2094 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */ 2095 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */ 2096 /* on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */ 2097 /* This is less extreme than memsetting the whole buffer to 0 */ 2098 /* although we could potentially do better with better handling of edge conditions */ 2099 if (win_tr_x1 >= 1 && win_tr_x1 < rw) { 2100 h.mem[win_tr_x1 - 1] = 0; 2101 } 2102 if (win_tr_x1 < rw) { 2103 h.mem[win_tr_x1] = 0; 2104 } 2105 2106 opj_dwt_interleave_partial_h(h.mem, 2107 h.cas, 2108 sa, 2109 j, 2110 (OPJ_UINT32)h.sn, 2111 win_ll_x0, 2112 win_ll_x1, 2113 win_hl_x0, 2114 win_hl_x1); 2115 opj_dwt_decode_partial_1(h.mem, h.mem_count, h.dn, h.sn, h.cas, 2116 (OPJ_INT32)win_ll_x0, 2117 (OPJ_INT32)win_ll_x1, 2118 (OPJ_INT32)win_hl_x0, 2119 (OPJ_INT32)win_hl_x1); 2120 if (!opj_sparse_array_int32_write(sa, 2121 win_tr_x0, j, 2122 win_tr_x1, j + 1, 2123 h.mem + win_tr_x0, 2124 1, 0, OPJ_TRUE)) { 2125 /* FIXME event manager error callback */ 2126 opj_sparse_array_int32_free(sa); 2127 opj_aligned_free(h.mem); 2128 return OPJ_FALSE; 2129 } 2130 } 2131 } 2132 2133 for (i = win_tr_x0; i < win_tr_x1;) { 2134 OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i); 2135 opj_dwt_interleave_partial_v(v.mem, 2136 v.cas, 2137 sa, 2138 i, 2139 nb_cols, 2140 (OPJ_UINT32)v.sn, 2141 win_ll_y0, 2142 win_ll_y1, 2143 win_lh_y0, 2144 win_lh_y1); 2145 opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas, 2146 (OPJ_INT32)win_ll_y0, 2147 (OPJ_INT32)win_ll_y1, 2148 (OPJ_INT32)win_lh_y0, 2149 (OPJ_INT32)win_lh_y1); 2150 if (!opj_sparse_array_int32_write(sa, 2151 i, win_tr_y0, 2152 i + nb_cols, win_tr_y1, 2153 v.mem + 4 * win_tr_y0, 2154 1, 4, OPJ_TRUE)) { 2155 /* FIXME event manager error callback */ 2156 opj_sparse_array_int32_free(sa); 2157 opj_aligned_free(h.mem); 2158 return OPJ_FALSE; 2159 } 2160 2161 i += nb_cols; 2162 } 2163 } 2164 opj_aligned_free(h.mem); 2165 2166 { 2167 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 2168 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 2169 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 2170 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 2171 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 2172 tilec->data_win, 2173 1, tr_max->win_x1 - tr_max->win_x0, 2174 OPJ_TRUE); 2175 assert(ret); 2176 OPJ_UNUSED(ret); 2177 } 2178 opj_sparse_array_int32_free(sa); 2179 return OPJ_TRUE; 2180} 2181 2182static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt, 2183 OPJ_FLOAT32* OPJ_RESTRICT a, 2184 OPJ_UINT32 width, 2185 OPJ_UINT32 remaining_height) 2186{ 2187 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas); 2188 OPJ_UINT32 i, k; 2189 OPJ_UINT32 x0 = dwt->win_l_x0; 2190 OPJ_UINT32 x1 = dwt->win_l_x1; 2191 2192 for (k = 0; k < 2; ++k) { 2193 if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 && 2194 ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) { 2195 /* Fast code path */ 2196 for (i = x0; i < x1; ++i) { 2197 OPJ_UINT32 j = i; 2198 bi[i * 8 ] = a[j]; 2199 j += width; 2200 bi[i * 8 + 1] = a[j]; 2201 j += width; 2202 bi[i * 8 + 2] = a[j]; 2203 j += width; 2204 bi[i * 8 + 3] = a[j]; 2205 } 2206 } else { 2207 /* Slow code path */ 2208 for (i = x0; i < x1; ++i) { 2209 OPJ_UINT32 j = i; 2210 bi[i * 8 ] = a[j]; 2211 j += width; 2212 if (remaining_height == 1) { 2213 continue; 2214 } 2215 bi[i * 8 + 1] = a[j]; 2216 j += width; 2217 if (remaining_height == 2) { 2218 continue; 2219 } 2220 bi[i * 8 + 2] = a[j]; 2221 j += width; 2222 if (remaining_height == 3) { 2223 continue; 2224 } 2225 bi[i * 8 + 3] = a[j]; /* This one*/ 2226 } 2227 } 2228 2229 bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas); 2230 a += dwt->sn; 2231 x0 = dwt->win_h_x0; 2232 x1 = dwt->win_h_x1; 2233 } 2234} 2235 2236static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt, 2237 opj_sparse_array_int32_t* sa, 2238 OPJ_UINT32 sa_line, 2239 OPJ_UINT32 remaining_height) 2240{ 2241 OPJ_UINT32 i; 2242 for (i = 0; i < remaining_height; i++) { 2243 OPJ_BOOL ret; 2244 ret = opj_sparse_array_int32_read(sa, 2245 dwt->win_l_x0, sa_line + i, 2246 dwt->win_l_x1, sa_line + i + 1, 2247 /* Nasty cast from float* to int32* */ 2248 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i, 2249 8, 0, OPJ_TRUE); 2250 assert(ret); 2251 ret = opj_sparse_array_int32_read(sa, 2252 (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i, 2253 (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1, 2254 /* Nasty cast from float* to int32* */ 2255 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i, 2256 8, 0, OPJ_TRUE); 2257 assert(ret); 2258 OPJ_UNUSED(ret); 2259 } 2260} 2261 2262static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt, 2263 OPJ_FLOAT32* OPJ_RESTRICT a, 2264 OPJ_UINT32 width, 2265 OPJ_UINT32 nb_elts_read) 2266{ 2267 opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; 2268 OPJ_UINT32 i; 2269 2270 for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) { 2271 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], 2272 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); 2273 } 2274 2275 a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width; 2276 bi = dwt->wavelet + 1 - dwt->cas; 2277 2278 for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) { 2279 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], 2280 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); 2281 } 2282} 2283 2284static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt, 2285 opj_sparse_array_int32_t* sa, 2286 OPJ_UINT32 sa_col, 2287 OPJ_UINT32 nb_elts_read) 2288{ 2289 OPJ_BOOL ret; 2290 ret = opj_sparse_array_int32_read(sa, 2291 sa_col, dwt->win_l_x0, 2292 sa_col + nb_elts_read, dwt->win_l_x1, 2293 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0), 2294 1, 8, OPJ_TRUE); 2295 assert(ret); 2296 ret = opj_sparse_array_int32_read(sa, 2297 sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, 2298 sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, 2299 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0), 2300 1, 8, OPJ_TRUE); 2301 assert(ret); 2302 OPJ_UNUSED(ret); 2303} 2304 2305#ifdef __SSE__ 2306 2307static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, 2308 OPJ_UINT32 start, 2309 OPJ_UINT32 end, 2310 const __m128 c) 2311{ 2312 __m128* OPJ_RESTRICT vw = (__m128*) w; 2313 OPJ_UINT32 i; 2314 /* 4x unrolled loop */ 2315 vw += 2 * start; 2316 for (i = start; i + 3 < end; i += 4, vw += 8) { 2317 __m128 xmm0 = _mm_mul_ps(vw[0], c); 2318 __m128 xmm2 = _mm_mul_ps(vw[2], c); 2319 __m128 xmm4 = _mm_mul_ps(vw[4], c); 2320 __m128 xmm6 = _mm_mul_ps(vw[6], c); 2321 vw[0] = xmm0; 2322 vw[2] = xmm2; 2323 vw[4] = xmm4; 2324 vw[6] = xmm6; 2325 } 2326 for (; i < end; ++i, vw += 2) { 2327 vw[0] = _mm_mul_ps(vw[0], c); 2328 } 2329} 2330 2331static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w, 2332 OPJ_UINT32 start, 2333 OPJ_UINT32 end, 2334 OPJ_UINT32 m, 2335 __m128 c) 2336{ 2337 __m128* OPJ_RESTRICT vl = (__m128*) l; 2338 __m128* OPJ_RESTRICT vw = (__m128*) w; 2339 OPJ_UINT32 i; 2340 OPJ_UINT32 imax = opj_uint_min(end, m); 2341 __m128 tmp1, tmp2, tmp3; 2342 if (start == 0) { 2343 tmp1 = vl[0]; 2344 } else { 2345 vw += start * 2; 2346 tmp1 = vw[-3]; 2347 } 2348 2349 i = start; 2350 2351 /* 4x loop unrolling */ 2352 for (; i + 3 < imax; i += 4) { 2353 __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 2354 tmp2 = vw[-1]; 2355 tmp3 = vw[ 0]; 2356 tmp4 = vw[ 1]; 2357 tmp5 = vw[ 2]; 2358 tmp6 = vw[ 3]; 2359 tmp7 = vw[ 4]; 2360 tmp8 = vw[ 5]; 2361 tmp9 = vw[ 6]; 2362 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); 2363 vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c)); 2364 vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c)); 2365 vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c)); 2366 tmp1 = tmp9; 2367 vw += 8; 2368 } 2369 2370 for (; i < imax; ++i) { 2371 tmp2 = vw[-1]; 2372 tmp3 = vw[ 0]; 2373 vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c)); 2374 tmp1 = tmp3; 2375 vw += 2; 2376 } 2377 if (m < end) { 2378 assert(m + 1 == end); 2379 c = _mm_add_ps(c, c); 2380 c = _mm_mul_ps(c, vw[-2]); 2381 vw[-1] = _mm_add_ps(vw[-1], c); 2382 } 2383} 2384 2385#else 2386 2387static void opj_v4dwt_decode_step1(opj_v4_t* w, 2388 OPJ_UINT32 start, 2389 OPJ_UINT32 end, 2390 const OPJ_FLOAT32 c) 2391{ 2392 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; 2393 OPJ_UINT32 i; 2394 for (i = start; i < end; ++i) { 2395 OPJ_FLOAT32 tmp1 = fw[i * 8 ]; 2396 OPJ_FLOAT32 tmp2 = fw[i * 8 + 1]; 2397 OPJ_FLOAT32 tmp3 = fw[i * 8 + 2]; 2398 OPJ_FLOAT32 tmp4 = fw[i * 8 + 3]; 2399 fw[i * 8 ] = tmp1 * c; 2400 fw[i * 8 + 1] = tmp2 * c; 2401 fw[i * 8 + 2] = tmp3 * c; 2402 fw[i * 8 + 3] = tmp4 * c; 2403 } 2404} 2405 2406static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w, 2407 OPJ_UINT32 start, 2408 OPJ_UINT32 end, 2409 OPJ_UINT32 m, 2410 OPJ_FLOAT32 c) 2411{ 2412 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l; 2413 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w; 2414 OPJ_UINT32 i; 2415 OPJ_UINT32 imax = opj_uint_min(end, m); 2416 if (start > 0) { 2417 fw += 8 * start; 2418 fl = fw - 8; 2419 } 2420 for (i = start; i < imax; ++i) { 2421 OPJ_FLOAT32 tmp1_1 = fl[0]; 2422 OPJ_FLOAT32 tmp1_2 = fl[1]; 2423 OPJ_FLOAT32 tmp1_3 = fl[2]; 2424 OPJ_FLOAT32 tmp1_4 = fl[3]; 2425 OPJ_FLOAT32 tmp2_1 = fw[-4]; 2426 OPJ_FLOAT32 tmp2_2 = fw[-3]; 2427 OPJ_FLOAT32 tmp2_3 = fw[-2]; 2428 OPJ_FLOAT32 tmp2_4 = fw[-1]; 2429 OPJ_FLOAT32 tmp3_1 = fw[0]; 2430 OPJ_FLOAT32 tmp3_2 = fw[1]; 2431 OPJ_FLOAT32 tmp3_3 = fw[2]; 2432 OPJ_FLOAT32 tmp3_4 = fw[3]; 2433 fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c); 2434 fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c); 2435 fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c); 2436 fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c); 2437 fl = fw; 2438 fw += 8; 2439 } 2440 if (m < end) { 2441 assert(m + 1 == end); 2442 c += c; 2443 fw[-4] = fw[-4] + fl[0] * c; 2444 fw[-3] = fw[-3] + fl[1] * c; 2445 fw[-2] = fw[-2] + fl[2] * c; 2446 fw[-1] = fw[-1] + fl[3] * c; 2447 } 2448} 2449 2450#endif 2451 2452/* <summary> */ 2453/* Inverse 9-7 wavelet transform in 1-D. */ 2454/* </summary> */ 2455static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt) 2456{ 2457 OPJ_INT32 a, b; 2458 if (dwt->cas == 0) { 2459 if (!((dwt->dn > 0) || (dwt->sn > 1))) { 2460 return; 2461 } 2462 a = 0; 2463 b = 1; 2464 } else { 2465 if (!((dwt->sn > 0) || (dwt->dn > 1))) { 2466 return; 2467 } 2468 a = 1; 2469 b = 0; 2470 } 2471#ifdef __SSE__ 2472 opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, 2473 _mm_set1_ps(opj_K)); 2474 opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, 2475 _mm_set1_ps(opj_c13318)); 2476 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, 2477 dwt->win_l_x0, dwt->win_l_x1, 2478 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2479 _mm_set1_ps(opj_dwt_delta)); 2480 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, 2481 dwt->win_h_x0, dwt->win_h_x1, 2482 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2483 _mm_set1_ps(opj_dwt_gamma)); 2484 opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, 2485 dwt->win_l_x0, dwt->win_l_x1, 2486 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2487 _mm_set1_ps(opj_dwt_beta)); 2488 opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, 2489 dwt->win_h_x0, dwt->win_h_x1, 2490 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2491 _mm_set1_ps(opj_dwt_alpha)); 2492#else 2493 opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, 2494 opj_K); 2495 opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, 2496 opj_c13318); 2497 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, 2498 dwt->win_l_x0, dwt->win_l_x1, 2499 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2500 opj_dwt_delta); 2501 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, 2502 dwt->win_h_x0, dwt->win_h_x1, 2503 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2504 opj_dwt_gamma); 2505 opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, 2506 dwt->win_l_x0, dwt->win_l_x1, 2507 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), 2508 opj_dwt_beta); 2509 opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, 2510 dwt->win_h_x0, dwt->win_h_x1, 2511 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), 2512 opj_dwt_alpha); 2513#endif 2514} 2515 2516 2517/* <summary> */ 2518/* Inverse 9-7 wavelet transform in 2-D. */ 2519/* </summary> */ 2520static 2521OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, 2522 OPJ_UINT32 numres) 2523{ 2524 opj_v4dwt_t h; 2525 opj_v4dwt_t v; 2526 2527 opj_tcd_resolution_t* res = tilec->resolutions; 2528 2529 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 - 2530 res->x0); /* width of the resolution level computed */ 2531 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 - 2532 res->y0); /* height of the resolution level computed */ 2533 2534 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - 2535 1].x1 - 2536 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); 2537 2538 OPJ_SIZE_T l_data_size; 2539 2540 l_data_size = opj_dwt_max_resolution(res, numres); 2541 /* overflow check */ 2542 if (l_data_size > (SIZE_MAX - 5U)) { 2543 /* FIXME event manager error callback */ 2544 return OPJ_FALSE; 2545 } 2546 l_data_size += 5U; 2547 /* overflow check */ 2548 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { 2549 /* FIXME event manager error callback */ 2550 return OPJ_FALSE; 2551 } 2552 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); 2553 if (!h.wavelet) { 2554 /* FIXME event manager error callback */ 2555 return OPJ_FALSE; 2556 } 2557 v.wavelet = h.wavelet; 2558 2559 while (--numres) { 2560 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data; 2561 OPJ_UINT32 j; 2562 2563 h.sn = (OPJ_INT32)rw; 2564 v.sn = (OPJ_INT32)rh; 2565 2566 ++res; 2567 2568 rw = (OPJ_UINT32)(res->x1 - 2569 res->x0); /* width of the resolution level computed */ 2570 rh = (OPJ_UINT32)(res->y1 - 2571 res->y0); /* height of the resolution level computed */ 2572 2573 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 2574 h.cas = res->x0 % 2; 2575 2576 h.win_l_x0 = 0; 2577 h.win_l_x1 = (OPJ_UINT32)h.sn; 2578 h.win_h_x0 = 0; 2579 h.win_h_x1 = (OPJ_UINT32)h.dn; 2580 for (j = 0; j + 3 < rh; j += 4) { 2581 OPJ_UINT32 k; 2582 opj_v4dwt_interleave_h(&h, aj, w, rh - j); 2583 opj_v4dwt_decode(&h); 2584 2585 for (k = 0; k < rw; k++) { 2586 aj[k ] = h.wavelet[k].f[0]; 2587 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; 2588 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; 2589 aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; 2590 } 2591 2592 aj += w * 4; 2593 } 2594 2595 if (j < rh) { 2596 OPJ_UINT32 k; 2597 opj_v4dwt_interleave_h(&h, aj, w, rh - j); 2598 opj_v4dwt_decode(&h); 2599 for (k = 0; k < rw; k++) { 2600 switch (rh - j) { 2601 case 3: 2602 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; 2603 /* FALLTHRU */ 2604 case 2: 2605 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; 2606 /* FALLTHRU */ 2607 case 1: 2608 aj[k] = h.wavelet[k].f[0]; 2609 } 2610 } 2611 } 2612 2613 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 2614 v.cas = res->y0 % 2; 2615 v.win_l_x0 = 0; 2616 v.win_l_x1 = (OPJ_UINT32)v.sn; 2617 v.win_h_x0 = 0; 2618 v.win_h_x1 = (OPJ_UINT32)v.dn; 2619 2620 aj = (OPJ_FLOAT32*) tilec->data; 2621 for (j = rw; j > 3; j -= 4) { 2622 OPJ_UINT32 k; 2623 2624 opj_v4dwt_interleave_v(&v, aj, w, 4); 2625 opj_v4dwt_decode(&v); 2626 2627 for (k = 0; k < rh; ++k) { 2628 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32)); 2629 } 2630 aj += 4; 2631 } 2632 2633 if (rw & 0x03) { 2634 OPJ_UINT32 k; 2635 2636 j = rw & 0x03; 2637 2638 opj_v4dwt_interleave_v(&v, aj, w, j); 2639 opj_v4dwt_decode(&v); 2640 2641 for (k = 0; k < rh; ++k) { 2642 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 2643 (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32)); 2644 } 2645 } 2646 } 2647 2648 opj_aligned_free(h.wavelet); 2649 return OPJ_TRUE; 2650} 2651 2652static 2653OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, 2654 OPJ_UINT32 numres) 2655{ 2656 opj_sparse_array_int32_t* sa; 2657 opj_v4dwt_t h; 2658 opj_v4dwt_t v; 2659 OPJ_UINT32 resno; 2660 /* This value matches the maximum left/right extension given in tables */ 2661 /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */ 2662 /* we currently use 3. */ 2663 const OPJ_UINT32 filter_width = 4U; 2664 2665 opj_tcd_resolution_t* tr = tilec->resolutions; 2666 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); 2667 2668 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - 2669 tr->x0); /* width of the resolution level computed */ 2670 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - 2671 tr->y0); /* height of the resolution level computed */ 2672 2673 OPJ_SIZE_T l_data_size; 2674 2675 /* Compute the intersection of the area of interest, expressed in tile coordinates */ 2676 /* with the tile coordinates */ 2677 OPJ_UINT32 win_tcx0 = tilec->win_x0; 2678 OPJ_UINT32 win_tcy0 = tilec->win_y0; 2679 OPJ_UINT32 win_tcx1 = tilec->win_x1; 2680 OPJ_UINT32 win_tcy1 = tilec->win_y1; 2681 2682 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { 2683 return OPJ_TRUE; 2684 } 2685 2686 sa = opj_dwt_init_sparse_array(tilec, numres); 2687 if (sa == NULL) { 2688 return OPJ_FALSE; 2689 } 2690 2691 if (numres == 1U) { 2692 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 2693 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 2694 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 2695 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 2696 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 2697 tilec->data_win, 2698 1, tr_max->win_x1 - tr_max->win_x0, 2699 OPJ_TRUE); 2700 assert(ret); 2701 OPJ_UNUSED(ret); 2702 opj_sparse_array_int32_free(sa); 2703 return OPJ_TRUE; 2704 } 2705 2706 l_data_size = opj_dwt_max_resolution(tr, numres); 2707 /* overflow check */ 2708 if (l_data_size > (SIZE_MAX - 5U)) { 2709 /* FIXME event manager error callback */ 2710 return OPJ_FALSE; 2711 } 2712 l_data_size += 5U; 2713 /* overflow check */ 2714 if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) { 2715 /* FIXME event manager error callback */ 2716 return OPJ_FALSE; 2717 } 2718 h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t)); 2719 if (!h.wavelet) { 2720 /* FIXME event manager error callback */ 2721 return OPJ_FALSE; 2722 } 2723 v.wavelet = h.wavelet; 2724 2725 for (resno = 1; resno < numres; resno ++) { 2726 OPJ_UINT32 j; 2727 /* Window of interest subband-based coordinates */ 2728 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; 2729 OPJ_UINT32 win_hl_x0, win_hl_x1; 2730 OPJ_UINT32 win_lh_y0, win_lh_y1; 2731 /* Window of interest tile-resolution-based coordinates */ 2732 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; 2733 /* Tile-resolution subband-based coordinates */ 2734 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; 2735 2736 ++tr; 2737 2738 h.sn = (OPJ_INT32)rw; 2739 v.sn = (OPJ_INT32)rh; 2740 2741 rw = (OPJ_UINT32)(tr->x1 - tr->x0); 2742 rh = (OPJ_UINT32)(tr->y1 - tr->y0); 2743 2744 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); 2745 h.cas = tr->x0 % 2; 2746 2747 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); 2748 v.cas = tr->y0 % 2; 2749 2750 /* Get the subband coordinates for the window of interest */ 2751 /* LL band */ 2752 opj_dwt_get_band_coordinates(tilec, resno, 0, 2753 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2754 &win_ll_x0, &win_ll_y0, 2755 &win_ll_x1, &win_ll_y1); 2756 2757 /* HL band */ 2758 opj_dwt_get_band_coordinates(tilec, resno, 1, 2759 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2760 &win_hl_x0, NULL, &win_hl_x1, NULL); 2761 2762 /* LH band */ 2763 opj_dwt_get_band_coordinates(tilec, resno, 2, 2764 win_tcx0, win_tcy0, win_tcx1, win_tcy1, 2765 NULL, &win_lh_y0, NULL, &win_lh_y1); 2766 2767 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ 2768 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; 2769 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; 2770 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; 2771 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; 2772 2773 /* Substract the origin of the bands for this tile, to the subwindow */ 2774 /* of interest band coordinates, so as to get them relative to the */ 2775 /* tile */ 2776 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); 2777 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); 2778 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); 2779 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); 2780 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); 2781 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); 2782 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); 2783 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); 2784 2785 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); 2786 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); 2787 2788 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); 2789 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); 2790 2791 /* Compute the tile-resolution-based coordinates for the window of interest */ 2792 if (h.cas == 0) { 2793 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); 2794 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); 2795 } else { 2796 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); 2797 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); 2798 } 2799 2800 if (v.cas == 0) { 2801 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); 2802 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); 2803 } else { 2804 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); 2805 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); 2806 } 2807 2808 h.win_l_x0 = win_ll_x0; 2809 h.win_l_x1 = win_ll_x1; 2810 h.win_h_x0 = win_hl_x0; 2811 h.win_h_x1 = win_hl_x1; 2812 for (j = 0; j + 3 < rh; j += 4) { 2813 if ((j + 3 >= win_ll_y0 && j < win_ll_y1) || 2814 (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && 2815 j < win_lh_y1 + (OPJ_UINT32)v.sn)) { 2816 opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j)); 2817 opj_v4dwt_decode(&h); 2818 if (!opj_sparse_array_int32_write(sa, 2819 win_tr_x0, j, 2820 win_tr_x1, j + 4, 2821 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], 2822 4, 1, OPJ_TRUE)) { 2823 /* FIXME event manager error callback */ 2824 opj_sparse_array_int32_free(sa); 2825 opj_aligned_free(h.wavelet); 2826 return OPJ_FALSE; 2827 } 2828 } 2829 } 2830 2831 if (j < rh && 2832 ((j + 3 >= win_ll_y0 && j < win_ll_y1) || 2833 (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn && 2834 j < win_lh_y1 + (OPJ_UINT32)v.sn))) { 2835 opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j); 2836 opj_v4dwt_decode(&h); 2837 if (!opj_sparse_array_int32_write(sa, 2838 win_tr_x0, j, 2839 win_tr_x1, rh, 2840 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], 2841 4, 1, OPJ_TRUE)) { 2842 /* FIXME event manager error callback */ 2843 opj_sparse_array_int32_free(sa); 2844 opj_aligned_free(h.wavelet); 2845 return OPJ_FALSE; 2846 } 2847 } 2848 2849 v.win_l_x0 = win_ll_y0; 2850 v.win_l_x1 = win_ll_y1; 2851 v.win_h_x0 = win_lh_y0; 2852 v.win_h_x1 = win_lh_y1; 2853 for (j = win_tr_x0; j < win_tr_x1; j += 4) { 2854 OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j); 2855 2856 opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts); 2857 opj_v4dwt_decode(&v); 2858 2859 if (!opj_sparse_array_int32_write(sa, 2860 j, win_tr_y0, 2861 j + nb_elts, win_tr_y1, 2862 (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0], 2863 1, 4, OPJ_TRUE)) { 2864 /* FIXME event manager error callback */ 2865 opj_sparse_array_int32_free(sa); 2866 opj_aligned_free(h.wavelet); 2867 return OPJ_FALSE; 2868 } 2869 } 2870 } 2871 2872 { 2873 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, 2874 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, 2875 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, 2876 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, 2877 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, 2878 tilec->data_win, 2879 1, tr_max->win_x1 - tr_max->win_x0, 2880 OPJ_TRUE); 2881 assert(ret); 2882 OPJ_UNUSED(ret); 2883 } 2884 opj_sparse_array_int32_free(sa); 2885 2886 opj_aligned_free(h.wavelet); 2887 return OPJ_TRUE; 2888} 2889 2890 2891OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd, 2892 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, 2893 OPJ_UINT32 numres) 2894{ 2895 if (p_tcd->whole_tile_decoding) { 2896 return opj_dwt_decode_tile_97(tilec, numres); 2897 } else { 2898 return opj_dwt_decode_partial_97(tilec, numres); 2899 } 2900} 2901