1 /*
2  * The copyright in this software is being made available under the 2-clauses
3  * BSD License, included below. This software may be subject to other third
4  * party and contributor rights, including patent rights, and no such rights
5  * are granted under this license.
6  *
7  * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium
8  * Copyright (c) 2002-2014, Professor Benoit Macq
9  * Copyright (c) 2001-2003, David Janssens
10  * Copyright (c) 2002-2003, Yannick Verschueren
11  * Copyright (c) 2003-2007, Francois-Olivier Devaux
12  * Copyright (c) 2003-2014, Antonin Descampe
13  * Copyright (c) 2005, Herve Drolon, FreeImage Team
14  * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net>
15  * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com>
16  * Copyright (c) 2017, IntoPIX SA <support@intopix.com>
17  * All rights reserved.
18  *
19  * Redistribution and use in source and binary forms, with or without
20  * modification, are permitted provided that the following conditions
21  * are met:
22  * 1. Redistributions of source code must retain the above copyright
23  *    notice, this list of conditions and the following disclaimer.
24  * 2. Redistributions in binary form must reproduce the above copyright
25  *    notice, this list of conditions and the following disclaimer in the
26  *    documentation and/or other materials provided with the distribution.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS'
29  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38  * POSSIBILITY OF SUCH DAMAGE.
39  */
40 
41 #include <assert.h>
42 
43 #define OPJ_SKIP_POISON
44 #include "opj_includes.h"
45 
46 #ifdef __SSE__
47 #include <xmmintrin.h>
48 #endif
49 #ifdef __SSE2__
50 #include <emmintrin.h>
51 #endif
52 #ifdef __SSSE3__
53 #include <tmmintrin.h>
54 #endif
55 #ifdef __AVX2__
56 #include <immintrin.h>
57 #endif
58 
59 #if defined(__GNUC__)
60 #pragma GCC poison malloc calloc realloc free
61 #endif
62 
63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */
64 /*@{*/
65 
66 #ifdef __AVX2__
67 /** Number of int32 values in a AVX2 register */
68 #define VREG_INT_COUNT       8
69 #else
70 /** Number of int32 values in a SSE2 register */
71 #define VREG_INT_COUNT       4
72 #endif
73 
74 /** Number of columns that we can process in parallel in the vertical pass */
75 #define PARALLEL_COLS_53     (2*VREG_INT_COUNT)
76 
77 /** @name Local data structures */
78 /*@{*/
79 
80 typedef struct dwt_local {
81     OPJ_INT32* mem;
82     OPJ_SIZE_T mem_count;
83     OPJ_INT32 dn;   /* number of elements in high pass band */
84     OPJ_INT32 sn;   /* number of elements in low pass band */
85     OPJ_INT32 cas;  /* 0 = start on even coord, 1 = start on odd coord */
86 } opj_dwt_t;
87 
88 typedef union {
89     OPJ_FLOAT32 f[4];
90 } opj_v4_t;
91 
92 typedef struct v4dwt_local {
93     opj_v4_t*   wavelet ;
94     OPJ_INT32       dn ;  /* number of elements in high pass band */
95     OPJ_INT32       sn ;  /* number of elements in low pass band */
96     OPJ_INT32       cas ; /* 0 = start on even coord, 1 = start on odd coord */
97     OPJ_UINT32      win_l_x0; /* start coord in low pass band */
98     OPJ_UINT32      win_l_x1; /* end coord in low pass band */
99     OPJ_UINT32      win_h_x0; /* start coord in high pass band */
100     OPJ_UINT32      win_h_x1; /* end coord in high pass band */
101 } opj_v4dwt_t ;
102 
103 static const OPJ_FLOAT32 opj_dwt_alpha =  1.586134342f; /*  12994 */
104 static const OPJ_FLOAT32 opj_dwt_beta  =  0.052980118f; /*    434 */
105 static const OPJ_FLOAT32 opj_dwt_gamma = -0.882911075f; /*  -7233 */
106 static const OPJ_FLOAT32 opj_dwt_delta = -0.443506852f; /*  -3633 */
107 
108 static const OPJ_FLOAT32 opj_K      = 1.230174105f; /*  10078 */
109 static const OPJ_FLOAT32 opj_c13318 = 1.625732422f;
110 
111 /*@}*/
112 
113 /**
114 Virtual function type for wavelet transform in 1-D
115 */
116 typedef void (*DWT1DFN)(const opj_dwt_t* v);
117 
118 /** @name Local static functions */
119 /*@{*/
120 
121 /**
122 Forward lazy transform (horizontal)
123 */
124 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
125                                    OPJ_INT32 sn, OPJ_INT32 cas);
126 /**
127 Forward lazy transform (vertical)
128 */
129 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
130                                    OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas);
131 /**
132 Forward 5-3 wavelet transform in 1-D
133 */
134 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
135     OPJ_INT32 sn, OPJ_INT32 cas);
136 /**
137 Forward 9-7 wavelet transform in 1-D
138 */
139 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count,
140     OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas);
141 /**
142 Explicit calculation of the Quantization Stepsizes
143 */
144 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
145                                     opj_stepsize_t *bandno_stepsize);
146 /**
147 Inverse wavelet transform in 2-D.
148 */
149 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
150                                     const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i);
151 
152 static OPJ_BOOL opj_dwt_decode_partial_tile(
153     opj_tcd_tilecomp_t* tilec,
154     OPJ_UINT32 numres);
155 
156 static OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec,
157         void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32));
158 
159 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
160         OPJ_UINT32 i);
161 
162 /* <summary>                             */
163 /* Inverse 9-7 wavelet transform in 1-D. */
164 /* </summary>                            */
165 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt);
166 
167 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
168                                    OPJ_FLOAT32* OPJ_RESTRICT a,
169                                    OPJ_UINT32 width,
170                                    OPJ_UINT32 remaining_height);
171 
172 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
173                                    OPJ_FLOAT32* OPJ_RESTRICT a,
174                                    OPJ_UINT32 width,
175                                    OPJ_UINT32 nb_elts_read);
176 
177 #ifdef __SSE__
178 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
179                                        OPJ_UINT32 start,
180                                        OPJ_UINT32 end,
181                                        const __m128 c);
182 
183 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
184                                        OPJ_UINT32 start,
185                                        OPJ_UINT32 end,
186                                        OPJ_UINT32 m, __m128 c);
187 
188 #else
189 static void opj_v4dwt_decode_step1(opj_v4_t* w,
190                                    OPJ_UINT32 start,
191                                    OPJ_UINT32 end,
192                                    const OPJ_FLOAT32 c);
193 
194 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
195                                    OPJ_UINT32 start,
196                                    OPJ_UINT32 end,
197                                    OPJ_UINT32 m,
198                                    OPJ_FLOAT32 c);
199 
200 #endif
201 
202 /*@}*/
203 
204 /*@}*/
205 
206 #define IDX_S(i) (i)*2
207 #define IDX_D(i) 1 + (i)* 2
208 #define UNDERFLOW_SN(i) ((i) >= sn&&sn>0)
209 #define UNDERFLOW_DN(i) ((i) >= dn&&dn>0)
210 #define OVERFLOW_S(i) (IDX_S(i) >= a_count)
211 #define OVERFLOW_D(i) (IDX_D(i) >= a_count)
212 
213 #define OPJ_S(i) a[IDX_S(i)]
214 #define OPJ_D(i) a[IDX_D(i)]
215 #define OPJ_S_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_SN(i) ? OPJ_S(sn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i)))
216 #define OPJ_D_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_DN(i) ? OPJ_D(dn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i)))
217 /* new */
218 #define OPJ_SS_(i) ((i)<0 ? OPJ_S(0) : (UNDERFLOW_DN(i) ? OPJ_S(dn - 1) : OVERFLOW_S(i) ? OPJ_S(i - 1) : OPJ_S(i)))
219 #define OPJ_DD_(i) ((i)<0 ? OPJ_D(0) : (UNDERFLOW_SN(i) ? OPJ_D(sn - 1) : OVERFLOW_D(i) ? OPJ_D(i - 1) : OPJ_D(i)))
220 
221 /* <summary>                                                              */
222 /* This table contains the norms of the 5-3 wavelets for different bands. */
223 /* </summary>                                                             */
224 /* FIXME! the array should really be extended up to 33 resolution levels */
225 /* See https://github.com/uclouvain/openjpeg/issues/493 */
226 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = {
227     {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3},
228     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
229     {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9},
230     {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93}
231 };
232 
233 /* <summary>                                                              */
234 /* This table contains the norms of the 9-7 wavelets for different bands. */
235 /* </summary>                                                             */
236 /* FIXME! the array should really be extended up to 33 resolution levels */
237 /* See https://github.com/uclouvain/openjpeg/issues/493 */
238 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = {
239     {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9},
240     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
241     {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0},
242     {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2}
243 };
244 
245 /*
246 ==========================================================
247    local functions
248 ==========================================================
249 */
250 
251 /* <summary>                             */
252 /* Forward lazy transform (horizontal).  */
253 /* </summary>                            */
opj_dwt_deinterleave_h(OPJ_INT32 * a,OPJ_INT32 * b,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)254 static void opj_dwt_deinterleave_h(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
255                                    OPJ_INT32 sn, OPJ_INT32 cas)
256 {
257     OPJ_INT32 i;
258     OPJ_INT32 * l_dest = b;
259     OPJ_INT32 * l_src = a + cas;
260 
261     for (i = 0; i < sn; ++i) {
262         *l_dest++ = *l_src;
263         l_src += 2;
264     }
265 
266     l_dest = b + sn;
267     l_src = a + 1 - cas;
268 
269     for (i = 0; i < dn; ++i)  {
270         *l_dest++ = *l_src;
271         l_src += 2;
272     }
273 }
274 
275 /* <summary>                             */
276 /* Forward lazy transform (vertical).    */
277 /* </summary>                            */
opj_dwt_deinterleave_v(OPJ_INT32 * a,OPJ_INT32 * b,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 x,OPJ_INT32 cas)278 static void opj_dwt_deinterleave_v(OPJ_INT32 *a, OPJ_INT32 *b, OPJ_INT32 dn,
279                                    OPJ_INT32 sn, OPJ_INT32 x, OPJ_INT32 cas)
280 {
281     OPJ_INT32 i = sn;
282     OPJ_INT32 * l_dest = b;
283     OPJ_INT32 * l_src = a + cas;
284 
285     while (i--) {
286         *l_dest = *l_src;
287         l_dest += x;
288         l_src += 2;
289     } /* b[i*x]=a[2*i+cas]; */
290 
291     l_dest = b + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)x;
292     l_src = a + 1 - cas;
293 
294     i = dn;
295     while (i--) {
296         *l_dest = *l_src;
297         l_dest += x;
298         l_src += 2;
299     } /*b[(sn+i)*x]=a[(2*i+1-cas)];*/
300 }
301 
302 #ifdef STANDARD_SLOW_VERSION
303 /* <summary>                             */
304 /* Inverse lazy transform (horizontal).  */
305 /* </summary>                            */
opj_dwt_interleave_h(const opj_dwt_t * h,OPJ_INT32 * a)306 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a)
307 {
308     OPJ_INT32 *ai = a;
309     OPJ_INT32 *bi = h->mem + h->cas;
310     OPJ_INT32  i    = h->sn;
311     while (i--) {
312         *bi = *(ai++);
313         bi += 2;
314     }
315     ai  = a + h->sn;
316     bi  = h->mem + 1 - h->cas;
317     i   = h->dn ;
318     while (i--) {
319         *bi = *(ai++);
320         bi += 2;
321     }
322 }
323 
324 /* <summary>                             */
325 /* Inverse lazy transform (vertical).    */
326 /* </summary>                            */
opj_dwt_interleave_v(const opj_dwt_t * v,OPJ_INT32 * a,OPJ_INT32 x)327 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x)
328 {
329     OPJ_INT32 *ai = a;
330     OPJ_INT32 *bi = v->mem + v->cas;
331     OPJ_INT32  i = v->sn;
332     while (i--) {
333         *bi = *ai;
334         bi += 2;
335         ai += x;
336     }
337     ai = a + (v->sn * (OPJ_SIZE_T)x);
338     bi = v->mem + 1 - v->cas;
339     i = v->dn ;
340     while (i--) {
341         *bi = *ai;
342         bi += 2;
343         ai += x;
344     }
345 }
346 
347 #endif /* STANDARD_SLOW_VERSION */
348 
349 /* <summary>                            */
350 /* Forward 5-3 wavelet transform in 1-D. */
351 /* </summary>                           */
opj_dwt_encode_1(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)352 static void opj_dwt_encode_1(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
353                              OPJ_INT32 sn, OPJ_INT32 cas)
354 {
355     OPJ_INT32 i;
356 
357     if (!cas) {
358         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
359             for (i = 0; i < dn; i++) {
360                 OPJ_D(i) -= (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
361             }
362             for (i = 0; i < sn; i++) {
363                 OPJ_S(i) += (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
364             }
365         }
366     } else {
367         if (!sn && dn == 1) {       /* NEW :  CASE ONE ELEMENT */
368             OPJ_S(0) *= 2;
369         } else {
370             for (i = 0; i < dn; i++) {
371                 OPJ_S(i) -= (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
372             }
373             for (i = 0; i < sn; i++) {
374                 OPJ_D(i) += (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
375             }
376         }
377     }
378 }
379 
380 #ifdef STANDARD_SLOW_VERSION
381 /* <summary>                            */
382 /* Inverse 5-3 wavelet transform in 1-D. */
383 /* </summary>                           */
opj_dwt_decode_1_(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)384 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_SIZE_T a_count, OPJ_INT32 dn,
385                               OPJ_INT32 sn, OPJ_INT32 cas)
386 {
387     OPJ_INT32 i;
388 
389     if (!cas) {
390         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
391             for (i = 0; i < sn; i++) {
392                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
393             }
394             for (i = 0; i < dn; i++) {
395                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
396             }
397         }
398     } else {
399         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
400             OPJ_S(0) /= 2;
401         } else {
402             for (i = 0; i < sn; i++) {
403                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
404             }
405             for (i = 0; i < dn; i++) {
406                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
407             }
408         }
409     }
410 }
411 
opj_dwt_decode_1(const opj_dwt_t * v)412 static void opj_dwt_decode_1(const opj_dwt_t *v)
413 {
414     opj_dwt_decode_1_(v->mem, v->mem_count, v->dn, v->sn, v->cas);
415 }
416 
417 #endif /* STANDARD_SLOW_VERSION */
418 
419 #if !defined(STANDARD_SLOW_VERSION)
opj_idwt53_h_cas0(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp)420 static void  opj_idwt53_h_cas0(OPJ_INT32* tmp,
421                                const OPJ_INT32 sn,
422                                const OPJ_INT32 len,
423                                OPJ_INT32* tiledp)
424 {
425     OPJ_INT32 i, j;
426     const OPJ_INT32* in_even = &tiledp[0];
427     const OPJ_INT32* in_odd = &tiledp[sn];
428 
429 #ifdef TWO_PASS_VERSION
430     /* For documentation purpose: performs lifting in two iterations, */
431     /* but without explicit interleaving */
432 
433     assert(len > 1);
434 
435     /* Even */
436     tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1);
437     for (i = 2, j = 0; i <= len - 2; i += 2, j++) {
438         tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2);
439     }
440     if (len & 1) { /* if len is odd */
441         tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1);
442     }
443 
444     /* Odd */
445     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
446         tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1);
447     }
448     if (!(len & 1)) { /* if len is even */
449         tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2];
450     }
451 #else
452     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
453 
454     assert(len > 1);
455 
456     /* Improved version of the TWO_PASS_VERSION: */
457     /* Performs lifting in one single iteration. Saves memory */
458     /* accesses and explicit interleaving. */
459     s1n = in_even[0];
460     d1n = in_odd[0];
461     s0n = s1n - ((d1n + 1) >> 1);
462 
463     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
464         d1c = d1n;
465         s0c = s0n;
466 
467         s1n = in_even[j];
468         d1n = in_odd[j];
469 
470         s0n = s1n - ((d1c + d1n + 2) >> 2);
471 
472         tmp[i  ] = s0c;
473         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
474     }
475 
476     tmp[i] = s0n;
477 
478     if (len & 1) {
479         tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1);
480         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
481     } else {
482         tmp[len - 1] = d1n + s0n;
483     }
484 #endif
485     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
486 }
487 
opj_idwt53_h_cas1(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp)488 static void  opj_idwt53_h_cas1(OPJ_INT32* tmp,
489                                const OPJ_INT32 sn,
490                                const OPJ_INT32 len,
491                                OPJ_INT32* tiledp)
492 {
493     OPJ_INT32 i, j;
494     const OPJ_INT32* in_even = &tiledp[sn];
495     const OPJ_INT32* in_odd = &tiledp[0];
496 
497 #ifdef TWO_PASS_VERSION
498     /* For documentation purpose: performs lifting in two iterations, */
499     /* but without explicit interleaving */
500 
501     assert(len > 2);
502 
503     /* Odd */
504     for (i = 1, j = 0; i < len - 1; i += 2, j++) {
505         tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2);
506     }
507     if (!(len & 1)) {
508         tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1);
509     }
510 
511     /* Even */
512     tmp[0] = in_even[0] + tmp[1];
513     for (i = 2, j = 1; i < len - 1; i += 2, j++) {
514         tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1);
515     }
516     if (len & 1) {
517         tmp[len - 1] = in_even[len / 2] + tmp[len - 2];
518     }
519 #else
520     OPJ_INT32 s1, s2, dc, dn;
521 
522     assert(len > 2);
523 
524     /* Improved version of the TWO_PASS_VERSION: */
525     /* Performs lifting in one single iteration. Saves memory */
526     /* accesses and explicit interleaving. */
527 
528     s1 = in_even[1];
529     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
530     tmp[0] = in_even[0] + dc;
531 
532     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
533 
534         s2 = in_even[j + 1];
535 
536         dn = in_odd[j] - ((s1 + s2 + 2) >> 2);
537         tmp[i  ] = dc;
538         tmp[i + 1] = s1 + ((dn + dc) >> 1);
539 
540         dc = dn;
541         s1 = s2;
542     }
543 
544     tmp[i] = dc;
545 
546     if (!(len & 1)) {
547         dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1);
548         tmp[len - 2] = s1 + ((dn + dc) >> 1);
549         tmp[len - 1] = dn;
550     } else {
551         tmp[len - 1] = s1 + dc;
552     }
553 #endif
554     memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32));
555 }
556 
557 
558 #endif /* !defined(STANDARD_SLOW_VERSION) */
559 
560 /* <summary>                            */
561 /* Inverse 5-3 wavelet transform in 1-D for one row. */
562 /* </summary>                           */
563 /* Performs interleave, inverse wavelet transform and copy back to buffer */
opj_idwt53_h(const opj_dwt_t * dwt,OPJ_INT32 * tiledp)564 static void opj_idwt53_h(const opj_dwt_t *dwt,
565                          OPJ_INT32* tiledp)
566 {
567 #ifdef STANDARD_SLOW_VERSION
568     /* For documentation purpose */
569     opj_dwt_interleave_h(dwt, tiledp);
570     opj_dwt_decode_1(dwt);
571     memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32));
572 #else
573     const OPJ_INT32 sn = dwt->sn;
574     const OPJ_INT32 len = sn + dwt->dn;
575     if (dwt->cas == 0) { /* Left-most sample is on even coordinate */
576         if (len > 1) {
577             opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp);
578         } else {
579             /* Unmodified value */
580         }
581     } else { /* Left-most sample is on odd coordinate */
582         if (len == 1) {
583             tiledp[0] /= 2;
584         } else if (len == 2) {
585             OPJ_INT32* out = dwt->mem;
586             const OPJ_INT32* in_even = &tiledp[sn];
587             const OPJ_INT32* in_odd = &tiledp[0];
588             out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
589             out[0] = in_even[0] + out[1];
590             memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32));
591         } else if (len > 2) {
592             opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp);
593         }
594     }
595 #endif
596 }
597 
598 #if (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION)
599 
600 /* Conveniency macros to improve the readabilty of the formulas */
601 #if __AVX2__
602 #define VREG        __m256i
603 #define LOAD_CST(x) _mm256_set1_epi32(x)
604 #define LOAD(x)     _mm256_load_si256((const VREG*)(x))
605 #define LOADU(x)    _mm256_loadu_si256((const VREG*)(x))
606 #define STORE(x,y)  _mm256_store_si256((VREG*)(x),(y))
607 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y))
608 #define ADD(x,y)    _mm256_add_epi32((x),(y))
609 #define SUB(x,y)    _mm256_sub_epi32((x),(y))
610 #define SAR(x,y)    _mm256_srai_epi32((x),(y))
611 #else
612 #define VREG        __m128i
613 #define LOAD_CST(x) _mm_set1_epi32(x)
614 #define LOAD(x)     _mm_load_si128((const VREG*)(x))
615 #define LOADU(x)    _mm_loadu_si128((const VREG*)(x))
616 #define STORE(x,y)  _mm_store_si128((VREG*)(x),(y))
617 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y))
618 #define ADD(x,y)    _mm_add_epi32((x),(y))
619 #define SUB(x,y)    _mm_sub_epi32((x),(y))
620 #define SAR(x,y)    _mm_srai_epi32((x),(y))
621 #endif
622 #define ADD3(x,y,z) ADD(ADD(x,y),z)
623 
624 static
opj_idwt53_v_final_memcpy(OPJ_INT32 * tiledp_col,const OPJ_INT32 * tmp,OPJ_INT32 len,OPJ_SIZE_T stride)625 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col,
626                                const OPJ_INT32* tmp,
627                                OPJ_INT32 len,
628                                OPJ_SIZE_T stride)
629 {
630     OPJ_INT32 i;
631     for (i = 0; i < len; ++i) {
632         /* A memcpy(&tiledp_col[i * stride + 0],
633                     &tmp[PARALLEL_COLS_53 * i + 0],
634                     PARALLEL_COLS_53 * sizeof(OPJ_INT32))
635            would do but would be a tiny bit slower.
636            We can take here advantage of our knowledge of alignment */
637         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0],
638                LOAD(&tmp[PARALLEL_COLS_53 * i + 0]));
639         STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT],
640                LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT]));
641     }
642 }
643 
644 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
645  * 16 in AVX2, when top-most pixel is on even coordinate */
opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)646 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(
647     OPJ_INT32* tmp,
648     const OPJ_INT32 sn,
649     const OPJ_INT32 len,
650     OPJ_INT32* tiledp_col,
651     const OPJ_SIZE_T stride)
652 {
653     const OPJ_INT32* in_even = &tiledp_col[0];
654     const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride];
655 
656     OPJ_INT32 i;
657     OPJ_SIZE_T j;
658     VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0;
659     VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1;
660     const VREG two = LOAD_CST(2);
661 
662     assert(len > 1);
663 #if __AVX2__
664     assert(PARALLEL_COLS_53 == 16);
665     assert(VREG_INT_COUNT == 8);
666 #else
667     assert(PARALLEL_COLS_53 == 8);
668     assert(VREG_INT_COUNT == 4);
669 #endif
670 
671     /* Note: loads of input even/odd values must be done in a unaligned */
672     /* fashion. But stores in tmp can be done with aligned store, since */
673     /* the temporary buffer is properly aligned */
674     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
675 
676     s1n_0 = LOADU(in_even + 0);
677     s1n_1 = LOADU(in_even + VREG_INT_COUNT);
678     d1n_0 = LOADU(in_odd);
679     d1n_1 = LOADU(in_odd + VREG_INT_COUNT);
680 
681     /* s0n = s1n - ((d1n + 1) >> 1); <==> */
682     /* s0n = s1n - ((d1n + d1n + 2) >> 2); */
683     s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
684     s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
685 
686     for (i = 0, j = 1; i < (len - 3); i += 2, j++) {
687         d1c_0 = d1n_0;
688         s0c_0 = s0n_0;
689         d1c_1 = d1n_1;
690         s0c_1 = s0n_1;
691 
692         s1n_0 = LOADU(in_even + j * stride);
693         s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT);
694         d1n_0 = LOADU(in_odd + j * stride);
695         d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT);
696 
697         /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/
698         s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2));
699         s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2));
700 
701         STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0);
702         STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1);
703 
704         /* d1c + ((s0c + s0n) >> 1) */
705         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
706               ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1)));
707         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
708               ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1)));
709     }
710 
711     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0);
712     STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1);
713 
714     if (len & 1) {
715         VREG tmp_len_minus_1;
716         s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride);
717         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
718         tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2));
719         STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1);
720         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
721         STORE(tmp + PARALLEL_COLS_53 * (len - 2),
722               ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1)));
723 
724         s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT);
725         /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */
726         tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2));
727         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
728               tmp_len_minus_1);
729         /* d1n + ((s0n + tmp_len_minus_1) >> 1) */
730         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
731               ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1)));
732 
733 
734     } else {
735         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0,
736               ADD(d1n_0, s0n_0));
737         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
738               ADD(d1n_1, s0n_1));
739     }
740 
741     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
742 }
743 
744 
745 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or
746  * 16 in AVX2, when top-most pixel is on odd coordinate */
opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)747 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(
748     OPJ_INT32* tmp,
749     const OPJ_INT32 sn,
750     const OPJ_INT32 len,
751     OPJ_INT32* tiledp_col,
752     const OPJ_SIZE_T stride)
753 {
754     OPJ_INT32 i;
755     OPJ_SIZE_T j;
756 
757     VREG s1_0, s2_0, dc_0, dn_0;
758     VREG s1_1, s2_1, dc_1, dn_1;
759     const VREG two = LOAD_CST(2);
760 
761     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
762     const OPJ_INT32* in_odd = &tiledp_col[0];
763 
764     assert(len > 2);
765 #if __AVX2__
766     assert(PARALLEL_COLS_53 == 16);
767     assert(VREG_INT_COUNT == 8);
768 #else
769     assert(PARALLEL_COLS_53 == 8);
770     assert(VREG_INT_COUNT == 4);
771 #endif
772 
773     /* Note: loads of input even/odd values must be done in a unaligned */
774     /* fashion. But stores in tmp can be done with aligned store, since */
775     /* the temporary buffer is properly aligned */
776     assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0);
777 
778     s1_0 = LOADU(in_even + stride);
779     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
780     dc_0 = SUB(LOADU(in_odd + 0),
781                SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2));
782     STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0));
783 
784     s1_1 = LOADU(in_even + stride + VREG_INT_COUNT);
785     /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */
786     dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT),
787                SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2));
788     STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT,
789           ADD(LOADU(in_even + VREG_INT_COUNT), dc_1));
790 
791     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
792 
793         s2_0 = LOADU(in_even + (j + 1) * stride);
794         s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT);
795 
796         /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */
797         dn_0 = SUB(LOADU(in_odd + j * stride),
798                    SAR(ADD3(s1_0, s2_0, two), 2));
799         dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT),
800                    SAR(ADD3(s1_1, s2_1, two), 2));
801 
802         STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
803         STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
804 
805         /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */
806         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0,
807               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
808         STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT,
809               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
810 
811         dc_0 = dn_0;
812         s1_0 = s2_0;
813         dc_1 = dn_1;
814         s1_1 = s2_1;
815     }
816     STORE(tmp + PARALLEL_COLS_53 * i, dc_0);
817     STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1);
818 
819     if (!(len & 1)) {
820         /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */
821         dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride),
822                    SAR(ADD3(s1_0, s1_0, two), 2));
823         dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT),
824                    SAR(ADD3(s1_1, s1_1, two), 2));
825 
826         /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */
827         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0,
828               ADD(s1_0, SAR(ADD(dn_0, dc_0), 1)));
829         STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT,
830               ADD(s1_1, SAR(ADD(dn_1, dc_1), 1)));
831 
832         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0);
833         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1);
834     } else {
835         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0));
836         STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT,
837               ADD(s1_1, dc_1));
838     }
839 
840     opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride);
841 }
842 
843 #undef VREG
844 #undef LOAD_CST
845 #undef LOADU
846 #undef LOAD
847 #undef STORE
848 #undef STOREU
849 #undef ADD
850 #undef ADD3
851 #undef SUB
852 #undef SAR
853 
854 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */
855 
856 #if !defined(STANDARD_SLOW_VERSION)
857 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
858  * pixel is on even coordinate */
opj_idwt3_v_cas0(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)859 static void opj_idwt3_v_cas0(OPJ_INT32* tmp,
860                              const OPJ_INT32 sn,
861                              const OPJ_INT32 len,
862                              OPJ_INT32* tiledp_col,
863                              const OPJ_SIZE_T stride)
864 {
865     OPJ_INT32 i, j;
866     OPJ_INT32 d1c, d1n, s1n, s0c, s0n;
867 
868     assert(len > 1);
869 
870     /* Performs lifting in one single iteration. Saves memory */
871     /* accesses and explicit interleaving. */
872 
873     s1n = tiledp_col[0];
874     d1n = tiledp_col[(OPJ_SIZE_T)sn * stride];
875     s0n = s1n - ((d1n + 1) >> 1);
876 
877     for (i = 0, j = 0; i < (len - 3); i += 2, j++) {
878         d1c = d1n;
879         s0c = s0n;
880 
881         s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride];
882         d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride];
883 
884         s0n = s1n - ((d1c + d1n + 2) >> 2);
885 
886         tmp[i  ] = s0c;
887         tmp[i + 1] = d1c + ((s0c + s0n) >> 1);
888     }
889 
890     tmp[i] = s0n;
891 
892     if (len & 1) {
893         tmp[len - 1] =
894             tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] -
895             ((d1n + 1) >> 1);
896         tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1);
897     } else {
898         tmp[len - 1] = d1n + s0n;
899     }
900 
901     for (i = 0; i < len; ++i) {
902         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
903     }
904 }
905 
906 
907 /** Vertical inverse 5x3 wavelet transform for one column, when top-most
908  * pixel is on odd coordinate */
opj_idwt3_v_cas1(OPJ_INT32 * tmp,const OPJ_INT32 sn,const OPJ_INT32 len,OPJ_INT32 * tiledp_col,const OPJ_SIZE_T stride)909 static void opj_idwt3_v_cas1(OPJ_INT32* tmp,
910                              const OPJ_INT32 sn,
911                              const OPJ_INT32 len,
912                              OPJ_INT32* tiledp_col,
913                              const OPJ_SIZE_T stride)
914 {
915     OPJ_INT32 i, j;
916     OPJ_INT32 s1, s2, dc, dn;
917     const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
918     const OPJ_INT32* in_odd = &tiledp_col[0];
919 
920     assert(len > 2);
921 
922     /* Performs lifting in one single iteration. Saves memory */
923     /* accesses and explicit interleaving. */
924 
925     s1 = in_even[stride];
926     dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2);
927     tmp[0] = in_even[0] + dc;
928     for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) {
929 
930         s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride];
931 
932         dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2);
933         tmp[i  ] = dc;
934         tmp[i + 1] = s1 + ((dn + dc) >> 1);
935 
936         dc = dn;
937         s1 = s2;
938     }
939     tmp[i] = dc;
940     if (!(len & 1)) {
941         dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1);
942         tmp[len - 2] = s1 + ((dn + dc) >> 1);
943         tmp[len - 1] = dn;
944     } else {
945         tmp[len - 1] = s1 + dc;
946     }
947 
948     for (i = 0; i < len; ++i) {
949         tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i];
950     }
951 }
952 #endif /* !defined(STANDARD_SLOW_VERSION) */
953 
954 /* <summary>                            */
955 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */
956 /* </summary>                           */
957 /* Performs interleave, inverse wavelet transform and copy back to buffer */
opj_idwt53_v(const opj_dwt_t * dwt,OPJ_INT32 * tiledp_col,OPJ_SIZE_T stride,OPJ_INT32 nb_cols)958 static void opj_idwt53_v(const opj_dwt_t *dwt,
959                          OPJ_INT32* tiledp_col,
960                          OPJ_SIZE_T stride,
961                          OPJ_INT32 nb_cols)
962 {
963 #ifdef STANDARD_SLOW_VERSION
964     /* For documentation purpose */
965     OPJ_INT32 k, c;
966     for (c = 0; c < nb_cols; c ++) {
967         opj_dwt_interleave_v(dwt, tiledp_col + c, stride);
968         opj_dwt_decode_1(dwt);
969         for (k = 0; k < dwt->sn + dwt->dn; ++k) {
970             tiledp_col[c + k * stride] = dwt->mem[k];
971         }
972     }
973 #else
974     const OPJ_INT32 sn = dwt->sn;
975     const OPJ_INT32 len = sn + dwt->dn;
976     if (dwt->cas == 0) {
977         /* If len == 1, unmodified value */
978 
979 #if (defined(__SSE2__) || defined(__AVX2__))
980         if (len > 1 && nb_cols == PARALLEL_COLS_53) {
981             /* Same as below general case, except that thanks to SSE2/AVX2 */
982             /* we can efficiently process 8/16 columns in parallel */
983             opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
984             return;
985         }
986 #endif
987         if (len > 1) {
988             OPJ_INT32 c;
989             for (c = 0; c < nb_cols; c++, tiledp_col++) {
990                 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride);
991             }
992             return;
993         }
994     } else {
995         if (len == 1) {
996             OPJ_INT32 c;
997             for (c = 0; c < nb_cols; c++, tiledp_col++) {
998                 tiledp_col[0] /= 2;
999             }
1000             return;
1001         }
1002 
1003         if (len == 2) {
1004             OPJ_INT32 c;
1005             OPJ_INT32* out = dwt->mem;
1006             for (c = 0; c < nb_cols; c++, tiledp_col++) {
1007                 OPJ_INT32 i;
1008                 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride];
1009                 const OPJ_INT32* in_odd = &tiledp_col[0];
1010 
1011                 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1);
1012                 out[0] = in_even[0] + out[1];
1013 
1014                 for (i = 0; i < len; ++i) {
1015                     tiledp_col[(OPJ_SIZE_T)i * stride] = out[i];
1016                 }
1017             }
1018 
1019             return;
1020         }
1021 
1022 #if (defined(__SSE2__) || defined(__AVX2__))
1023         if (len > 2 && nb_cols == PARALLEL_COLS_53) {
1024             /* Same as below general case, except that thanks to SSE2/AVX2 */
1025             /* we can efficiently process 8/16 columns in parallel */
1026             opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride);
1027             return;
1028         }
1029 #endif
1030         if (len > 2) {
1031             OPJ_INT32 c;
1032             for (c = 0; c < nb_cols; c++, tiledp_col++) {
1033                 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride);
1034             }
1035             return;
1036         }
1037     }
1038 #endif
1039 }
1040 
1041 
1042 /* <summary>                             */
1043 /* Forward 9-7 wavelet transform in 1-D. */
1044 /* </summary>                            */
opj_dwt_encode_1_real(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas)1045 static void opj_dwt_encode_1_real(OPJ_INT32 *a, OPJ_SIZE_T a_count,
1046                                   OPJ_INT32 dn, OPJ_INT32 sn, OPJ_INT32 cas)
1047 {
1048     OPJ_INT32 i;
1049     if (!cas) {
1050         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
1051             for (i = 0; i < dn; i++) {
1052                 OPJ_D(i) -= opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 12993);
1053             }
1054             for (i = 0; i < sn; i++) {
1055                 OPJ_S(i) -= opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 434);
1056             }
1057             for (i = 0; i < dn; i++) {
1058                 OPJ_D(i) += opj_int_fix_mul(OPJ_S_(i) + OPJ_S_(i + 1), 7233);
1059             }
1060             for (i = 0; i < sn; i++) {
1061                 OPJ_S(i) += opj_int_fix_mul(OPJ_D_(i - 1) + OPJ_D_(i), 3633);
1062             }
1063             for (i = 0; i < dn; i++) {
1064                 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 5038);    /*5038 */
1065             }
1066             for (i = 0; i < sn; i++) {
1067                 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 6659);    /*6660 */
1068             }
1069         }
1070     } else {
1071         if ((sn > 0) || (dn > 1)) { /* NEW :  CASE ONE ELEMENT */
1072             for (i = 0; i < dn; i++) {
1073                 OPJ_S(i) -= opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 12993);
1074             }
1075             for (i = 0; i < sn; i++) {
1076                 OPJ_D(i) -= opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 434);
1077             }
1078             for (i = 0; i < dn; i++) {
1079                 OPJ_S(i) += opj_int_fix_mul(OPJ_DD_(i) + OPJ_DD_(i - 1), 7233);
1080             }
1081             for (i = 0; i < sn; i++) {
1082                 OPJ_D(i) += opj_int_fix_mul(OPJ_SS_(i) + OPJ_SS_(i + 1), 3633);
1083             }
1084             for (i = 0; i < dn; i++) {
1085                 OPJ_S(i) = opj_int_fix_mul(OPJ_S(i), 5038);    /*5038 */
1086             }
1087             for (i = 0; i < sn; i++) {
1088                 OPJ_D(i) = opj_int_fix_mul(OPJ_D(i), 6659);    /*6660 */
1089             }
1090         }
1091     }
1092 }
1093 
opj_dwt_encode_stepsize(OPJ_INT32 stepsize,OPJ_INT32 numbps,opj_stepsize_t * bandno_stepsize)1094 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps,
1095                                     opj_stepsize_t *bandno_stepsize)
1096 {
1097     OPJ_INT32 p, n;
1098     p = opj_int_floorlog2(stepsize) - 13;
1099     n = 11 - opj_int_floorlog2(stepsize);
1100     bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff;
1101     bandno_stepsize->expn = numbps - p;
1102 }
1103 
1104 /*
1105 ==========================================================
1106    DWT interface
1107 ==========================================================
1108 */
1109 
1110 
1111 /* <summary>                            */
1112 /* Forward 5-3 wavelet transform in 2-D. */
1113 /* </summary>                           */
opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec,void (* p_function)(OPJ_INT32 *,OPJ_SIZE_T,OPJ_INT32,OPJ_INT32,OPJ_INT32))1114 static INLINE OPJ_BOOL opj_dwt_encode_procedure(const opj_tcd_tilecomp_t * tilec,
1115         void(*p_function)(OPJ_INT32 *, OPJ_SIZE_T, OPJ_INT32, OPJ_INT32, OPJ_INT32))
1116 {
1117     OPJ_INT32 i, j, k;
1118     OPJ_INT32 *a = 00;
1119     OPJ_INT32 *aj = 00;
1120     OPJ_INT32 *bj = 00;
1121     OPJ_INT32 w, l;
1122 
1123     OPJ_INT32 rw;           /* width of the resolution level computed   */
1124     OPJ_INT32 rh;           /* height of the resolution level computed  */
1125     OPJ_SIZE_T l_data_count;
1126     OPJ_SIZE_T l_data_size;
1127 
1128     opj_tcd_resolution_t * l_cur_res = 0;
1129     opj_tcd_resolution_t * l_last_res = 0;
1130 
1131     w = tilec->x1 - tilec->x0;
1132     l = (OPJ_INT32)tilec->numresolutions - 1;
1133     a = tilec->data;
1134 
1135     l_cur_res = tilec->resolutions + l;
1136     l_last_res = l_cur_res - 1;
1137 
1138     l_data_count = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions);
1139     /* overflow check */
1140     if (l_data_count > (SIZE_MAX / sizeof(OPJ_INT32))) {
1141         /* FIXME event manager error callback */
1142         return OPJ_FALSE;
1143     }
1144     l_data_size = l_data_count * sizeof(OPJ_INT32);
1145     bj = (OPJ_INT32*)opj_malloc(l_data_size);
1146     /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */
1147     /* in that case, so do not error out */
1148     if (l_data_size != 0 && ! bj) {
1149         return OPJ_FALSE;
1150     }
1151     i = l;
1152 
1153     while (i--) {
1154         OPJ_INT32 rw1;      /* width of the resolution level once lower than computed one                                       */
1155         OPJ_INT32 rh1;      /* height of the resolution level once lower than computed one                                      */
1156         OPJ_INT32 cas_col;  /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */
1157         OPJ_INT32 cas_row;  /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering   */
1158         OPJ_INT32 dn, sn;
1159 
1160         rw  = l_cur_res->x1 - l_cur_res->x0;
1161         rh  = l_cur_res->y1 - l_cur_res->y0;
1162         rw1 = l_last_res->x1 - l_last_res->x0;
1163         rh1 = l_last_res->y1 - l_last_res->y0;
1164 
1165         cas_row = l_cur_res->x0 & 1;
1166         cas_col = l_cur_res->y0 & 1;
1167 
1168         sn = rh1;
1169         dn = rh - rh1;
1170         for (j = 0; j < rw; ++j) {
1171             aj = a + j;
1172             for (k = 0; k < rh; ++k) {
1173                 bj[k] = aj[k * w];
1174             }
1175 
1176             (*p_function) (bj, l_data_count, dn, sn, cas_col);
1177 
1178             opj_dwt_deinterleave_v(bj, aj, dn, sn, w, cas_col);
1179         }
1180 
1181         sn = rw1;
1182         dn = rw - rw1;
1183 
1184         for (j = 0; j < rh; j++) {
1185             aj = a + j * w;
1186             for (k = 0; k < rw; k++) {
1187                 bj[k] = aj[k];
1188             }
1189             (*p_function) (bj, l_data_count, dn, sn, cas_row);
1190             opj_dwt_deinterleave_h(bj, aj, dn, sn, cas_row);
1191         }
1192 
1193         l_cur_res = l_last_res;
1194 
1195         --l_last_res;
1196     }
1197 
1198     opj_free(bj);
1199     return OPJ_TRUE;
1200 }
1201 
1202 /* Forward 5-3 wavelet transform in 2-D. */
1203 /* </summary>                           */
opj_dwt_encode(opj_tcd_tilecomp_t * tilec)1204 OPJ_BOOL opj_dwt_encode(opj_tcd_tilecomp_t * tilec)
1205 {
1206     return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1);
1207 }
1208 
1209 /* <summary>                            */
1210 /* Inverse 5-3 wavelet transform in 2-D. */
1211 /* </summary>                           */
opj_dwt_decode(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1212 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec,
1213                         OPJ_UINT32 numres)
1214 {
1215     if (p_tcd->whole_tile_decoding) {
1216         return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres);
1217     } else {
1218         return opj_dwt_decode_partial_tile(tilec, numres);
1219     }
1220 }
1221 
1222 
1223 /* <summary>                          */
1224 /* Get gain of 5-3 wavelet transform. */
1225 /* </summary>                         */
opj_dwt_getgain(OPJ_UINT32 orient)1226 OPJ_UINT32 opj_dwt_getgain(OPJ_UINT32 orient)
1227 {
1228     if (orient == 0) {
1229         return 0;
1230     }
1231     if (orient == 1 || orient == 2) {
1232         return 1;
1233     }
1234     return 2;
1235 }
1236 
1237 /* <summary>                */
1238 /* Get norm of 5-3 wavelet. */
1239 /* </summary>               */
opj_dwt_getnorm(OPJ_UINT32 level,OPJ_UINT32 orient)1240 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient)
1241 {
1242     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1243     /* but the array should really be extended up to 33 resolution levels */
1244     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1245     if (orient == 0 && level >= 10) {
1246         level = 9;
1247     } else if (orient > 0 && level >= 9) {
1248         level = 8;
1249     }
1250     return opj_dwt_norms[orient][level];
1251 }
1252 
1253 /* <summary>                             */
1254 /* Forward 9-7 wavelet transform in 2-D. */
1255 /* </summary>                            */
opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)1256 OPJ_BOOL opj_dwt_encode_real(opj_tcd_tilecomp_t * tilec)
1257 {
1258     return opj_dwt_encode_procedure(tilec, opj_dwt_encode_1_real);
1259 }
1260 
1261 /* <summary>                          */
1262 /* Get gain of 9-7 wavelet transform. */
1263 /* </summary>                         */
opj_dwt_getgain_real(OPJ_UINT32 orient)1264 OPJ_UINT32 opj_dwt_getgain_real(OPJ_UINT32 orient)
1265 {
1266     (void)orient;
1267     return 0;
1268 }
1269 
1270 /* <summary>                */
1271 /* Get norm of 9-7 wavelet. */
1272 /* </summary>               */
opj_dwt_getnorm_real(OPJ_UINT32 level,OPJ_UINT32 orient)1273 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient)
1274 {
1275     /* FIXME ! This is just a band-aid to avoid a buffer overflow */
1276     /* but the array should really be extended up to 33 resolution levels */
1277     /* See https://github.com/uclouvain/openjpeg/issues/493 */
1278     if (orient == 0 && level >= 10) {
1279         level = 9;
1280     } else if (orient > 0 && level >= 9) {
1281         level = 8;
1282     }
1283     return opj_dwt_norms_real[orient][level];
1284 }
1285 
opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp,OPJ_UINT32 prec)1286 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec)
1287 {
1288     OPJ_UINT32 numbands, bandno;
1289     numbands = 3 * tccp->numresolutions - 2;
1290     for (bandno = 0; bandno < numbands; bandno++) {
1291         OPJ_FLOAT64 stepsize;
1292         OPJ_UINT32 resno, level, orient, gain;
1293 
1294         resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1);
1295         orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1);
1296         level = tccp->numresolutions - 1 - resno;
1297         gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) ||
1298                                           (orient == 2)) ? 1 : 2));
1299         if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) {
1300             stepsize = 1.0;
1301         } else {
1302             OPJ_FLOAT64 norm = opj_dwt_norms_real[orient][level];
1303             stepsize = (1 << (gain)) / norm;
1304         }
1305         opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0),
1306                                 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]);
1307     }
1308 }
1309 
1310 /* <summary>                             */
1311 /* Determine maximum computed resolution level for inverse wavelet transform */
1312 /* </summary>                            */
opj_dwt_max_resolution(opj_tcd_resolution_t * OPJ_RESTRICT r,OPJ_UINT32 i)1313 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r,
1314         OPJ_UINT32 i)
1315 {
1316     OPJ_UINT32 mr   = 0;
1317     OPJ_UINT32 w;
1318     while (--i) {
1319         ++r;
1320         if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) {
1321             mr = w ;
1322         }
1323         if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) {
1324             mr = w ;
1325         }
1326     }
1327     return mr ;
1328 }
1329 
1330 typedef struct {
1331     opj_dwt_t h;
1332     OPJ_UINT32 rw;
1333     OPJ_UINT32 w;
1334     OPJ_INT32 * OPJ_RESTRICT tiledp;
1335     OPJ_UINT32 min_j;
1336     OPJ_UINT32 max_j;
1337 } opj_dwd_decode_h_job_t;
1338 
opj_dwt_decode_h_func(void * user_data,opj_tls_t * tls)1339 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls)
1340 {
1341     OPJ_UINT32 j;
1342     opj_dwd_decode_h_job_t* job;
1343     (void)tls;
1344 
1345     job = (opj_dwd_decode_h_job_t*)user_data;
1346     for (j = job->min_j; j < job->max_j; j++) {
1347         opj_idwt53_h(&job->h, &job->tiledp[j * job->w]);
1348     }
1349 
1350     opj_aligned_free(job->h.mem);
1351     opj_free(job);
1352 }
1353 
1354 typedef struct {
1355     opj_dwt_t v;
1356     OPJ_UINT32 rh;
1357     OPJ_UINT32 w;
1358     OPJ_INT32 * OPJ_RESTRICT tiledp;
1359     OPJ_UINT32 min_j;
1360     OPJ_UINT32 max_j;
1361 } opj_dwd_decode_v_job_t;
1362 
opj_dwt_decode_v_func(void * user_data,opj_tls_t * tls)1363 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls)
1364 {
1365     OPJ_UINT32 j;
1366     opj_dwd_decode_v_job_t* job;
1367     (void)tls;
1368 
1369     job = (opj_dwd_decode_v_job_t*)user_data;
1370     for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j;
1371             j += PARALLEL_COLS_53) {
1372         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1373                      PARALLEL_COLS_53);
1374     }
1375     if (j < job->max_j)
1376         opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w,
1377                      (OPJ_INT32)(job->max_j - j));
1378 
1379     opj_aligned_free(job->v.mem);
1380     opj_free(job);
1381 }
1382 
1383 
1384 /* <summary>                            */
1385 /* Inverse wavelet transform in 2-D.    */
1386 /* </summary>                           */
opj_dwt_decode_tile(opj_thread_pool_t * tp,const opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1387 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp,
1388         const opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres)
1389 {
1390     opj_dwt_t h;
1391     opj_dwt_t v;
1392 
1393     opj_tcd_resolution_t* tr = tilec->resolutions;
1394 
1395     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1396                                  tr->x0);  /* width of the resolution level computed */
1397     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1398                                  tr->y0);  /* height of the resolution level computed */
1399 
1400     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
1401                                                                1].x1 -
1402                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
1403     OPJ_SIZE_T h_mem_size;
1404     int num_threads;
1405 
1406     if (numres == 1U) {
1407         return OPJ_TRUE;
1408     }
1409     num_threads = opj_thread_pool_get_thread_count(tp);
1410     h.mem_count = opj_dwt_max_resolution(tr, numres);
1411     /* overflow check */
1412     if (h.mem_count > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) {
1413         /* FIXME event manager error callback */
1414         return OPJ_FALSE;
1415     }
1416     /* We need PARALLEL_COLS_53 times the height of the array, */
1417     /* since for the vertical pass */
1418     /* we process PARALLEL_COLS_53 columns at a time */
1419     h_mem_size = h.mem_count * PARALLEL_COLS_53 * sizeof(OPJ_INT32);
1420     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1421     if (! h.mem) {
1422         /* FIXME event manager error callback */
1423         return OPJ_FALSE;
1424     }
1425 
1426     v.mem_count = h.mem_count;
1427     v.mem = h.mem;
1428 
1429     while (--numres) {
1430         OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data;
1431         OPJ_UINT32 j;
1432 
1433         ++tr;
1434         h.sn = (OPJ_INT32)rw;
1435         v.sn = (OPJ_INT32)rh;
1436 
1437         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
1438         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
1439 
1440         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
1441         h.cas = tr->x0 % 2;
1442 
1443         if (num_threads <= 1 || rh <= 1) {
1444             for (j = 0; j < rh; ++j) {
1445                 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]);
1446             }
1447         } else {
1448             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1449             OPJ_UINT32 step_j;
1450 
1451             if (rh < num_jobs) {
1452                 num_jobs = rh;
1453             }
1454             step_j = (rh / num_jobs);
1455 
1456             for (j = 0; j < num_jobs; j++) {
1457                 opj_dwd_decode_h_job_t* job;
1458 
1459                 job = (opj_dwd_decode_h_job_t*) opj_malloc(sizeof(opj_dwd_decode_h_job_t));
1460                 if (!job) {
1461                     /* It would be nice to fallback to single thread case, but */
1462                     /* unfortunately some jobs may be launched and have modified */
1463                     /* tiledp, so it is not practical to recover from that error */
1464                     /* FIXME event manager error callback */
1465                     opj_thread_pool_wait_completion(tp, 0);
1466                     opj_aligned_free(h.mem);
1467                     return OPJ_FALSE;
1468                 }
1469                 job->h = h;
1470                 job->rw = rw;
1471                 job->w = w;
1472                 job->tiledp = tiledp;
1473                 job->min_j = j * step_j;
1474                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1475                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1476                     job->max_j = rh;
1477                 }
1478                 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1479                 if (!job->h.mem) {
1480                     /* FIXME event manager error callback */
1481                     opj_thread_pool_wait_completion(tp, 0);
1482                     opj_free(job);
1483                     opj_aligned_free(h.mem);
1484                     return OPJ_FALSE;
1485                 }
1486                 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job);
1487             }
1488             opj_thread_pool_wait_completion(tp, 0);
1489         }
1490 
1491         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
1492         v.cas = tr->y0 % 2;
1493 
1494         if (num_threads <= 1 || rw <= 1) {
1495             for (j = 0; j + PARALLEL_COLS_53 <= rw;
1496                     j += PARALLEL_COLS_53) {
1497                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53);
1498             }
1499             if (j < rw) {
1500                 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j));
1501             }
1502         } else {
1503             OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads;
1504             OPJ_UINT32 step_j;
1505 
1506             if (rw < num_jobs) {
1507                 num_jobs = rw;
1508             }
1509             step_j = (rw / num_jobs);
1510 
1511             for (j = 0; j < num_jobs; j++) {
1512                 opj_dwd_decode_v_job_t* job;
1513 
1514                 job = (opj_dwd_decode_v_job_t*) opj_malloc(sizeof(opj_dwd_decode_v_job_t));
1515                 if (!job) {
1516                     /* It would be nice to fallback to single thread case, but */
1517                     /* unfortunately some jobs may be launched and have modified */
1518                     /* tiledp, so it is not practical to recover from that error */
1519                     /* FIXME event manager error callback */
1520                     opj_thread_pool_wait_completion(tp, 0);
1521                     opj_aligned_free(v.mem);
1522                     return OPJ_FALSE;
1523                 }
1524                 job->v = v;
1525                 job->rh = rh;
1526                 job->w = w;
1527                 job->tiledp = tiledp;
1528                 job->min_j = j * step_j;
1529                 job->max_j = (j + 1U) * step_j; /* this can overflow */
1530                 if (j == (num_jobs - 1U)) {  /* this will take care of the overflow */
1531                     job->max_j = rw;
1532                 }
1533                 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1534                 if (!job->v.mem) {
1535                     /* FIXME event manager error callback */
1536                     opj_thread_pool_wait_completion(tp, 0);
1537                     opj_free(job);
1538                     opj_aligned_free(v.mem);
1539                     return OPJ_FALSE;
1540                 }
1541                 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job);
1542             }
1543             opj_thread_pool_wait_completion(tp, 0);
1544         }
1545     }
1546     opj_aligned_free(h.mem);
1547     return OPJ_TRUE;
1548 }
1549 
opj_dwt_interleave_partial_h(OPJ_INT32 * dest,OPJ_INT32 cas,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_line,OPJ_UINT32 sn,OPJ_UINT32 win_l_x0,OPJ_UINT32 win_l_x1,OPJ_UINT32 win_h_x0,OPJ_UINT32 win_h_x1)1550 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest,
1551         OPJ_INT32 cas,
1552         opj_sparse_array_int32_t* sa,
1553         OPJ_UINT32 sa_line,
1554         OPJ_UINT32 sn,
1555         OPJ_UINT32 win_l_x0,
1556         OPJ_UINT32 win_l_x1,
1557         OPJ_UINT32 win_h_x0,
1558         OPJ_UINT32 win_h_x1)
1559 {
1560     OPJ_BOOL ret;
1561     ret = opj_sparse_array_int32_read(sa,
1562                                       win_l_x0, sa_line,
1563                                       win_l_x1, sa_line + 1,
1564                                       dest + cas + 2 * win_l_x0,
1565                                       2, 0, OPJ_TRUE);
1566     assert(ret);
1567     ret = opj_sparse_array_int32_read(sa,
1568                                       sn + win_h_x0, sa_line,
1569                                       sn + win_h_x1, sa_line + 1,
1570                                       dest + 1 - cas + 2 * win_h_x0,
1571                                       2, 0, OPJ_TRUE);
1572     assert(ret);
1573     OPJ_UNUSED(ret);
1574 }
1575 
1576 
opj_dwt_interleave_partial_v(OPJ_INT32 * dest,OPJ_INT32 cas,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_col,OPJ_UINT32 nb_cols,OPJ_UINT32 sn,OPJ_UINT32 win_l_y0,OPJ_UINT32 win_l_y1,OPJ_UINT32 win_h_y0,OPJ_UINT32 win_h_y1)1577 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest,
1578         OPJ_INT32 cas,
1579         opj_sparse_array_int32_t* sa,
1580         OPJ_UINT32 sa_col,
1581         OPJ_UINT32 nb_cols,
1582         OPJ_UINT32 sn,
1583         OPJ_UINT32 win_l_y0,
1584         OPJ_UINT32 win_l_y1,
1585         OPJ_UINT32 win_h_y0,
1586         OPJ_UINT32 win_h_y1)
1587 {
1588     OPJ_BOOL ret;
1589     ret  = opj_sparse_array_int32_read(sa,
1590                                        sa_col, win_l_y0,
1591                                        sa_col + nb_cols, win_l_y1,
1592                                        dest + cas * 4 + 2 * 4 * win_l_y0,
1593                                        1, 2 * 4, OPJ_TRUE);
1594     assert(ret);
1595     ret = opj_sparse_array_int32_read(sa,
1596                                       sa_col, sn + win_h_y0,
1597                                       sa_col + nb_cols, sn + win_h_y1,
1598                                       dest + (1 - cas) * 4 + 2 * 4 * win_h_y0,
1599                                       1, 2 * 4, OPJ_TRUE);
1600     assert(ret);
1601     OPJ_UNUSED(ret);
1602 }
1603 
opj_dwt_decode_partial_1(OPJ_INT32 * a,OPJ_SIZE_T a_count,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas,OPJ_INT32 win_l_x0,OPJ_INT32 win_l_x1,OPJ_INT32 win_h_x0,OPJ_INT32 win_h_x1)1604 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_SIZE_T a_count,
1605                                      OPJ_INT32 dn, OPJ_INT32 sn,
1606                                      OPJ_INT32 cas,
1607                                      OPJ_INT32 win_l_x0,
1608                                      OPJ_INT32 win_l_x1,
1609                                      OPJ_INT32 win_h_x0,
1610                                      OPJ_INT32 win_h_x1)
1611 {
1612     OPJ_INT32 i;
1613 
1614     if (!cas) {
1615         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
1616 
1617             /* Naive version is :
1618             for (i = win_l_x0; i < i_max; i++) {
1619                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1620             }
1621             for (i = win_h_x0; i < win_h_x1; i++) {
1622                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1623             }
1624             but the compiler doesn't manage to unroll it to avoid bound
1625             checking in OPJ_S_ and OPJ_D_ macros
1626             */
1627 
1628             i = win_l_x0;
1629             if (i < win_l_x1) {
1630                 OPJ_INT32 i_max;
1631 
1632                 /* Left-most case */
1633                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1634                 i ++;
1635 
1636                 i_max = win_l_x1;
1637                 if (i_max > dn) {
1638                     i_max = dn;
1639                 }
1640                 for (; i < i_max; i++) {
1641                     /* No bound checking */
1642                     OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2;
1643                 }
1644                 for (; i < win_l_x1; i++) {
1645                     /* Right-most case */
1646                     OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1647                 }
1648             }
1649 
1650             i = win_h_x0;
1651             if (i < win_h_x1) {
1652                 OPJ_INT32 i_max = win_h_x1;
1653                 if (i_max >= sn) {
1654                     i_max = sn - 1;
1655                 }
1656                 for (; i < i_max; i++) {
1657                     /* No bound checking */
1658                     OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1;
1659                 }
1660                 for (; i < win_h_x1; i++) {
1661                     /* Right-most case */
1662                     OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1663                 }
1664             }
1665         }
1666     } else {
1667         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
1668             OPJ_S(0) /= 2;
1669         } else {
1670             for (i = win_l_x0; i < win_l_x1; i++) {
1671                 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2;
1672             }
1673             for (i = win_h_x0; i < win_h_x1; i++) {
1674                 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1;
1675             }
1676         }
1677     }
1678 }
1679 
1680 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off]
1681 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off]
1682 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off)))
1683 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off)))
1684 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off)))
1685 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off)))
1686 
opj_dwt_decode_partial_1_parallel(OPJ_INT32 * a,OPJ_UINT32 nb_cols,OPJ_INT32 dn,OPJ_INT32 sn,OPJ_INT32 cas,OPJ_INT32 win_l_x0,OPJ_INT32 win_l_x1,OPJ_INT32 win_h_x0,OPJ_INT32 win_h_x1)1687 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a,
1688         OPJ_UINT32 nb_cols,
1689         OPJ_INT32 dn, OPJ_INT32 sn,
1690         OPJ_INT32 cas,
1691         OPJ_INT32 win_l_x0,
1692         OPJ_INT32 win_l_x1,
1693         OPJ_INT32 win_h_x0,
1694         OPJ_INT32 win_h_x1)
1695 {
1696     OPJ_INT32 i;
1697     OPJ_UINT32 off;
1698 
1699     (void)nb_cols;
1700 
1701     if (!cas) {
1702         if ((dn > 0) || (sn > 1)) { /* NEW :  CASE ONE ELEMENT */
1703 
1704             /* Naive version is :
1705             for (i = win_l_x0; i < i_max; i++) {
1706                 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2;
1707             }
1708             for (i = win_h_x0; i < win_h_x1; i++) {
1709                 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1;
1710             }
1711             but the compiler doesn't manage to unroll it to avoid bound
1712             checking in OPJ_S_ and OPJ_D_ macros
1713             */
1714 
1715             i = win_l_x0;
1716             if (i < win_l_x1) {
1717                 OPJ_INT32 i_max;
1718 
1719                 /* Left-most case */
1720                 for (off = 0; off < 4; off++) {
1721                     OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
1722                 }
1723                 i ++;
1724 
1725                 i_max = win_l_x1;
1726                 if (i_max > dn) {
1727                     i_max = dn;
1728                 }
1729 
1730 #ifdef __SSE2__
1731                 if (i + 1 < i_max) {
1732                     const __m128i two = _mm_set1_epi32(2);
1733                     __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8));
1734                     for (; i + 1 < i_max; i += 2) {
1735                         /* No bound checking */
1736                         __m128i S = _mm_load_si128((__m128i * const)(a + i * 8));
1737                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
1738                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
1739                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
1740                         S = _mm_sub_epi32(S,
1741                                           _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2));
1742                         S1 = _mm_sub_epi32(S1,
1743                                            _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2));
1744                         _mm_store_si128((__m128i*)(a + i * 8), S);
1745                         _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1);
1746                         Dm1 = D1;
1747                     }
1748                 }
1749 #endif
1750 
1751                 for (; i < i_max; i++) {
1752                     /* No bound checking */
1753                     for (off = 0; off < 4; off++) {
1754                         OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2;
1755                     }
1756                 }
1757                 for (; i < win_l_x1; i++) {
1758                     /* Right-most case */
1759                     for (off = 0; off < 4; off++) {
1760                         OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2;
1761                     }
1762                 }
1763             }
1764 
1765             i = win_h_x0;
1766             if (i < win_h_x1) {
1767                 OPJ_INT32 i_max = win_h_x1;
1768                 if (i_max >= sn) {
1769                     i_max = sn - 1;
1770                 }
1771 
1772 #ifdef __SSE2__
1773                 if (i + 1 < i_max) {
1774                     __m128i S =  _mm_load_si128((__m128i * const)(a + i * 8));
1775                     for (; i + 1 < i_max; i += 2) {
1776                         /* No bound checking */
1777                         __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8));
1778                         __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8));
1779                         __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8));
1780                         __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8));
1781                         D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1));
1782                         D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1));
1783                         _mm_store_si128((__m128i*)(a + 4 + i * 8), D);
1784                         _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1);
1785                         S = S2;
1786                     }
1787                 }
1788 #endif
1789 
1790                 for (; i < i_max; i++) {
1791                     /* No bound checking */
1792                     for (off = 0; off < 4; off++) {
1793                         OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1;
1794                     }
1795                 }
1796                 for (; i < win_h_x1; i++) {
1797                     /* Right-most case */
1798                     for (off = 0; off < 4; off++) {
1799                         OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1;
1800                     }
1801                 }
1802             }
1803         }
1804     } else {
1805         if (!sn  && dn == 1) {        /* NEW :  CASE ONE ELEMENT */
1806             for (off = 0; off < 4; off++) {
1807                 OPJ_S_off(0, off) /= 2;
1808             }
1809         } else {
1810             for (i = win_l_x0; i < win_l_x1; i++) {
1811                 for (off = 0; off < 4; off++) {
1812                     OPJ_D_off(i, off) -= (OPJ_SS__off(i, off) + OPJ_SS__off(i + 1, off) + 2) >> 2;
1813                 }
1814             }
1815             for (i = win_h_x0; i < win_h_x1; i++) {
1816                 for (off = 0; off < 4; off++) {
1817                     OPJ_S_off(i, off) += (OPJ_DD__off(i, off) + OPJ_DD__off(i - 1, off)) >> 1;
1818                 }
1819             }
1820         }
1821     }
1822 }
1823 
opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 resno,OPJ_UINT32 bandno,OPJ_UINT32 tcx0,OPJ_UINT32 tcy0,OPJ_UINT32 tcx1,OPJ_UINT32 tcy1,OPJ_UINT32 * tbx0,OPJ_UINT32 * tby0,OPJ_UINT32 * tbx1,OPJ_UINT32 * tby1)1824 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec,
1825         OPJ_UINT32 resno,
1826         OPJ_UINT32 bandno,
1827         OPJ_UINT32 tcx0,
1828         OPJ_UINT32 tcy0,
1829         OPJ_UINT32 tcx1,
1830         OPJ_UINT32 tcy1,
1831         OPJ_UINT32* tbx0,
1832         OPJ_UINT32* tby0,
1833         OPJ_UINT32* tbx1,
1834         OPJ_UINT32* tby1)
1835 {
1836     /* Compute number of decomposition for this band. See table F-1 */
1837     OPJ_UINT32 nb = (resno == 0) ?
1838                     tilec->numresolutions - 1 :
1839                     tilec->numresolutions - resno;
1840     /* Map above tile-based coordinates to sub-band-based coordinates per */
1841     /* equation B-15 of the standard */
1842     OPJ_UINT32 x0b = bandno & 1;
1843     OPJ_UINT32 y0b = bandno >> 1;
1844     if (tbx0) {
1845         *tbx0 = (nb == 0) ? tcx0 :
1846                 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 :
1847                 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb);
1848     }
1849     if (tby0) {
1850         *tby0 = (nb == 0) ? tcy0 :
1851                 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 :
1852                 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb);
1853     }
1854     if (tbx1) {
1855         *tbx1 = (nb == 0) ? tcx1 :
1856                 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 :
1857                 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb);
1858     }
1859     if (tby1) {
1860         *tby1 = (nb == 0) ? tcy1 :
1861                 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 :
1862                 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb);
1863     }
1864 }
1865 
opj_dwt_segment_grow(OPJ_UINT32 filter_width,OPJ_UINT32 max_size,OPJ_UINT32 * start,OPJ_UINT32 * end)1866 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width,
1867                                  OPJ_UINT32 max_size,
1868                                  OPJ_UINT32* start,
1869                                  OPJ_UINT32* end)
1870 {
1871     *start = opj_uint_subs(*start, filter_width);
1872     *end = opj_uint_adds(*end, filter_width);
1873     *end = opj_uint_min(*end, max_size);
1874 }
1875 
1876 
opj_dwt_init_sparse_array(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1877 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array(
1878     opj_tcd_tilecomp_t* tilec,
1879     OPJ_UINT32 numres)
1880 {
1881     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
1882     OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0);
1883     OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0);
1884     OPJ_UINT32 resno, bandno, precno, cblkno;
1885     opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create(
1886                                        w, h, opj_uint_min(w, 64), opj_uint_min(h, 64));
1887     if (sa == NULL) {
1888         return NULL;
1889     }
1890 
1891     for (resno = 0; resno < numres; ++resno) {
1892         opj_tcd_resolution_t* res = &tilec->resolutions[resno];
1893 
1894         for (bandno = 0; bandno < res->numbands; ++bandno) {
1895             opj_tcd_band_t* band = &res->bands[bandno];
1896 
1897             for (precno = 0; precno < res->pw * res->ph; ++precno) {
1898                 opj_tcd_precinct_t* precinct = &band->precincts[precno];
1899                 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) {
1900                     opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno];
1901                     if (cblk->decoded_data != NULL) {
1902                         OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0);
1903                         OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0);
1904                         OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0);
1905                         OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0);
1906 
1907                         if (band->bandno & 1) {
1908                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
1909                             x += (OPJ_UINT32)(pres->x1 - pres->x0);
1910                         }
1911                         if (band->bandno & 2) {
1912                             opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1];
1913                             y += (OPJ_UINT32)(pres->y1 - pres->y0);
1914                         }
1915 
1916                         if (!opj_sparse_array_int32_write(sa, x, y,
1917                                                           x + cblk_w, y + cblk_h,
1918                                                           cblk->decoded_data,
1919                                                           1, cblk_w, OPJ_TRUE)) {
1920                             opj_sparse_array_int32_free(sa);
1921                             return NULL;
1922                         }
1923                     }
1924                 }
1925             }
1926         }
1927     }
1928 
1929     return sa;
1930 }
1931 
1932 
opj_dwt_decode_partial_tile(opj_tcd_tilecomp_t * tilec,OPJ_UINT32 numres)1933 static OPJ_BOOL opj_dwt_decode_partial_tile(
1934     opj_tcd_tilecomp_t* tilec,
1935     OPJ_UINT32 numres)
1936 {
1937     opj_sparse_array_int32_t* sa;
1938     opj_dwt_t h;
1939     opj_dwt_t v;
1940     OPJ_UINT32 resno;
1941     /* This value matches the maximum left/right extension given in tables */
1942     /* F.2 and F.3 of the standard. */
1943     const OPJ_UINT32 filter_width = 2U;
1944 
1945     opj_tcd_resolution_t* tr = tilec->resolutions;
1946     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
1947 
1948     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
1949                                  tr->x0);  /* width of the resolution level computed */
1950     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
1951                                  tr->y0);  /* height of the resolution level computed */
1952 
1953     OPJ_SIZE_T h_mem_size;
1954 
1955     /* Compute the intersection of the area of interest, expressed in tile coordinates */
1956     /* with the tile coordinates */
1957     OPJ_UINT32 win_tcx0 = tilec->win_x0;
1958     OPJ_UINT32 win_tcy0 = tilec->win_y0;
1959     OPJ_UINT32 win_tcx1 = tilec->win_x1;
1960     OPJ_UINT32 win_tcy1 = tilec->win_y1;
1961 
1962     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
1963         return OPJ_TRUE;
1964     }
1965 
1966     sa = opj_dwt_init_sparse_array(tilec, numres);
1967     if (sa == NULL) {
1968         return OPJ_FALSE;
1969     }
1970 
1971     if (numres == 1U) {
1972         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
1973                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
1974                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
1975                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
1976                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
1977                        tilec->data_win,
1978                        1, tr_max->win_x1 - tr_max->win_x0,
1979                        OPJ_TRUE);
1980         assert(ret);
1981         OPJ_UNUSED(ret);
1982         opj_sparse_array_int32_free(sa);
1983         return OPJ_TRUE;
1984     }
1985     h.mem_count = opj_dwt_max_resolution(tr, numres);
1986     /* overflow check */
1987     /* in vertical pass, we process 4 columns at a time */
1988     if (h.mem_count > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) {
1989         /* FIXME event manager error callback */
1990         opj_sparse_array_int32_free(sa);
1991         return OPJ_FALSE;
1992     }
1993 
1994     h_mem_size = h.mem_count * 4 * sizeof(OPJ_INT32);
1995     h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size);
1996     if (! h.mem) {
1997         /* FIXME event manager error callback */
1998         opj_sparse_array_int32_free(sa);
1999         return OPJ_FALSE;
2000     }
2001 
2002     v.mem_count = h.mem_count;
2003     v.mem = h.mem;
2004 
2005     for (resno = 1; resno < numres; resno ++) {
2006         OPJ_UINT32 i, j;
2007         /* Window of interest subband-based coordinates */
2008         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2009         OPJ_UINT32 win_hl_x0, win_hl_x1;
2010         OPJ_UINT32 win_lh_y0, win_lh_y1;
2011         /* Window of interest tile-resolution-based coordinates */
2012         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2013         /* Tile-resolution subband-based coordinates */
2014         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2015 
2016         ++tr;
2017 
2018         h.sn = (OPJ_INT32)rw;
2019         v.sn = (OPJ_INT32)rh;
2020 
2021         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2022         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2023 
2024         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2025         h.cas = tr->x0 % 2;
2026 
2027         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2028         v.cas = tr->y0 % 2;
2029 
2030         /* Get the subband coordinates for the window of interest */
2031         /* LL band */
2032         opj_dwt_get_band_coordinates(tilec, resno, 0,
2033                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2034                                      &win_ll_x0, &win_ll_y0,
2035                                      &win_ll_x1, &win_ll_y1);
2036 
2037         /* HL band */
2038         opj_dwt_get_band_coordinates(tilec, resno, 1,
2039                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2040                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
2041 
2042         /* LH band */
2043         opj_dwt_get_band_coordinates(tilec, resno, 2,
2044                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2045                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
2046 
2047         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2048         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2049         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2050         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2051         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2052 
2053         /* Subtract the origin of the bands for this tile, to the subwindow */
2054         /* of interest band coordinates, so as to get them relative to the */
2055         /* tile */
2056         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2057         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2058         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2059         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2060         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2061         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2062         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2063         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2064 
2065         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2066         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2067 
2068         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2069         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2070 
2071         /* Compute the tile-resolution-based coordinates for the window of interest */
2072         if (h.cas == 0) {
2073             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2074             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2075         } else {
2076             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2077             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2078         }
2079 
2080         if (v.cas == 0) {
2081             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2082             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2083         } else {
2084             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2085             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2086         }
2087 
2088         for (j = 0; j < rh; ++j) {
2089             if ((j >= win_ll_y0 && j < win_ll_y1) ||
2090                     (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2091 
2092                 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */
2093                 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */
2094                 /* on opj_decompress -i  ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */
2095                 /* This is less extreme than memsetting the whole buffer to 0 */
2096                 /* although we could potentially do better with better handling of edge conditions */
2097                 if (win_tr_x1 >= 1 && win_tr_x1 < rw) {
2098                     h.mem[win_tr_x1 - 1] = 0;
2099                 }
2100                 if (win_tr_x1 < rw) {
2101                     h.mem[win_tr_x1] = 0;
2102                 }
2103 
2104                 opj_dwt_interleave_partial_h(h.mem,
2105                                              h.cas,
2106                                              sa,
2107                                              j,
2108                                              (OPJ_UINT32)h.sn,
2109                                              win_ll_x0,
2110                                              win_ll_x1,
2111                                              win_hl_x0,
2112                                              win_hl_x1);
2113                 opj_dwt_decode_partial_1(h.mem, h.mem_count, h.dn, h.sn, h.cas,
2114                                          (OPJ_INT32)win_ll_x0,
2115                                          (OPJ_INT32)win_ll_x1,
2116                                          (OPJ_INT32)win_hl_x0,
2117                                          (OPJ_INT32)win_hl_x1);
2118                 if (!opj_sparse_array_int32_write(sa,
2119                                                   win_tr_x0, j,
2120                                                   win_tr_x1, j + 1,
2121                                                   h.mem + win_tr_x0,
2122                                                   1, 0, OPJ_TRUE)) {
2123                     /* FIXME event manager error callback */
2124                     opj_sparse_array_int32_free(sa);
2125                     opj_aligned_free(h.mem);
2126                     return OPJ_FALSE;
2127                 }
2128             }
2129         }
2130 
2131         for (i = win_tr_x0; i < win_tr_x1;) {
2132             OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i);
2133             opj_dwt_interleave_partial_v(v.mem,
2134                                          v.cas,
2135                                          sa,
2136                                          i,
2137                                          nb_cols,
2138                                          (OPJ_UINT32)v.sn,
2139                                          win_ll_y0,
2140                                          win_ll_y1,
2141                                          win_lh_y0,
2142                                          win_lh_y1);
2143             opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas,
2144                                               (OPJ_INT32)win_ll_y0,
2145                                               (OPJ_INT32)win_ll_y1,
2146                                               (OPJ_INT32)win_lh_y0,
2147                                               (OPJ_INT32)win_lh_y1);
2148             if (!opj_sparse_array_int32_write(sa,
2149                                               i, win_tr_y0,
2150                                               i + nb_cols, win_tr_y1,
2151                                               v.mem + 4 * win_tr_y0,
2152                                               1, 4, OPJ_TRUE)) {
2153                 /* FIXME event manager error callback */
2154                 opj_sparse_array_int32_free(sa);
2155                 opj_aligned_free(h.mem);
2156                 return OPJ_FALSE;
2157             }
2158 
2159             i += nb_cols;
2160         }
2161     }
2162     opj_aligned_free(h.mem);
2163 
2164     {
2165         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2166                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2167                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2168                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2169                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2170                        tilec->data_win,
2171                        1, tr_max->win_x1 - tr_max->win_x0,
2172                        OPJ_TRUE);
2173         assert(ret);
2174         OPJ_UNUSED(ret);
2175     }
2176     opj_sparse_array_int32_free(sa);
2177     return OPJ_TRUE;
2178 }
2179 
opj_v4dwt_interleave_h(opj_v4dwt_t * OPJ_RESTRICT dwt,OPJ_FLOAT32 * OPJ_RESTRICT a,OPJ_UINT32 width,OPJ_UINT32 remaining_height)2180 static void opj_v4dwt_interleave_h(opj_v4dwt_t* OPJ_RESTRICT dwt,
2181                                    OPJ_FLOAT32* OPJ_RESTRICT a,
2182                                    OPJ_UINT32 width,
2183                                    OPJ_UINT32 remaining_height)
2184 {
2185     OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas);
2186     OPJ_UINT32 i, k;
2187     OPJ_UINT32 x0 = dwt->win_l_x0;
2188     OPJ_UINT32 x1 = dwt->win_l_x1;
2189 
2190     for (k = 0; k < 2; ++k) {
2191         if (remaining_height >= 4 && ((OPJ_SIZE_T) a & 0x0f) == 0 &&
2192                 ((OPJ_SIZE_T) bi & 0x0f) == 0 && (width & 0x0f) == 0) {
2193             /* Fast code path */
2194             for (i = x0; i < x1; ++i) {
2195                 OPJ_UINT32 j = i;
2196                 bi[i * 8    ] = a[j];
2197                 j += width;
2198                 bi[i * 8 + 1] = a[j];
2199                 j += width;
2200                 bi[i * 8 + 2] = a[j];
2201                 j += width;
2202                 bi[i * 8 + 3] = a[j];
2203             }
2204         } else {
2205             /* Slow code path */
2206             for (i = x0; i < x1; ++i) {
2207                 OPJ_UINT32 j = i;
2208                 bi[i * 8    ] = a[j];
2209                 j += width;
2210                 if (remaining_height == 1) {
2211                     continue;
2212                 }
2213                 bi[i * 8 + 1] = a[j];
2214                 j += width;
2215                 if (remaining_height == 2) {
2216                     continue;
2217                 }
2218                 bi[i * 8 + 2] = a[j];
2219                 j += width;
2220                 if (remaining_height == 3) {
2221                     continue;
2222                 }
2223                 bi[i * 8 + 3] = a[j]; /* This one*/
2224             }
2225         }
2226 
2227         bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas);
2228         a += dwt->sn;
2229         x0 = dwt->win_h_x0;
2230         x1 = dwt->win_h_x1;
2231     }
2232 }
2233 
opj_v4dwt_interleave_partial_h(opj_v4dwt_t * dwt,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_line,OPJ_UINT32 remaining_height)2234 static void opj_v4dwt_interleave_partial_h(opj_v4dwt_t* dwt,
2235         opj_sparse_array_int32_t* sa,
2236         OPJ_UINT32 sa_line,
2237         OPJ_UINT32 remaining_height)
2238 {
2239     OPJ_UINT32 i;
2240     for (i = 0; i < remaining_height; i++) {
2241         OPJ_BOOL ret;
2242         ret = opj_sparse_array_int32_read(sa,
2243                                           dwt->win_l_x0, sa_line + i,
2244                                           dwt->win_l_x1, sa_line + i + 1,
2245                                           /* Nasty cast from float* to int32* */
2246                                           (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i,
2247                                           8, 0, OPJ_TRUE);
2248         assert(ret);
2249         ret = opj_sparse_array_int32_read(sa,
2250                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i,
2251                                           (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1,
2252                                           /* Nasty cast from float* to int32* */
2253                                           (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i,
2254                                           8, 0, OPJ_TRUE);
2255         assert(ret);
2256         OPJ_UNUSED(ret);
2257     }
2258 }
2259 
opj_v4dwt_interleave_v(opj_v4dwt_t * OPJ_RESTRICT dwt,OPJ_FLOAT32 * OPJ_RESTRICT a,OPJ_UINT32 width,OPJ_UINT32 nb_elts_read)2260 static void opj_v4dwt_interleave_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
2261                                    OPJ_FLOAT32* OPJ_RESTRICT a,
2262                                    OPJ_UINT32 width,
2263                                    OPJ_UINT32 nb_elts_read)
2264 {
2265     opj_v4_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas;
2266     OPJ_UINT32 i;
2267 
2268     for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) {
2269         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2270                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2271     }
2272 
2273     a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width;
2274     bi = dwt->wavelet + 1 - dwt->cas;
2275 
2276     for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) {
2277         memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width],
2278                (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32));
2279     }
2280 }
2281 
opj_v4dwt_interleave_partial_v(opj_v4dwt_t * OPJ_RESTRICT dwt,opj_sparse_array_int32_t * sa,OPJ_UINT32 sa_col,OPJ_UINT32 nb_elts_read)2282 static void opj_v4dwt_interleave_partial_v(opj_v4dwt_t* OPJ_RESTRICT dwt,
2283         opj_sparse_array_int32_t* sa,
2284         OPJ_UINT32 sa_col,
2285         OPJ_UINT32 nb_elts_read)
2286 {
2287     OPJ_BOOL ret;
2288     ret = opj_sparse_array_int32_read(sa,
2289                                       sa_col, dwt->win_l_x0,
2290                                       sa_col + nb_elts_read, dwt->win_l_x1,
2291                                       (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0),
2292                                       1, 8, OPJ_TRUE);
2293     assert(ret);
2294     ret = opj_sparse_array_int32_read(sa,
2295                                       sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0,
2296                                       sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1,
2297                                       (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0),
2298                                       1, 8, OPJ_TRUE);
2299     assert(ret);
2300     OPJ_UNUSED(ret);
2301 }
2302 
2303 #ifdef __SSE__
2304 
opj_v4dwt_decode_step1_sse(opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,const __m128 c)2305 static void opj_v4dwt_decode_step1_sse(opj_v4_t* w,
2306                                        OPJ_UINT32 start,
2307                                        OPJ_UINT32 end,
2308                                        const __m128 c)
2309 {
2310     __m128* OPJ_RESTRICT vw = (__m128*) w;
2311     OPJ_UINT32 i;
2312     /* 4x unrolled loop */
2313     vw += 2 * start;
2314     for (i = start; i + 3 < end; i += 4, vw += 8) {
2315         __m128 xmm0 = _mm_mul_ps(vw[0], c);
2316         __m128 xmm2 = _mm_mul_ps(vw[2], c);
2317         __m128 xmm4 = _mm_mul_ps(vw[4], c);
2318         __m128 xmm6 = _mm_mul_ps(vw[6], c);
2319         vw[0] = xmm0;
2320         vw[2] = xmm2;
2321         vw[4] = xmm4;
2322         vw[6] = xmm6;
2323     }
2324     for (; i < end; ++i, vw += 2) {
2325         vw[0] = _mm_mul_ps(vw[0], c);
2326     }
2327 }
2328 
opj_v4dwt_decode_step2_sse(opj_v4_t * l,opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,OPJ_UINT32 m,__m128 c)2329 static void opj_v4dwt_decode_step2_sse(opj_v4_t* l, opj_v4_t* w,
2330                                        OPJ_UINT32 start,
2331                                        OPJ_UINT32 end,
2332                                        OPJ_UINT32 m,
2333                                        __m128 c)
2334 {
2335     __m128* OPJ_RESTRICT vl = (__m128*) l;
2336     __m128* OPJ_RESTRICT vw = (__m128*) w;
2337     OPJ_UINT32 i;
2338     OPJ_UINT32 imax = opj_uint_min(end, m);
2339     __m128 tmp1, tmp2, tmp3;
2340     if (start == 0) {
2341         tmp1 = vl[0];
2342     } else {
2343         vw += start * 2;
2344         tmp1 = vw[-3];
2345     }
2346 
2347     i = start;
2348 
2349     /* 4x loop unrolling */
2350     for (; i + 3 < imax; i += 4) {
2351         __m128 tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
2352         tmp2 = vw[-1];
2353         tmp3 = vw[ 0];
2354         tmp4 = vw[ 1];
2355         tmp5 = vw[ 2];
2356         tmp6 = vw[ 3];
2357         tmp7 = vw[ 4];
2358         tmp8 = vw[ 5];
2359         tmp9 = vw[ 6];
2360         vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
2361         vw[ 1] = _mm_add_ps(tmp4, _mm_mul_ps(_mm_add_ps(tmp3, tmp5), c));
2362         vw[ 3] = _mm_add_ps(tmp6, _mm_mul_ps(_mm_add_ps(tmp5, tmp7), c));
2363         vw[ 5] = _mm_add_ps(tmp8, _mm_mul_ps(_mm_add_ps(tmp7, tmp9), c));
2364         tmp1 = tmp9;
2365         vw += 8;
2366     }
2367 
2368     for (; i < imax; ++i) {
2369         tmp2 = vw[-1];
2370         tmp3 = vw[ 0];
2371         vw[-1] = _mm_add_ps(tmp2, _mm_mul_ps(_mm_add_ps(tmp1, tmp3), c));
2372         tmp1 = tmp3;
2373         vw += 2;
2374     }
2375     if (m < end) {
2376         assert(m + 1 == end);
2377         c = _mm_add_ps(c, c);
2378         c = _mm_mul_ps(c, vw[-2]);
2379         vw[-1] = _mm_add_ps(vw[-1], c);
2380     }
2381 }
2382 
2383 #else
2384 
opj_v4dwt_decode_step1(opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,const OPJ_FLOAT32 c)2385 static void opj_v4dwt_decode_step1(opj_v4_t* w,
2386                                    OPJ_UINT32 start,
2387                                    OPJ_UINT32 end,
2388                                    const OPJ_FLOAT32 c)
2389 {
2390     OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w;
2391     OPJ_UINT32 i;
2392     for (i = start; i < end; ++i) {
2393         OPJ_FLOAT32 tmp1 = fw[i * 8    ];
2394         OPJ_FLOAT32 tmp2 = fw[i * 8 + 1];
2395         OPJ_FLOAT32 tmp3 = fw[i * 8 + 2];
2396         OPJ_FLOAT32 tmp4 = fw[i * 8 + 3];
2397         fw[i * 8    ] = tmp1 * c;
2398         fw[i * 8 + 1] = tmp2 * c;
2399         fw[i * 8 + 2] = tmp3 * c;
2400         fw[i * 8 + 3] = tmp4 * c;
2401     }
2402 }
2403 
opj_v4dwt_decode_step2(opj_v4_t * l,opj_v4_t * w,OPJ_UINT32 start,OPJ_UINT32 end,OPJ_UINT32 m,OPJ_FLOAT32 c)2404 static void opj_v4dwt_decode_step2(opj_v4_t* l, opj_v4_t* w,
2405                                    OPJ_UINT32 start,
2406                                    OPJ_UINT32 end,
2407                                    OPJ_UINT32 m,
2408                                    OPJ_FLOAT32 c)
2409 {
2410     OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l;
2411     OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w;
2412     OPJ_UINT32 i;
2413     OPJ_UINT32 imax = opj_uint_min(end, m);
2414     if (start > 0) {
2415         fw += 8 * start;
2416         fl = fw - 8;
2417     }
2418     for (i = start; i < imax; ++i) {
2419         OPJ_FLOAT32 tmp1_1 = fl[0];
2420         OPJ_FLOAT32 tmp1_2 = fl[1];
2421         OPJ_FLOAT32 tmp1_3 = fl[2];
2422         OPJ_FLOAT32 tmp1_4 = fl[3];
2423         OPJ_FLOAT32 tmp2_1 = fw[-4];
2424         OPJ_FLOAT32 tmp2_2 = fw[-3];
2425         OPJ_FLOAT32 tmp2_3 = fw[-2];
2426         OPJ_FLOAT32 tmp2_4 = fw[-1];
2427         OPJ_FLOAT32 tmp3_1 = fw[0];
2428         OPJ_FLOAT32 tmp3_2 = fw[1];
2429         OPJ_FLOAT32 tmp3_3 = fw[2];
2430         OPJ_FLOAT32 tmp3_4 = fw[3];
2431         fw[-4] = tmp2_1 + ((tmp1_1 + tmp3_1) * c);
2432         fw[-3] = tmp2_2 + ((tmp1_2 + tmp3_2) * c);
2433         fw[-2] = tmp2_3 + ((tmp1_3 + tmp3_3) * c);
2434         fw[-1] = tmp2_4 + ((tmp1_4 + tmp3_4) * c);
2435         fl = fw;
2436         fw += 8;
2437     }
2438     if (m < end) {
2439         assert(m + 1 == end);
2440         c += c;
2441         fw[-4] = fw[-4] + fl[0] * c;
2442         fw[-3] = fw[-3] + fl[1] * c;
2443         fw[-2] = fw[-2] + fl[2] * c;
2444         fw[-1] = fw[-1] + fl[3] * c;
2445     }
2446 }
2447 
2448 #endif
2449 
2450 /* <summary>                             */
2451 /* Inverse 9-7 wavelet transform in 1-D. */
2452 /* </summary>                            */
opj_v4dwt_decode(opj_v4dwt_t * OPJ_RESTRICT dwt)2453 static void opj_v4dwt_decode(opj_v4dwt_t* OPJ_RESTRICT dwt)
2454 {
2455     OPJ_INT32 a, b;
2456     if (dwt->cas == 0) {
2457         if (!((dwt->dn > 0) || (dwt->sn > 1))) {
2458             return;
2459         }
2460         a = 0;
2461         b = 1;
2462     } else {
2463         if (!((dwt->sn > 0) || (dwt->dn > 1))) {
2464             return;
2465         }
2466         a = 1;
2467         b = 0;
2468     }
2469 #ifdef __SSE__
2470     opj_v4dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2471                                _mm_set1_ps(opj_K));
2472     opj_v4dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2473                                _mm_set1_ps(opj_c13318));
2474     opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
2475                                dwt->win_l_x0, dwt->win_l_x1,
2476                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2477                                _mm_set1_ps(opj_dwt_delta));
2478     opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
2479                                dwt->win_h_x0, dwt->win_h_x1,
2480                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2481                                _mm_set1_ps(opj_dwt_gamma));
2482     opj_v4dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1,
2483                                dwt->win_l_x0, dwt->win_l_x1,
2484                                (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2485                                _mm_set1_ps(opj_dwt_beta));
2486     opj_v4dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1,
2487                                dwt->win_h_x0, dwt->win_h_x1,
2488                                (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2489                                _mm_set1_ps(opj_dwt_alpha));
2490 #else
2491     opj_v4dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1,
2492                            opj_K);
2493     opj_v4dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1,
2494                            opj_c13318);
2495     opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
2496                            dwt->win_l_x0, dwt->win_l_x1,
2497                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2498                            opj_dwt_delta);
2499     opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
2500                            dwt->win_h_x0, dwt->win_h_x1,
2501                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2502                            opj_dwt_gamma);
2503     opj_v4dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1,
2504                            dwt->win_l_x0, dwt->win_l_x1,
2505                            (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a),
2506                            opj_dwt_beta);
2507     opj_v4dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1,
2508                            dwt->win_h_x0, dwt->win_h_x1,
2509                            (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b),
2510                            opj_dwt_alpha);
2511 #endif
2512 }
2513 
2514 
2515 /* <summary>                             */
2516 /* Inverse 9-7 wavelet transform in 2-D. */
2517 /* </summary>                            */
2518 static
opj_dwt_decode_tile_97(opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)2519 OPJ_BOOL opj_dwt_decode_tile_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2520                                 OPJ_UINT32 numres)
2521 {
2522     opj_v4dwt_t h;
2523     opj_v4dwt_t v;
2524 
2525     opj_tcd_resolution_t* res = tilec->resolutions;
2526 
2527     OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 -
2528                                  res->x0);    /* width of the resolution level computed */
2529     OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 -
2530                                  res->y0);    /* height of the resolution level computed */
2531 
2532     OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions -
2533                                                                1].x1 -
2534                                 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0);
2535 
2536     OPJ_SIZE_T l_data_size;
2537 
2538     l_data_size = opj_dwt_max_resolution(res, numres);
2539     /* overflow check */
2540     if (l_data_size > (SIZE_MAX - 5U)) {
2541         /* FIXME event manager error callback */
2542         return OPJ_FALSE;
2543     }
2544     l_data_size += 5U;
2545     /* overflow check */
2546     if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
2547         /* FIXME event manager error callback */
2548         return OPJ_FALSE;
2549     }
2550     h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
2551     if (!h.wavelet) {
2552         /* FIXME event manager error callback */
2553         return OPJ_FALSE;
2554     }
2555     v.wavelet = h.wavelet;
2556 
2557     while (--numres) {
2558         OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data;
2559         OPJ_UINT32 j;
2560 
2561         h.sn = (OPJ_INT32)rw;
2562         v.sn = (OPJ_INT32)rh;
2563 
2564         ++res;
2565 
2566         rw = (OPJ_UINT32)(res->x1 -
2567                           res->x0);   /* width of the resolution level computed */
2568         rh = (OPJ_UINT32)(res->y1 -
2569                           res->y0);   /* height of the resolution level computed */
2570 
2571         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2572         h.cas = res->x0 % 2;
2573 
2574         h.win_l_x0 = 0;
2575         h.win_l_x1 = (OPJ_UINT32)h.sn;
2576         h.win_h_x0 = 0;
2577         h.win_h_x1 = (OPJ_UINT32)h.dn;
2578         for (j = 0; j + 3 < rh; j += 4) {
2579             OPJ_UINT32 k;
2580             opj_v4dwt_interleave_h(&h, aj, w, rh - j);
2581             opj_v4dwt_decode(&h);
2582 
2583             for (k = 0; k < rw; k++) {
2584                 aj[k      ] = h.wavelet[k].f[0];
2585                 aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
2586                 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
2587                 aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3];
2588             }
2589 
2590             aj += w * 4;
2591         }
2592 
2593         if (j < rh) {
2594             OPJ_UINT32 k;
2595             opj_v4dwt_interleave_h(&h, aj, w, rh - j);
2596             opj_v4dwt_decode(&h);
2597             for (k = 0; k < rw; k++) {
2598                 switch (rh - j) {
2599                 case 3:
2600                     aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2];
2601                 /* FALLTHRU */
2602                 case 2:
2603                     aj[k + (OPJ_SIZE_T)w  ] = h.wavelet[k].f[1];
2604                 /* FALLTHRU */
2605                 case 1:
2606                     aj[k] = h.wavelet[k].f[0];
2607                 }
2608             }
2609         }
2610 
2611         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2612         v.cas = res->y0 % 2;
2613         v.win_l_x0 = 0;
2614         v.win_l_x1 = (OPJ_UINT32)v.sn;
2615         v.win_h_x0 = 0;
2616         v.win_h_x1 = (OPJ_UINT32)v.dn;
2617 
2618         aj = (OPJ_FLOAT32*) tilec->data;
2619         for (j = rw; j > 3; j -= 4) {
2620             OPJ_UINT32 k;
2621 
2622             opj_v4dwt_interleave_v(&v, aj, w, 4);
2623             opj_v4dwt_decode(&v);
2624 
2625             for (k = 0; k < rh; ++k) {
2626                 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], 4 * sizeof(OPJ_FLOAT32));
2627             }
2628             aj += 4;
2629         }
2630 
2631         if (rw & 0x03) {
2632             OPJ_UINT32 k;
2633 
2634             j = rw & 0x03;
2635 
2636             opj_v4dwt_interleave_v(&v, aj, w, j);
2637             opj_v4dwt_decode(&v);
2638 
2639             for (k = 0; k < rh; ++k) {
2640                 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k],
2641                        (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32));
2642             }
2643         }
2644     }
2645 
2646     opj_aligned_free(h.wavelet);
2647     return OPJ_TRUE;
2648 }
2649 
2650 static
opj_dwt_decode_partial_97(opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)2651 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2652                                    OPJ_UINT32 numres)
2653 {
2654     opj_sparse_array_int32_t* sa;
2655     opj_v4dwt_t h;
2656     opj_v4dwt_t v;
2657     OPJ_UINT32 resno;
2658     /* This value matches the maximum left/right extension given in tables */
2659     /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */
2660     /* we currently use 3. */
2661     const OPJ_UINT32 filter_width = 4U;
2662 
2663     opj_tcd_resolution_t* tr = tilec->resolutions;
2664     opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]);
2665 
2666     OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 -
2667                                  tr->x0);    /* width of the resolution level computed */
2668     OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 -
2669                                  tr->y0);    /* height of the resolution level computed */
2670 
2671     OPJ_SIZE_T l_data_size;
2672 
2673     /* Compute the intersection of the area of interest, expressed in tile coordinates */
2674     /* with the tile coordinates */
2675     OPJ_UINT32 win_tcx0 = tilec->win_x0;
2676     OPJ_UINT32 win_tcy0 = tilec->win_y0;
2677     OPJ_UINT32 win_tcx1 = tilec->win_x1;
2678     OPJ_UINT32 win_tcy1 = tilec->win_y1;
2679 
2680     if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) {
2681         return OPJ_TRUE;
2682     }
2683 
2684     sa = opj_dwt_init_sparse_array(tilec, numres);
2685     if (sa == NULL) {
2686         return OPJ_FALSE;
2687     }
2688 
2689     if (numres == 1U) {
2690         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2691                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2692                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2693                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2694                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2695                        tilec->data_win,
2696                        1, tr_max->win_x1 - tr_max->win_x0,
2697                        OPJ_TRUE);
2698         assert(ret);
2699         OPJ_UNUSED(ret);
2700         opj_sparse_array_int32_free(sa);
2701         return OPJ_TRUE;
2702     }
2703 
2704     l_data_size = opj_dwt_max_resolution(tr, numres);
2705     /* overflow check */
2706     if (l_data_size > (SIZE_MAX - 5U)) {
2707         /* FIXME event manager error callback */
2708         opj_sparse_array_int32_free(sa);
2709         return OPJ_FALSE;
2710     }
2711     l_data_size += 5U;
2712     /* overflow check */
2713     if (l_data_size > (SIZE_MAX / sizeof(opj_v4_t))) {
2714         /* FIXME event manager error callback */
2715         opj_sparse_array_int32_free(sa);
2716         return OPJ_FALSE;
2717     }
2718     h.wavelet = (opj_v4_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v4_t));
2719     if (!h.wavelet) {
2720         /* FIXME event manager error callback */
2721         opj_sparse_array_int32_free(sa);
2722         return OPJ_FALSE;
2723     }
2724     v.wavelet = h.wavelet;
2725 
2726     for (resno = 1; resno < numres; resno ++) {
2727         OPJ_UINT32 j;
2728         /* Window of interest subband-based coordinates */
2729         OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1;
2730         OPJ_UINT32 win_hl_x0, win_hl_x1;
2731         OPJ_UINT32 win_lh_y0, win_lh_y1;
2732         /* Window of interest tile-resolution-based coordinates */
2733         OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1;
2734         /* Tile-resolution subband-based coordinates */
2735         OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0;
2736 
2737         ++tr;
2738 
2739         h.sn = (OPJ_INT32)rw;
2740         v.sn = (OPJ_INT32)rh;
2741 
2742         rw = (OPJ_UINT32)(tr->x1 - tr->x0);
2743         rh = (OPJ_UINT32)(tr->y1 - tr->y0);
2744 
2745         h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn);
2746         h.cas = tr->x0 % 2;
2747 
2748         v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn);
2749         v.cas = tr->y0 % 2;
2750 
2751         /* Get the subband coordinates for the window of interest */
2752         /* LL band */
2753         opj_dwt_get_band_coordinates(tilec, resno, 0,
2754                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2755                                      &win_ll_x0, &win_ll_y0,
2756                                      &win_ll_x1, &win_ll_y1);
2757 
2758         /* HL band */
2759         opj_dwt_get_band_coordinates(tilec, resno, 1,
2760                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2761                                      &win_hl_x0, NULL, &win_hl_x1, NULL);
2762 
2763         /* LH band */
2764         opj_dwt_get_band_coordinates(tilec, resno, 2,
2765                                      win_tcx0, win_tcy0, win_tcx1, win_tcy1,
2766                                      NULL, &win_lh_y0, NULL, &win_lh_y1);
2767 
2768         /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */
2769         tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0;
2770         tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0;
2771         tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0;
2772         tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0;
2773 
2774         /* Subtract the origin of the bands for this tile, to the subwindow */
2775         /* of interest band coordinates, so as to get them relative to the */
2776         /* tile */
2777         win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0);
2778         win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0);
2779         win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0);
2780         win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0);
2781         win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0);
2782         win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0);
2783         win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0);
2784         win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0);
2785 
2786         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1);
2787         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1);
2788 
2789         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1);
2790         opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1);
2791 
2792         /* Compute the tile-resolution-based coordinates for the window of interest */
2793         if (h.cas == 0) {
2794             win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1);
2795             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw);
2796         } else {
2797             win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1);
2798             win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw);
2799         }
2800 
2801         if (v.cas == 0) {
2802             win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1);
2803             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh);
2804         } else {
2805             win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1);
2806             win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh);
2807         }
2808 
2809         h.win_l_x0 = win_ll_x0;
2810         h.win_l_x1 = win_ll_x1;
2811         h.win_h_x0 = win_hl_x0;
2812         h.win_h_x1 = win_hl_x1;
2813         for (j = 0; j + 3 < rh; j += 4) {
2814             if ((j + 3 >= win_ll_y0 && j < win_ll_y1) ||
2815                     (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn &&
2816                      j < win_lh_y1 + (OPJ_UINT32)v.sn)) {
2817                 opj_v4dwt_interleave_partial_h(&h, sa, j, opj_uint_min(4U, rh - j));
2818                 opj_v4dwt_decode(&h);
2819                 if (!opj_sparse_array_int32_write(sa,
2820                                                   win_tr_x0, j,
2821                                                   win_tr_x1, j + 4,
2822                                                   (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
2823                                                   4, 1, OPJ_TRUE)) {
2824                     /* FIXME event manager error callback */
2825                     opj_sparse_array_int32_free(sa);
2826                     opj_aligned_free(h.wavelet);
2827                     return OPJ_FALSE;
2828                 }
2829             }
2830         }
2831 
2832         if (j < rh &&
2833                 ((j + 3 >= win_ll_y0 && j < win_ll_y1) ||
2834                  (j + 3 >= win_lh_y0 + (OPJ_UINT32)v.sn &&
2835                   j < win_lh_y1 + (OPJ_UINT32)v.sn))) {
2836             opj_v4dwt_interleave_partial_h(&h, sa, j, rh - j);
2837             opj_v4dwt_decode(&h);
2838             if (!opj_sparse_array_int32_write(sa,
2839                                               win_tr_x0, j,
2840                                               win_tr_x1, rh,
2841                                               (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0],
2842                                               4, 1, OPJ_TRUE)) {
2843                 /* FIXME event manager error callback */
2844                 opj_sparse_array_int32_free(sa);
2845                 opj_aligned_free(h.wavelet);
2846                 return OPJ_FALSE;
2847             }
2848         }
2849 
2850         v.win_l_x0 = win_ll_y0;
2851         v.win_l_x1 = win_ll_y1;
2852         v.win_h_x0 = win_lh_y0;
2853         v.win_h_x1 = win_lh_y1;
2854         for (j = win_tr_x0; j < win_tr_x1; j += 4) {
2855             OPJ_UINT32 nb_elts = opj_uint_min(4U, win_tr_x1 - j);
2856 
2857             opj_v4dwt_interleave_partial_v(&v, sa, j, nb_elts);
2858             opj_v4dwt_decode(&v);
2859 
2860             if (!opj_sparse_array_int32_write(sa,
2861                                               j, win_tr_y0,
2862                                               j + nb_elts, win_tr_y1,
2863                                               (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0],
2864                                               1, 4, OPJ_TRUE)) {
2865                 /* FIXME event manager error callback */
2866                 opj_sparse_array_int32_free(sa);
2867                 opj_aligned_free(h.wavelet);
2868                 return OPJ_FALSE;
2869             }
2870         }
2871     }
2872 
2873     {
2874         OPJ_BOOL ret = opj_sparse_array_int32_read(sa,
2875                        tr_max->win_x0 - (OPJ_UINT32)tr_max->x0,
2876                        tr_max->win_y0 - (OPJ_UINT32)tr_max->y0,
2877                        tr_max->win_x1 - (OPJ_UINT32)tr_max->x0,
2878                        tr_max->win_y1 - (OPJ_UINT32)tr_max->y0,
2879                        tilec->data_win,
2880                        1, tr_max->win_x1 - tr_max->win_x0,
2881                        OPJ_TRUE);
2882         assert(ret);
2883         OPJ_UNUSED(ret);
2884     }
2885     opj_sparse_array_int32_free(sa);
2886 
2887     opj_aligned_free(h.wavelet);
2888     return OPJ_TRUE;
2889 }
2890 
2891 
opj_dwt_decode_real(opj_tcd_t * p_tcd,opj_tcd_tilecomp_t * OPJ_RESTRICT tilec,OPJ_UINT32 numres)2892 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd,
2893                              opj_tcd_tilecomp_t* OPJ_RESTRICT tilec,
2894                              OPJ_UINT32 numres)
2895 {
2896     if (p_tcd->whole_tile_decoding) {
2897         return opj_dwt_decode_tile_97(tilec, numres);
2898     } else {
2899         return opj_dwt_decode_partial_97(tilec, numres);
2900     }
2901 }
2902