1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 #include "precomp.hpp"
45 #include "opencl_kernels_core.hpp"
46 
47 #ifdef __APPLE__
48 #undef CV_NEON
49 #define CV_NEON 0
50 #endif
51 
52 namespace cv
53 {
54 
55 /****************************************************************************************\
56 *                                       split & merge                                    *
57 \****************************************************************************************/
58 
59 #if CV_NEON
60 template<typename T> struct VSplit2;
61 template<typename T> struct VSplit3;
62 template<typename T> struct VSplit4;
63 
64 #define SPLIT2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
65     template<>                                                                    \
66     struct name<data_type>                                                        \
67     {                                                                             \
68         void operator()(const data_type* src, data_type* dst0,                    \
69                         data_type* dst1) const                                    \
70         {                                                                         \
71             reg_type r = load_func(src);                                          \
72             store_func(dst0, r.val[0]);                                           \
73             store_func(dst1, r.val[1]);                                           \
74         }                                                                         \
75     }
76 
77 #define SPLIT3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
78     template<>                                                                    \
79     struct name<data_type>                                                        \
80     {                                                                             \
81         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
82                         data_type* dst2) const                                    \
83         {                                                                         \
84             reg_type r = load_func(src);                                          \
85             store_func(dst0, r.val[0]);                                           \
86             store_func(dst1, r.val[1]);                                           \
87             store_func(dst2, r.val[2]);                                           \
88         }                                                                         \
89     }
90 
91 #define SPLIT4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
92     template<>                                                                    \
93     struct name<data_type>                                                        \
94     {                                                                             \
95         void operator()(const data_type* src, data_type* dst0, data_type* dst1,   \
96                         data_type* dst2, data_type* dst3) const                   \
97         {                                                                         \
98             reg_type r = load_func(src);                                          \
99             store_func(dst0, r.val[0]);                                           \
100             store_func(dst1, r.val[1]);                                           \
101             store_func(dst2, r.val[2]);                                           \
102             store_func(dst3, r.val[3]);                                           \
103         }                                                                         \
104     }
105 
106 SPLIT2_KERNEL_TEMPLATE(VSplit2, uchar ,  uint8x16x2_t, vld2q_u8 , vst1q_u8 );
107 SPLIT2_KERNEL_TEMPLATE(VSplit2, ushort,  uint16x8x2_t, vld2q_u16, vst1q_u16);
108 SPLIT2_KERNEL_TEMPLATE(VSplit2, int   ,   int32x4x2_t, vld2q_s32, vst1q_s32);
109 SPLIT2_KERNEL_TEMPLATE(VSplit2, int64 ,   int64x1x2_t, vld2_s64 , vst1_s64 );
110 
111 SPLIT3_KERNEL_TEMPLATE(VSplit3, uchar ,  uint8x16x3_t, vld3q_u8 , vst1q_u8 );
112 SPLIT3_KERNEL_TEMPLATE(VSplit3, ushort,  uint16x8x3_t, vld3q_u16, vst1q_u16);
113 SPLIT3_KERNEL_TEMPLATE(VSplit3, int   ,   int32x4x3_t, vld3q_s32, vst1q_s32);
114 SPLIT3_KERNEL_TEMPLATE(VSplit3, int64 ,   int64x1x3_t, vld3_s64 , vst1_s64 );
115 
116 SPLIT4_KERNEL_TEMPLATE(VSplit4, uchar ,  uint8x16x4_t, vld4q_u8 , vst1q_u8 );
117 SPLIT4_KERNEL_TEMPLATE(VSplit4, ushort,  uint16x8x4_t, vld4q_u16, vst1q_u16);
118 SPLIT4_KERNEL_TEMPLATE(VSplit4, int   ,   int32x4x4_t, vld4q_s32, vst1q_s32);
119 SPLIT4_KERNEL_TEMPLATE(VSplit4, int64 ,   int64x1x4_t, vld4_s64 , vst1_s64 );
120 
121 #elif CV_SSE2
122 
123 template <typename T>
124 struct VSplit2
125 {
126     VSplit2() : support(false) { }
127     void operator()(const T *, T *, T *) const { }
128 
129     bool support;
130 };
131 
132 template <typename T>
133 struct VSplit3
134 {
135     VSplit3() : support(false) { }
136     void operator()(const T *, T *, T *, T *) const { }
137 
138     bool support;
139 };
140 
141 template <typename T>
142 struct VSplit4
143 {
144     VSplit4() : support(false) { }
145     void operator()(const T *, T *, T *, T *, T *) const { }
146 
147     bool support;
148 };
149 
150 #define SPLIT2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
151 template <>                                                                                \
152 struct VSplit2<data_type>                                                                  \
153 {                                                                                          \
154     enum                                                                                   \
155     {                                                                                      \
156         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
157     };                                                                                     \
158                                                                                            \
159     VSplit2()                                                                              \
160     {                                                                                      \
161         support = checkHardwareSupport(CV_CPU_SSE2);                                       \
162     }                                                                                      \
163                                                                                            \
164     void operator()(const data_type * src,                                                 \
165                     data_type * dst0, data_type * dst1) const                              \
166     {                                                                                      \
167         reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
168         reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
169         reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
170         reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
171                                                                                            \
172         _mm_deinterleave(v_src0, v_src1, v_src2, v_src3);                                  \
173                                                                                            \
174         _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
175         _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
176         _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
177         _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
178     }                                                                                      \
179                                                                                            \
180     bool support;                                                                          \
181 }
182 
183 #define SPLIT3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
184 template <>                                                                                \
185 struct VSplit3<data_type>                                                                  \
186 {                                                                                          \
187     enum                                                                                   \
188     {                                                                                      \
189         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
190     };                                                                                     \
191                                                                                            \
192     VSplit3()                                                                              \
193     {                                                                                      \
194         support = checkHardwareSupport(CV_CPU_SSE2);                                       \
195     }                                                                                      \
196                                                                                            \
197     void operator()(const data_type * src,                                                 \
198                     data_type * dst0, data_type * dst1, data_type * dst2) const            \
199     {                                                                                      \
200         reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
201         reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
202         reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
203         reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
204         reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
205         reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
206                                                                                            \
207         _mm_deinterleave(v_src0, v_src1, v_src2,                                           \
208                          v_src3, v_src4, v_src5);                                          \
209                                                                                            \
210         _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
211         _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
212         _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
213         _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
214         _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
215         _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
216     }                                                                                      \
217                                                                                            \
218     bool support;                                                                          \
219 }
220 
221 #define SPLIT4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_deinterleave, flavor)   \
222 template <>                                                                                \
223 struct VSplit4<data_type>                                                                  \
224 {                                                                                          \
225     enum                                                                                   \
226     {                                                                                      \
227         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
228     };                                                                                     \
229                                                                                            \
230     VSplit4()                                                                              \
231     {                                                                                      \
232         support = checkHardwareSupport(CV_CPU_SSE2);                                       \
233     }                                                                                      \
234                                                                                            \
235     void operator()(const data_type * src, data_type * dst0, data_type * dst1,             \
236                     data_type * dst2, data_type * dst3) const                              \
237     {                                                                                      \
238         reg_type v_src0 = _mm_loadu_##flavor((cast_type const *)(src));                    \
239         reg_type v_src1 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC));     \
240         reg_type v_src2 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 2)); \
241         reg_type v_src3 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 3)); \
242         reg_type v_src4 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 4)); \
243         reg_type v_src5 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 5)); \
244         reg_type v_src6 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 6)); \
245         reg_type v_src7 = _mm_loadu_##flavor((cast_type const *)(src + ELEMS_IN_VEC * 7)); \
246                                                                                            \
247         _mm_deinterleave(v_src0, v_src1, v_src2, v_src3,                                   \
248                          v_src4, v_src5, v_src6, v_src7);                                  \
249                                                                                            \
250         _mm_storeu_##flavor((cast_type *)(dst0), v_src0);                                  \
251         _mm_storeu_##flavor((cast_type *)(dst0 + ELEMS_IN_VEC), v_src1);                   \
252         _mm_storeu_##flavor((cast_type *)(dst1), v_src2);                                  \
253         _mm_storeu_##flavor((cast_type *)(dst1 + ELEMS_IN_VEC), v_src3);                   \
254         _mm_storeu_##flavor((cast_type *)(dst2), v_src4);                                  \
255         _mm_storeu_##flavor((cast_type *)(dst2 + ELEMS_IN_VEC), v_src5);                   \
256         _mm_storeu_##flavor((cast_type *)(dst3), v_src6);                                  \
257         _mm_storeu_##flavor((cast_type *)(dst3 + ELEMS_IN_VEC), v_src7);                   \
258     }                                                                                      \
259                                                                                            \
260     bool support;                                                                          \
261 }
262 
263 SPLIT2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
264 SPLIT2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
265 SPLIT2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
266 
267 SPLIT3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
268 SPLIT3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
269 SPLIT3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
270 
271 SPLIT4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_deinterleave_epi8, si128);
272 SPLIT4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_deinterleave_epi16, si128);
273 SPLIT4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_deinterleave_ps, ps);
274 
275 #endif
276 
277 template<typename T> static void
split_(const T * src,T ** dst,int len,int cn)278 split_( const T* src, T** dst, int len, int cn )
279 {
280     int k = cn % 4 ? cn % 4 : 4;
281     int i, j;
282     if( k == 1 )
283     {
284         T* dst0 = dst[0];
285 
286         if(cn == 1)
287         {
288             memcpy(dst0, src, len * sizeof(T));
289         }
290         else
291         {
292             for( i = 0, j = 0 ; i < len; i++, j += cn )
293                 dst0[i] = src[j];
294         }
295     }
296     else if( k == 2 )
297     {
298         T *dst0 = dst[0], *dst1 = dst[1];
299         i = j = 0;
300 
301 #if CV_NEON
302         if(cn == 2)
303         {
304             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
305             int inc_j = 2 * inc_i;
306 
307             VSplit2<T> vsplit;
308             for( ; i < len - inc_i; i += inc_i, j += inc_j)
309                 vsplit(src + j, dst0 + i, dst1 + i);
310         }
311 #elif CV_SSE2
312         if (cn == 2)
313         {
314             int inc_i = 32/sizeof(T);
315             int inc_j = 2 * inc_i;
316 
317             VSplit2<T> vsplit;
318             if (vsplit.support)
319             {
320                 for( ; i <= len - inc_i; i += inc_i, j += inc_j)
321                     vsplit(src + j, dst0 + i, dst1 + i);
322             }
323         }
324 #endif
325         for( ; i < len; i++, j += cn )
326         {
327             dst0[i] = src[j];
328             dst1[i] = src[j+1];
329         }
330     }
331     else if( k == 3 )
332     {
333         T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2];
334         i = j = 0;
335 
336 #if CV_NEON
337         if(cn == 3)
338         {
339             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
340             int inc_j = 3 * inc_i;
341 
342             VSplit3<T> vsplit;
343             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
344                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
345         }
346 #elif CV_SSE2
347         if (cn == 3)
348         {
349             int inc_i = 32/sizeof(T);
350             int inc_j = 3 * inc_i;
351 
352             VSplit3<T> vsplit;
353 
354             if (vsplit.support)
355             {
356                 for( ; i <= len - inc_i; i += inc_i, j += inc_j)
357                     vsplit(src + j, dst0 + i, dst1 + i, dst2 + i);
358             }
359         }
360 #endif
361         for( ; i < len; i++, j += cn )
362         {
363             dst0[i] = src[j];
364             dst1[i] = src[j+1];
365             dst2[i] = src[j+2];
366         }
367     }
368     else
369     {
370         T *dst0 = dst[0], *dst1 = dst[1], *dst2 = dst[2], *dst3 = dst[3];
371         i = j = 0;
372 
373 #if CV_NEON
374         if(cn == 4)
375         {
376             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
377             int inc_j = 4 * inc_i;
378 
379             VSplit4<T> vsplit;
380             for( ; i <= len - inc_i; i += inc_i, j += inc_j)
381                 vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
382         }
383 #elif CV_SSE2
384         if (cn == 4)
385         {
386             int inc_i = 32/sizeof(T);
387             int inc_j = 4 * inc_i;
388 
389             VSplit4<T> vsplit;
390             if (vsplit.support)
391             {
392                 for( ; i <= len - inc_i; i += inc_i, j += inc_j)
393                     vsplit(src + j, dst0 + i, dst1 + i, dst2 + i, dst3 + i);
394             }
395         }
396 #endif
397         for( ; i < len; i++, j += cn )
398         {
399             dst0[i] = src[j]; dst1[i] = src[j+1];
400             dst2[i] = src[j+2]; dst3[i] = src[j+3];
401         }
402     }
403 
404     for( ; k < cn; k += 4 )
405     {
406         T *dst0 = dst[k], *dst1 = dst[k+1], *dst2 = dst[k+2], *dst3 = dst[k+3];
407         for( i = 0, j = k; i < len; i++, j += cn )
408         {
409             dst0[i] = src[j]; dst1[i] = src[j+1];
410             dst2[i] = src[j+2]; dst3[i] = src[j+3];
411         }
412     }
413 }
414 
415 
416 #if CV_NEON
417 template<typename T> struct VMerge2;
418 template<typename T> struct VMerge3;
419 template<typename T> struct VMerge4;
420 
421 #define MERGE2_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
422     template<>                                                                    \
423     struct name<data_type>{                                                       \
424         void operator()(const data_type* src0, const data_type* src1,             \
425                         data_type* dst){                                          \
426             reg_type r;                                                           \
427             r.val[0] = load_func(src0);                                           \
428             r.val[1] = load_func(src1);                                           \
429             store_func(dst, r);                                                   \
430         }                                                                         \
431     }
432 
433 #define MERGE3_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
434     template<>                                                                    \
435     struct name<data_type>{                                                       \
436         void operator()(const data_type* src0, const data_type* src1,             \
437                         const data_type* src2, data_type* dst){                   \
438             reg_type r;                                                           \
439             r.val[0] = load_func(src0);                                           \
440             r.val[1] = load_func(src1);                                           \
441             r.val[2] = load_func(src2);                                           \
442             store_func(dst, r);                                                   \
443         }                                                                         \
444     }
445 
446 #define MERGE4_KERNEL_TEMPLATE(name, data_type, reg_type, load_func, store_func)  \
447     template<>                                                                    \
448     struct name<data_type>{                                                       \
449         void operator()(const data_type* src0, const data_type* src1,             \
450                         const data_type* src2, const data_type* src3,             \
451                         data_type* dst){                                          \
452             reg_type r;                                                           \
453             r.val[0] = load_func(src0);                                           \
454             r.val[1] = load_func(src1);                                           \
455             r.val[2] = load_func(src2);                                           \
456             r.val[3] = load_func(src3);                                           \
457             store_func(dst, r);                                                   \
458         }                                                                         \
459     }
460 
461 MERGE2_KERNEL_TEMPLATE(VMerge2, uchar ,  uint8x16x2_t, vld1q_u8 , vst2q_u8 );
462 MERGE2_KERNEL_TEMPLATE(VMerge2, ushort,  uint16x8x2_t, vld1q_u16, vst2q_u16);
463 MERGE2_KERNEL_TEMPLATE(VMerge2, int   ,   int32x4x2_t, vld1q_s32, vst2q_s32);
464 MERGE2_KERNEL_TEMPLATE(VMerge2, int64 ,   int64x1x2_t, vld1_s64 , vst2_s64 );
465 
466 MERGE3_KERNEL_TEMPLATE(VMerge3, uchar ,  uint8x16x3_t, vld1q_u8 , vst3q_u8 );
467 MERGE3_KERNEL_TEMPLATE(VMerge3, ushort,  uint16x8x3_t, vld1q_u16, vst3q_u16);
468 MERGE3_KERNEL_TEMPLATE(VMerge3, int   ,   int32x4x3_t, vld1q_s32, vst3q_s32);
469 MERGE3_KERNEL_TEMPLATE(VMerge3, int64 ,   int64x1x3_t, vld1_s64 , vst3_s64 );
470 
471 MERGE4_KERNEL_TEMPLATE(VMerge4, uchar ,  uint8x16x4_t, vld1q_u8 , vst4q_u8 );
472 MERGE4_KERNEL_TEMPLATE(VMerge4, ushort,  uint16x8x4_t, vld1q_u16, vst4q_u16);
473 MERGE4_KERNEL_TEMPLATE(VMerge4, int   ,   int32x4x4_t, vld1q_s32, vst4q_s32);
474 MERGE4_KERNEL_TEMPLATE(VMerge4, int64 ,   int64x1x4_t, vld1_s64 , vst4_s64 );
475 
476 #elif CV_SSE2
477 
478 template <typename T>
479 struct VMerge2
480 {
VMerge2cv::VMerge2481     VMerge2() : support(false) { }
operator ()cv::VMerge2482     void operator()(const T *, const T *, T *) const { }
483 
484     bool support;
485 };
486 
487 template <typename T>
488 struct VMerge3
489 {
VMerge3cv::VMerge3490     VMerge3() : support(false) { }
operator ()cv::VMerge3491     void operator()(const T *, const T *, const T *, T *) const { }
492 
493     bool support;
494 };
495 
496 template <typename T>
497 struct VMerge4
498 {
VMerge4cv::VMerge4499     VMerge4() : support(false) { }
operator ()cv::VMerge4500     void operator()(const T *, const T *, const T *, const T *, T *) const { }
501 
502     bool support;
503 };
504 
505 #define MERGE2_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
506 template <>                                                                                \
507 struct VMerge2<data_type>                                                                  \
508 {                                                                                          \
509     enum                                                                                   \
510     {                                                                                      \
511         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
512     };                                                                                     \
513                                                                                            \
514     VMerge2()                                                                              \
515     {                                                                                      \
516         support = checkHardwareSupport(se);                                                \
517     }                                                                                      \
518                                                                                            \
519     void operator()(const data_type * src0, const data_type * src1,                        \
520                     data_type * dst) const                                                 \
521     {                                                                                      \
522         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
523         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
524         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
525         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
526                                                                                            \
527         _mm_interleave(v_src0, v_src1, v_src2, v_src3);                                    \
528                                                                                            \
529         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
530         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
531         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
532         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
533     }                                                                                      \
534                                                                                            \
535     bool support;                                                                          \
536 }
537 
538 #define MERGE3_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
539 template <>                                                                                \
540 struct VMerge3<data_type>                                                                  \
541 {                                                                                          \
542     enum                                                                                   \
543     {                                                                                      \
544         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
545     };                                                                                     \
546                                                                                            \
547     VMerge3()                                                                              \
548     {                                                                                      \
549         support = checkHardwareSupport(se);                                                \
550     }                                                                                      \
551                                                                                            \
552     void operator()(const data_type * src0, const data_type * src1, const data_type * src2,\
553                     data_type * dst) const                                                 \
554     {                                                                                      \
555         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
556         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
557         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
558         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
559         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
560         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
561                                                                                            \
562         _mm_interleave(v_src0, v_src1, v_src2,                                             \
563                        v_src3, v_src4, v_src5);                                            \
564                                                                                            \
565         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
566         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
567         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
568         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
569         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
570         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
571     }                                                                                      \
572                                                                                            \
573     bool support;                                                                          \
574 }
575 
576 #define MERGE4_KERNEL_TEMPLATE(data_type, reg_type, cast_type, _mm_interleave, flavor, se) \
577 template <>                                                                                \
578 struct VMerge4<data_type>                                                                  \
579 {                                                                                          \
580     enum                                                                                   \
581     {                                                                                      \
582         ELEMS_IN_VEC = 16 / sizeof(data_type)                                              \
583     };                                                                                     \
584                                                                                            \
585     VMerge4()                                                                              \
586     {                                                                                      \
587         support = checkHardwareSupport(se);                                                \
588     }                                                                                      \
589                                                                                            \
590     void operator()(const data_type * src0, const data_type * src1,                        \
591                     const data_type * src2, const data_type * src3,                        \
592                     data_type * dst) const                                                 \
593     {                                                                                      \
594         reg_type v_src0 = _mm_loadu_##flavor((const cast_type *)(src0));                   \
595         reg_type v_src1 = _mm_loadu_##flavor((const cast_type *)(src0 + ELEMS_IN_VEC));    \
596         reg_type v_src2 = _mm_loadu_##flavor((const cast_type *)(src1));                   \
597         reg_type v_src3 = _mm_loadu_##flavor((const cast_type *)(src1 + ELEMS_IN_VEC));    \
598         reg_type v_src4 = _mm_loadu_##flavor((const cast_type *)(src2));                   \
599         reg_type v_src5 = _mm_loadu_##flavor((const cast_type *)(src2 + ELEMS_IN_VEC));    \
600         reg_type v_src6 = _mm_loadu_##flavor((const cast_type *)(src3));                   \
601         reg_type v_src7 = _mm_loadu_##flavor((const cast_type *)(src3 + ELEMS_IN_VEC));    \
602                                                                                            \
603         _mm_interleave(v_src0, v_src1, v_src2, v_src3,                                     \
604                        v_src4, v_src5, v_src6, v_src7);                                    \
605                                                                                            \
606         _mm_storeu_##flavor((cast_type *)(dst), v_src0);                                   \
607         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC), v_src1);                    \
608         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 2), v_src2);                \
609         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 3), v_src3);                \
610         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 4), v_src4);                \
611         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 5), v_src5);                \
612         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 6), v_src6);                \
613         _mm_storeu_##flavor((cast_type *)(dst + ELEMS_IN_VEC * 7), v_src7);                \
614     }                                                                                      \
615                                                                                            \
616     bool support;                                                                          \
617 }
618 
619 MERGE2_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
620 MERGE3_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
621 MERGE4_KERNEL_TEMPLATE( uchar, __m128i, __m128i, _mm_interleave_epi8, si128, CV_CPU_SSE2);
622 
623 #if CV_SSE4_1
624 MERGE2_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
625 MERGE3_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
626 MERGE4_KERNEL_TEMPLATE(ushort, __m128i, __m128i, _mm_interleave_epi16, si128, CV_CPU_SSE4_1);
627 #endif
628 
629 MERGE2_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
630 MERGE3_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
631 MERGE4_KERNEL_TEMPLATE(   int,  __m128,   float, _mm_interleave_ps, ps, CV_CPU_SSE2);
632 
633 #endif
634 
635 template<typename T> static void
merge_(const T ** src,T * dst,int len,int cn)636 merge_( const T** src, T* dst, int len, int cn )
637 {
638     int k = cn % 4 ? cn % 4 : 4;
639     int i, j;
640     if( k == 1 )
641     {
642         const T* src0 = src[0];
643         for( i = j = 0; i < len; i++, j += cn )
644             dst[j] = src0[i];
645     }
646     else if( k == 2 )
647     {
648         const T *src0 = src[0], *src1 = src[1];
649         i = j = 0;
650 #if CV_NEON
651         if(cn == 2)
652         {
653             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
654             int inc_j = 2 * inc_i;
655 
656             VMerge2<T> vmerge;
657             for( ; i < len - inc_i; i += inc_i, j += inc_j)
658                 vmerge(src0 + i, src1 + i, dst + j);
659         }
660 #elif CV_SSE2
661         if(cn == 2)
662         {
663             int inc_i = 32/sizeof(T);
664             int inc_j = 2 * inc_i;
665 
666             VMerge2<T> vmerge;
667             if (vmerge.support)
668                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
669                     vmerge(src0 + i, src1 + i, dst + j);
670         }
671 #endif
672         for( ; i < len; i++, j += cn )
673         {
674             dst[j] = src0[i];
675             dst[j+1] = src1[i];
676         }
677     }
678     else if( k == 3 )
679     {
680         const T *src0 = src[0], *src1 = src[1], *src2 = src[2];
681         i = j = 0;
682 #if CV_NEON
683         if(cn == 3)
684         {
685             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
686             int inc_j = 3 * inc_i;
687 
688             VMerge3<T> vmerge;
689             for( ; i < len - inc_i; i += inc_i, j += inc_j)
690                 vmerge(src0 + i, src1 + i, src2 + i, dst + j);
691         }
692 #elif CV_SSE2
693         if(cn == 3)
694         {
695             int inc_i = 32/sizeof(T);
696             int inc_j = 3 * inc_i;
697 
698             VMerge3<T> vmerge;
699             if (vmerge.support)
700                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
701                     vmerge(src0 + i, src1 + i, src2 + i, dst + j);
702         }
703 #endif
704         for( ; i < len; i++, j += cn )
705         {
706             dst[j] = src0[i];
707             dst[j+1] = src1[i];
708             dst[j+2] = src2[i];
709         }
710     }
711     else
712     {
713         const T *src0 = src[0], *src1 = src[1], *src2 = src[2], *src3 = src[3];
714         i = j = 0;
715 #if CV_NEON
716         if(cn == 4)
717         {
718             int inc_i = (sizeof(T) == 8)? 1: 16/sizeof(T);
719             int inc_j = 4 * inc_i;
720 
721             VMerge4<T> vmerge;
722             for( ; i < len - inc_i; i += inc_i, j += inc_j)
723                 vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
724         }
725 #elif CV_SSE2
726         if(cn == 4)
727         {
728             int inc_i = 32/sizeof(T);
729             int inc_j = 4 * inc_i;
730 
731             VMerge4<T> vmerge;
732             if (vmerge.support)
733                 for( ; i < len - inc_i; i += inc_i, j += inc_j)
734                     vmerge(src0 + i, src1 + i, src2 + i, src3 + i, dst + j);
735         }
736 #endif
737         for( ; i < len; i++, j += cn )
738         {
739             dst[j] = src0[i]; dst[j+1] = src1[i];
740             dst[j+2] = src2[i]; dst[j+3] = src3[i];
741         }
742     }
743 
744     for( ; k < cn; k += 4 )
745     {
746         const T *src0 = src[k], *src1 = src[k+1], *src2 = src[k+2], *src3 = src[k+3];
747         for( i = 0, j = k; i < len; i++, j += cn )
748         {
749             dst[j] = src0[i]; dst[j+1] = src1[i];
750             dst[j+2] = src2[i]; dst[j+3] = src3[i];
751         }
752     }
753 }
754 
split8u(const uchar * src,uchar ** dst,int len,int cn)755 static void split8u(const uchar* src, uchar** dst, int len, int cn )
756 {
757     split_(src, dst, len, cn);
758 }
759 
split16u(const ushort * src,ushort ** dst,int len,int cn)760 static void split16u(const ushort* src, ushort** dst, int len, int cn )
761 {
762     split_(src, dst, len, cn);
763 }
764 
split32s(const int * src,int ** dst,int len,int cn)765 static void split32s(const int* src, int** dst, int len, int cn )
766 {
767     split_(src, dst, len, cn);
768 }
769 
split64s(const int64 * src,int64 ** dst,int len,int cn)770 static void split64s(const int64* src, int64** dst, int len, int cn )
771 {
772     split_(src, dst, len, cn);
773 }
774 
merge8u(const uchar ** src,uchar * dst,int len,int cn)775 static void merge8u(const uchar** src, uchar* dst, int len, int cn )
776 {
777     merge_(src, dst, len, cn);
778 }
779 
merge16u(const ushort ** src,ushort * dst,int len,int cn)780 static void merge16u(const ushort** src, ushort* dst, int len, int cn )
781 {
782     merge_(src, dst, len, cn);
783 }
784 
merge32s(const int ** src,int * dst,int len,int cn)785 static void merge32s(const int** src, int* dst, int len, int cn )
786 {
787     merge_(src, dst, len, cn);
788 }
789 
merge64s(const int64 ** src,int64 * dst,int len,int cn)790 static void merge64s(const int64** src, int64* dst, int len, int cn )
791 {
792     merge_(src, dst, len, cn);
793 }
794 
795 typedef void (*SplitFunc)(const uchar* src, uchar** dst, int len, int cn);
796 typedef void (*MergeFunc)(const uchar** src, uchar* dst, int len, int cn);
797 
getSplitFunc(int depth)798 static SplitFunc getSplitFunc(int depth)
799 {
800     static SplitFunc splitTab[] =
801     {
802         (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split8u), (SplitFunc)GET_OPTIMIZED(split16u), (SplitFunc)GET_OPTIMIZED(split16u),
803         (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split32s), (SplitFunc)GET_OPTIMIZED(split64s), 0
804     };
805 
806     return splitTab[depth];
807 }
808 
getMergeFunc(int depth)809 static MergeFunc getMergeFunc(int depth)
810 {
811     static MergeFunc mergeTab[] =
812     {
813         (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge8u), (MergeFunc)GET_OPTIMIZED(merge16u), (MergeFunc)GET_OPTIMIZED(merge16u),
814         (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge32s), (MergeFunc)GET_OPTIMIZED(merge64s), 0
815     };
816 
817     return mergeTab[depth];
818 }
819 
820 }
821 
split(const Mat & src,Mat * mv)822 void cv::split(const Mat& src, Mat* mv)
823 {
824     int k, depth = src.depth(), cn = src.channels();
825     if( cn == 1 )
826     {
827         src.copyTo(mv[0]);
828         return;
829     }
830 
831     SplitFunc func = getSplitFunc(depth);
832     CV_Assert( func != 0 );
833 
834     int esz = (int)src.elemSize(), esz1 = (int)src.elemSize1();
835     int blocksize0 = (BLOCK_SIZE + esz-1)/esz;
836     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
837     const Mat** arrays = (const Mat**)(uchar*)_buf;
838     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
839 
840     arrays[0] = &src;
841     for( k = 0; k < cn; k++ )
842     {
843         mv[k].create(src.dims, src.size, depth);
844         arrays[k+1] = &mv[k];
845     }
846 
847     NAryMatIterator it(arrays, ptrs, cn+1);
848     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
849 
850     for( size_t i = 0; i < it.nplanes; i++, ++it )
851     {
852         for( int j = 0; j < total; j += blocksize )
853         {
854             int bsz = std::min(total - j, blocksize);
855             func( ptrs[0], &ptrs[1], bsz, cn );
856 
857             if( j + blocksize < total )
858             {
859                 ptrs[0] += bsz*esz;
860                 for( k = 0; k < cn; k++ )
861                     ptrs[k+1] += bsz*esz1;
862             }
863         }
864     }
865 }
866 
867 #ifdef HAVE_OPENCL
868 
869 namespace cv {
870 
ocl_split(InputArray _m,OutputArrayOfArrays _mv)871 static bool ocl_split( InputArray _m, OutputArrayOfArrays _mv )
872 {
873     int type = _m.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type),
874             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
875 
876     String dstargs, processelem, indexdecl;
877     for (int i = 0; i < cn; ++i)
878     {
879         dstargs += format("DECLARE_DST_PARAM(%d)", i);
880         indexdecl += format("DECLARE_INDEX(%d)", i);
881         processelem += format("PROCESS_ELEM(%d)", i);
882     }
883 
884     ocl::Kernel k("split", ocl::core::split_merge_oclsrc,
885                   format("-D T=%s -D OP_SPLIT -D cn=%d -D DECLARE_DST_PARAMS=%s"
886                          " -D PROCESS_ELEMS_N=%s -D DECLARE_INDEX_N=%s",
887                          ocl::memopTypeToStr(depth), cn, dstargs.c_str(),
888                          processelem.c_str(), indexdecl.c_str()));
889     if (k.empty())
890         return false;
891 
892     Size size = _m.size();
893     _mv.create(cn, 1, depth);
894     for (int i = 0; i < cn; ++i)
895         _mv.create(size, depth, i);
896 
897     std::vector<UMat> dst;
898     _mv.getUMatVector(dst);
899 
900     int argidx = k.set(0, ocl::KernelArg::ReadOnly(_m.getUMat()));
901     for (int i = 0; i < cn; ++i)
902         argidx = k.set(argidx, ocl::KernelArg::WriteOnlyNoSize(dst[i]));
903     k.set(argidx, rowsPerWI);
904 
905     size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
906     return k.run(2, globalsize, NULL, false);
907 }
908 
909 }
910 
911 #endif
912 
split(InputArray _m,OutputArrayOfArrays _mv)913 void cv::split(InputArray _m, OutputArrayOfArrays _mv)
914 {
915     CV_OCL_RUN(_m.dims() <= 2 && _mv.isUMatVector(),
916                ocl_split(_m, _mv))
917 
918     Mat m = _m.getMat();
919     if( m.empty() )
920     {
921         _mv.release();
922         return;
923     }
924 
925     CV_Assert( !_mv.fixedType() || _mv.empty() || _mv.type() == m.depth() );
926 
927     Size size = m.size();
928     int depth = m.depth(), cn = m.channels();
929     _mv.create(cn, 1, depth);
930     for (int i = 0; i < cn; ++i)
931         _mv.create(size, depth, i);
932 
933     std::vector<Mat> dst;
934     _mv.getMatVector(dst);
935 
936     split(m, &dst[0]);
937 }
938 
merge(const Mat * mv,size_t n,OutputArray _dst)939 void cv::merge(const Mat* mv, size_t n, OutputArray _dst)
940 {
941     CV_Assert( mv && n > 0 );
942 
943     int depth = mv[0].depth();
944     bool allch1 = true;
945     int k, cn = 0;
946     size_t i;
947 
948     for( i = 0; i < n; i++ )
949     {
950         CV_Assert(mv[i].size == mv[0].size && mv[i].depth() == depth);
951         allch1 = allch1 && mv[i].channels() == 1;
952         cn += mv[i].channels();
953     }
954 
955     CV_Assert( 0 < cn && cn <= CV_CN_MAX );
956     _dst.create(mv[0].dims, mv[0].size, CV_MAKETYPE(depth, cn));
957     Mat dst = _dst.getMat();
958 
959     if( n == 1 )
960     {
961         mv[0].copyTo(dst);
962         return;
963     }
964 
965     if( !allch1 )
966     {
967         AutoBuffer<int> pairs(cn*2);
968         int j, ni=0;
969 
970         for( i = 0, j = 0; i < n; i++, j += ni )
971         {
972             ni = mv[i].channels();
973             for( k = 0; k < ni; k++ )
974             {
975                 pairs[(j+k)*2] = j + k;
976                 pairs[(j+k)*2+1] = j + k;
977             }
978         }
979         mixChannels( mv, n, &dst, 1, &pairs[0], cn );
980         return;
981     }
982 
983     size_t esz = dst.elemSize(), esz1 = dst.elemSize1();
984     int blocksize0 = (int)((BLOCK_SIZE + esz-1)/esz);
985     AutoBuffer<uchar> _buf((cn+1)*(sizeof(Mat*) + sizeof(uchar*)) + 16);
986     const Mat** arrays = (const Mat**)(uchar*)_buf;
987     uchar** ptrs = (uchar**)alignPtr(arrays + cn + 1, 16);
988 
989     arrays[0] = &dst;
990     for( k = 0; k < cn; k++ )
991         arrays[k+1] = &mv[k];
992 
993     NAryMatIterator it(arrays, ptrs, cn+1);
994     int total = (int)it.size, blocksize = cn <= 4 ? total : std::min(total, blocksize0);
995     MergeFunc func = getMergeFunc(depth);
996 
997     for( i = 0; i < it.nplanes; i++, ++it )
998     {
999         for( int j = 0; j < total; j += blocksize )
1000         {
1001             int bsz = std::min(total - j, blocksize);
1002             func( (const uchar**)&ptrs[1], ptrs[0], bsz, cn );
1003 
1004             if( j + blocksize < total )
1005             {
1006                 ptrs[0] += bsz*esz;
1007                 for( int t = 0; t < cn; t++ )
1008                     ptrs[t+1] += bsz*esz1;
1009             }
1010         }
1011     }
1012 }
1013 
1014 #ifdef HAVE_OPENCL
1015 
1016 namespace cv {
1017 
ocl_merge(InputArrayOfArrays _mv,OutputArray _dst)1018 static bool ocl_merge( InputArrayOfArrays _mv, OutputArray _dst )
1019 {
1020     std::vector<UMat> src, ksrc;
1021     _mv.getUMatVector(src);
1022     CV_Assert(!src.empty());
1023 
1024     int type = src[0].type(), depth = CV_MAT_DEPTH(type),
1025             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
1026     Size size = src[0].size();
1027 
1028     for (size_t i = 0, srcsize = src.size(); i < srcsize; ++i)
1029     {
1030         int itype = src[i].type(), icn = CV_MAT_CN(itype), idepth = CV_MAT_DEPTH(itype),
1031                 esz1 = CV_ELEM_SIZE1(idepth);
1032         if (src[i].dims > 2)
1033             return false;
1034 
1035         CV_Assert(size == src[i].size() && depth == idepth);
1036 
1037         for (int cn = 0; cn < icn; ++cn)
1038         {
1039             UMat tsrc = src[i];
1040             tsrc.offset += cn * esz1;
1041             ksrc.push_back(tsrc);
1042         }
1043     }
1044     int dcn = (int)ksrc.size();
1045 
1046     String srcargs, processelem, cndecl, indexdecl;
1047     for (int i = 0; i < dcn; ++i)
1048     {
1049         srcargs += format("DECLARE_SRC_PARAM(%d)", i);
1050         processelem += format("PROCESS_ELEM(%d)", i);
1051         indexdecl += format("DECLARE_INDEX(%d)", i);
1052         cndecl += format(" -D scn%d=%d", i, ksrc[i].channels());
1053     }
1054 
1055     ocl::Kernel k("merge", ocl::core::split_merge_oclsrc,
1056                   format("-D OP_MERGE -D cn=%d -D T=%s -D DECLARE_SRC_PARAMS_N=%s"
1057                          " -D DECLARE_INDEX_N=%s -D PROCESS_ELEMS_N=%s%s",
1058                          dcn, ocl::memopTypeToStr(depth), srcargs.c_str(),
1059                          indexdecl.c_str(), processelem.c_str(), cndecl.c_str()));
1060     if (k.empty())
1061         return false;
1062 
1063     _dst.create(size, CV_MAKE_TYPE(depth, dcn));
1064     UMat dst = _dst.getUMat();
1065 
1066     int argidx = 0;
1067     for (int i = 0; i < dcn; ++i)
1068         argidx = k.set(argidx, ocl::KernelArg::ReadOnlyNoSize(ksrc[i]));
1069     argidx = k.set(argidx, ocl::KernelArg::WriteOnly(dst));
1070     k.set(argidx, rowsPerWI);
1071 
1072     size_t globalsize[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
1073     return k.run(2, globalsize, NULL, false);
1074 }
1075 
1076 }
1077 
1078 #endif
1079 
merge(InputArrayOfArrays _mv,OutputArray _dst)1080 void cv::merge(InputArrayOfArrays _mv, OutputArray _dst)
1081 {
1082     CV_OCL_RUN(_mv.isUMatVector() && _dst.isUMat(),
1083                ocl_merge(_mv, _dst))
1084 
1085     std::vector<Mat> mv;
1086     _mv.getMatVector(mv);
1087     merge(!mv.empty() ? &mv[0] : 0, mv.size(), _dst);
1088 }
1089 
1090 /****************************************************************************************\
1091 *                       Generalized split/merge: mixing channels                         *
1092 \****************************************************************************************/
1093 
1094 namespace cv
1095 {
1096 
1097 template<typename T> static void
mixChannels_(const T ** src,const int * sdelta,T ** dst,const int * ddelta,int len,int npairs)1098 mixChannels_( const T** src, const int* sdelta,
1099               T** dst, const int* ddelta,
1100               int len, int npairs )
1101 {
1102     int i, k;
1103     for( k = 0; k < npairs; k++ )
1104     {
1105         const T* s = src[k];
1106         T* d = dst[k];
1107         int ds = sdelta[k], dd = ddelta[k];
1108         if( s )
1109         {
1110             for( i = 0; i <= len - 2; i += 2, s += ds*2, d += dd*2 )
1111             {
1112                 T t0 = s[0], t1 = s[ds];
1113                 d[0] = t0; d[dd] = t1;
1114             }
1115             if( i < len )
1116                 d[0] = s[0];
1117         }
1118         else
1119         {
1120             for( i = 0; i <= len - 2; i += 2, d += dd*2 )
1121                 d[0] = d[dd] = 0;
1122             if( i < len )
1123                 d[0] = 0;
1124         }
1125     }
1126 }
1127 
1128 
mixChannels8u(const uchar ** src,const int * sdelta,uchar ** dst,const int * ddelta,int len,int npairs)1129 static void mixChannels8u( const uchar** src, const int* sdelta,
1130                            uchar** dst, const int* ddelta,
1131                            int len, int npairs )
1132 {
1133     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
1134 }
1135 
mixChannels16u(const ushort ** src,const int * sdelta,ushort ** dst,const int * ddelta,int len,int npairs)1136 static void mixChannels16u( const ushort** src, const int* sdelta,
1137                             ushort** dst, const int* ddelta,
1138                             int len, int npairs )
1139 {
1140     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
1141 }
1142 
mixChannels32s(const int ** src,const int * sdelta,int ** dst,const int * ddelta,int len,int npairs)1143 static void mixChannels32s( const int** src, const int* sdelta,
1144                             int** dst, const int* ddelta,
1145                             int len, int npairs )
1146 {
1147     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
1148 }
1149 
mixChannels64s(const int64 ** src,const int * sdelta,int64 ** dst,const int * ddelta,int len,int npairs)1150 static void mixChannels64s( const int64** src, const int* sdelta,
1151                             int64** dst, const int* ddelta,
1152                             int len, int npairs )
1153 {
1154     mixChannels_(src, sdelta, dst, ddelta, len, npairs);
1155 }
1156 
1157 typedef void (*MixChannelsFunc)( const uchar** src, const int* sdelta,
1158         uchar** dst, const int* ddelta, int len, int npairs );
1159 
getMixchFunc(int depth)1160 static MixChannelsFunc getMixchFunc(int depth)
1161 {
1162     static MixChannelsFunc mixchTab[] =
1163     {
1164         (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels8u, (MixChannelsFunc)mixChannels16u,
1165         (MixChannelsFunc)mixChannels16u, (MixChannelsFunc)mixChannels32s, (MixChannelsFunc)mixChannels32s,
1166         (MixChannelsFunc)mixChannels64s, 0
1167     };
1168 
1169     return mixchTab[depth];
1170 }
1171 
1172 }
1173 
mixChannels(const Mat * src,size_t nsrcs,Mat * dst,size_t ndsts,const int * fromTo,size_t npairs)1174 void cv::mixChannels( const Mat* src, size_t nsrcs, Mat* dst, size_t ndsts, const int* fromTo, size_t npairs )
1175 {
1176     if( npairs == 0 )
1177         return;
1178     CV_Assert( src && nsrcs > 0 && dst && ndsts > 0 && fromTo && npairs > 0 );
1179 
1180     size_t i, j, k, esz1 = dst[0].elemSize1();
1181     int depth = dst[0].depth();
1182 
1183     AutoBuffer<uchar> buf((nsrcs + ndsts + 1)*(sizeof(Mat*) + sizeof(uchar*)) + npairs*(sizeof(uchar*)*2 + sizeof(int)*6));
1184     const Mat** arrays = (const Mat**)(uchar*)buf;
1185     uchar** ptrs = (uchar**)(arrays + nsrcs + ndsts);
1186     const uchar** srcs = (const uchar**)(ptrs + nsrcs + ndsts + 1);
1187     uchar** dsts = (uchar**)(srcs + npairs);
1188     int* tab = (int*)(dsts + npairs);
1189     int *sdelta = (int*)(tab + npairs*4), *ddelta = sdelta + npairs;
1190 
1191     for( i = 0; i < nsrcs; i++ )
1192         arrays[i] = &src[i];
1193     for( i = 0; i < ndsts; i++ )
1194         arrays[i + nsrcs] = &dst[i];
1195     ptrs[nsrcs + ndsts] = 0;
1196 
1197     for( i = 0; i < npairs; i++ )
1198     {
1199         int i0 = fromTo[i*2], i1 = fromTo[i*2+1];
1200         if( i0 >= 0 )
1201         {
1202             for( j = 0; j < nsrcs; i0 -= src[j].channels(), j++ )
1203                 if( i0 < src[j].channels() )
1204                     break;
1205             CV_Assert(j < nsrcs && src[j].depth() == depth);
1206             tab[i*4] = (int)j; tab[i*4+1] = (int)(i0*esz1);
1207             sdelta[i] = src[j].channels();
1208         }
1209         else
1210         {
1211             tab[i*4] = (int)(nsrcs + ndsts); tab[i*4+1] = 0;
1212             sdelta[i] = 0;
1213         }
1214 
1215         for( j = 0; j < ndsts; i1 -= dst[j].channels(), j++ )
1216             if( i1 < dst[j].channels() )
1217                 break;
1218         CV_Assert(i1 >= 0 && j < ndsts && dst[j].depth() == depth);
1219         tab[i*4+2] = (int)(j + nsrcs); tab[i*4+3] = (int)(i1*esz1);
1220         ddelta[i] = dst[j].channels();
1221     }
1222 
1223     NAryMatIterator it(arrays, ptrs, (int)(nsrcs + ndsts));
1224     int total = (int)it.size, blocksize = std::min(total, (int)((BLOCK_SIZE + esz1-1)/esz1));
1225     MixChannelsFunc func = getMixchFunc(depth);
1226 
1227     for( i = 0; i < it.nplanes; i++, ++it )
1228     {
1229         for( k = 0; k < npairs; k++ )
1230         {
1231             srcs[k] = ptrs[tab[k*4]] + tab[k*4+1];
1232             dsts[k] = ptrs[tab[k*4+2]] + tab[k*4+3];
1233         }
1234 
1235         for( int t = 0; t < total; t += blocksize )
1236         {
1237             int bsz = std::min(total - t, blocksize);
1238             func( srcs, sdelta, dsts, ddelta, bsz, (int)npairs );
1239 
1240             if( t + blocksize < total )
1241                 for( k = 0; k < npairs; k++ )
1242                 {
1243                     srcs[k] += blocksize*sdelta[k]*esz1;
1244                     dsts[k] += blocksize*ddelta[k]*esz1;
1245                 }
1246         }
1247     }
1248 }
1249 
1250 #ifdef HAVE_OPENCL
1251 
1252 namespace cv {
1253 
getUMatIndex(const std::vector<UMat> & um,int cn,int & idx,int & cnidx)1254 static void getUMatIndex(const std::vector<UMat> & um, int cn, int & idx, int & cnidx)
1255 {
1256     int totalChannels = 0;
1257     for (size_t i = 0, size = um.size(); i < size; ++i)
1258     {
1259         int ccn = um[i].channels();
1260         totalChannels += ccn;
1261 
1262         if (totalChannels == cn)
1263         {
1264             idx = (int)(i + 1);
1265             cnidx = 0;
1266             return;
1267         }
1268         else if (totalChannels > cn)
1269         {
1270             idx = (int)i;
1271             cnidx = i == 0 ? cn : (cn - totalChannels + ccn);
1272             return;
1273         }
1274     }
1275 
1276     idx = cnidx = -1;
1277 }
1278 
ocl_mixChannels(InputArrayOfArrays _src,InputOutputArrayOfArrays _dst,const int * fromTo,size_t npairs)1279 static bool ocl_mixChannels(InputArrayOfArrays _src, InputOutputArrayOfArrays _dst,
1280                             const int* fromTo, size_t npairs)
1281 {
1282     std::vector<UMat> src, dst;
1283     _src.getUMatVector(src);
1284     _dst.getUMatVector(dst);
1285 
1286     size_t nsrc = src.size(), ndst = dst.size();
1287     CV_Assert(nsrc > 0 && ndst > 0);
1288 
1289     Size size = src[0].size();
1290     int depth = src[0].depth(), esz = CV_ELEM_SIZE(depth),
1291             rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1;
1292 
1293     for (size_t i = 1, ssize = src.size(); i < ssize; ++i)
1294         CV_Assert(src[i].size() == size && src[i].depth() == depth);
1295     for (size_t i = 0, dsize = dst.size(); i < dsize; ++i)
1296         CV_Assert(dst[i].size() == size && dst[i].depth() == depth);
1297 
1298     String declsrc, decldst, declproc, declcn, indexdecl;
1299     std::vector<UMat> srcargs(npairs), dstargs(npairs);
1300 
1301     for (size_t i = 0; i < npairs; ++i)
1302     {
1303         int scn = fromTo[i<<1], dcn = fromTo[(i<<1) + 1];
1304         int src_idx, src_cnidx, dst_idx, dst_cnidx;
1305 
1306         getUMatIndex(src, scn, src_idx, src_cnidx);
1307         getUMatIndex(dst, dcn, dst_idx, dst_cnidx);
1308 
1309         CV_Assert(dst_idx >= 0 && src_idx >= 0);
1310 
1311         srcargs[i] = src[src_idx];
1312         srcargs[i].offset += src_cnidx * esz;
1313 
1314         dstargs[i] = dst[dst_idx];
1315         dstargs[i].offset += dst_cnidx * esz;
1316 
1317         declsrc += format("DECLARE_INPUT_MAT(%d)", i);
1318         decldst += format("DECLARE_OUTPUT_MAT(%d)", i);
1319         indexdecl += format("DECLARE_INDEX(%d)", i);
1320         declproc += format("PROCESS_ELEM(%d)", i);
1321         declcn += format(" -D scn%d=%d -D dcn%d=%d", i, src[src_idx].channels(), i, dst[dst_idx].channels());
1322     }
1323 
1324     ocl::Kernel k("mixChannels", ocl::core::mixchannels_oclsrc,
1325                   format("-D T=%s -D DECLARE_INPUT_MAT_N=%s -D DECLARE_OUTPUT_MAT_N=%s"
1326                          " -D PROCESS_ELEM_N=%s -D DECLARE_INDEX_N=%s%s",
1327                          ocl::memopTypeToStr(depth), declsrc.c_str(), decldst.c_str(),
1328                          declproc.c_str(), indexdecl.c_str(), declcn.c_str()));
1329     if (k.empty())
1330         return false;
1331 
1332     int argindex = 0;
1333     for (size_t i = 0; i < npairs; ++i)
1334         argindex = k.set(argindex, ocl::KernelArg::ReadOnlyNoSize(srcargs[i]));
1335     for (size_t i = 0; i < npairs; ++i)
1336         argindex = k.set(argindex, ocl::KernelArg::WriteOnlyNoSize(dstargs[i]));
1337     argindex = k.set(argindex, size.height);
1338     argindex = k.set(argindex, size.width);
1339     k.set(argindex, rowsPerWI);
1340 
1341     size_t globalsize[2] = { size.width, (size.height + rowsPerWI - 1) / rowsPerWI };
1342     return k.run(2, globalsize, NULL, false);
1343 }
1344 
1345 }
1346 
1347 #endif
1348 
mixChannels(InputArrayOfArrays src,InputOutputArrayOfArrays dst,const int * fromTo,size_t npairs)1349 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
1350                  const int* fromTo, size_t npairs)
1351 {
1352     if (npairs == 0 || fromTo == NULL)
1353         return;
1354 
1355     CV_OCL_RUN(dst.isUMatVector(),
1356                ocl_mixChannels(src, dst, fromTo, npairs))
1357 
1358     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
1359             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
1360             src.kind() != _InputArray::STD_VECTOR_UMAT;
1361     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
1362             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
1363             dst.kind() != _InputArray::STD_VECTOR_UMAT;
1364     int i;
1365     int nsrc = src_is_mat ? 1 : (int)src.total();
1366     int ndst = dst_is_mat ? 1 : (int)dst.total();
1367 
1368     CV_Assert(nsrc > 0 && ndst > 0);
1369     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
1370     Mat* buf = _buf;
1371     for( i = 0; i < nsrc; i++ )
1372         buf[i] = src.getMat(src_is_mat ? -1 : i);
1373     for( i = 0; i < ndst; i++ )
1374         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
1375     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, fromTo, npairs);
1376 }
1377 
mixChannels(InputArrayOfArrays src,InputOutputArrayOfArrays dst,const std::vector<int> & fromTo)1378 void cv::mixChannels(InputArrayOfArrays src, InputOutputArrayOfArrays dst,
1379                      const std::vector<int>& fromTo)
1380 {
1381     if (fromTo.empty())
1382         return;
1383 
1384     CV_OCL_RUN(dst.isUMatVector(),
1385                ocl_mixChannels(src, dst, &fromTo[0], fromTo.size()>>1))
1386 
1387     bool src_is_mat = src.kind() != _InputArray::STD_VECTOR_MAT &&
1388             src.kind() != _InputArray::STD_VECTOR_VECTOR &&
1389             src.kind() != _InputArray::STD_VECTOR_UMAT;
1390     bool dst_is_mat = dst.kind() != _InputArray::STD_VECTOR_MAT &&
1391             dst.kind() != _InputArray::STD_VECTOR_VECTOR &&
1392             dst.kind() != _InputArray::STD_VECTOR_UMAT;
1393     int i;
1394     int nsrc = src_is_mat ? 1 : (int)src.total();
1395     int ndst = dst_is_mat ? 1 : (int)dst.total();
1396 
1397     CV_Assert(fromTo.size()%2 == 0 && nsrc > 0 && ndst > 0);
1398     cv::AutoBuffer<Mat> _buf(nsrc + ndst);
1399     Mat* buf = _buf;
1400     for( i = 0; i < nsrc; i++ )
1401         buf[i] = src.getMat(src_is_mat ? -1 : i);
1402     for( i = 0; i < ndst; i++ )
1403         buf[nsrc + i] = dst.getMat(dst_is_mat ? -1 : i);
1404     mixChannels(&buf[0], nsrc, &buf[nsrc], ndst, &fromTo[0], fromTo.size()/2);
1405 }
1406 
extractChannel(InputArray _src,OutputArray _dst,int coi)1407 void cv::extractChannel(InputArray _src, OutputArray _dst, int coi)
1408 {
1409     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
1410     CV_Assert( 0 <= coi && coi < cn );
1411     int ch[] = { coi, 0 };
1412 
1413     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
1414     {
1415         UMat src = _src.getUMat();
1416         _dst.create(src.dims, &src.size[0], depth);
1417         UMat dst = _dst.getUMat();
1418         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
1419         return;
1420     }
1421 
1422     Mat src = _src.getMat();
1423     _dst.create(src.dims, &src.size[0], depth);
1424     Mat dst = _dst.getMat();
1425     mixChannels(&src, 1, &dst, 1, ch, 1);
1426 }
1427 
insertChannel(InputArray _src,InputOutputArray _dst,int coi)1428 void cv::insertChannel(InputArray _src, InputOutputArray _dst, int coi)
1429 {
1430     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
1431     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
1432     CV_Assert( _src.sameSize(_dst) && sdepth == ddepth );
1433     CV_Assert( 0 <= coi && coi < dcn && scn == 1 );
1434 
1435     int ch[] = { 0, coi };
1436     if (ocl::useOpenCL() && _src.dims() <= 2 && _dst.isUMat())
1437     {
1438         UMat src = _src.getUMat(), dst = _dst.getUMat();
1439         mixChannels(std::vector<UMat>(1, src), std::vector<UMat>(1, dst), ch, 1);
1440         return;
1441     }
1442 
1443     Mat src = _src.getMat(), dst = _dst.getMat();
1444     mixChannels(&src, 1, &dst, 1, ch, 1);
1445 }
1446 
1447 /****************************************************************************************\
1448 *                                convertScale[Abs]                                       *
1449 \****************************************************************************************/
1450 
1451 namespace cv
1452 {
1453 
1454 template<typename T, typename DT, typename WT>
1455 struct cvtScaleAbs_SIMD
1456 {
operator ()cv::cvtScaleAbs_SIMD1457     int operator () (const T *, DT *, int, WT, WT) const
1458     {
1459         return 0;
1460     }
1461 };
1462 
1463 #if CV_SSE2
1464 
1465 template <>
1466 struct cvtScaleAbs_SIMD<uchar, uchar, float>
1467 {
operator ()cv::cvtScaleAbs_SIMD1468     int operator () (const uchar * src, uchar * dst, int width,
1469                      float scale, float shift) const
1470     {
1471         int x = 0;
1472 
1473         if (USE_SSE2)
1474         {
1475             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1476                 v_zero_f = _mm_setzero_ps();
1477             __m128i v_zero_i = _mm_setzero_si128();
1478 
1479             for ( ; x <= width - 16; x += 16)
1480             {
1481                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1482                 __m128i v_src12 = _mm_unpacklo_epi8(v_src, v_zero_i), v_src_34 = _mm_unpackhi_epi8(v_src, v_zero_i);
1483                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src12, v_zero_i)), v_scale), v_shift);
1484                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1485                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src12, v_zero_i)), v_scale), v_shift);
1486                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1487                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
1488                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
1489                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_34, v_zero_i)), v_scale), v_shift);
1490                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
1491 
1492                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
1493                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
1494                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
1495             }
1496         }
1497 
1498         return x;
1499     }
1500 };
1501 
1502 template <>
1503 struct cvtScaleAbs_SIMD<schar, uchar, float>
1504 {
operator ()cv::cvtScaleAbs_SIMD1505     int operator () (const schar * src, uchar * dst, int width,
1506                      float scale, float shift) const
1507     {
1508         int x = 0;
1509 
1510         if (USE_SSE2)
1511         {
1512             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1513                 v_zero_f = _mm_setzero_ps();
1514             __m128i v_zero_i = _mm_setzero_si128();
1515 
1516             for ( ; x <= width - 16; x += 16)
1517             {
1518                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1519                 __m128i v_src_12 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero_i, v_src), 8),
1520                         v_src_34 = _mm_srai_epi16(_mm_unpackhi_epi8(v_zero_i, v_src), 8);
1521                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
1522                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
1523                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1524                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
1525                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_12), 16)), v_scale), v_shift);
1526                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1527                 __m128 v_dst3 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
1528                     _mm_srai_epi32(_mm_unpacklo_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
1529                 v_dst3 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst3), v_dst3);
1530                 __m128 v_dst4 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(
1531                     _mm_srai_epi32(_mm_unpackhi_epi16(v_zero_i, v_src_34), 16)), v_scale), v_shift);
1532                 v_dst4 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst4), v_dst4);
1533 
1534                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)),
1535                                                    _mm_packs_epi32(_mm_cvtps_epi32(v_dst3), _mm_cvtps_epi32(v_dst4)));
1536                 _mm_storeu_si128((__m128i *)(dst + x), v_dst_i);
1537             }
1538         }
1539 
1540         return x;
1541     }
1542 };
1543 
1544 template <>
1545 struct cvtScaleAbs_SIMD<ushort, uchar, float>
1546 {
operator ()cv::cvtScaleAbs_SIMD1547     int operator () (const ushort * src, uchar * dst, int width,
1548                      float scale, float shift) const
1549     {
1550         int x = 0;
1551 
1552         if (USE_SSE2)
1553         {
1554             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1555                 v_zero_f = _mm_setzero_ps();
1556             __m128i v_zero_i = _mm_setzero_si128();
1557 
1558             for ( ; x <= width - 8; x += 8)
1559             {
1560                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1561                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero_i)), v_scale), v_shift);
1562                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1563                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero_i)), v_scale), v_shift);
1564                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1565 
1566                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
1567                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
1568             }
1569         }
1570 
1571         return x;
1572     }
1573 };
1574 
1575 template <>
1576 struct cvtScaleAbs_SIMD<short, uchar, float>
1577 {
operator ()cv::cvtScaleAbs_SIMD1578     int operator () (const short * src, uchar * dst, int width,
1579                      float scale, float shift) const
1580     {
1581         int x = 0;
1582 
1583         if (USE_SSE2)
1584         {
1585             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1586                 v_zero_f = _mm_setzero_ps();
1587             __m128i v_zero_i = _mm_setzero_si128();
1588 
1589             for ( ; x <= width - 8; x += 8)
1590             {
1591                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1592                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_src, v_src), 16)), v_scale), v_shift);
1593                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1594                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_src, v_src), 16)), v_scale), v_shift);
1595                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1596 
1597                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2)), v_zero_i);
1598                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
1599             }
1600         }
1601 
1602         return x;
1603     }
1604 };
1605 
1606 template <>
1607 struct cvtScaleAbs_SIMD<int, uchar, float>
1608 {
operator ()cv::cvtScaleAbs_SIMD1609     int operator () (const int * src, uchar * dst, int width,
1610                      float scale, float shift) const
1611     {
1612         int x = 0;
1613 
1614         if (USE_SSE2)
1615         {
1616             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1617                 v_zero_f = _mm_setzero_ps();
1618             __m128i v_zero_i = _mm_setzero_si128();
1619 
1620             for ( ; x <= width - 8; x += 4)
1621             {
1622                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src + x));
1623                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
1624                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1625 
1626                 __m128i v_dst_i = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(v_dst1), v_zero_i), v_zero_i);
1627                 _mm_storel_epi64((__m128i *)(dst + x), v_dst_i);
1628             }
1629         }
1630 
1631         return x;
1632     }
1633 };
1634 
1635 template <>
1636 struct cvtScaleAbs_SIMD<float, uchar, float>
1637 {
operator ()cv::cvtScaleAbs_SIMD1638     int operator () (const float * src, uchar * dst, int width,
1639                      float scale, float shift) const
1640     {
1641         int x = 0;
1642 
1643         if (USE_SSE2)
1644         {
1645             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1646                 v_zero_f = _mm_setzero_ps();
1647             __m128i v_zero_i = _mm_setzero_si128();
1648 
1649             for ( ; x <= width - 8; x += 4)
1650             {
1651                 __m128 v_dst = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src + x), v_scale), v_shift);
1652                 v_dst = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst), v_dst);
1653 
1654                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst), v_zero_i);
1655                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
1656             }
1657         }
1658 
1659         return x;
1660     }
1661 };
1662 
1663 template <>
1664 struct cvtScaleAbs_SIMD<double, uchar, float>
1665 {
operator ()cv::cvtScaleAbs_SIMD1666     int operator () (const double * src, uchar * dst, int width,
1667                      float scale, float shift) const
1668     {
1669         int x = 0;
1670 
1671         if (USE_SSE2)
1672         {
1673             __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift),
1674                 v_zero_f = _mm_setzero_ps();
1675             __m128i v_zero_i = _mm_setzero_si128();
1676 
1677             for ( ; x <= width - 8; x += 8)
1678             {
1679                 __m128 v_src1 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
1680                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
1681                 __m128 v_src2 = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
1682                                               _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
1683 
1684                 __m128 v_dst1 = _mm_add_ps(_mm_mul_ps(v_src1, v_scale), v_shift);
1685                 v_dst1 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst1), v_dst1);
1686 
1687                 __m128 v_dst2 = _mm_add_ps(_mm_mul_ps(v_src2, v_scale), v_shift);
1688                 v_dst2 = _mm_max_ps(_mm_sub_ps(v_zero_f, v_dst2), v_dst2);
1689 
1690                 __m128i v_dst_i = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1),
1691                                                   _mm_cvtps_epi32(v_dst2));
1692 
1693                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst_i, v_zero_i));
1694             }
1695         }
1696 
1697         return x;
1698     }
1699 };
1700 
1701 #elif CV_NEON
1702 
1703 template <>
1704 struct cvtScaleAbs_SIMD<uchar, uchar, float>
1705 {
operator ()cv::cvtScaleAbs_SIMD1706     int operator () (const uchar * src, uchar * dst, int width,
1707                      float scale, float shift) const
1708     {
1709         int x = 0;
1710         float32x4_t v_shift = vdupq_n_f32(shift);
1711 
1712         for ( ; x <= width - 16; x += 16)
1713         {
1714             uint8x16_t v_src = vld1q_u8(src + x);
1715             uint16x8_t v_half = vmovl_u8(vget_low_u8(v_src));
1716 
1717             uint32x4_t v_quat = vmovl_u16(vget_low_u16(v_half));
1718             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1719             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1720 
1721             v_quat = vmovl_u16(vget_high_u16(v_half));
1722             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1723             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1724 
1725             v_half = vmovl_u8(vget_high_u8(v_src));
1726 
1727             v_quat = vmovl_u16(vget_low_u16(v_half));
1728             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1729             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
1730 
1731             v_quat = vmovl_u16(vget_high_u16(v_half));
1732             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_u32(v_quat), scale);
1733             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
1734 
1735             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1736                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1737             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
1738                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
1739 
1740             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
1741         }
1742 
1743         return x;
1744     }
1745 };
1746 
1747 template <>
1748 struct cvtScaleAbs_SIMD<schar, uchar, float>
1749 {
operator ()cv::cvtScaleAbs_SIMD1750     int operator () (const schar * src, uchar * dst, int width,
1751                      float scale, float shift) const
1752     {
1753         int x = 0;
1754         float32x4_t v_shift = vdupq_n_f32(shift);
1755 
1756         for ( ; x <= width - 16; x += 16)
1757         {
1758             int8x16_t v_src = vld1q_s8(src + x);
1759             int16x8_t v_half = vmovl_s8(vget_low_s8(v_src));
1760 
1761             int32x4_t v_quat = vmovl_s16(vget_low_s16(v_half));
1762             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1763             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1764 
1765             v_quat = vmovl_s16(vget_high_s16(v_half));
1766             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1767             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1768 
1769             v_half = vmovl_s8(vget_high_s8(v_src));
1770 
1771             v_quat = vmovl_s16(vget_low_s16(v_half));
1772             float32x4_t v_dst_2 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1773             v_dst_2 = vabsq_f32(vaddq_f32(v_dst_2, v_shift));
1774 
1775             v_quat = vmovl_s16(vget_high_s16(v_half));
1776             float32x4_t v_dst_3 = vmulq_n_f32(vcvtq_f32_s32(v_quat), scale);
1777             v_dst_3 = vabsq_f32(vaddq_f32(v_dst_3, v_shift));
1778 
1779             uint16x8_t v_dsti_0 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1780                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1781             uint16x8_t v_dsti_1 = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_2)),
1782                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_3)));
1783 
1784             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_dsti_0), vqmovn_u16(v_dsti_1)));
1785         }
1786 
1787         return x;
1788     }
1789 };
1790 
1791 template <>
1792 struct cvtScaleAbs_SIMD<ushort, uchar, float>
1793 {
operator ()cv::cvtScaleAbs_SIMD1794     int operator () (const ushort * src, uchar * dst, int width,
1795                      float scale, float shift) const
1796     {
1797         int x = 0;
1798         float32x4_t v_shift = vdupq_n_f32(shift);
1799 
1800         for ( ; x <= width - 8; x += 8)
1801         {
1802             uint16x8_t v_src = vld1q_u16(src + x);
1803 
1804             uint32x4_t v_half = vmovl_u16(vget_low_u16(v_src));
1805             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
1806             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1807 
1808             v_half = vmovl_u16(vget_high_u16(v_src));
1809             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_u32(v_half), scale);
1810             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1811 
1812             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1813                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1814 
1815             vst1_u8(dst + x, vqmovn_u16(v_dst));
1816         }
1817 
1818         return x;
1819     }
1820 };
1821 
1822 template <>
1823 struct cvtScaleAbs_SIMD<short, uchar, float>
1824 {
operator ()cv::cvtScaleAbs_SIMD1825     int operator () (const short * src, uchar * dst, int width,
1826                      float scale, float shift) const
1827     {
1828         int x = 0;
1829         float32x4_t v_shift = vdupq_n_f32(shift);
1830 
1831         for ( ; x <= width - 8; x += 8)
1832         {
1833             int16x8_t v_src = vld1q_s16(src + x);
1834 
1835             int32x4_t v_half = vmovl_s16(vget_low_s16(v_src));
1836             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
1837             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1838 
1839             v_half = vmovl_s16(vget_high_s16(v_src));
1840             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(v_half), scale);
1841             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1842 
1843             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst_0)),
1844                 vqmovn_u32(cv_vrndq_u32_f32(v_dst_1)));
1845 
1846             vst1_u8(dst + x, vqmovn_u16(v_dst));
1847         }
1848 
1849         return x;
1850     }
1851 };
1852 
1853 template <>
1854 struct cvtScaleAbs_SIMD<int, uchar, float>
1855 {
operator ()cv::cvtScaleAbs_SIMD1856     int operator () (const int * src, uchar * dst, int width,
1857                      float scale, float shift) const
1858     {
1859         int x = 0;
1860         float32x4_t v_shift = vdupq_n_f32(shift);
1861 
1862         for ( ; x <= width - 8; x += 8)
1863         {
1864             float32x4_t v_dst_0 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x)), scale);
1865             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1866             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
1867 
1868             float32x4_t v_dst_1 = vmulq_n_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), scale);
1869             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1870             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
1871 
1872             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
1873             vst1_u8(dst + x, vqmovn_u16(v_dst));
1874         }
1875 
1876         return x;
1877     }
1878 };
1879 
1880 template <>
1881 struct cvtScaleAbs_SIMD<float, uchar, float>
1882 {
operator ()cv::cvtScaleAbs_SIMD1883     int operator () (const float * src, uchar * dst, int width,
1884                      float scale, float shift) const
1885     {
1886         int x = 0;
1887         float32x4_t v_shift = vdupq_n_f32(shift);
1888 
1889         for ( ; x <= width - 8; x += 8)
1890         {
1891             float32x4_t v_dst_0 = vmulq_n_f32(vld1q_f32(src + x), scale);
1892             v_dst_0 = vabsq_f32(vaddq_f32(v_dst_0, v_shift));
1893             uint16x4_t v_dsti_0 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_0));
1894 
1895             float32x4_t v_dst_1 = vmulq_n_f32(vld1q_f32(src + x + 4), scale);
1896             v_dst_1 = vabsq_f32(vaddq_f32(v_dst_1, v_shift));
1897             uint16x4_t v_dsti_1 = vqmovn_u32(cv_vrndq_u32_f32(v_dst_1));
1898 
1899             uint16x8_t v_dst = vcombine_u16(v_dsti_0, v_dsti_1);
1900             vst1_u8(dst + x, vqmovn_u16(v_dst));
1901         }
1902 
1903         return x;
1904     }
1905 };
1906 
1907 #endif
1908 
1909 template<typename T, typename DT, typename WT> static void
cvtScaleAbs_(const T * src,size_t sstep,DT * dst,size_t dstep,Size size,WT scale,WT shift)1910 cvtScaleAbs_( const T* src, size_t sstep,
1911               DT* dst, size_t dstep, Size size,
1912               WT scale, WT shift )
1913 {
1914     sstep /= sizeof(src[0]);
1915     dstep /= sizeof(dst[0]);
1916     cvtScaleAbs_SIMD<T, DT, WT> vop;
1917 
1918     for( ; size.height--; src += sstep, dst += dstep )
1919     {
1920         int x = vop(src, dst, size.width, scale, shift);
1921 
1922         #if CV_ENABLE_UNROLLED
1923         for( ; x <= size.width - 4; x += 4 )
1924         {
1925             DT t0, t1;
1926             t0 = saturate_cast<DT>(std::abs(src[x]*scale + shift));
1927             t1 = saturate_cast<DT>(std::abs(src[x+1]*scale + shift));
1928             dst[x] = t0; dst[x+1] = t1;
1929             t0 = saturate_cast<DT>(std::abs(src[x+2]*scale + shift));
1930             t1 = saturate_cast<DT>(std::abs(src[x+3]*scale + shift));
1931             dst[x+2] = t0; dst[x+3] = t1;
1932         }
1933         #endif
1934         for( ; x < size.width; x++ )
1935             dst[x] = saturate_cast<DT>(std::abs(src[x]*scale + shift));
1936     }
1937 }
1938 
1939 template <typename T, typename DT, typename WT>
1940 struct cvtScale_SIMD
1941 {
operator ()cv::cvtScale_SIMD1942     int operator () (const T *, DT *, int, WT, WT) const
1943     {
1944         return 0;
1945     }
1946 };
1947 
1948 #if CV_SSE2
1949 
1950 // from uchar
1951 
1952 template <>
1953 struct cvtScale_SIMD<uchar, uchar, float>
1954 {
operator ()cv::cvtScale_SIMD1955     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
1956     {
1957         int x = 0;
1958 
1959         if (!USE_SSE2)
1960             return x;
1961 
1962         __m128i v_zero = _mm_setzero_si128();
1963         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
1964 
1965         for ( ; x <= width - 8; x += 8)
1966         {
1967             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
1968             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
1969             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
1970 
1971             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
1972             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
1973 
1974             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
1975                                             _mm_cvtps_epi32(v_dst_1));
1976             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
1977         }
1978 
1979         return x;
1980     }
1981 };
1982 
1983 template <>
1984 struct cvtScale_SIMD<uchar, schar, float>
1985 {
operator ()cv::cvtScale_SIMD1986     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
1987     {
1988         int x = 0;
1989 
1990         if (!USE_SSE2)
1991             return x;
1992 
1993         __m128i v_zero = _mm_setzero_si128();
1994         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
1995 
1996         for ( ; x <= width - 8; x += 8)
1997         {
1998             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
1999             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2000             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2001 
2002             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2003             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2004 
2005             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2006                                             _mm_cvtps_epi32(v_dst_1));
2007             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
2008         }
2009 
2010         return x;
2011     }
2012 };
2013 
2014 #if CV_SSE4_1
2015 
2016 template <>
2017 struct cvtScale_SIMD<uchar, ushort, float>
2018 {
cvtScale_SIMDcv::cvtScale_SIMD2019     cvtScale_SIMD()
2020     {
2021         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
2022     }
2023 
operator ()cv::cvtScale_SIMD2024     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
2025     {
2026         int x = 0;
2027 
2028         if (!haveSSE)
2029             return x;
2030 
2031         __m128i v_zero = _mm_setzero_si128();
2032         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2033 
2034         for ( ; x <= width - 8; x += 8)
2035         {
2036             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
2037             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2038             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2039 
2040             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2041             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2042 
2043             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
2044                                              _mm_cvtps_epi32(v_dst_1));
2045             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2046         }
2047 
2048         return x;
2049     }
2050 
2051     bool haveSSE;
2052 };
2053 
2054 #endif
2055 
2056 template <>
2057 struct cvtScale_SIMD<uchar, short, float>
2058 {
operator ()cv::cvtScale_SIMD2059     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
2060     {
2061         int x = 0;
2062 
2063         if (!USE_SSE2)
2064             return x;
2065 
2066         __m128i v_zero = _mm_setzero_si128();
2067         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2068 
2069         for ( ; x <= width - 8; x += 8)
2070         {
2071             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
2072             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2073             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2074 
2075             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2076             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2077 
2078             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2079                                             _mm_cvtps_epi32(v_dst_1));
2080             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2081         }
2082 
2083         return x;
2084     }
2085 };
2086 
2087 template <>
2088 struct cvtScale_SIMD<uchar, int, float>
2089 {
operator ()cv::cvtScale_SIMD2090     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
2091     {
2092         int x = 0;
2093 
2094         if (!USE_SSE2)
2095             return x;
2096 
2097         __m128i v_zero = _mm_setzero_si128();
2098         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2099 
2100         for ( ; x <= width - 8; x += 8)
2101         {
2102             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
2103             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2104             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2105 
2106             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2107             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2108 
2109             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
2110             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
2111         }
2112 
2113         return x;
2114     }
2115 };
2116 
2117 template <>
2118 struct cvtScale_SIMD<uchar, float, float>
2119 {
operator ()cv::cvtScale_SIMD2120     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
2121     {
2122         int x = 0;
2123 
2124         if (!USE_SSE2)
2125             return x;
2126 
2127         __m128i v_zero = _mm_setzero_si128();
2128         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2129 
2130         for ( ; x <= width - 8; x += 8)
2131         {
2132             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
2133             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2134             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2135 
2136             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2137             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2138 
2139             _mm_storeu_ps(dst + x, v_dst_0);
2140             _mm_storeu_ps(dst + x + 4, v_dst_1);
2141         }
2142 
2143         return x;
2144     }
2145 };
2146 
2147 template <>
2148 struct cvtScale_SIMD<uchar, double, double>
2149 {
operator ()cv::cvtScale_SIMD2150     int operator () (const uchar * src, double * dst, int width, double scale, double shift) const
2151     {
2152         int x = 0;
2153 
2154         if (!USE_SSE2)
2155             return x;
2156 
2157         __m128i v_zero = _mm_setzero_si128();
2158         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
2159 
2160         for ( ; x <= width - 8; x += 8)
2161         {
2162             __m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
2163 
2164             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
2165             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2166             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2167             _mm_storeu_pd(dst + x, v_dst_0);
2168             _mm_storeu_pd(dst + x + 2, v_dst_1);
2169 
2170             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
2171             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2172             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2173             _mm_storeu_pd(dst + x + 4, v_dst_0);
2174             _mm_storeu_pd(dst + x + 6, v_dst_1);
2175         }
2176 
2177         return x;
2178     }
2179 };
2180 
2181 // from schar
2182 
2183 template <>
2184 struct cvtScale_SIMD<schar, uchar, float>
2185 {
operator ()cv::cvtScale_SIMD2186     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
2187     {
2188         int x = 0;
2189 
2190         if (!USE_SSE2)
2191             return x;
2192 
2193         __m128i v_zero = _mm_setzero_si128();
2194         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2195 
2196         for ( ; x <= width - 8; x += 8)
2197         {
2198             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
2199             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2200             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2201 
2202             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2203             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2204 
2205             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2206                                             _mm_cvtps_epi32(v_dst_1));
2207             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
2208         }
2209 
2210         return x;
2211     }
2212 };
2213 
2214 template <>
2215 struct cvtScale_SIMD<schar, schar, float>
2216 {
operator ()cv::cvtScale_SIMD2217     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
2218     {
2219         int x = 0;
2220 
2221         if (!USE_SSE2)
2222             return x;
2223 
2224         __m128i v_zero = _mm_setzero_si128();
2225         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2226 
2227         for ( ; x <= width - 8; x += 8)
2228         {
2229             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
2230             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2231             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2232 
2233             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2234             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2235 
2236             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2237                                             _mm_cvtps_epi32(v_dst_1));
2238             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
2239         }
2240 
2241         return x;
2242     }
2243 };
2244 
2245 #if CV_SSE4_1
2246 
2247 template <>
2248 struct cvtScale_SIMD<schar, ushort, float>
2249 {
cvtScale_SIMDcv::cvtScale_SIMD2250     cvtScale_SIMD()
2251     {
2252         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
2253     }
2254 
operator ()cv::cvtScale_SIMD2255     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
2256     {
2257         int x = 0;
2258 
2259         if (!haveSSE)
2260             return x;
2261 
2262         __m128i v_zero = _mm_setzero_si128();
2263         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2264 
2265         for ( ; x <= width - 8; x += 8)
2266         {
2267             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
2268             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2269             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2270 
2271             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2272             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2273 
2274             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
2275                                              _mm_cvtps_epi32(v_dst_1));
2276             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2277         }
2278 
2279         return x;
2280     }
2281 
2282     bool haveSSE;
2283 };
2284 
2285 #endif
2286 
2287 template <>
2288 struct cvtScale_SIMD<schar, short, float>
2289 {
operator ()cv::cvtScale_SIMD2290     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
2291     {
2292         int x = 0;
2293 
2294         if (!USE_SSE2)
2295             return x;
2296 
2297         __m128i v_zero = _mm_setzero_si128();
2298         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2299 
2300         for ( ; x <= width - 8; x += 8)
2301         {
2302             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
2303             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2304             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2305 
2306             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2307             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2308 
2309             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2310                                             _mm_cvtps_epi32(v_dst_1));
2311             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2312         }
2313 
2314         return x;
2315     }
2316 };
2317 
2318 template <>
2319 struct cvtScale_SIMD<schar, int, float>
2320 {
operator ()cv::cvtScale_SIMD2321     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
2322     {
2323         int x = 0;
2324 
2325         if (!USE_SSE2)
2326             return x;
2327 
2328         __m128i v_zero = _mm_setzero_si128();
2329         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2330 
2331         for ( ; x <= width - 8; x += 8)
2332         {
2333             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
2334             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2335             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2336 
2337             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2338             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2339 
2340             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
2341             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
2342         }
2343 
2344         return x;
2345     }
2346 };
2347 
2348 template <>
2349 struct cvtScale_SIMD<schar, float, float>
2350 {
operator ()cv::cvtScale_SIMD2351     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
2352     {
2353         int x = 0;
2354 
2355         if (!USE_SSE2)
2356             return x;
2357 
2358         __m128i v_zero = _mm_setzero_si128();
2359         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2360 
2361         for ( ; x <= width - 8; x += 8)
2362         {
2363             __m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
2364             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2365             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2366 
2367             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2368             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2369 
2370             _mm_storeu_ps(dst + x, v_dst_0);
2371             _mm_storeu_ps(dst + x + 4, v_dst_1);
2372         }
2373 
2374         return x;
2375     }
2376 };
2377 
2378 template <>
2379 struct cvtScale_SIMD<schar, double, double>
2380 {
operator ()cv::cvtScale_SIMD2381     int operator () (const schar * src, double * dst, int width, double scale, double shift) const
2382     {
2383         int x = 0;
2384 
2385         if (!USE_SSE2)
2386             return x;
2387 
2388         __m128i v_zero = _mm_setzero_si128();
2389         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
2390 
2391         for ( ; x <= width - 8; x += 8)
2392         {
2393             __m128i v_src = _mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x)));
2394             v_src = _mm_srai_epi16(v_src, 8);
2395 
2396             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
2397             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2398             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2399             _mm_storeu_pd(dst + x, v_dst_0);
2400             _mm_storeu_pd(dst + x + 2, v_dst_1);
2401 
2402             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
2403             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2404             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2405             _mm_storeu_pd(dst + x + 4, v_dst_0);
2406             _mm_storeu_pd(dst + x + 6, v_dst_1);
2407         }
2408 
2409         return x;
2410     }
2411 };
2412 
2413 // from ushort
2414 
2415 template <>
2416 struct cvtScale_SIMD<ushort, uchar, float>
2417 {
operator ()cv::cvtScale_SIMD2418     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
2419     {
2420         int x = 0;
2421 
2422         if (!USE_SSE2)
2423             return x;
2424 
2425         __m128i v_zero = _mm_setzero_si128();
2426         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2427 
2428         for ( ; x <= width - 8; x += 8)
2429         {
2430             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2431             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2432             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2433 
2434             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2435             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2436 
2437             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2438                                             _mm_cvtps_epi32(v_dst_1));
2439             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
2440         }
2441 
2442         return x;
2443     }
2444 };
2445 
2446 template <>
2447 struct cvtScale_SIMD<ushort, schar, float>
2448 {
operator ()cv::cvtScale_SIMD2449     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
2450     {
2451         int x = 0;
2452 
2453         if (!USE_SSE2)
2454             return x;
2455 
2456         __m128i v_zero = _mm_setzero_si128();
2457         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2458 
2459         for ( ; x <= width - 8; x += 8)
2460         {
2461             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2462             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2463             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2464 
2465             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2466             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2467 
2468             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2469                                             _mm_cvtps_epi32(v_dst_1));
2470             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
2471         }
2472 
2473         return x;
2474     }
2475 };
2476 
2477 #if CV_SSE4_1
2478 
2479 template <>
2480 struct cvtScale_SIMD<ushort, ushort, float>
2481 {
cvtScale_SIMDcv::cvtScale_SIMD2482     cvtScale_SIMD()
2483     {
2484         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
2485     }
2486 
operator ()cv::cvtScale_SIMD2487     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
2488     {
2489         int x = 0;
2490 
2491         if (!haveSSE)
2492             return x;
2493 
2494         __m128i v_zero = _mm_setzero_si128();
2495         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2496 
2497         for ( ; x <= width - 8; x += 8)
2498         {
2499             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2500             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2501             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2502 
2503             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2504             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2505 
2506             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
2507                                              _mm_cvtps_epi32(v_dst_1));
2508             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2509         }
2510 
2511         return x;
2512     }
2513 
2514     bool haveSSE;
2515 };
2516 
2517 #endif
2518 
2519 template <>
2520 struct cvtScale_SIMD<ushort, short, float>
2521 {
operator ()cv::cvtScale_SIMD2522     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
2523     {
2524         int x = 0;
2525 
2526         if (!USE_SSE2)
2527             return x;
2528 
2529         __m128i v_zero = _mm_setzero_si128();
2530         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2531 
2532         for ( ; x <= width - 8; x += 8)
2533         {
2534             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2535             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2536             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2537 
2538             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2539             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2540 
2541             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2542                                             _mm_cvtps_epi32(v_dst_1));
2543             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2544         }
2545 
2546         return x;
2547     }
2548 };
2549 
2550 template <>
2551 struct cvtScale_SIMD<ushort, int, float>
2552 {
operator ()cv::cvtScale_SIMD2553     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
2554     {
2555         int x = 0;
2556 
2557         if (!USE_SSE2)
2558             return x;
2559 
2560         __m128i v_zero = _mm_setzero_si128();
2561         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2562 
2563         for ( ; x <= width - 8; x += 8)
2564         {
2565             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2566             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2567             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2568 
2569             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2570             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2571 
2572             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
2573             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
2574         }
2575 
2576         return x;
2577     }
2578 };
2579 
2580 template <>
2581 struct cvtScale_SIMD<ushort, float, float>
2582 {
operator ()cv::cvtScale_SIMD2583     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
2584     {
2585         int x = 0;
2586 
2587         if (!USE_SSE2)
2588             return x;
2589 
2590         __m128i v_zero = _mm_setzero_si128();
2591         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2592 
2593         for ( ; x <= width - 8; x += 8)
2594         {
2595             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2596             __m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
2597             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2598 
2599             v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
2600             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2601 
2602             _mm_storeu_ps(dst + x, v_dst_0);
2603             _mm_storeu_ps(dst + x + 4, v_dst_1);
2604         }
2605 
2606         return x;
2607     }
2608 };
2609 
2610 template <>
2611 struct cvtScale_SIMD<ushort, double, double>
2612 {
operator ()cv::cvtScale_SIMD2613     int operator () (const ushort * src, double * dst, int width, double scale, double shift) const
2614     {
2615         int x = 0;
2616 
2617         if (!USE_SSE2)
2618             return x;
2619 
2620         __m128i v_zero = _mm_setzero_si128();
2621         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
2622 
2623         for ( ; x <= width - 8; x += 8)
2624         {
2625             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2626 
2627             __m128i v_src_s32 = _mm_unpacklo_epi16(v_src, v_zero);
2628             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2629             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2630             _mm_storeu_pd(dst + x, v_dst_0);
2631             _mm_storeu_pd(dst + x + 2, v_dst_1);
2632 
2633             v_src_s32 = _mm_unpackhi_epi16(v_src, v_zero);
2634             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2635             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2636             _mm_storeu_pd(dst + x + 4, v_dst_0);
2637             _mm_storeu_pd(dst + x + 6, v_dst_1);
2638         }
2639 
2640         return x;
2641     }
2642 };
2643 
2644 // from short
2645 
2646 template <>
2647 struct cvtScale_SIMD<short, uchar, float>
2648 {
operator ()cv::cvtScale_SIMD2649     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
2650     {
2651         int x = 0;
2652 
2653         if (!USE_SSE2)
2654             return x;
2655 
2656         __m128i v_zero = _mm_setzero_si128();
2657         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2658 
2659         for ( ; x <= width - 8; x += 8)
2660         {
2661             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2662             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2663             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2664 
2665             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2666             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2667 
2668             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2669                                             _mm_cvtps_epi32(v_dst_1));
2670             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
2671         }
2672 
2673         return x;
2674     }
2675 };
2676 
2677 template <>
2678 struct cvtScale_SIMD<short, schar, float>
2679 {
operator ()cv::cvtScale_SIMD2680     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
2681     {
2682         int x = 0;
2683 
2684         if (!USE_SSE2)
2685             return x;
2686 
2687         __m128i v_zero = _mm_setzero_si128();
2688         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2689 
2690         for ( ; x <= width - 8; x += 8)
2691         {
2692             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2693             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2694             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2695 
2696             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2697             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2698 
2699             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2700                                             _mm_cvtps_epi32(v_dst_1));
2701             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
2702         }
2703 
2704         return x;
2705     }
2706 };
2707 
2708 #if CV_SSE4_1
2709 
2710 template <>
2711 struct cvtScale_SIMD<short, ushort, float>
2712 {
cvtScale_SIMDcv::cvtScale_SIMD2713     cvtScale_SIMD()
2714     {
2715         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
2716     }
2717 
operator ()cv::cvtScale_SIMD2718     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
2719     {
2720         int x = 0;
2721 
2722         if (!haveSSE)
2723             return x;
2724 
2725         __m128i v_zero = _mm_setzero_si128();
2726         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2727 
2728         for ( ; x <= width - 8; x += 8)
2729         {
2730             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2731             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2732             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2733 
2734             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2735             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2736 
2737             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
2738                                              _mm_cvtps_epi32(v_dst_1));
2739             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2740         }
2741 
2742         return x;
2743     }
2744 
2745     bool haveSSE;
2746 };
2747 
2748 #endif
2749 
2750 template <>
2751 struct cvtScale_SIMD<short, short, float>
2752 {
operator ()cv::cvtScale_SIMD2753     int operator () (const short * src, short * dst, int width, float scale, float shift) const
2754     {
2755         int x = 0;
2756 
2757         if (!USE_SSE2)
2758             return x;
2759 
2760         __m128i v_zero = _mm_setzero_si128();
2761         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2762 
2763         for ( ; x <= width - 8; x += 8)
2764         {
2765             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2766             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2767             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2768 
2769             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2770             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2771 
2772             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2773                                             _mm_cvtps_epi32(v_dst_1));
2774             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2775         }
2776 
2777         return x;
2778     }
2779 };
2780 
2781 template <>
2782 struct cvtScale_SIMD<short, int, float>
2783 {
operator ()cv::cvtScale_SIMD2784     int operator () (const short * src, int * dst, int width, float scale, float shift) const
2785     {
2786         int x = 0;
2787 
2788         if (!USE_SSE2)
2789             return x;
2790 
2791         __m128i v_zero = _mm_setzero_si128();
2792         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2793 
2794         for ( ; x <= width - 8; x += 8)
2795         {
2796             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2797             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2798             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2799 
2800             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2801             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2802 
2803             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
2804             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
2805         }
2806 
2807         return x;
2808     }
2809 };
2810 
2811 template <>
2812 struct cvtScale_SIMD<short, float, float>
2813 {
operator ()cv::cvtScale_SIMD2814     int operator () (const short * src, float * dst, int width, float scale, float shift) const
2815     {
2816         int x = 0;
2817 
2818         if (!USE_SSE2)
2819             return x;
2820 
2821         __m128i v_zero = _mm_setzero_si128();
2822         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2823 
2824         for ( ; x <= width - 8; x += 8)
2825         {
2826             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2827             __m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
2828             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2829 
2830             v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
2831             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
2832 
2833             _mm_storeu_ps(dst + x, v_dst_0);
2834             _mm_storeu_ps(dst + x + 4, v_dst_1);
2835         }
2836 
2837         return x;
2838     }
2839 };
2840 
2841 template <>
2842 struct cvtScale_SIMD<short, double, double>
2843 {
operator ()cv::cvtScale_SIMD2844     int operator () (const short * src, double * dst, int width, double scale, double shift) const
2845     {
2846         int x = 0;
2847 
2848         if (!USE_SSE2)
2849             return x;
2850 
2851         __m128i v_zero = _mm_setzero_si128();
2852         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
2853 
2854         for ( ; x <= width - 8; x += 8)
2855         {
2856             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2857 
2858             __m128i v_src_s32 = _mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16);
2859             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2860             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2861             _mm_storeu_pd(dst + x, v_dst_0);
2862             _mm_storeu_pd(dst + x + 2, v_dst_1);
2863 
2864             v_src_s32 = _mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16);
2865             v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src_s32), v_scale), v_shift);
2866             v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(_mm_srli_si128(v_src_s32, 8)), v_scale), v_shift);
2867             _mm_storeu_pd(dst + x + 4, v_dst_0);
2868             _mm_storeu_pd(dst + x + 6, v_dst_1);
2869         }
2870 
2871         return x;
2872     }
2873 };
2874 
2875 // from int
2876 
2877 template <>
2878 struct cvtScale_SIMD<int, uchar, float>
2879 {
operator ()cv::cvtScale_SIMD2880     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
2881     {
2882         int x = 0;
2883 
2884         if (!USE_SSE2)
2885             return x;
2886 
2887         __m128i v_zero = _mm_setzero_si128();
2888         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2889 
2890         for ( ; x <= width - 8; x += 8)
2891         {
2892             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2893             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2894 
2895             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
2896             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2897 
2898             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2899                                             _mm_cvtps_epi32(v_dst_1));
2900             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
2901         }
2902 
2903         return x;
2904     }
2905 };
2906 
2907 template <>
2908 struct cvtScale_SIMD<int, schar, float>
2909 {
operator ()cv::cvtScale_SIMD2910     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
2911     {
2912         int x = 0;
2913 
2914         if (!USE_SSE2)
2915             return x;
2916 
2917         __m128i v_zero = _mm_setzero_si128();
2918         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2919 
2920         for ( ; x <= width - 8; x += 8)
2921         {
2922             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2923             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2924 
2925             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
2926             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2927 
2928             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2929                                             _mm_cvtps_epi32(v_dst_1));
2930             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
2931         }
2932 
2933         return x;
2934     }
2935 };
2936 
2937 #if CV_SSE4_1
2938 
2939 template <>
2940 struct cvtScale_SIMD<int, ushort, float>
2941 {
cvtScale_SIMDcv::cvtScale_SIMD2942     cvtScale_SIMD()
2943     {
2944         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
2945     }
2946 
operator ()cv::cvtScale_SIMD2947     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
2948     {
2949         int x = 0;
2950 
2951         if (!haveSSE)
2952             return x;
2953 
2954         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2955 
2956         for ( ; x <= width - 8; x += 8)
2957         {
2958             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2959             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2960 
2961             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
2962             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2963 
2964             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
2965                                              _mm_cvtps_epi32(v_dst_1));
2966             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
2967         }
2968 
2969         return x;
2970     }
2971 
2972     bool haveSSE;
2973 };
2974 
2975 #endif
2976 
2977 template <>
2978 struct cvtScale_SIMD<int, short, float>
2979 {
operator ()cv::cvtScale_SIMD2980     int operator () (const int * src, short * dst, int width, float scale, float shift) const
2981     {
2982         int x = 0;
2983 
2984         if (!USE_SSE2)
2985             return x;
2986 
2987         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
2988 
2989         for ( ; x <= width - 8; x += 8)
2990         {
2991             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
2992             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2993 
2994             v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
2995             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
2996 
2997             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
2998                                             _mm_cvtps_epi32(v_dst_1));
2999             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
3000         }
3001 
3002         return x;
3003     }
3004 };
3005 
3006 template <>
3007 struct cvtScale_SIMD<int, int, double>
3008 {
operator ()cv::cvtScale_SIMD3009     int operator () (const int * src, int * dst, int width, double scale, double shift) const
3010     {
3011         int x = 0;
3012 
3013         if (!USE_SSE2)
3014             return x;
3015 
3016         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
3017 
3018         for ( ; x <= width - 4; x += 4)
3019         {
3020             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
3021             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
3022 
3023             v_src = _mm_srli_si128(v_src, 8);
3024             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
3025 
3026             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_0)),
3027                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst_1)));
3028 
3029             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
3030         }
3031 
3032         return x;
3033     }
3034 };
3035 
3036 template <>
3037 struct cvtScale_SIMD<int, float, double>
3038 {
operator ()cv::cvtScale_SIMD3039     int operator () (const int * src, float * dst, int width, double scale, double shift) const
3040     {
3041         int x = 0;
3042 
3043         if (!USE_SSE2)
3044             return x;
3045 
3046         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
3047 
3048         for ( ; x <= width - 4; x += 4)
3049         {
3050             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
3051             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
3052 
3053             v_src = _mm_srli_si128(v_src, 8);
3054             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
3055 
3056             _mm_storeu_ps(dst + x, _mm_movelh_ps(_mm_cvtpd_ps(v_dst_0),
3057                                                  _mm_cvtpd_ps(v_dst_1)));
3058         }
3059 
3060         return x;
3061     }
3062 };
3063 
3064 template <>
3065 struct cvtScale_SIMD<int, double, double>
3066 {
operator ()cv::cvtScale_SIMD3067     int operator () (const int * src, double * dst, int width, double scale, double shift) const
3068     {
3069         int x = 0;
3070 
3071         if (!USE_SSE2)
3072             return x;
3073 
3074         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
3075 
3076         for ( ; x <= width - 4; x += 4)
3077         {
3078             __m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
3079             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
3080 
3081             v_src = _mm_srli_si128(v_src, 8);
3082             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtepi32_pd(v_src), v_scale), v_shift);
3083 
3084             _mm_storeu_pd(dst + x, v_dst_0);
3085             _mm_storeu_pd(dst + x + 2, v_dst_1);
3086         }
3087 
3088         return x;
3089     }
3090 };
3091 
3092 // from float
3093 
3094 template <>
3095 struct cvtScale_SIMD<float, uchar, float>
3096 {
operator ()cv::cvtScale_SIMD3097     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
3098     {
3099         int x = 0;
3100 
3101         if (!USE_SSE2)
3102             return x;
3103 
3104         __m128i v_zero = _mm_setzero_si128();
3105         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3106 
3107         for ( ; x <= width - 8; x += 8)
3108         {
3109             __m128 v_src = _mm_loadu_ps(src + x);
3110             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3111 
3112             v_src = _mm_loadu_ps(src + x + 4);
3113             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3114 
3115             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
3116                                             _mm_cvtps_epi32(v_dst_1));
3117             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
3118         }
3119 
3120         return x;
3121     }
3122 };
3123 
3124 template <>
3125 struct cvtScale_SIMD<float, schar, float>
3126 {
operator ()cv::cvtScale_SIMD3127     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
3128     {
3129         int x = 0;
3130 
3131         if (!USE_SSE2)
3132             return x;
3133 
3134         __m128i v_zero = _mm_setzero_si128();
3135         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3136 
3137         for ( ; x <= width - 8; x += 8)
3138         {
3139             __m128 v_src = _mm_loadu_ps(src + x);
3140             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3141 
3142             v_src = _mm_loadu_ps(src + x + 4);
3143             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3144 
3145             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
3146                                             _mm_cvtps_epi32(v_dst_1));
3147             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
3148         }
3149 
3150         return x;
3151     }
3152 };
3153 
3154 #if CV_SSE4_1
3155 
3156 template <>
3157 struct cvtScale_SIMD<float, ushort, float>
3158 {
cvtScale_SIMDcv::cvtScale_SIMD3159     cvtScale_SIMD()
3160     {
3161         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
3162     }
3163 
operator ()cv::cvtScale_SIMD3164     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
3165     {
3166         int x = 0;
3167 
3168         if (!haveSSE)
3169             return x;
3170 
3171         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3172 
3173         for ( ; x <= width - 8; x += 8)
3174         {
3175             __m128 v_src = _mm_loadu_ps(src + x);
3176             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3177 
3178             v_src = _mm_loadu_ps(src + x + 4);
3179             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3180 
3181             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
3182                                              _mm_cvtps_epi32(v_dst_1));
3183             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
3184         }
3185 
3186         return x;
3187     }
3188 
3189     bool haveSSE;
3190 };
3191 
3192 #endif
3193 
3194 template <>
3195 struct cvtScale_SIMD<float, short, float>
3196 {
operator ()cv::cvtScale_SIMD3197     int operator () (const float * src, short * dst, int width, float scale, float shift) const
3198     {
3199         int x = 0;
3200 
3201         if (!USE_SSE2)
3202             return x;
3203 
3204         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3205 
3206         for ( ; x <= width - 8; x += 8)
3207         {
3208             __m128 v_src = _mm_loadu_ps(src + x);
3209             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3210 
3211             v_src = _mm_loadu_ps(src + x + 4);
3212             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3213 
3214             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
3215                                             _mm_cvtps_epi32(v_dst_1));
3216             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
3217         }
3218 
3219         return x;
3220     }
3221 };
3222 
3223 template <>
3224 struct cvtScale_SIMD<float, int, float>
3225 {
operator ()cv::cvtScale_SIMD3226     int operator () (const float * src, int * dst, int width, float scale, float shift) const
3227     {
3228         int x = 0;
3229 
3230         if (!USE_SSE2)
3231             return x;
3232 
3233         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3234 
3235         for ( ; x <= width - 8; x += 8)
3236         {
3237             __m128 v_src = _mm_loadu_ps(src + x);
3238             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3239 
3240             v_src = _mm_loadu_ps(src + x + 4);
3241             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3242 
3243             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_dst_0));
3244             _mm_storeu_si128((__m128i *)(dst + x + 4), _mm_cvtps_epi32(v_dst_1));
3245         }
3246 
3247         return x;
3248     }
3249 };
3250 
3251 template <>
3252 struct cvtScale_SIMD<float, float, float>
3253 {
operator ()cv::cvtScale_SIMD3254     int operator () (const float * src, float * dst, int width, float scale, float shift) const
3255     {
3256         int x = 0;
3257 
3258         if (!USE_SSE2)
3259             return x;
3260 
3261         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3262 
3263         for ( ; x <= width - 4; x += 4)
3264         {
3265             __m128 v_src = _mm_loadu_ps(src + x);
3266             __m128 v_dst = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3267             _mm_storeu_ps(dst + x, v_dst);
3268         }
3269 
3270         return x;
3271     }
3272 };
3273 
3274 template <>
3275 struct cvtScale_SIMD<float, double, double>
3276 {
operator ()cv::cvtScale_SIMD3277     int operator () (const float * src, double * dst, int width, double scale, double shift) const
3278     {
3279         int x = 0;
3280 
3281         if (!USE_SSE2)
3282             return x;
3283 
3284         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
3285 
3286         for ( ; x <= width - 4; x += 4)
3287         {
3288             __m128 v_src = _mm_loadu_ps(src + x);
3289             __m128d v_dst_0 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
3290             v_src = _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(v_src), 8));
3291             __m128d v_dst_1 = _mm_add_pd(_mm_mul_pd(_mm_cvtps_pd(v_src), v_scale), v_shift);
3292 
3293             _mm_storeu_pd(dst + x, v_dst_0);
3294             _mm_storeu_pd(dst + x + 2, v_dst_1);
3295         }
3296 
3297         return x;
3298     }
3299 };
3300 
3301 // from double
3302 
3303 template <>
3304 struct cvtScale_SIMD<double, uchar, float>
3305 {
operator ()cv::cvtScale_SIMD3306     int operator () (const double * src, uchar * dst, int width, float scale, float shift) const
3307     {
3308         int x = 0;
3309 
3310         if (!USE_SSE2)
3311             return x;
3312 
3313         __m128i v_zero = _mm_setzero_si128();
3314         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3315 
3316         for ( ; x <= width - 8; x += 8)
3317         {
3318             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
3319                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
3320             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3321 
3322             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
3323                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
3324             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3325 
3326             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
3327                                             _mm_cvtps_epi32(v_dst_1));
3328             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_zero));
3329         }
3330 
3331         return x;
3332     }
3333 };
3334 
3335 template <>
3336 struct cvtScale_SIMD<double, schar, float>
3337 {
operator ()cv::cvtScale_SIMD3338     int operator () (const double * src, schar * dst, int width, float scale, float shift) const
3339     {
3340         int x = 0;
3341 
3342         if (!USE_SSE2)
3343             return x;
3344 
3345         __m128i v_zero = _mm_setzero_si128();
3346         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3347 
3348         for ( ; x <= width - 8; x += 8)
3349         {
3350             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
3351                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
3352             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3353 
3354             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
3355                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
3356             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3357 
3358             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
3359                                             _mm_cvtps_epi32(v_dst_1));
3360             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_zero));
3361         }
3362 
3363         return x;
3364     }
3365 };
3366 
3367 #if CV_SSE4_1
3368 
3369 template <>
3370 struct cvtScale_SIMD<double, ushort, float>
3371 {
cvtScale_SIMDcv::cvtScale_SIMD3372     cvtScale_SIMD()
3373     {
3374         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
3375     }
3376 
operator ()cv::cvtScale_SIMD3377     int operator () (const double * src, ushort * dst, int width, float scale, float shift) const
3378     {
3379         int x = 0;
3380 
3381         if (!haveSSE)
3382             return x;
3383 
3384         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3385 
3386         for ( ; x <= width - 8; x += 8)
3387         {
3388             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
3389                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
3390             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3391 
3392             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
3393                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
3394             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3395 
3396             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
3397                                              _mm_cvtps_epi32(v_dst_1));
3398             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
3399         }
3400 
3401         return x;
3402     }
3403 
3404     bool haveSSE;
3405 };
3406 
3407 #endif
3408 
3409 template <>
3410 struct cvtScale_SIMD<double, short, float>
3411 {
operator ()cv::cvtScale_SIMD3412     int operator () (const double * src, short * dst, int width, float scale, float shift) const
3413     {
3414         int x = 0;
3415 
3416         if (!USE_SSE2)
3417             return x;
3418 
3419         __m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
3420 
3421         for ( ; x <= width - 8; x += 8)
3422         {
3423             __m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
3424                                          _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
3425             __m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3426 
3427             v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
3428                                   _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
3429             __m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
3430 
3431             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_dst_0),
3432                                             _mm_cvtps_epi32(v_dst_1));
3433             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
3434         }
3435 
3436         return x;
3437     }
3438 };
3439 
3440 template <>
3441 struct cvtScale_SIMD<double, int, double>
3442 {
operator ()cv::cvtScale_SIMD3443     int operator () (const double * src, int * dst, int width, double scale, double shift) const
3444     {
3445         int x = 0;
3446 
3447         if (!USE_SSE2)
3448             return x;
3449 
3450         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
3451 
3452         for ( ; x <= width - 4; x += 4)
3453         {
3454             __m128d v_src = _mm_loadu_pd(src + x);
3455             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
3456 
3457             v_src = _mm_loadu_pd(src + x + 2);
3458             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
3459 
3460             __m128 v_dst = _mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_dst0)),
3461                                          _mm_castsi128_ps(_mm_cvtpd_epi32(v_dst1)));
3462 
3463             _mm_storeu_si128((__m128i *)(dst + x), _mm_castps_si128(v_dst));
3464         }
3465 
3466         return x;
3467     }
3468 };
3469 
3470 template <>
3471 struct cvtScale_SIMD<double, float, double>
3472 {
operator ()cv::cvtScale_SIMD3473     int operator () (const double * src, float * dst, int width, double scale, double shift) const
3474     {
3475         int x = 0;
3476 
3477         if (!USE_SSE2)
3478             return x;
3479 
3480         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
3481 
3482         for ( ; x <= width - 4; x += 4)
3483         {
3484             __m128d v_src = _mm_loadu_pd(src + x);
3485             __m128d v_dst0 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
3486 
3487             v_src = _mm_loadu_pd(src + x + 2);
3488             __m128d v_dst1 = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
3489 
3490             __m128 v_dst = _mm_movelh_ps(_mm_cvtpd_ps(v_dst0),
3491                                          _mm_cvtpd_ps(v_dst1));
3492 
3493             _mm_storeu_ps(dst + x, v_dst);
3494         }
3495 
3496         return x;
3497     }
3498 };
3499 
3500 template <>
3501 struct cvtScale_SIMD<double, double, double>
3502 {
operator ()cv::cvtScale_SIMD3503     int operator () (const double * src, double * dst, int width, double scale, double shift) const
3504     {
3505         int x = 0;
3506 
3507         if (!USE_SSE2)
3508             return x;
3509 
3510         __m128d v_scale = _mm_set1_pd(scale), v_shift = _mm_set1_pd(shift);
3511 
3512         for ( ; x <= width - 2; x += 2)
3513         {
3514             __m128d v_src = _mm_loadu_pd(src + x);
3515             __m128d v_dst = _mm_add_pd(_mm_mul_pd(v_src, v_scale), v_shift);
3516             _mm_storeu_pd(dst + x, v_dst);
3517         }
3518 
3519         return x;
3520     }
3521 };
3522 
3523 #elif CV_NEON
3524 
3525 // from uchar
3526 
3527 template <>
3528 struct cvtScale_SIMD<uchar, uchar, float>
3529 {
operator ()cv::cvtScale_SIMD3530     int operator () (const uchar * src, uchar * dst, int width, float scale, float shift) const
3531     {
3532         int x = 0;
3533         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3534 
3535         for ( ; x <= width - 8; x += 8)
3536         {
3537             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
3538             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3539             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3540 
3541             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3542                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3543             vst1_u8(dst + x, vqmovn_u16(v_dst));
3544         }
3545 
3546         return x;
3547     }
3548 };
3549 
3550 template <>
3551 struct cvtScale_SIMD<uchar, schar, float>
3552 {
operator ()cv::cvtScale_SIMD3553     int operator () (const uchar * src, schar * dst, int width, float scale, float shift) const
3554     {
3555         int x = 0;
3556         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3557 
3558         for ( ; x <= width - 8; x += 8)
3559         {
3560             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
3561             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3562             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3563 
3564             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
3565                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
3566             vst1_s8(dst + x, vqmovn_s16(v_dst));
3567         }
3568 
3569         return x;
3570     }
3571 };
3572 
3573 template <>
3574 struct cvtScale_SIMD<uchar, ushort, float>
3575 {
operator ()cv::cvtScale_SIMD3576     int operator () (const uchar * src, ushort * dst, int width, float scale, float shift) const
3577     {
3578         int x = 0;
3579         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3580 
3581         for ( ; x <= width - 8; x += 8)
3582         {
3583             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
3584             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3585             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3586 
3587             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3588                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3589             vst1q_u16(dst + x, v_dst);
3590         }
3591 
3592         return x;
3593     }
3594 };
3595 
3596 template <>
3597 struct cvtScale_SIMD<uchar, short, float>
3598 {
operator ()cv::cvtScale_SIMD3599     int operator () (const uchar * src, short * dst, int width, float scale, float shift) const
3600     {
3601         int x = 0;
3602         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3603 
3604         for ( ; x <= width - 8; x += 8)
3605         {
3606             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
3607             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3608             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3609 
3610             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
3611                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
3612             vst1q_s16(dst + x, v_dst);
3613         }
3614 
3615         return x;
3616     }
3617 };
3618 
3619 template <>
3620 struct cvtScale_SIMD<uchar, int, float>
3621 {
operator ()cv::cvtScale_SIMD3622     int operator () (const uchar * src, int * dst, int width, float scale, float shift) const
3623     {
3624         int x = 0;
3625         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3626 
3627         for ( ; x <= width - 8; x += 8)
3628         {
3629             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
3630             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3631             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3632 
3633             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
3634             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
3635         }
3636 
3637         return x;
3638     }
3639 };
3640 
3641 template <>
3642 struct cvtScale_SIMD<uchar, float, float>
3643 {
operator ()cv::cvtScale_SIMD3644     int operator () (const uchar * src, float * dst, int width, float scale, float shift) const
3645     {
3646         int x = 0;
3647         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3648 
3649         for ( ; x <= width - 8; x += 8)
3650         {
3651             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
3652             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
3653             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
3654         }
3655 
3656         return x;
3657     }
3658 };
3659 
3660 // from schar
3661 
3662 template <>
3663 struct cvtScale_SIMD<schar, uchar, float>
3664 {
operator ()cv::cvtScale_SIMD3665     int operator () (const schar * src, uchar * dst, int width, float scale, float shift) const
3666     {
3667         int x = 0;
3668         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3669 
3670         for ( ; x <= width - 8; x += 8)
3671         {
3672             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
3673             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3674             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3675 
3676             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3677                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3678             vst1_u8(dst + x, vqmovn_u16(v_dst));
3679         }
3680 
3681         return x;
3682     }
3683 };
3684 
3685 template <>
3686 struct cvtScale_SIMD<schar, schar, float>
3687 {
operator ()cv::cvtScale_SIMD3688     int operator () (const schar * src, schar * dst, int width, float scale, float shift) const
3689     {
3690         int x = 0;
3691         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3692 
3693         for ( ; x <= width - 8; x += 8)
3694         {
3695             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
3696             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3697             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3698 
3699             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
3700                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
3701             vst1_s8(dst + x, vqmovn_s16(v_dst));
3702         }
3703 
3704         return x;
3705     }
3706 };
3707 
3708 template <>
3709 struct cvtScale_SIMD<schar, ushort, float>
3710 {
operator ()cv::cvtScale_SIMD3711     int operator () (const schar * src, ushort * dst, int width, float scale, float shift) const
3712     {
3713         int x = 0;
3714         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3715 
3716         for ( ; x <= width - 8; x += 8)
3717         {
3718             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
3719             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3720             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3721 
3722             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3723                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3724             vst1q_u16(dst + x, v_dst);
3725         }
3726 
3727         return x;
3728     }
3729 };
3730 
3731 template <>
3732 struct cvtScale_SIMD<schar, short, float>
3733 {
operator ()cv::cvtScale_SIMD3734     int operator () (const schar * src, short * dst, int width, float scale, float shift) const
3735     {
3736         int x = 0;
3737         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3738 
3739         for ( ; x <= width - 8; x += 8)
3740         {
3741             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
3742             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3743             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3744 
3745             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
3746                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
3747             vst1q_s16(dst + x, v_dst);
3748         }
3749 
3750         return x;
3751     }
3752 };
3753 
3754 template <>
3755 struct cvtScale_SIMD<schar, int, float>
3756 {
operator ()cv::cvtScale_SIMD3757     int operator () (const schar * src, int * dst, int width, float scale, float shift) const
3758     {
3759         int x = 0;
3760         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3761 
3762         for ( ; x <= width - 8; x += 8)
3763         {
3764             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
3765             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3766             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3767 
3768             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
3769             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
3770         }
3771 
3772         return x;
3773     }
3774 };
3775 
3776 template <>
3777 struct cvtScale_SIMD<schar, float, float>
3778 {
operator ()cv::cvtScale_SIMD3779     int operator () (const schar * src, float * dst, int width, float scale, float shift) const
3780     {
3781         int x = 0;
3782         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3783 
3784         for ( ; x <= width - 8; x += 8)
3785         {
3786             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
3787             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
3788             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
3789         }
3790 
3791         return x;
3792     }
3793 };
3794 
3795 // from ushort
3796 
3797 template <>
3798 struct cvtScale_SIMD<ushort, uchar, float>
3799 {
operator ()cv::cvtScale_SIMD3800     int operator () (const ushort * src, uchar * dst, int width, float scale, float shift) const
3801     {
3802         int x = 0;
3803         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3804 
3805         for ( ; x <= width - 8; x += 8)
3806         {
3807             uint16x8_t v_src = vld1q_u16(src + x);
3808             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3809             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3810 
3811             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3812                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3813             vst1_u8(dst + x, vqmovn_u16(v_dst));
3814         }
3815 
3816         return x;
3817     }
3818 };
3819 
3820 template <>
3821 struct cvtScale_SIMD<ushort, schar, float>
3822 {
operator ()cv::cvtScale_SIMD3823     int operator () (const ushort * src, schar * dst, int width, float scale, float shift) const
3824     {
3825         int x = 0;
3826         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3827 
3828         for ( ; x <= width - 8; x += 8)
3829         {
3830             uint16x8_t v_src = vld1q_u16(src + x);
3831             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3832             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3833 
3834             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
3835                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
3836             vst1_s8(dst + x, vqmovn_s16(v_dst));
3837         }
3838 
3839         return x;
3840     }
3841 };
3842 
3843 template <>
3844 struct cvtScale_SIMD<ushort, ushort, float>
3845 {
operator ()cv::cvtScale_SIMD3846     int operator () (const ushort * src, ushort * dst, int width, float scale, float shift) const
3847     {
3848         int x = 0;
3849         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3850 
3851         for ( ; x <= width - 8; x += 8)
3852         {
3853             uint16x8_t v_src = vld1q_u16(src + x);
3854             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3855             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3856 
3857             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3858                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3859             vst1q_u16(dst + x, v_dst);
3860         }
3861 
3862         return x;
3863     }
3864 };
3865 
3866 template <>
3867 struct cvtScale_SIMD<ushort, short, float>
3868 {
operator ()cv::cvtScale_SIMD3869     int operator () (const ushort * src, short * dst, int width, float scale, float shift) const
3870     {
3871         int x = 0;
3872         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3873 
3874         for ( ; x <= width - 8; x += 8)
3875         {
3876             uint16x8_t v_src = vld1q_u16(src + x);
3877             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3878             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3879 
3880             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
3881                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
3882             vst1q_s16(dst + x, v_dst);
3883         }
3884 
3885         return x;
3886     }
3887 };
3888 
3889 template <>
3890 struct cvtScale_SIMD<ushort, int, float>
3891 {
operator ()cv::cvtScale_SIMD3892     int operator () (const ushort * src, int * dst, int width, float scale, float shift) const
3893     {
3894         int x = 0;
3895         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3896 
3897         for ( ; x <= width - 8; x += 8)
3898         {
3899             uint16x8_t v_src = vld1q_u16(src + x);
3900             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift);
3901             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift);
3902 
3903             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_dst1));
3904             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_dst2));
3905         }
3906 
3907         return x;
3908     }
3909 };
3910 
3911 template <>
3912 struct cvtScale_SIMD<ushort, float, float>
3913 {
operator ()cv::cvtScale_SIMD3914     int operator () (const ushort * src, float * dst, int width, float scale, float shift) const
3915     {
3916         int x = 0;
3917         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3918 
3919         for ( ; x <= width - 8; x += 8)
3920         {
3921             uint16x8_t v_src = vld1q_u16(src + x);
3922             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))), v_scale), v_shift));
3923             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))), v_scale), v_shift));
3924         }
3925 
3926         return x;
3927     }
3928 };
3929 
3930 // from short
3931 
3932 template <>
3933 struct cvtScale_SIMD<short, uchar, float>
3934 {
operator ()cv::cvtScale_SIMD3935     int operator () (const short * src, uchar * dst, int width, float scale, float shift) const
3936     {
3937         int x = 0;
3938         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3939 
3940         for ( ; x <= width - 8; x += 8)
3941         {
3942             int16x8_t v_src = vld1q_s16(src + x);
3943             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3944             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3945 
3946             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3947                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3948             vst1_u8(dst + x, vqmovn_u16(v_dst));
3949         }
3950 
3951         return x;
3952     }
3953 };
3954 
3955 template <>
3956 struct cvtScale_SIMD<short, schar, float>
3957 {
operator ()cv::cvtScale_SIMD3958     int operator () (const short * src, schar * dst, int width, float scale, float shift) const
3959     {
3960         int x = 0;
3961         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3962 
3963         for ( ; x <= width - 8; x += 8)
3964         {
3965             int16x8_t v_src = vld1q_s16(src + x);
3966             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3967             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3968 
3969             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
3970                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
3971             vst1_s8(dst + x, vqmovn_s16(v_dst));
3972         }
3973 
3974         return x;
3975     }
3976 };
3977 
3978 template <>
3979 struct cvtScale_SIMD<short, ushort, float>
3980 {
operator ()cv::cvtScale_SIMD3981     int operator () (const short * src, ushort * dst, int width, float scale, float shift) const
3982     {
3983         int x = 0;
3984         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
3985 
3986         for ( ; x <= width - 8; x += 8)
3987         {
3988             int16x8_t v_src = vld1q_s16(src + x);
3989             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift);
3990             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift);
3991 
3992             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
3993                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
3994             vst1q_u16(dst + x, v_dst);
3995         }
3996 
3997         return x;
3998     }
3999 };
4000 
4001 template <>
4002 struct cvtScale_SIMD<short, float, float>
4003 {
operator ()cv::cvtScale_SIMD4004     int operator () (const short * src, float * dst, int width, float scale, float shift) const
4005     {
4006         int x = 0;
4007         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4008 
4009         for ( ; x <= width - 8; x += 8)
4010         {
4011             int16x8_t v_src = vld1q_s16(src + x);
4012             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))), v_scale), v_shift));
4013             vst1q_f32(dst + x + 4, vaddq_f32(vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))), v_scale), v_shift));
4014         }
4015 
4016         return x;
4017     }
4018 };
4019 
4020 // from int
4021 
4022 template <>
4023 struct cvtScale_SIMD<int, uchar, float>
4024 {
operator ()cv::cvtScale_SIMD4025     int operator () (const int * src, uchar * dst, int width, float scale, float shift) const
4026     {
4027         int x = 0;
4028         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4029 
4030         for ( ; x <= width - 8; x += 8)
4031         {
4032             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
4033             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
4034 
4035             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
4036                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
4037             vst1_u8(dst + x, vqmovn_u16(v_dst));
4038         }
4039 
4040         return x;
4041     }
4042 };
4043 
4044 template <>
4045 struct cvtScale_SIMD<int, schar, float>
4046 {
operator ()cv::cvtScale_SIMD4047     int operator () (const int * src, schar * dst, int width, float scale, float shift) const
4048     {
4049         int x = 0;
4050         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4051 
4052         for ( ; x <= width - 8; x += 8)
4053         {
4054             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
4055             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
4056 
4057             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
4058                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
4059             vst1_s8(dst + x, vqmovn_s16(v_dst));
4060         }
4061 
4062         return x;
4063     }
4064 };
4065 
4066 template <>
4067 struct cvtScale_SIMD<int, ushort, float>
4068 {
operator ()cv::cvtScale_SIMD4069     int operator () (const int * src, ushort * dst, int width, float scale, float shift) const
4070     {
4071         int x = 0;
4072         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4073 
4074         for ( ; x <= width - 8; x += 8)
4075         {
4076             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
4077             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
4078 
4079             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
4080                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
4081             vst1q_u16(dst + x, v_dst);
4082         }
4083 
4084         return x;
4085     }
4086 };
4087 
4088 template <>
4089 struct cvtScale_SIMD<int, short, float>
4090 {
operator ()cv::cvtScale_SIMD4091     int operator () (const int * src, short * dst, int width, float scale, float shift) const
4092     {
4093         int x = 0;
4094         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4095 
4096         for ( ; x <= width - 8; x += 8)
4097         {
4098             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x)), v_scale), v_shift);
4099             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vcvtq_f32_s32(vld1q_s32(src + x + 4)), v_scale), v_shift);
4100 
4101             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
4102                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
4103             vst1q_s16(dst + x, v_dst);
4104         }
4105 
4106         return x;
4107     }
4108 };
4109 
4110 // from float
4111 
4112 template <>
4113 struct cvtScale_SIMD<float, uchar, float>
4114 {
operator ()cv::cvtScale_SIMD4115     int operator () (const float * src, uchar * dst, int width, float scale, float shift) const
4116     {
4117         int x = 0;
4118         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4119 
4120         for ( ; x <= width - 8; x += 8)
4121         {
4122             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
4123             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
4124 
4125             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
4126                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
4127             vst1_u8(dst + x, vqmovn_u16(v_dst));
4128         }
4129 
4130         return x;
4131     }
4132 };
4133 
4134 template <>
4135 struct cvtScale_SIMD<float, schar, float>
4136 {
operator ()cv::cvtScale_SIMD4137     int operator () (const float * src, schar * dst, int width, float scale, float shift) const
4138     {
4139         int x = 0;
4140         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4141 
4142         for ( ; x <= width - 8; x += 8)
4143         {
4144             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
4145             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
4146 
4147             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
4148                                            vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
4149             vst1_s8(dst + x, vqmovn_s16(v_dst));
4150         }
4151 
4152         return x;
4153     }
4154 };
4155 
4156 template <>
4157 struct cvtScale_SIMD<float, ushort, float>
4158 {
operator ()cv::cvtScale_SIMD4159     int operator () (const float * src, ushort * dst, int width, float scale, float shift) const
4160     {
4161         int x = 0;
4162         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4163 
4164         for ( ; x <= width - 8; x += 8)
4165         {
4166             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
4167             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
4168 
4169             uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
4170                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
4171             vst1q_u16(dst + x, v_dst);
4172         }
4173 
4174         return x;
4175     }
4176 };
4177 
4178 template <>
4179 struct cvtScale_SIMD<float, short, float>
4180 {
operator ()cv::cvtScale_SIMD4181     int operator () (const float * src, short * dst, int width, float scale, float shift) const
4182     {
4183         int x = 0;
4184         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4185 
4186         for ( ; x <= width - 8; x += 8)
4187         {
4188             float32x4_t v_dst1 = vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift);
4189             float32x4_t v_dst2 = vaddq_f32(vmulq_f32(vld1q_f32(src + x + 4), v_scale), v_shift);
4190 
4191             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
4192                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
4193             vst1q_s16(dst + x, v_dst);
4194         }
4195 
4196         return x;
4197     }
4198 };
4199 
4200 template <>
4201 struct cvtScale_SIMD<float, int, float>
4202 {
operator ()cv::cvtScale_SIMD4203     int operator () (const float * src, int * dst, int width, float scale, float shift) const
4204     {
4205         int x = 0;
4206         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4207 
4208         for ( ; x <= width - 4; x += 4)
4209             vst1q_s32(dst + x, cv_vrndq_s32_f32(vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift)));
4210 
4211         return x;
4212     }
4213 };
4214 
4215 template <>
4216 struct cvtScale_SIMD<float, float, float>
4217 {
operator ()cv::cvtScale_SIMD4218     int operator () (const float * src, float * dst, int width, float scale, float shift) const
4219     {
4220         int x = 0;
4221         float32x4_t v_shift = vdupq_n_f32(shift), v_scale = vdupq_n_f32(scale);
4222 
4223         for ( ; x <= width - 4; x += 4)
4224             vst1q_f32(dst + x, vaddq_f32(vmulq_f32(vld1q_f32(src + x), v_scale), v_shift));
4225 
4226         return x;
4227     }
4228 };
4229 
4230 #endif
4231 
4232 template<typename T, typename DT, typename WT> static void
cvtScale_(const T * src,size_t sstep,DT * dst,size_t dstep,Size size,WT scale,WT shift)4233 cvtScale_( const T* src, size_t sstep,
4234            DT* dst, size_t dstep, Size size,
4235            WT scale, WT shift )
4236 {
4237     sstep /= sizeof(src[0]);
4238     dstep /= sizeof(dst[0]);
4239 
4240     cvtScale_SIMD<T, DT, WT> vop;
4241 
4242     for( ; size.height--; src += sstep, dst += dstep )
4243     {
4244         int x = vop(src, dst, size.width, scale, shift);
4245 
4246         #if CV_ENABLE_UNROLLED
4247         for( ; x <= size.width - 4; x += 4 )
4248         {
4249             DT t0, t1;
4250             t0 = saturate_cast<DT>(src[x]*scale + shift);
4251             t1 = saturate_cast<DT>(src[x+1]*scale + shift);
4252             dst[x] = t0; dst[x+1] = t1;
4253             t0 = saturate_cast<DT>(src[x+2]*scale + shift);
4254             t1 = saturate_cast<DT>(src[x+3]*scale + shift);
4255             dst[x+2] = t0; dst[x+3] = t1;
4256         }
4257         #endif
4258 
4259         for( ; x < size.width; x++ )
4260             dst[x] = saturate_cast<DT>(src[x]*scale + shift);
4261     }
4262 }
4263 
4264 //vz optimized template specialization
4265 template<> void
cvtScale_(const short * src,size_t sstep,short * dst,size_t dstep,Size size,float scale,float shift)4266 cvtScale_<short, short, float>( const short* src, size_t sstep,
4267            short* dst, size_t dstep, Size size,
4268            float scale, float shift )
4269 {
4270     sstep /= sizeof(src[0]);
4271     dstep /= sizeof(dst[0]);
4272 
4273     for( ; size.height--; src += sstep, dst += dstep )
4274     {
4275         int x = 0;
4276         #if CV_SSE2
4277             if(USE_SSE2)
4278             {
4279                 __m128 scale128 = _mm_set1_ps (scale);
4280                 __m128 shift128 = _mm_set1_ps (shift);
4281                 for(; x <= size.width - 8; x += 8 )
4282                 {
4283                     __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x));
4284                     __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4));
4285                     __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
4286                     __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16));
4287                     rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
4288                     rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
4289                     r0 = _mm_cvtps_epi32(rf0);
4290                     r1 = _mm_cvtps_epi32(rf1);
4291                     r0 = _mm_packs_epi32(r0, r1);
4292                     _mm_storeu_si128((__m128i*)(dst + x), r0);
4293                 }
4294             }
4295         #elif CV_NEON
4296         float32x4_t v_shift = vdupq_n_f32(shift);
4297         for(; x <= size.width - 8; x += 8 )
4298         {
4299             int16x8_t v_src = vld1q_s16(src + x);
4300             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
4301             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
4302 
4303             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
4304             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
4305 
4306             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_tmp1)),
4307                                             vqmovn_s32(cv_vrndq_s32_f32(v_tmp2))));
4308         }
4309         #endif
4310 
4311         for(; x < size.width; x++ )
4312             dst[x] = saturate_cast<short>(src[x]*scale + shift);
4313     }
4314 }
4315 
4316 template<> void
cvtScale_(const short * src,size_t sstep,int * dst,size_t dstep,Size size,float scale,float shift)4317 cvtScale_<short, int, float>( const short* src, size_t sstep,
4318            int* dst, size_t dstep, Size size,
4319            float scale, float shift )
4320 {
4321     sstep /= sizeof(src[0]);
4322     dstep /= sizeof(dst[0]);
4323 
4324     for( ; size.height--; src += sstep, dst += dstep )
4325     {
4326         int x = 0;
4327 
4328         #if CV_AVX2
4329         if (USE_AVX2)
4330         {
4331             __m256 scale256 = _mm256_set1_ps(scale);
4332             __m256 shift256 = _mm256_set1_ps(shift);
4333             const int shuffle = 0xD8;
4334 
4335             for ( ; x <= size.width - 16; x += 16)
4336             {
4337                 __m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
4338                 v_src = _mm256_permute4x64_epi64(v_src, shuffle);
4339                 __m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
4340                 __m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
4341                 __m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
4342                 __m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
4343                 _mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
4344                 _mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
4345             }
4346         }
4347         #endif
4348         #if CV_SSE2
4349         if (USE_SSE2)//~5X
4350         {
4351             __m128 scale128 = _mm_set1_ps (scale);
4352             __m128 shift128 = _mm_set1_ps (shift);
4353             for(; x <= size.width - 8; x += 8 )
4354             {
4355                 __m128i r0 = _mm_loadu_si128((const __m128i*)(src + x));
4356 
4357                 __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16));
4358                 __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(r0, r0), 16));
4359                 rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128);
4360                 rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128);
4361 
4362                 _mm_storeu_si128((__m128i*)(dst + x), _mm_cvtps_epi32(rf0));
4363                 _mm_storeu_si128((__m128i*)(dst + x + 4), _mm_cvtps_epi32(rf1));
4364             }
4365         }
4366         #elif CV_NEON
4367         float32x4_t v_shift = vdupq_n_f32(shift);
4368         for(; x <= size.width - 8; x += 8 )
4369         {
4370             int16x8_t v_src = vld1q_s16(src + x);
4371             float32x4_t v_tmp1 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src)));
4372             float32x4_t v_tmp2 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src)));
4373 
4374             v_tmp1 = vaddq_f32(vmulq_n_f32(v_tmp1, scale), v_shift);
4375             v_tmp2 = vaddq_f32(vmulq_n_f32(v_tmp2, scale), v_shift);
4376 
4377             vst1q_s32(dst + x, cv_vrndq_s32_f32(v_tmp1));
4378             vst1q_s32(dst + x + 4, cv_vrndq_s32_f32(v_tmp2));
4379         }
4380         #endif
4381 
4382         for(; x < size.width; x++ )
4383             dst[x] = saturate_cast<int>(src[x]*scale + shift);
4384     }
4385 }
4386 
4387 template <typename T, typename DT>
4388 struct Cvt_SIMD
4389 {
operator ()cv::Cvt_SIMD4390     int operator() (const T *, DT *, int) const
4391     {
4392         return 0;
4393     }
4394 };
4395 
4396 #if CV_SSE2
4397 
4398 // from double
4399 
4400 template <>
4401 struct Cvt_SIMD<double, uchar>
4402 {
operator ()cv::Cvt_SIMD4403     int operator() (const double * src, uchar * dst, int width) const
4404     {
4405         int x = 0;
4406 
4407         if (!USE_SSE2)
4408             return x;
4409 
4410         for ( ; x <= width - 8; x += 8)
4411         {
4412             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
4413             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
4414             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
4415             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
4416 
4417             v_src0 = _mm_movelh_ps(v_src0, v_src1);
4418             v_src1 = _mm_movelh_ps(v_src2, v_src3);
4419 
4420             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
4421                                             _mm_cvtps_epi32(v_src1));
4422             _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_dst, v_dst));
4423         }
4424 
4425         return x;
4426     }
4427 };
4428 
4429 template <>
4430 struct Cvt_SIMD<double, schar>
4431 {
operator ()cv::Cvt_SIMD4432     int operator() (const double * src, schar * dst, int width) const
4433     {
4434         int x = 0;
4435 
4436         if (!USE_SSE2)
4437             return x;
4438 
4439         for ( ; x <= width - 8; x += 8)
4440         {
4441             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
4442             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
4443             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
4444             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
4445 
4446             v_src0 = _mm_movelh_ps(v_src0, v_src1);
4447             v_src1 = _mm_movelh_ps(v_src2, v_src3);
4448 
4449             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
4450                                             _mm_cvtps_epi32(v_src1));
4451             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst, v_dst));
4452         }
4453 
4454         return x;
4455     }
4456 };
4457 
4458 #if CV_SSE4_1
4459 
4460 template <>
4461 struct Cvt_SIMD<double, ushort>
4462 {
4463     bool haveSIMD;
Cvt_SIMDcv::Cvt_SIMD4464     Cvt_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1); }
4465 
operator ()cv::Cvt_SIMD4466     int operator() (const double * src, ushort * dst, int width) const
4467     {
4468         int x = 0;
4469 
4470         if (!haveSIMD)
4471             return x;
4472 
4473         for ( ; x <= width - 8; x += 8)
4474         {
4475             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
4476             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
4477             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
4478             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
4479 
4480             v_src0 = _mm_movelh_ps(v_src0, v_src1);
4481             v_src1 = _mm_movelh_ps(v_src2, v_src3);
4482 
4483             __m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
4484                                              _mm_cvtps_epi32(v_src1));
4485             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
4486         }
4487 
4488         return x;
4489     }
4490 };
4491 
4492 #endif // CV_SSE4_1
4493 
4494 template <>
4495 struct Cvt_SIMD<double, short>
4496 {
operator ()cv::Cvt_SIMD4497     int operator() (const double * src, short * dst, int width) const
4498     {
4499         int x = 0;
4500 
4501         if (!USE_SSE2)
4502             return x;
4503 
4504         for ( ; x <= width - 8; x += 8)
4505         {
4506             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
4507             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
4508             __m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
4509             __m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
4510 
4511             v_src0 = _mm_movelh_ps(v_src0, v_src1);
4512             v_src1 = _mm_movelh_ps(v_src2, v_src3);
4513 
4514             __m128i v_dst = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
4515                                             _mm_cvtps_epi32(v_src1));
4516             _mm_storeu_si128((__m128i *)(dst + x), v_dst);
4517         }
4518 
4519         return x;
4520     }
4521 };
4522 
4523 template <>
4524 struct Cvt_SIMD<double, int>
4525 {
operator ()cv::Cvt_SIMD4526     int operator() (const double * src, int * dst, int width) const
4527     {
4528         int x = 0;
4529 
4530         if (!USE_SSE2)
4531             return x;
4532 
4533         for ( ; x <= width - 4; x += 4)
4534         {
4535             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
4536             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
4537             v_src0 = _mm_movelh_ps(v_src0, v_src1);
4538 
4539             _mm_storeu_si128((__m128i *)(dst + x), _mm_cvtps_epi32(v_src0));
4540         }
4541 
4542         return x;
4543     }
4544 };
4545 
4546 template <>
4547 struct Cvt_SIMD<double, float>
4548 {
operator ()cv::Cvt_SIMD4549     int operator() (const double * src, float * dst, int width) const
4550     {
4551         int x = 0;
4552 
4553         if (!USE_SSE2)
4554             return x;
4555 
4556         for ( ; x <= width - 4; x += 4)
4557         {
4558             __m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
4559             __m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
4560 
4561             _mm_storeu_ps(dst + x, _mm_movelh_ps(v_src0, v_src1));
4562         }
4563 
4564         return x;
4565     }
4566 };
4567 
4568 
4569 #elif CV_NEON
4570 
4571 // from uchar
4572 
4573 template <>
4574 struct Cvt_SIMD<uchar, schar>
4575 {
operator ()cv::Cvt_SIMD4576     int operator() (const uchar * src, schar * dst, int width) const
4577     {
4578         int x = 0;
4579 
4580         for ( ; x <= width - 8; x += 8)
4581             vst1_s8(dst + x, vqmovn_s16(vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x)))));
4582 
4583         return x;
4584     }
4585 };
4586 
4587 
4588 template <>
4589 struct Cvt_SIMD<uchar, ushort>
4590 {
operator ()cv::Cvt_SIMD4591     int operator() (const uchar * src, ushort * dst, int width) const
4592     {
4593         int x = 0;
4594 
4595         for ( ; x <= width - 8; x += 8)
4596             vst1q_u16(dst + x, vmovl_u8(vld1_u8(src + x)));
4597 
4598         return x;
4599     }
4600 };
4601 
4602 template <>
4603 struct Cvt_SIMD<uchar, short>
4604 {
operator ()cv::Cvt_SIMD4605     int operator() (const uchar * src, short * dst, int width) const
4606     {
4607         int x = 0;
4608 
4609         for ( ; x <= width - 8; x += 8)
4610             vst1q_s16(dst + x, vreinterpretq_s16_u16(vmovl_u8(vld1_u8(src + x))));
4611 
4612         return x;
4613     }
4614 };
4615 
4616 template <>
4617 struct Cvt_SIMD<uchar, int>
4618 {
operator ()cv::Cvt_SIMD4619     int operator() (const uchar * src, int * dst, int width) const
4620     {
4621         int x = 0;
4622 
4623         for ( ; x <= width - 8; x += 8)
4624         {
4625             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
4626             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
4627             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
4628         }
4629 
4630         return x;
4631     }
4632 };
4633 
4634 template <>
4635 struct Cvt_SIMD<uchar, float>
4636 {
operator ()cv::Cvt_SIMD4637     int operator() (const uchar * src, float * dst, int width) const
4638     {
4639         int x = 0;
4640 
4641         for ( ; x <= width - 8; x += 8)
4642         {
4643             uint16x8_t v_src = vmovl_u8(vld1_u8(src + x));
4644             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
4645             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
4646         }
4647 
4648         return x;
4649     }
4650 };
4651 
4652 // from schar
4653 
4654 template <>
4655 struct Cvt_SIMD<schar, uchar>
4656 {
operator ()cv::Cvt_SIMD4657     int operator() (const schar * src, uchar * dst, int width) const
4658     {
4659         int x = 0;
4660 
4661         for ( ; x <= width - 8; x += 8)
4662             vst1_u8(dst + x, vqmovun_s16(vmovl_s8(vld1_s8(src + x))));
4663 
4664         return x;
4665     }
4666 };
4667 
4668 template <>
4669 struct Cvt_SIMD<schar, short>
4670 {
operator ()cv::Cvt_SIMD4671     int operator() (const schar * src, short * dst, int width) const
4672     {
4673         int x = 0;
4674 
4675         for ( ; x <= width - 8; x += 8)
4676             vst1q_s16(dst + x, vmovl_s8(vld1_s8(src + x)));
4677 
4678         return x;
4679     }
4680 };
4681 
4682 template <>
4683 struct Cvt_SIMD<schar, ushort>
4684 {
operator ()cv::Cvt_SIMD4685     int operator() (const schar * src, ushort * dst, int width) const
4686     {
4687         int x = 0;
4688 
4689         for ( ; x <= width - 8; x += 8)
4690         {
4691             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
4692             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(vmovl_s16(vget_low_s16(v_src))),
4693                                             vqmovun_s32(vmovl_s16(vget_high_s16(v_src)))));
4694         }
4695 
4696         return x;
4697     }
4698 };
4699 
4700 
4701 template <>
4702 struct Cvt_SIMD<schar, int>
4703 {
operator ()cv::Cvt_SIMD4704     int operator() (const schar * src, int * dst, int width) const
4705     {
4706         int x = 0;
4707 
4708         for ( ; x <= width - 8; x += 8)
4709         {
4710             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
4711             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
4712             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
4713         }
4714 
4715         return x;
4716     }
4717 };
4718 
4719 template <>
4720 struct Cvt_SIMD<schar, float>
4721 {
operator ()cv::Cvt_SIMD4722     int operator() (const schar * src, float * dst, int width) const
4723     {
4724         int x = 0;
4725 
4726         for ( ; x <= width - 8; x += 8)
4727         {
4728             int16x8_t v_src = vmovl_s8(vld1_s8(src + x));
4729             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
4730             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
4731         }
4732 
4733         return x;
4734     }
4735 };
4736 
4737 // from ushort
4738 
4739 template <>
4740 struct Cvt_SIMD<ushort, uchar>
4741 {
operator ()cv::Cvt_SIMD4742     int operator() (const ushort * src, uchar * dst, int width) const
4743     {
4744         int x = 0;
4745 
4746         for ( ; x <= width - 16; x += 16)
4747         {
4748             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
4749             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(v_src1), vqmovn_u16(v_src2)));
4750         }
4751 
4752         return x;
4753     }
4754 };
4755 
4756 template <>
4757 struct Cvt_SIMD<ushort, schar>
4758 {
operator ()cv::Cvt_SIMD4759     int operator() (const ushort * src, schar * dst, int width) const
4760     {
4761         int x = 0;
4762 
4763         for ( ; x <= width - 16; x += 16)
4764         {
4765             uint16x8_t v_src1 = vld1q_u16(src + x), v_src2 = vld1q_u16(src + x + 8);
4766             int32x4_t v_dst10 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src1)));
4767             int32x4_t v_dst11 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src1)));
4768             int32x4_t v_dst20 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src2)));
4769             int32x4_t v_dst21 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src2)));
4770 
4771             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst10), vqmovn_s32(v_dst11))),
4772                                           vqmovn_s16(vcombine_s16(vqmovn_s32(v_dst20), vqmovn_s32(v_dst21)))));
4773         }
4774 
4775         return x;
4776     }
4777 };
4778 
4779 template <>
4780 struct Cvt_SIMD<ushort, short>
4781 {
operator ()cv::Cvt_SIMD4782     int operator() (const ushort * src, short * dst, int width) const
4783     {
4784         int x = 0;
4785 
4786         for ( ; x <= width - 8; x += 8)
4787         {
4788             uint16x8_t v_src = vld1q_u16(src + x);
4789             int32x4_t v_dst0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src)));
4790             int32x4_t v_dst1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src)));
4791 
4792             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
4793         }
4794 
4795         return x;
4796     }
4797 };
4798 
4799 template <>
4800 struct Cvt_SIMD<ushort, int>
4801 {
operator ()cv::Cvt_SIMD4802     int operator() (const ushort * src, int * dst, int width) const
4803     {
4804         int x = 0;
4805 
4806         for ( ; x <= width - 8; x += 8)
4807         {
4808             uint16x8_t v_src = vld1q_u16(src + x);
4809             vst1q_s32(dst + x, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src))));
4810             vst1q_s32(dst + x + 4, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src))));
4811         }
4812 
4813         return x;
4814     }
4815 };
4816 
4817 template <>
4818 struct Cvt_SIMD<ushort, float>
4819 {
operator ()cv::Cvt_SIMD4820     int operator() (const ushort * src, float * dst, int width) const
4821     {
4822         int x = 0;
4823 
4824         for ( ; x <= width - 8; x += 8)
4825         {
4826             uint16x8_t v_src = vld1q_u16(src + x);
4827             vst1q_f32(dst + x, vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src))));
4828             vst1q_f32(dst + x + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src))));
4829         }
4830 
4831         return x;
4832     }
4833 };
4834 
4835 // from short
4836 
4837 template <>
4838 struct Cvt_SIMD<short, uchar>
4839 {
operator ()cv::Cvt_SIMD4840     int operator() (const short * src, uchar * dst, int width) const
4841     {
4842         int x = 0;
4843 
4844         for ( ; x <= width - 16; x += 16)
4845         {
4846             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
4847             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_src1), vqmovun_s16(v_src2)));
4848         }
4849 
4850         return x;
4851     }
4852 };
4853 
4854 template <>
4855 struct Cvt_SIMD<short, schar>
4856 {
operator ()cv::Cvt_SIMD4857     int operator() (const short * src, schar * dst, int width) const
4858     {
4859         int x = 0;
4860 
4861         for ( ; x <= width - 16; x += 16)
4862         {
4863             int16x8_t v_src1 = vld1q_s16(src + x), v_src2 = vld1q_s16(src + x + 8);
4864             vst1q_s8(dst + x, vcombine_s8(vqmovn_s16(v_src1), vqmovn_s16(v_src2)));
4865         }
4866 
4867         return x;
4868     }
4869 };
4870 
4871 template <>
4872 struct Cvt_SIMD<short, ushort>
4873 {
operator ()cv::Cvt_SIMD4874     int operator() (const short * src, ushort * dst, int width) const
4875     {
4876         int x = 0;
4877 
4878         for ( ; x <= width - 8; x += 8)
4879         {
4880             int16x8_t v_src = vld1q_s16(src + x);
4881             uint16x4_t v_dst1 = vqmovun_s32(vmovl_s16(vget_low_s16(v_src)));
4882             uint16x4_t v_dst2 = vqmovun_s32(vmovl_s16(vget_high_s16(v_src)));
4883             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
4884         }
4885 
4886         return x;
4887     }
4888 };
4889 
4890 template <>
4891 struct Cvt_SIMD<short, int>
4892 {
operator ()cv::Cvt_SIMD4893     int operator() (const short * src, int * dst, int width) const
4894     {
4895         int x = 0;
4896 
4897         for ( ; x <= width - 8; x += 8)
4898         {
4899             int16x8_t v_src = vld1q_s16(src + x);
4900             vst1q_s32(dst + x, vmovl_s16(vget_low_s16(v_src)));
4901             vst1q_s32(dst + x + 4, vmovl_s16(vget_high_s16(v_src)));
4902         }
4903 
4904         return x;
4905     }
4906 };
4907 
4908 template <>
4909 struct Cvt_SIMD<short, float>
4910 {
operator ()cv::Cvt_SIMD4911     int operator() (const short * src, float * dst, int width) const
4912     {
4913         int x = 0;
4914 
4915         for ( ; x <= width - 8; x += 8)
4916         {
4917             int16x8_t v_src = vld1q_s16(src + x);
4918             vst1q_f32(dst + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src))));
4919             vst1q_f32(dst + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src))));
4920         }
4921 
4922         return x;
4923     }
4924 };
4925 
4926 // from int
4927 
4928 template <>
4929 struct Cvt_SIMD<int, uchar>
4930 {
operator ()cv::Cvt_SIMD4931     int operator() (const int * src, uchar * dst, int width) const
4932     {
4933         int x = 0;
4934 
4935         for ( ; x <= width - 16; x += 16)
4936         {
4937             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
4938             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
4939             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
4940             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovun_s32(v_src3), vqmovun_s32(v_src4)));
4941             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
4942         }
4943 
4944         return x;
4945     }
4946 };
4947 
4948 template <>
4949 struct Cvt_SIMD<int, schar>
4950 {
operator ()cv::Cvt_SIMD4951     int operator() (const int * src, schar * dst, int width) const
4952     {
4953         int x = 0;
4954 
4955         for ( ; x <= width - 16; x += 16)
4956         {
4957             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
4958             int32x4_t v_src3 = vld1q_s32(src + x + 8), v_src4 = vld1q_s32(src + x + 12);
4959             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
4960             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
4961             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
4962         }
4963 
4964         return x;
4965     }
4966 };
4967 
4968 
4969 template <>
4970 struct Cvt_SIMD<int, ushort>
4971 {
operator ()cv::Cvt_SIMD4972     int operator() (const int * src, ushort * dst, int width) const
4973     {
4974         int x = 0;
4975 
4976         for ( ; x <= width - 8; x += 8)
4977         {
4978             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
4979             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_src1), vqmovun_s32(v_src2)));
4980         }
4981 
4982         return x;
4983     }
4984 };
4985 
4986 template <>
4987 struct Cvt_SIMD<int, short>
4988 {
operator ()cv::Cvt_SIMD4989     int operator() (const int * src, short * dst, int width) const
4990     {
4991         int x = 0;
4992 
4993         for ( ; x <= width - 8; x += 8)
4994         {
4995             int32x4_t v_src1 = vld1q_s32(src + x), v_src2 = vld1q_s32(src + x + 4);
4996             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
4997         }
4998 
4999         return x;
5000     }
5001 };
5002 
5003 template <>
5004 struct Cvt_SIMD<int, float>
5005 {
operator ()cv::Cvt_SIMD5006     int operator() (const int * src, float * dst, int width) const
5007     {
5008         int x = 0;
5009 
5010         for ( ; x <= width - 4; x += 4)
5011             vst1q_f32(dst + x, vcvtq_f32_s32(vld1q_s32(src + x)));
5012 
5013         return x;
5014     }
5015 };
5016 
5017 // from float
5018 
5019 template <>
5020 struct Cvt_SIMD<float, uchar>
5021 {
operator ()cv::Cvt_SIMD5022     int operator() (const float * src, uchar * dst, int width) const
5023     {
5024         int x = 0;
5025 
5026         for ( ; x <= width - 16; x += 16)
5027         {
5028             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
5029             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
5030             uint32x4_t v_src3 = cv_vrndq_u32_f32(vld1q_f32(src + x + 8));
5031             uint32x4_t v_src4 = cv_vrndq_u32_f32(vld1q_f32(src + x + 12));
5032             uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
5033             uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(v_src3), vqmovn_u32(v_src4)));
5034             vst1q_u8(dst + x, vcombine_u8(v_dst1, v_dst2));
5035         }
5036 
5037         return x;
5038     }
5039 };
5040 
5041 template <>
5042 struct Cvt_SIMD<float, schar>
5043 {
operator ()cv::Cvt_SIMD5044     int operator() (const float * src, schar * dst, int width) const
5045     {
5046         int x = 0;
5047 
5048         for ( ; x <= width - 16; x += 16)
5049         {
5050             int32x4_t v_src1 = cv_vrndq_s32_f32(vld1q_f32(src + x));
5051             int32x4_t v_src2 = cv_vrndq_s32_f32(vld1q_f32(src + x + 4));
5052             int32x4_t v_src3 = cv_vrndq_s32_f32(vld1q_f32(src + x + 8));
5053             int32x4_t v_src4 = cv_vrndq_s32_f32(vld1q_f32(src + x + 12));
5054             int8x8_t v_dst1 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src1), vqmovn_s32(v_src2)));
5055             int8x8_t v_dst2 = vqmovn_s16(vcombine_s16(vqmovn_s32(v_src3), vqmovn_s32(v_src4)));
5056             vst1q_s8(dst + x, vcombine_s8(v_dst1, v_dst2));
5057         }
5058 
5059         return x;
5060     }
5061 };
5062 
5063 
5064 template <>
5065 struct Cvt_SIMD<float, ushort>
5066 {
operator ()cv::Cvt_SIMD5067     int operator() (const float * src, ushort * dst, int width) const
5068     {
5069         int x = 0;
5070 
5071         for ( ; x <= width - 8; x += 8)
5072         {
5073             uint32x4_t v_src1 = cv_vrndq_u32_f32(vld1q_f32(src + x));
5074             uint32x4_t v_src2 = cv_vrndq_u32_f32(vld1q_f32(src + x + 4));
5075             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(v_src1), vqmovn_u32(v_src2)));
5076         }
5077 
5078         return x;
5079     }
5080 };
5081 
5082 template <>
5083 struct Cvt_SIMD<float, int>
5084 {
operator ()cv::Cvt_SIMD5085     int operator() (const float * src, int * dst, int width) const
5086     {
5087         int x = 0;
5088 
5089         for ( ; x <= width - 4; x += 4)
5090             vst1q_s32(dst + x, cv_vrndq_s32_f32(vld1q_f32(src + x)));
5091 
5092         return x;
5093     }
5094 };
5095 
5096 #endif
5097 
5098 template<typename T, typename DT> static void
cvt_(const T * src,size_t sstep,DT * dst,size_t dstep,Size size)5099 cvt_( const T* src, size_t sstep,
5100       DT* dst, size_t dstep, Size size )
5101 {
5102     sstep /= sizeof(src[0]);
5103     dstep /= sizeof(dst[0]);
5104     Cvt_SIMD<T, DT> vop;
5105 
5106     for( ; size.height--; src += sstep, dst += dstep )
5107     {
5108         int x = vop(src, dst, size.width);
5109         #if CV_ENABLE_UNROLLED
5110         for( ; x <= size.width - 4; x += 4 )
5111         {
5112             DT t0, t1;
5113             t0 = saturate_cast<DT>(src[x]);
5114             t1 = saturate_cast<DT>(src[x+1]);
5115             dst[x] = t0; dst[x+1] = t1;
5116             t0 = saturate_cast<DT>(src[x+2]);
5117             t1 = saturate_cast<DT>(src[x+3]);
5118             dst[x+2] = t0; dst[x+3] = t1;
5119         }
5120         #endif
5121         for( ; x < size.width; x++ )
5122             dst[x] = saturate_cast<DT>(src[x]);
5123     }
5124 }
5125 
5126 //vz optimized template specialization, test Core_ConvertScale/ElemWiseTest
5127 template<>  void
cvt_(const float * src,size_t sstep,short * dst,size_t dstep,Size size)5128 cvt_<float, short>( const float* src, size_t sstep,
5129      short* dst, size_t dstep, Size size )
5130 {
5131     sstep /= sizeof(src[0]);
5132     dstep /= sizeof(dst[0]);
5133 
5134     for( ; size.height--; src += sstep, dst += dstep )
5135     {
5136         int x = 0;
5137         #if   CV_SSE2
5138         if(USE_SSE2)
5139         {
5140             for( ; x <= size.width - 8; x += 8 )
5141             {
5142                 __m128 src128 = _mm_loadu_ps (src + x);
5143                 __m128i src_int128 = _mm_cvtps_epi32 (src128);
5144 
5145                 src128 = _mm_loadu_ps (src + x + 4);
5146                 __m128i src1_int128 = _mm_cvtps_epi32 (src128);
5147 
5148                 src1_int128 = _mm_packs_epi32(src_int128, src1_int128);
5149                 _mm_storeu_si128((__m128i*)(dst + x),src1_int128);
5150             }
5151         }
5152         #elif CV_NEON
5153         for( ; x <= size.width - 8; x += 8 )
5154         {
5155             float32x4_t v_src1 = vld1q_f32(src + x), v_src2 = vld1q_f32(src + x + 4);
5156             int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_src1)),
5157                                            vqmovn_s32(cv_vrndq_s32_f32(v_src2)));
5158             vst1q_s16(dst + x, v_dst);
5159         }
5160         #endif
5161         for( ; x < size.width; x++ )
5162             dst[x] = saturate_cast<short>(src[x]);
5163     }
5164 
5165 }
5166 
5167 
5168 template<typename T> static void
cpy_(const T * src,size_t sstep,T * dst,size_t dstep,Size size)5169 cpy_( const T* src, size_t sstep, T* dst, size_t dstep, Size size )
5170 {
5171     sstep /= sizeof(src[0]);
5172     dstep /= sizeof(dst[0]);
5173 
5174     for( ; size.height--; src += sstep, dst += dstep )
5175         memcpy(dst, src, size.width*sizeof(src[0]));
5176 }
5177 
5178 #define DEF_CVT_SCALE_ABS_FUNC(suffix, tfunc, stype, dtype, wtype) \
5179 static void cvtScaleAbs##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5180                          dtype* dst, size_t dstep, Size size, double* scale) \
5181 { \
5182     tfunc(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
5183 }
5184 
5185 #define DEF_CVT_SCALE_FUNC(suffix, stype, dtype, wtype) \
5186 static void cvtScale##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5187 dtype* dst, size_t dstep, Size size, double* scale) \
5188 { \
5189     cvtScale_(src, sstep, dst, dstep, size, (wtype)scale[0], (wtype)scale[1]); \
5190 }
5191 
5192 #if defined(HAVE_IPP)
5193 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
5194 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5195                          dtype* dst, size_t dstep, Size size, double*) \
5196 { \
5197     CV_IPP_CHECK()\
5198     {\
5199         if (src && dst)\
5200         {\
5201             if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height)) >= 0) \
5202             {\
5203                 CV_IMPL_ADD(CV_IMPL_IPP)\
5204                 return; \
5205             }\
5206             setIppErrorStatus(); \
5207         }\
5208     }\
5209     cvt_(src, sstep, dst, dstep, size); \
5210 }
5211 
5212 #define DEF_CVT_FUNC_F2(suffix, stype, dtype, ippFavor) \
5213 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5214                          dtype* dst, size_t dstep, Size size, double*) \
5215 { \
5216     CV_IPP_CHECK()\
5217     {\
5218         if (src && dst)\
5219         {\
5220             if (ippiConvert_##ippFavor(src, (int)sstep, dst, (int)dstep, ippiSize(size.width, size.height), ippRndFinancial, 0) >= 0) \
5221             {\
5222                 CV_IMPL_ADD(CV_IMPL_IPP)\
5223                 return; \
5224             }\
5225             setIppErrorStatus(); \
5226         }\
5227     }\
5228     cvt_(src, sstep, dst, dstep, size); \
5229 }
5230 #else
5231 #define DEF_CVT_FUNC_F(suffix, stype, dtype, ippFavor) \
5232 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5233                          dtype* dst, size_t dstep, Size size, double*) \
5234 { \
5235     cvt_(src, sstep, dst, dstep, size); \
5236 }
5237 #define DEF_CVT_FUNC_F2 DEF_CVT_FUNC_F
5238 #endif
5239 
5240 #define DEF_CVT_FUNC(suffix, stype, dtype) \
5241 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5242                          dtype* dst, size_t dstep, Size size, double*) \
5243 { \
5244     cvt_(src, sstep, dst, dstep, size); \
5245 }
5246 
5247 #define DEF_CPY_FUNC(suffix, stype) \
5248 static void cvt##suffix( const stype* src, size_t sstep, const uchar*, size_t, \
5249                          stype* dst, size_t dstep, Size size, double*) \
5250 { \
5251     cpy_(src, sstep, dst, dstep, size); \
5252 }
5253 
5254 
5255 DEF_CVT_SCALE_ABS_FUNC(8u, cvtScaleAbs_, uchar, uchar, float)
5256 DEF_CVT_SCALE_ABS_FUNC(8s8u, cvtScaleAbs_, schar, uchar, float)
5257 DEF_CVT_SCALE_ABS_FUNC(16u8u, cvtScaleAbs_, ushort, uchar, float)
5258 DEF_CVT_SCALE_ABS_FUNC(16s8u, cvtScaleAbs_, short, uchar, float)
5259 DEF_CVT_SCALE_ABS_FUNC(32s8u, cvtScaleAbs_, int, uchar, float)
5260 DEF_CVT_SCALE_ABS_FUNC(32f8u, cvtScaleAbs_, float, uchar, float)
5261 DEF_CVT_SCALE_ABS_FUNC(64f8u, cvtScaleAbs_, double, uchar, float)
5262 
5263 DEF_CVT_SCALE_FUNC(8u,     uchar, uchar, float)
5264 DEF_CVT_SCALE_FUNC(8s8u,   schar, uchar, float)
5265 DEF_CVT_SCALE_FUNC(16u8u,  ushort, uchar, float)
5266 DEF_CVT_SCALE_FUNC(16s8u,  short, uchar, float)
5267 DEF_CVT_SCALE_FUNC(32s8u,  int, uchar, float)
5268 DEF_CVT_SCALE_FUNC(32f8u,  float, uchar, float)
5269 DEF_CVT_SCALE_FUNC(64f8u,  double, uchar, float)
5270 
5271 DEF_CVT_SCALE_FUNC(8u8s,   uchar, schar, float)
5272 DEF_CVT_SCALE_FUNC(8s,     schar, schar, float)
5273 DEF_CVT_SCALE_FUNC(16u8s,  ushort, schar, float)
5274 DEF_CVT_SCALE_FUNC(16s8s,  short, schar, float)
5275 DEF_CVT_SCALE_FUNC(32s8s,  int, schar, float)
5276 DEF_CVT_SCALE_FUNC(32f8s,  float, schar, float)
5277 DEF_CVT_SCALE_FUNC(64f8s,  double, schar, float)
5278 
5279 DEF_CVT_SCALE_FUNC(8u16u,  uchar, ushort, float)
5280 DEF_CVT_SCALE_FUNC(8s16u,  schar, ushort, float)
5281 DEF_CVT_SCALE_FUNC(16u,    ushort, ushort, float)
5282 DEF_CVT_SCALE_FUNC(16s16u, short, ushort, float)
5283 DEF_CVT_SCALE_FUNC(32s16u, int, ushort, float)
5284 DEF_CVT_SCALE_FUNC(32f16u, float, ushort, float)
5285 DEF_CVT_SCALE_FUNC(64f16u, double, ushort, float)
5286 
5287 DEF_CVT_SCALE_FUNC(8u16s,  uchar, short, float)
5288 DEF_CVT_SCALE_FUNC(8s16s,  schar, short, float)
5289 DEF_CVT_SCALE_FUNC(16u16s, ushort, short, float)
5290 DEF_CVT_SCALE_FUNC(16s,    short, short, float)
5291 DEF_CVT_SCALE_FUNC(32s16s, int, short, float)
5292 DEF_CVT_SCALE_FUNC(32f16s, float, short, float)
5293 DEF_CVT_SCALE_FUNC(64f16s, double, short, float)
5294 
5295 DEF_CVT_SCALE_FUNC(8u32s,  uchar, int, float)
5296 DEF_CVT_SCALE_FUNC(8s32s,  schar, int, float)
5297 DEF_CVT_SCALE_FUNC(16u32s, ushort, int, float)
5298 DEF_CVT_SCALE_FUNC(16s32s, short, int, float)
5299 DEF_CVT_SCALE_FUNC(32s,    int, int, double)
5300 DEF_CVT_SCALE_FUNC(32f32s, float, int, float)
5301 DEF_CVT_SCALE_FUNC(64f32s, double, int, double)
5302 
5303 DEF_CVT_SCALE_FUNC(8u32f,  uchar, float, float)
5304 DEF_CVT_SCALE_FUNC(8s32f,  schar, float, float)
5305 DEF_CVT_SCALE_FUNC(16u32f, ushort, float, float)
5306 DEF_CVT_SCALE_FUNC(16s32f, short, float, float)
5307 DEF_CVT_SCALE_FUNC(32s32f, int, float, double)
5308 DEF_CVT_SCALE_FUNC(32f,    float, float, float)
5309 DEF_CVT_SCALE_FUNC(64f32f, double, float, double)
5310 
5311 DEF_CVT_SCALE_FUNC(8u64f,  uchar, double, double)
5312 DEF_CVT_SCALE_FUNC(8s64f,  schar, double, double)
5313 DEF_CVT_SCALE_FUNC(16u64f, ushort, double, double)
5314 DEF_CVT_SCALE_FUNC(16s64f, short, double, double)
5315 DEF_CVT_SCALE_FUNC(32s64f, int, double, double)
5316 DEF_CVT_SCALE_FUNC(32f64f, float, double, double)
5317 DEF_CVT_SCALE_FUNC(64f,    double, double, double)
5318 
5319 DEF_CPY_FUNC(8u,     uchar)
5320 DEF_CVT_FUNC_F(8s8u,   schar, uchar, 8s8u_C1Rs)
5321 DEF_CVT_FUNC_F(16u8u,  ushort, uchar, 16u8u_C1R)
5322 DEF_CVT_FUNC_F(16s8u,  short, uchar, 16s8u_C1R)
5323 DEF_CVT_FUNC_F(32s8u,  int, uchar, 32s8u_C1R)
5324 DEF_CVT_FUNC_F2(32f8u,  float, uchar, 32f8u_C1RSfs)
5325 DEF_CVT_FUNC(64f8u,  double, uchar)
5326 
5327 DEF_CVT_FUNC_F2(8u8s,   uchar, schar, 8u8s_C1RSfs)
5328 DEF_CVT_FUNC_F2(16u8s,  ushort, schar, 16u8s_C1RSfs)
5329 DEF_CVT_FUNC_F2(16s8s,  short, schar, 16s8s_C1RSfs)
5330 DEF_CVT_FUNC_F(32s8s,  int, schar, 32s8s_C1R)
5331 DEF_CVT_FUNC_F2(32f8s,  float, schar, 32f8s_C1RSfs)
5332 DEF_CVT_FUNC(64f8s,  double, schar)
5333 
5334 DEF_CVT_FUNC_F(8u16u,  uchar, ushort, 8u16u_C1R)
5335 DEF_CVT_FUNC_F(8s16u,  schar, ushort, 8s16u_C1Rs)
5336 DEF_CPY_FUNC(16u,    ushort)
5337 DEF_CVT_FUNC_F(16s16u, short, ushort, 16s16u_C1Rs)
5338 DEF_CVT_FUNC_F2(32s16u, int, ushort, 32s16u_C1RSfs)
5339 DEF_CVT_FUNC_F2(32f16u, float, ushort, 32f16u_C1RSfs)
5340 DEF_CVT_FUNC(64f16u, double, ushort)
5341 
5342 DEF_CVT_FUNC_F(8u16s,  uchar, short, 8u16s_C1R)
5343 DEF_CVT_FUNC_F(8s16s,  schar, short, 8s16s_C1R)
5344 DEF_CVT_FUNC_F2(16u16s, ushort, short, 16u16s_C1RSfs)
5345 DEF_CVT_FUNC_F2(32s16s, int, short, 32s16s_C1RSfs)
5346 DEF_CVT_FUNC(32f16s, float, short)
5347 DEF_CVT_FUNC(64f16s, double, short)
5348 
5349 DEF_CVT_FUNC_F(8u32s,  uchar, int, 8u32s_C1R)
5350 DEF_CVT_FUNC_F(8s32s,  schar, int, 8s32s_C1R)
5351 DEF_CVT_FUNC_F(16u32s, ushort, int, 16u32s_C1R)
5352 DEF_CVT_FUNC_F(16s32s, short, int, 16s32s_C1R)
5353 DEF_CPY_FUNC(32s,    int)
5354 DEF_CVT_FUNC_F2(32f32s, float, int, 32f32s_C1RSfs)
5355 DEF_CVT_FUNC(64f32s, double, int)
5356 
5357 DEF_CVT_FUNC_F(8u32f,  uchar, float, 8u32f_C1R)
5358 DEF_CVT_FUNC_F(8s32f,  schar, float, 8s32f_C1R)
5359 DEF_CVT_FUNC_F(16u32f, ushort, float, 16u32f_C1R)
5360 DEF_CVT_FUNC_F(16s32f, short, float, 16s32f_C1R)
5361 DEF_CVT_FUNC_F(32s32f, int, float, 32s32f_C1R)
5362 DEF_CVT_FUNC(64f32f, double, float)
5363 
5364 DEF_CVT_FUNC(8u64f,  uchar, double)
5365 DEF_CVT_FUNC(8s64f,  schar, double)
5366 DEF_CVT_FUNC(16u64f, ushort, double)
5367 DEF_CVT_FUNC(16s64f, short, double)
5368 DEF_CVT_FUNC(32s64f, int, double)
5369 DEF_CVT_FUNC(32f64f, float, double)
5370 DEF_CPY_FUNC(64s,    int64)
5371 
getCvtScaleAbsFunc(int depth)5372 static BinaryFunc getCvtScaleAbsFunc(int depth)
5373 {
5374     static BinaryFunc cvtScaleAbsTab[] =
5375     {
5376         (BinaryFunc)cvtScaleAbs8u, (BinaryFunc)cvtScaleAbs8s8u, (BinaryFunc)cvtScaleAbs16u8u,
5377         (BinaryFunc)cvtScaleAbs16s8u, (BinaryFunc)cvtScaleAbs32s8u, (BinaryFunc)cvtScaleAbs32f8u,
5378         (BinaryFunc)cvtScaleAbs64f8u, 0
5379     };
5380 
5381     return cvtScaleAbsTab[depth];
5382 }
5383 
getConvertFunc(int sdepth,int ddepth)5384 BinaryFunc getConvertFunc(int sdepth, int ddepth)
5385 {
5386     static BinaryFunc cvtTab[][8] =
5387     {
5388         {
5389             (BinaryFunc)(cvt8u), (BinaryFunc)GET_OPTIMIZED(cvt8s8u), (BinaryFunc)GET_OPTIMIZED(cvt16u8u),
5390             (BinaryFunc)GET_OPTIMIZED(cvt16s8u), (BinaryFunc)GET_OPTIMIZED(cvt32s8u), (BinaryFunc)GET_OPTIMIZED(cvt32f8u),
5391             (BinaryFunc)GET_OPTIMIZED(cvt64f8u), 0
5392         },
5393         {
5394             (BinaryFunc)GET_OPTIMIZED(cvt8u8s), (BinaryFunc)cvt8u, (BinaryFunc)GET_OPTIMIZED(cvt16u8s),
5395             (BinaryFunc)GET_OPTIMIZED(cvt16s8s), (BinaryFunc)GET_OPTIMIZED(cvt32s8s), (BinaryFunc)GET_OPTIMIZED(cvt32f8s),
5396             (BinaryFunc)GET_OPTIMIZED(cvt64f8s), 0
5397         },
5398         {
5399             (BinaryFunc)GET_OPTIMIZED(cvt8u16u), (BinaryFunc)GET_OPTIMIZED(cvt8s16u), (BinaryFunc)cvt16u,
5400             (BinaryFunc)GET_OPTIMIZED(cvt16s16u), (BinaryFunc)GET_OPTIMIZED(cvt32s16u), (BinaryFunc)GET_OPTIMIZED(cvt32f16u),
5401             (BinaryFunc)GET_OPTIMIZED(cvt64f16u), 0
5402         },
5403         {
5404             (BinaryFunc)GET_OPTIMIZED(cvt8u16s), (BinaryFunc)GET_OPTIMIZED(cvt8s16s), (BinaryFunc)GET_OPTIMIZED(cvt16u16s),
5405             (BinaryFunc)cvt16u, (BinaryFunc)GET_OPTIMIZED(cvt32s16s), (BinaryFunc)GET_OPTIMIZED(cvt32f16s),
5406             (BinaryFunc)GET_OPTIMIZED(cvt64f16s), 0
5407         },
5408         {
5409             (BinaryFunc)GET_OPTIMIZED(cvt8u32s), (BinaryFunc)GET_OPTIMIZED(cvt8s32s), (BinaryFunc)GET_OPTIMIZED(cvt16u32s),
5410             (BinaryFunc)GET_OPTIMIZED(cvt16s32s), (BinaryFunc)cvt32s, (BinaryFunc)GET_OPTIMIZED(cvt32f32s),
5411             (BinaryFunc)GET_OPTIMIZED(cvt64f32s), 0
5412         },
5413         {
5414             (BinaryFunc)GET_OPTIMIZED(cvt8u32f), (BinaryFunc)GET_OPTIMIZED(cvt8s32f), (BinaryFunc)GET_OPTIMIZED(cvt16u32f),
5415             (BinaryFunc)GET_OPTIMIZED(cvt16s32f), (BinaryFunc)GET_OPTIMIZED(cvt32s32f), (BinaryFunc)cvt32s,
5416             (BinaryFunc)GET_OPTIMIZED(cvt64f32f), 0
5417         },
5418         {
5419             (BinaryFunc)GET_OPTIMIZED(cvt8u64f), (BinaryFunc)GET_OPTIMIZED(cvt8s64f), (BinaryFunc)GET_OPTIMIZED(cvt16u64f),
5420             (BinaryFunc)GET_OPTIMIZED(cvt16s64f), (BinaryFunc)GET_OPTIMIZED(cvt32s64f), (BinaryFunc)GET_OPTIMIZED(cvt32f64f),
5421             (BinaryFunc)(cvt64s), 0
5422         },
5423         {
5424             0, 0, 0, 0, 0, 0, 0, 0
5425         }
5426     };
5427 
5428     return cvtTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
5429 }
5430 
getConvertScaleFunc(int sdepth,int ddepth)5431 static BinaryFunc getConvertScaleFunc(int sdepth, int ddepth)
5432 {
5433     static BinaryFunc cvtScaleTab[][8] =
5434     {
5435         {
5436             (BinaryFunc)GET_OPTIMIZED(cvtScale8u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8u),
5437             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8u),
5438             (BinaryFunc)cvtScale64f8u, 0
5439         },
5440         {
5441             (BinaryFunc)GET_OPTIMIZED(cvtScale8u8s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u8s),
5442             (BinaryFunc)GET_OPTIMIZED(cvtScale16s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s8s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f8s),
5443             (BinaryFunc)cvtScale64f8s, 0
5444         },
5445         {
5446             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16u), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale16u),
5447             (BinaryFunc)GET_OPTIMIZED(cvtScale16s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16u), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16u),
5448             (BinaryFunc)cvtScale64f16u, 0
5449         },
5450         {
5451             (BinaryFunc)GET_OPTIMIZED(cvtScale8u16s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u16s),
5452             (BinaryFunc)GET_OPTIMIZED(cvtScale16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s16s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f16s),
5453             (BinaryFunc)cvtScale64f16s, 0
5454         },
5455         {
5456             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32s), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32s),
5457             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32s), (BinaryFunc)GET_OPTIMIZED(cvtScale32f32s),
5458             (BinaryFunc)cvtScale64f32s, 0
5459         },
5460         {
5461             (BinaryFunc)GET_OPTIMIZED(cvtScale8u32f), (BinaryFunc)GET_OPTIMIZED(cvtScale8s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale16u32f),
5462             (BinaryFunc)GET_OPTIMIZED(cvtScale16s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32s32f), (BinaryFunc)GET_OPTIMIZED(cvtScale32f),
5463             (BinaryFunc)cvtScale64f32f, 0
5464         },
5465         {
5466             (BinaryFunc)cvtScale8u64f, (BinaryFunc)cvtScale8s64f, (BinaryFunc)cvtScale16u64f,
5467             (BinaryFunc)cvtScale16s64f, (BinaryFunc)cvtScale32s64f, (BinaryFunc)cvtScale32f64f,
5468             (BinaryFunc)cvtScale64f, 0
5469         },
5470         {
5471             0, 0, 0, 0, 0, 0, 0, 0
5472         }
5473     };
5474 
5475     return cvtScaleTab[CV_MAT_DEPTH(ddepth)][CV_MAT_DEPTH(sdepth)];
5476 }
5477 
5478 #ifdef HAVE_OPENCL
5479 
ocl_convertScaleAbs(InputArray _src,OutputArray _dst,double alpha,double beta)5480 static bool ocl_convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
5481 {
5482     const ocl::Device & d = ocl::Device::getDefault();
5483 
5484     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
5485     bool doubleSupport = d.doubleFPConfig() > 0;
5486     if (!doubleSupport && depth == CV_64F)
5487         return false;
5488 
5489     _dst.create(_src.size(), CV_8UC(cn));
5490     int kercn = 1;
5491     if (d.isIntel())
5492     {
5493         static const int vectorWidths[] = {4, 4, 4, 4, 4, 4, 4, -1};
5494         kercn = ocl::checkOptimalVectorWidth( vectorWidths, _src, _dst,
5495                                               noArray(), noArray(), noArray(),
5496                                               noArray(), noArray(), noArray(),
5497                                               noArray(), ocl::OCL_VECTOR_MAX);
5498     }
5499     else
5500         kercn = ocl::predictOptimalVectorWidthMax(_src, _dst);
5501 
5502     int rowsPerWI = d.isIntel() ? 4 : 1;
5503     char cvt[2][50];
5504     int wdepth = std::max(depth, CV_32F);
5505     String build_opt = format("-D OP_CONVERT_SCALE_ABS -D UNARY_OP -D dstT=%s -D srcT1=%s"
5506                          " -D workT=%s -D wdepth=%d -D convertToWT1=%s -D convertToDT=%s"
5507                          " -D workT1=%s -D rowsPerWI=%d%s",
5508                          ocl::typeToStr(CV_8UC(kercn)),
5509                          ocl::typeToStr(CV_MAKE_TYPE(depth, kercn)),
5510                          ocl::typeToStr(CV_MAKE_TYPE(wdepth, kercn)), wdepth,
5511                          ocl::convertTypeStr(depth, wdepth, kercn, cvt[0]),
5512                          ocl::convertTypeStr(wdepth, CV_8U, kercn, cvt[1]),
5513                          ocl::typeToStr(wdepth), rowsPerWI,
5514                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
5515     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, build_opt);
5516     if (k.empty())
5517         return false;
5518 
5519     UMat src = _src.getUMat();
5520     UMat dst = _dst.getUMat();
5521 
5522     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
5523             dstarg = ocl::KernelArg::WriteOnly(dst, cn, kercn);
5524 
5525     if (wdepth == CV_32F)
5526         k.args(srcarg, dstarg, (float)alpha, (float)beta);
5527     else if (wdepth == CV_64F)
5528         k.args(srcarg, dstarg, alpha, beta);
5529 
5530     size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
5531     return k.run(2, globalsize, NULL, false);
5532 }
5533 
5534 #endif
5535 
5536 }
5537 
convertScaleAbs(InputArray _src,OutputArray _dst,double alpha,double beta)5538 void cv::convertScaleAbs( InputArray _src, OutputArray _dst, double alpha, double beta )
5539 {
5540     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
5541                ocl_convertScaleAbs(_src, _dst, alpha, beta))
5542 
5543     Mat src = _src.getMat();
5544     int cn = src.channels();
5545     double scale[] = {alpha, beta};
5546     _dst.create( src.dims, src.size, CV_8UC(cn) );
5547     Mat dst = _dst.getMat();
5548     BinaryFunc func = getCvtScaleAbsFunc(src.depth());
5549     CV_Assert( func != 0 );
5550 
5551     if( src.dims <= 2 )
5552     {
5553         Size sz = getContinuousSize(src, dst, cn);
5554         func( src.ptr(), src.step, 0, 0, dst.ptr(), dst.step, sz, scale );
5555     }
5556     else
5557     {
5558         const Mat* arrays[] = {&src, &dst, 0};
5559         uchar* ptrs[2];
5560         NAryMatIterator it(arrays, ptrs);
5561         Size sz((int)it.size*cn, 1);
5562 
5563         for( size_t i = 0; i < it.nplanes; i++, ++it )
5564             func( ptrs[0], 0, 0, 0, ptrs[1], 0, sz, scale );
5565     }
5566 }
5567 
convertTo(OutputArray _dst,int _type,double alpha,double beta) const5568 void cv::Mat::convertTo(OutputArray _dst, int _type, double alpha, double beta) const
5569 {
5570     bool noScale = fabs(alpha-1) < DBL_EPSILON && fabs(beta) < DBL_EPSILON;
5571 
5572     if( _type < 0 )
5573         _type = _dst.fixedType() ? _dst.type() : type();
5574     else
5575         _type = CV_MAKETYPE(CV_MAT_DEPTH(_type), channels());
5576 
5577     int sdepth = depth(), ddepth = CV_MAT_DEPTH(_type);
5578     if( sdepth == ddepth && noScale )
5579     {
5580         copyTo(_dst);
5581         return;
5582     }
5583 
5584     Mat src = *this;
5585 
5586     BinaryFunc func = noScale ? getConvertFunc(sdepth, ddepth) : getConvertScaleFunc(sdepth, ddepth);
5587     double scale[] = {alpha, beta};
5588     int cn = channels();
5589     CV_Assert( func != 0 );
5590 
5591     if( dims <= 2 )
5592     {
5593         _dst.create( size(), _type );
5594         Mat dst = _dst.getMat();
5595         Size sz = getContinuousSize(src, dst, cn);
5596         func( src.data, src.step, 0, 0, dst.data, dst.step, sz, scale );
5597     }
5598     else
5599     {
5600         _dst.create( dims, size, _type );
5601         Mat dst = _dst.getMat();
5602         const Mat* arrays[] = {&src, &dst, 0};
5603         uchar* ptrs[2];
5604         NAryMatIterator it(arrays, ptrs);
5605         Size sz((int)(it.size*cn), 1);
5606 
5607         for( size_t i = 0; i < it.nplanes; i++, ++it )
5608             func(ptrs[0], 1, 0, 0, ptrs[1], 1, sz, scale);
5609     }
5610 }
5611 
5612 /****************************************************************************************\
5613 *                                    LUT Transform                                       *
5614 \****************************************************************************************/
5615 
5616 namespace cv
5617 {
5618 
5619 template<typename T> static void
LUT8u_(const uchar * src,const T * lut,T * dst,int len,int cn,int lutcn)5620 LUT8u_( const uchar* src, const T* lut, T* dst, int len, int cn, int lutcn )
5621 {
5622     if( lutcn == 1 )
5623     {
5624         for( int i = 0; i < len*cn; i++ )
5625             dst[i] = lut[src[i]];
5626     }
5627     else
5628     {
5629         for( int i = 0; i < len*cn; i += cn )
5630             for( int k = 0; k < cn; k++ )
5631                 dst[i+k] = lut[src[i+k]*cn+k];
5632     }
5633 }
5634 
LUT8u_8u(const uchar * src,const uchar * lut,uchar * dst,int len,int cn,int lutcn)5635 static void LUT8u_8u( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn )
5636 {
5637     LUT8u_( src, lut, dst, len, cn, lutcn );
5638 }
5639 
LUT8u_8s(const uchar * src,const schar * lut,schar * dst,int len,int cn,int lutcn)5640 static void LUT8u_8s( const uchar* src, const schar* lut, schar* dst, int len, int cn, int lutcn )
5641 {
5642     LUT8u_( src, lut, dst, len, cn, lutcn );
5643 }
5644 
LUT8u_16u(const uchar * src,const ushort * lut,ushort * dst,int len,int cn,int lutcn)5645 static void LUT8u_16u( const uchar* src, const ushort* lut, ushort* dst, int len, int cn, int lutcn )
5646 {
5647     LUT8u_( src, lut, dst, len, cn, lutcn );
5648 }
5649 
LUT8u_16s(const uchar * src,const short * lut,short * dst,int len,int cn,int lutcn)5650 static void LUT8u_16s( const uchar* src, const short* lut, short* dst, int len, int cn, int lutcn )
5651 {
5652     LUT8u_( src, lut, dst, len, cn, lutcn );
5653 }
5654 
LUT8u_32s(const uchar * src,const int * lut,int * dst,int len,int cn,int lutcn)5655 static void LUT8u_32s( const uchar* src, const int* lut, int* dst, int len, int cn, int lutcn )
5656 {
5657     LUT8u_( src, lut, dst, len, cn, lutcn );
5658 }
5659 
LUT8u_32f(const uchar * src,const float * lut,float * dst,int len,int cn,int lutcn)5660 static void LUT8u_32f( const uchar* src, const float* lut, float* dst, int len, int cn, int lutcn )
5661 {
5662     LUT8u_( src, lut, dst, len, cn, lutcn );
5663 }
5664 
LUT8u_64f(const uchar * src,const double * lut,double * dst,int len,int cn,int lutcn)5665 static void LUT8u_64f( const uchar* src, const double* lut, double* dst, int len, int cn, int lutcn )
5666 {
5667     LUT8u_( src, lut, dst, len, cn, lutcn );
5668 }
5669 
5670 typedef void (*LUTFunc)( const uchar* src, const uchar* lut, uchar* dst, int len, int cn, int lutcn );
5671 
5672 static LUTFunc lutTab[] =
5673 {
5674     (LUTFunc)LUT8u_8u, (LUTFunc)LUT8u_8s, (LUTFunc)LUT8u_16u, (LUTFunc)LUT8u_16s,
5675     (LUTFunc)LUT8u_32s, (LUTFunc)LUT8u_32f, (LUTFunc)LUT8u_64f, 0
5676 };
5677 
5678 #ifdef HAVE_OPENCL
5679 
ocl_LUT(InputArray _src,InputArray _lut,OutputArray _dst)5680 static bool ocl_LUT(InputArray _src, InputArray _lut, OutputArray _dst)
5681 {
5682     int lcn = _lut.channels(), dcn = _src.channels(), ddepth = _lut.depth();
5683 
5684     UMat src = _src.getUMat(), lut = _lut.getUMat();
5685     _dst.create(src.size(), CV_MAKETYPE(ddepth, dcn));
5686     UMat dst = _dst.getUMat();
5687     int kercn = lcn == 1 ? std::min(4, ocl::predictOptimalVectorWidth(_src, _dst)) : dcn;
5688 
5689     ocl::Kernel k("LUT", ocl::core::lut_oclsrc,
5690                   format("-D dcn=%d -D lcn=%d -D srcT=%s -D dstT=%s", kercn, lcn,
5691                          ocl::typeToStr(src.depth()), ocl::memopTypeToStr(ddepth)));
5692     if (k.empty())
5693         return false;
5694 
5695     k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::ReadOnlyNoSize(lut),
5696         ocl::KernelArg::WriteOnly(dst, dcn, kercn));
5697 
5698     size_t globalSize[2] = { dst.cols * dcn / kercn, (dst.rows + 3) / 4 };
5699     return k.run(2, globalSize, NULL, false);
5700 }
5701 
5702 #endif
5703 
5704 #if defined(HAVE_IPP)
5705 namespace ipp {
5706 
5707 #if 0 // there are no performance benefits (PR #2653)
5708 class IppLUTParallelBody_LUTC1 : public ParallelLoopBody
5709 {
5710 public:
5711     bool* ok;
5712     const Mat& src_;
5713     const Mat& lut_;
5714     Mat& dst_;
5715 
5716     typedef IppStatus (*IppFn)(const Ipp8u* pSrc, int srcStep, void* pDst, int dstStep,
5717                           IppiSize roiSize, const void* pTable, int nBitSize);
5718     IppFn fn;
5719 
5720     int width;
5721 
5722     IppLUTParallelBody_LUTC1(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
5723         : ok(_ok), src_(src), lut_(lut), dst_(dst)
5724     {
5725         width = dst.cols * dst.channels();
5726 
5727         size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
5728 
5729         fn =
5730                 elemSize1 == 1 ? (IppFn)ippiLUTPalette_8u_C1R :
5731                 elemSize1 == 4 ? (IppFn)ippiLUTPalette_8u32u_C1R :
5732                 NULL;
5733 
5734         *ok = (fn != NULL);
5735     }
5736 
5737     void operator()( const cv::Range& range ) const
5738     {
5739         if (!*ok)
5740             return;
5741 
5742         const int row0 = range.start;
5743         const int row1 = range.end;
5744 
5745         Mat src = src_.rowRange(row0, row1);
5746         Mat dst = dst_.rowRange(row0, row1);
5747 
5748         IppiSize sz = { width, dst.rows };
5749 
5750         CV_DbgAssert(fn != NULL);
5751         if (fn(src.data, (int)src.step[0], dst.data, (int)dst.step[0], sz, lut_.data, 8) < 0)
5752         {
5753             setIppErrorStatus();
5754             *ok = false;
5755         }
5756         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5757     }
5758 private:
5759     IppLUTParallelBody_LUTC1(const IppLUTParallelBody_LUTC1&);
5760     IppLUTParallelBody_LUTC1& operator=(const IppLUTParallelBody_LUTC1&);
5761 };
5762 #endif
5763 
5764 class IppLUTParallelBody_LUTCN : public ParallelLoopBody
5765 {
5766 public:
5767     bool *ok;
5768     const Mat& src_;
5769     const Mat& lut_;
5770     Mat& dst_;
5771 
5772     int lutcn;
5773 
5774     uchar* lutBuffer;
5775     uchar* lutTable[4];
5776 
IppLUTParallelBody_LUTCN(const Mat & src,const Mat & lut,Mat & dst,bool * _ok)5777     IppLUTParallelBody_LUTCN(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
5778         : ok(_ok), src_(src), lut_(lut), dst_(dst), lutBuffer(NULL)
5779     {
5780         lutcn = lut.channels();
5781         IppiSize sz256 = {256, 1};
5782 
5783         size_t elemSize1 = dst.elemSize1();
5784         CV_DbgAssert(elemSize1 == 1);
5785         lutBuffer = (uchar*)ippMalloc(256 * (int)elemSize1 * 4);
5786         lutTable[0] = lutBuffer + 0;
5787         lutTable[1] = lutBuffer + 1 * 256 * elemSize1;
5788         lutTable[2] = lutBuffer + 2 * 256 * elemSize1;
5789         lutTable[3] = lutBuffer + 3 * 256 * elemSize1;
5790 
5791         CV_DbgAssert(lutcn == 3 || lutcn == 4);
5792         if (lutcn == 3)
5793         {
5794             IppStatus status = ippiCopy_8u_C3P3R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
5795             if (status < 0)
5796             {
5797                 setIppErrorStatus();
5798                 return;
5799             }
5800             CV_IMPL_ADD(CV_IMPL_IPP);
5801         }
5802         else if (lutcn == 4)
5803         {
5804             IppStatus status = ippiCopy_8u_C4P4R(lut.ptr(), (int)lut.step[0], lutTable, (int)lut.step[0], sz256);
5805             if (status < 0)
5806             {
5807                 setIppErrorStatus();
5808                 return;
5809             }
5810             CV_IMPL_ADD(CV_IMPL_IPP);
5811         }
5812 
5813         *ok = true;
5814     }
5815 
~IppLUTParallelBody_LUTCN()5816     ~IppLUTParallelBody_LUTCN()
5817     {
5818         if (lutBuffer != NULL)
5819             ippFree(lutBuffer);
5820         lutBuffer = NULL;
5821         lutTable[0] = NULL;
5822     }
5823 
operator ()(const cv::Range & range) const5824     void operator()( const cv::Range& range ) const
5825     {
5826         if (!*ok)
5827             return;
5828 
5829         const int row0 = range.start;
5830         const int row1 = range.end;
5831 
5832         Mat src = src_.rowRange(row0, row1);
5833         Mat dst = dst_.rowRange(row0, row1);
5834 
5835         if (lutcn == 3)
5836         {
5837             if (ippiLUTPalette_8u_C3R(
5838                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
5839                     ippiSize(dst.size()), lutTable, 8) >= 0)
5840             {
5841                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5842                 return;
5843             }
5844         }
5845         else if (lutcn == 4)
5846         {
5847             if (ippiLUTPalette_8u_C4R(
5848                     src.ptr(), (int)src.step[0], dst.ptr(), (int)dst.step[0],
5849                     ippiSize(dst.size()), lutTable, 8) >= 0)
5850             {
5851                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
5852                 return;
5853             }
5854         }
5855         setIppErrorStatus();
5856         *ok = false;
5857     }
5858 private:
5859     IppLUTParallelBody_LUTCN(const IppLUTParallelBody_LUTCN&);
5860     IppLUTParallelBody_LUTCN& operator=(const IppLUTParallelBody_LUTCN&);
5861 };
5862 } // namespace ipp
5863 #endif // IPP
5864 
5865 class LUTParallelBody : public ParallelLoopBody
5866 {
5867 public:
5868     bool* ok;
5869     const Mat& src_;
5870     const Mat& lut_;
5871     Mat& dst_;
5872 
5873     LUTFunc func;
5874 
LUTParallelBody(const Mat & src,const Mat & lut,Mat & dst,bool * _ok)5875     LUTParallelBody(const Mat& src, const Mat& lut, Mat& dst, bool* _ok)
5876         : ok(_ok), src_(src), lut_(lut), dst_(dst)
5877     {
5878         func = lutTab[lut.depth()];
5879         *ok = (func != NULL);
5880     }
5881 
operator ()(const cv::Range & range) const5882     void operator()( const cv::Range& range ) const
5883     {
5884         CV_DbgAssert(*ok);
5885 
5886         const int row0 = range.start;
5887         const int row1 = range.end;
5888 
5889         Mat src = src_.rowRange(row0, row1);
5890         Mat dst = dst_.rowRange(row0, row1);
5891 
5892         int cn = src.channels();
5893         int lutcn = lut_.channels();
5894 
5895         const Mat* arrays[] = {&src, &dst, 0};
5896         uchar* ptrs[2];
5897         NAryMatIterator it(arrays, ptrs);
5898         int len = (int)it.size;
5899 
5900         for( size_t i = 0; i < it.nplanes; i++, ++it )
5901             func(ptrs[0], lut_.ptr(), ptrs[1], len, cn, lutcn);
5902     }
5903 private:
5904     LUTParallelBody(const LUTParallelBody&);
5905     LUTParallelBody& operator=(const LUTParallelBody&);
5906 };
5907 
5908 }
5909 
LUT(InputArray _src,InputArray _lut,OutputArray _dst)5910 void cv::LUT( InputArray _src, InputArray _lut, OutputArray _dst )
5911 {
5912     int cn = _src.channels(), depth = _src.depth();
5913     int lutcn = _lut.channels();
5914 
5915     CV_Assert( (lutcn == cn || lutcn == 1) &&
5916         _lut.total() == 256 && _lut.isContinuous() &&
5917         (depth == CV_8U || depth == CV_8S) );
5918 
5919     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
5920                ocl_LUT(_src, _lut, _dst))
5921 
5922     Mat src = _src.getMat(), lut = _lut.getMat();
5923     _dst.create(src.dims, src.size, CV_MAKETYPE(_lut.depth(), cn));
5924     Mat dst = _dst.getMat();
5925 
5926     if (_src.dims() <= 2)
5927     {
5928         bool ok = false;
5929         Ptr<ParallelLoopBody> body;
5930 #if defined(HAVE_IPP)
5931         CV_IPP_CHECK()
5932         {
5933             size_t elemSize1 = CV_ELEM_SIZE1(dst.depth());
5934 #if 0 // there are no performance benefits (PR #2653)
5935             if (lutcn == 1)
5936             {
5937                 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTC1(src, lut, dst, &ok);
5938                 body.reset(p);
5939             }
5940             else
5941 #endif
5942             if ((lutcn == 3 || lutcn == 4) && elemSize1 == 1)
5943             {
5944                 ParallelLoopBody* p = new ipp::IppLUTParallelBody_LUTCN(src, lut, dst, &ok);
5945                 body.reset(p);
5946             }
5947         }
5948 #endif
5949         if (body == NULL || ok == false)
5950         {
5951             ok = false;
5952             ParallelLoopBody* p = new LUTParallelBody(src, lut, dst, &ok);
5953             body.reset(p);
5954         }
5955         if (body != NULL && ok)
5956         {
5957             Range all(0, dst.rows);
5958             if (dst.total()>>18)
5959                 parallel_for_(all, *body, (double)std::max((size_t)1, dst.total()>>16));
5960             else
5961                 (*body)(all);
5962             if (ok)
5963                 return;
5964         }
5965     }
5966 
5967     LUTFunc func = lutTab[lut.depth()];
5968     CV_Assert( func != 0 );
5969 
5970     const Mat* arrays[] = {&src, &dst, 0};
5971     uchar* ptrs[2];
5972     NAryMatIterator it(arrays, ptrs);
5973     int len = (int)it.size;
5974 
5975     for( size_t i = 0; i < it.nplanes; i++, ++it )
5976         func(ptrs[0], lut.ptr(), ptrs[1], len, cn, lutcn);
5977 }
5978 
5979 namespace cv {
5980 
5981 #ifdef HAVE_OPENCL
5982 
ocl_normalize(InputArray _src,InputOutputArray _dst,InputArray _mask,int dtype,double scale,double delta)5983 static bool ocl_normalize( InputArray _src, InputOutputArray _dst, InputArray _mask, int dtype,
5984                            double scale, double delta )
5985 {
5986     UMat src = _src.getUMat();
5987 
5988     if( _mask.empty() )
5989         src.convertTo( _dst, dtype, scale, delta );
5990     else if (src.channels() <= 4)
5991     {
5992         const ocl::Device & dev = ocl::Device::getDefault();
5993 
5994         int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
5995                 ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32F, std::max(sdepth, ddepth)),
5996                 rowsPerWI = dev.isIntel() ? 4 : 1;
5997 
5998         float fscale = static_cast<float>(scale), fdelta = static_cast<float>(delta);
5999         bool haveScale = std::fabs(scale - 1) > DBL_EPSILON,
6000                 haveZeroScale = !(std::fabs(scale) > DBL_EPSILON),
6001                 haveDelta = std::fabs(delta) > DBL_EPSILON,
6002                 doubleSupport = dev.doubleFPConfig() > 0;
6003 
6004         if (!haveScale && !haveDelta && stype == dtype)
6005         {
6006             _src.copyTo(_dst, _mask);
6007             return true;
6008         }
6009         if (haveZeroScale)
6010         {
6011             _dst.setTo(Scalar(delta), _mask);
6012             return true;
6013         }
6014 
6015         if ((sdepth == CV_64F || ddepth == CV_64F) && !doubleSupport)
6016             return false;
6017 
6018         char cvt[2][40];
6019         String opts = format("-D srcT=%s -D dstT=%s -D convertToWT=%s -D cn=%d -D rowsPerWI=%d"
6020                              " -D convertToDT=%s -D workT=%s%s%s%s -D srcT1=%s -D dstT1=%s",
6021                              ocl::typeToStr(stype), ocl::typeToStr(dtype),
6022                              ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), cn,
6023                              rowsPerWI, ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]),
6024                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
6025                              doubleSupport ? " -D DOUBLE_SUPPORT" : "",
6026                              haveScale ? " -D HAVE_SCALE" : "",
6027                              haveDelta ? " -D HAVE_DELTA" : "",
6028                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth));
6029 
6030         ocl::Kernel k("normalizek", ocl::core::normalize_oclsrc, opts);
6031         if (k.empty())
6032             return false;
6033 
6034         UMat mask = _mask.getUMat(), dst = _dst.getUMat();
6035 
6036         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
6037                 maskarg = ocl::KernelArg::ReadOnlyNoSize(mask),
6038                 dstarg = ocl::KernelArg::ReadWrite(dst);
6039 
6040         if (haveScale)
6041         {
6042             if (haveDelta)
6043                 k.args(srcarg, maskarg, dstarg, fscale, fdelta);
6044             else
6045                 k.args(srcarg, maskarg, dstarg, fscale);
6046         }
6047         else
6048         {
6049             if (haveDelta)
6050                 k.args(srcarg, maskarg, dstarg, fdelta);
6051             else
6052                 k.args(srcarg, maskarg, dstarg);
6053         }
6054 
6055         size_t globalsize[2] = { src.cols, (src.rows + rowsPerWI - 1) / rowsPerWI };
6056         return k.run(2, globalsize, NULL, false);
6057     }
6058     else
6059     {
6060         UMat temp;
6061         src.convertTo( temp, dtype, scale, delta );
6062         temp.copyTo( _dst, _mask );
6063     }
6064 
6065     return true;
6066 }
6067 
6068 #endif
6069 
6070 }
6071 
normalize(InputArray _src,InputOutputArray _dst,double a,double b,int norm_type,int rtype,InputArray _mask)6072 void cv::normalize( InputArray _src, InputOutputArray _dst, double a, double b,
6073                     int norm_type, int rtype, InputArray _mask )
6074 {
6075     double scale = 1, shift = 0;
6076     if( norm_type == CV_MINMAX )
6077     {
6078         double smin = 0, smax = 0;
6079         double dmin = MIN( a, b ), dmax = MAX( a, b );
6080         minMaxLoc( _src, &smin, &smax, 0, 0, _mask );
6081         scale = (dmax - dmin)*(smax - smin > DBL_EPSILON ? 1./(smax - smin) : 0);
6082         shift = dmin - smin*scale;
6083     }
6084     else if( norm_type == CV_L2 || norm_type == CV_L1 || norm_type == CV_C )
6085     {
6086         scale = norm( _src, norm_type, _mask );
6087         scale = scale > DBL_EPSILON ? a/scale : 0.;
6088         shift = 0;
6089     }
6090     else
6091         CV_Error( CV_StsBadArg, "Unknown/unsupported norm type" );
6092 
6093     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
6094     if( rtype < 0 )
6095         rtype = _dst.fixedType() ? _dst.depth() : depth;
6096     _dst.createSameSize(_src, CV_MAKETYPE(rtype, cn));
6097 
6098     CV_OCL_RUN(_dst.isUMat(),
6099                ocl_normalize(_src, _dst, _mask, rtype, scale, shift))
6100 
6101     Mat src = _src.getMat(), dst = _dst.getMat();
6102     if( _mask.empty() )
6103         src.convertTo( dst, rtype, scale, shift );
6104     else
6105     {
6106         Mat temp;
6107         src.convertTo( temp, rtype, scale, shift );
6108         temp.copyTo( dst, _mask );
6109     }
6110 }
6111 
6112 CV_IMPL void
cvSplit(const void * srcarr,void * dstarr0,void * dstarr1,void * dstarr2,void * dstarr3)6113 cvSplit( const void* srcarr, void* dstarr0, void* dstarr1, void* dstarr2, void* dstarr3 )
6114 {
6115     void* dptrs[] = { dstarr0, dstarr1, dstarr2, dstarr3 };
6116     cv::Mat src = cv::cvarrToMat(srcarr);
6117     int i, j, nz = 0;
6118     for( i = 0; i < 4; i++ )
6119         nz += dptrs[i] != 0;
6120     CV_Assert( nz > 0 );
6121     std::vector<cv::Mat> dvec(nz);
6122     std::vector<int> pairs(nz*2);
6123 
6124     for( i = j = 0; i < 4; i++ )
6125     {
6126         if( dptrs[i] != 0 )
6127         {
6128             dvec[j] = cv::cvarrToMat(dptrs[i]);
6129             CV_Assert( dvec[j].size() == src.size() );
6130             CV_Assert( dvec[j].depth() == src.depth() );
6131             CV_Assert( dvec[j].channels() == 1 );
6132             CV_Assert( i < src.channels() );
6133             pairs[j*2] = i;
6134             pairs[j*2+1] = j;
6135             j++;
6136         }
6137     }
6138     if( nz == src.channels() )
6139         cv::split( src, dvec );
6140     else
6141     {
6142         cv::mixChannels( &src, 1, &dvec[0], nz, &pairs[0], nz );
6143     }
6144 }
6145 
6146 
6147 CV_IMPL void
cvMerge(const void * srcarr0,const void * srcarr1,const void * srcarr2,const void * srcarr3,void * dstarr)6148 cvMerge( const void* srcarr0, const void* srcarr1, const void* srcarr2,
6149          const void* srcarr3, void* dstarr )
6150 {
6151     const void* sptrs[] = { srcarr0, srcarr1, srcarr2, srcarr3 };
6152     cv::Mat dst = cv::cvarrToMat(dstarr);
6153     int i, j, nz = 0;
6154     for( i = 0; i < 4; i++ )
6155         nz += sptrs[i] != 0;
6156     CV_Assert( nz > 0 );
6157     std::vector<cv::Mat> svec(nz);
6158     std::vector<int> pairs(nz*2);
6159 
6160     for( i = j = 0; i < 4; i++ )
6161     {
6162         if( sptrs[i] != 0 )
6163         {
6164             svec[j] = cv::cvarrToMat(sptrs[i]);
6165             CV_Assert( svec[j].size == dst.size &&
6166                 svec[j].depth() == dst.depth() &&
6167                 svec[j].channels() == 1 && i < dst.channels() );
6168             pairs[j*2] = j;
6169             pairs[j*2+1] = i;
6170             j++;
6171         }
6172     }
6173 
6174     if( nz == dst.channels() )
6175         cv::merge( svec, dst );
6176     else
6177     {
6178         cv::mixChannels( &svec[0], nz, &dst, 1, &pairs[0], nz );
6179     }
6180 }
6181 
6182 
6183 CV_IMPL void
cvMixChannels(const CvArr ** src,int src_count,CvArr ** dst,int dst_count,const int * from_to,int pair_count)6184 cvMixChannels( const CvArr** src, int src_count,
6185                CvArr** dst, int dst_count,
6186                const int* from_to, int pair_count )
6187 {
6188     cv::AutoBuffer<cv::Mat> buf(src_count + dst_count);
6189 
6190     int i;
6191     for( i = 0; i < src_count; i++ )
6192         buf[i] = cv::cvarrToMat(src[i]);
6193     for( i = 0; i < dst_count; i++ )
6194         buf[i+src_count] = cv::cvarrToMat(dst[i]);
6195     cv::mixChannels(&buf[0], src_count, &buf[src_count], dst_count, from_to, pair_count);
6196 }
6197 
6198 CV_IMPL void
cvConvertScaleAbs(const void * srcarr,void * dstarr,double scale,double shift)6199 cvConvertScaleAbs( const void* srcarr, void* dstarr,
6200                    double scale, double shift )
6201 {
6202     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6203     CV_Assert( src.size == dst.size && dst.type() == CV_8UC(src.channels()));
6204     cv::convertScaleAbs( src, dst, scale, shift );
6205 }
6206 
6207 CV_IMPL void
cvConvertScale(const void * srcarr,void * dstarr,double scale,double shift)6208 cvConvertScale( const void* srcarr, void* dstarr,
6209                 double scale, double shift )
6210 {
6211     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
6212 
6213     CV_Assert( src.size == dst.size && src.channels() == dst.channels() );
6214     src.convertTo(dst, dst.type(), scale, shift);
6215 }
6216 
cvLUT(const void * srcarr,void * dstarr,const void * lutarr)6217 CV_IMPL void cvLUT( const void* srcarr, void* dstarr, const void* lutarr )
6218 {
6219     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), lut = cv::cvarrToMat(lutarr);
6220 
6221     CV_Assert( dst.size() == src.size() && dst.type() == CV_MAKETYPE(lut.depth(), src.channels()) );
6222     cv::LUT( src, lut, dst );
6223 }
6224 
cvNormalize(const CvArr * srcarr,CvArr * dstarr,double a,double b,int norm_type,const CvArr * maskarr)6225 CV_IMPL void cvNormalize( const CvArr* srcarr, CvArr* dstarr,
6226                           double a, double b, int norm_type, const CvArr* maskarr )
6227 {
6228     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
6229     if( maskarr )
6230         mask = cv::cvarrToMat(maskarr);
6231     CV_Assert( dst.size() == src.size() && src.channels() == dst.channels() );
6232     cv::normalize( src, dst, a, b, norm_type, dst.type(), mask );
6233 }
6234 
6235 /* End of file. */
6236