1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /********************************* COPYRIGHT NOTICE *******************************\
45   The function for RGB to Lab conversion is based on the MATLAB script
46   RGB2Lab.m translated by Mark Ruzon from C code by Yossi Rubner, 23 September 1997.
47   See the page [http://vision.stanford.edu/~ruzon/software/rgblab.html]
48 \**********************************************************************************/
49 
50 /********************************* COPYRIGHT NOTICE *******************************\
51   Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
52   from MD-Mathematische Dienste GmbH. Below is the copyright notice:
53 
54     IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
55     By downloading, copying, installing or using the software you agree
56     to this license. If you do not agree to this license, do not download,
57     install, copy or use the software.
58 
59     Contributors License Agreement:
60 
61       Copyright (c) 2002,
62       MD-Mathematische Dienste GmbH
63       Im Defdahl 5-10
64       44141 Dortmund
65       Germany
66       www.md-it.de
67 
68     Redistribution and use in source and binary forms,
69     with or without modification, are permitted provided
70     that the following conditions are met:
71 
72     Redistributions of source code must retain
73     the above copyright notice, this list of conditions and the following disclaimer.
74     Redistributions in binary form must reproduce the above copyright notice,
75     this list of conditions and the following disclaimer in the documentation
76     and/or other materials provided with the distribution.
77     The name of Contributor may not be used to endorse or promote products
78     derived from this software without specific prior written permission.
79 
80     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
81     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
82     THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
83     PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
84     FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
85     DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
86     OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
87     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
88     STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
89     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
90     THE POSSIBILITY OF SUCH DAMAGE.
91 \**********************************************************************************/
92 
93 #include "precomp.hpp"
94 #include "opencl_kernels_imgproc.hpp"
95 #include <limits>
96 
97 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
98 
99 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
100 #define MAX_IPP8u   255
101 #define MAX_IPP16u  65535
102 #define MAX_IPP32f  1.0
103 static IppStatus sts = ippInit();
104 #endif
105 
106 namespace cv
107 {
108 
109 // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
splineBuild(const _Tp * f,int n,_Tp * tab)110 template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
111 {
112     _Tp cn = 0;
113     int i;
114     tab[0] = tab[1] = (_Tp)0;
115 
116     for(i = 1; i < n-1; i++)
117     {
118         _Tp t = 3*(f[i+1] - 2*f[i] + f[i-1]);
119         _Tp l = 1/(4 - tab[(i-1)*4]);
120         tab[i*4] = l; tab[i*4+1] = (t - tab[(i-1)*4+1])*l;
121     }
122 
123     for(i = n-1; i >= 0; i--)
124     {
125         _Tp c = tab[i*4+1] - tab[i*4]*cn;
126         _Tp b = f[i+1] - f[i] - (cn + c*2)*(_Tp)0.3333333333333333;
127         _Tp d = (cn - c)*(_Tp)0.3333333333333333;
128         tab[i*4] = f[i]; tab[i*4+1] = b;
129         tab[i*4+2] = c; tab[i*4+3] = d;
130         cn = c;
131     }
132 }
133 
134 // interpolates value of a function at x, 0 <= x <= n using a cubic spline.
splineInterpolate(_Tp x,const _Tp * tab,int n)135 template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab, int n)
136 {
137     // don't touch this function without urgent need - some versions of gcc fail to inline it correctly
138     int ix = std::min(std::max(int(x), 0), n-1);
139     x -= ix;
140     tab += ix*4;
141     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
142 }
143 
144 
145 template<typename _Tp> struct ColorChannel
146 {
147     typedef float worktype_f;
maxcv::ColorChannel148     static _Tp max() { return std::numeric_limits<_Tp>::max(); }
halfcv::ColorChannel149     static _Tp half() { return (_Tp)(max()/2 + 1); }
150 };
151 
152 template<> struct ColorChannel<float>
153 {
154     typedef float worktype_f;
maxcv::ColorChannel155     static float max() { return 1.f; }
halfcv::ColorChannel156     static float half() { return 0.5f; }
157 };
158 
159 /*template<> struct ColorChannel<double>
160 {
161     typedef double worktype_f;
162     static double max() { return 1.; }
163     static double half() { return 0.5; }
164 };*/
165 
166 
167 ///////////////////////////// Top-level template function ////////////////////////////////
168 
169 template <typename Cvt>
170 class CvtColorLoop_Invoker : public ParallelLoopBody
171 {
172     typedef typename Cvt::channel_type _Tp;
173 public:
174 
CvtColorLoop_Invoker(const Mat & _src,Mat & _dst,const Cvt & _cvt)175     CvtColorLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt) :
176         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt)
177     {
178     }
179 
operator ()(const Range & range) const180     virtual void operator()(const Range& range) const
181     {
182         const uchar* yS = src.ptr<uchar>(range.start);
183         uchar* yD = dst.ptr<uchar>(range.start);
184 
185         for( int i = range.start; i < range.end; ++i, yS += src.step, yD += dst.step )
186             cvt((const _Tp*)yS, (_Tp*)yD, src.cols);
187     }
188 
189 private:
190     const Mat& src;
191     Mat& dst;
192     const Cvt& cvt;
193 
194     const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
195 };
196 
197 template <typename Cvt>
CvtColorLoop(const Mat & src,Mat & dst,const Cvt & cvt)198 void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
199 {
200     parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
201 }
202 
203 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
204 
205 typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
206 typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
207 typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
208 
209 template <typename Cvt>
210 class CvtColorIPPLoop_Invoker :
211         public ParallelLoopBody
212 {
213 public:
214 
CvtColorIPPLoop_Invoker(const Mat & _src,Mat & _dst,const Cvt & _cvt,bool * _ok)215     CvtColorIPPLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt, bool *_ok) :
216         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt), ok(_ok)
217     {
218         *ok = true;
219     }
220 
operator ()(const Range & range) const221     virtual void operator()(const Range& range) const
222     {
223         const void *yS = src.ptr<uchar>(range.start);
224         void *yD = dst.ptr<uchar>(range.start);
225         if( !cvt(yS, (int)src.step[0], yD, (int)dst.step[0], src.cols, range.end - range.start) )
226             *ok = false;
227         else
228         {
229             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
230         }
231     }
232 
233 private:
234     const Mat& src;
235     Mat& dst;
236     const Cvt& cvt;
237     bool *ok;
238 
239     const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
240 };
241 
242 template <typename Cvt>
CvtColorIPPLoop(const Mat & src,Mat & dst,const Cvt & cvt)243 bool CvtColorIPPLoop(const Mat& src, Mat& dst, const Cvt& cvt)
244 {
245     bool ok;
246     parallel_for_(Range(0, src.rows), CvtColorIPPLoop_Invoker<Cvt>(src, dst, cvt, &ok), src.total()/(double)(1<<16) );
247     return ok;
248 }
249 
250 template <typename Cvt>
CvtColorIPPLoopCopy(Mat & src,Mat & dst,const Cvt & cvt)251 bool CvtColorIPPLoopCopy(Mat& src, Mat& dst, const Cvt& cvt)
252 {
253     Mat temp;
254     Mat &source = src;
255     if( src.data == dst.data )
256     {
257         src.copyTo(temp);
258         source = temp;
259     }
260     bool ok;
261     parallel_for_(Range(0, source.rows), CvtColorIPPLoop_Invoker<Cvt>(source, dst, cvt, &ok),
262                   source.total()/(double)(1<<16) );
263     return ok;
264 }
265 
ippiSwapChannels_8u_C3C4Rf(const Ipp8u * pSrc,int srcStep,Ipp8u * pDst,int dstStep,IppiSize roiSize,const int * dstOrder)266 static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
267          IppiSize roiSize, const int *dstOrder)
268 {
269     return ippiSwapChannels_8u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
270 }
271 
ippiSwapChannels_16u_C3C4Rf(const Ipp16u * pSrc,int srcStep,Ipp16u * pDst,int dstStep,IppiSize roiSize,const int * dstOrder)272 static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
273          IppiSize roiSize, const int *dstOrder)
274 {
275     return ippiSwapChannels_16u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
276 }
277 
ippiSwapChannels_32f_C3C4Rf(const Ipp32f * pSrc,int srcStep,Ipp32f * pDst,int dstStep,IppiSize roiSize,const int * dstOrder)278 static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
279          IppiSize roiSize, const int *dstOrder)
280 {
281     return ippiSwapChannels_32f_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
282 }
283 
284 static ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
285 {
286     (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
287     0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
288 };
289 
290 static ippiGeneralFunc ippiCopyAC4C3RTab[] =
291 {
292     (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
293     0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
294 };
295 
296 static ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
297 {
298     (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
299     0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
300 };
301 
302 static ippiReorderFunc ippiSwapChannelsC3RTab[] =
303 {
304     (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
305     0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
306 };
307 
308 #if IPP_VERSION_X100 >= 801
309 static ippiReorderFunc ippiSwapChannelsC4RTab[] =
310 {
311     (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
312     0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
313 };
314 #endif
315 
316 static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
317 {
318     (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
319     0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
320 };
321 
322 static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
323 {
324     (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
325     0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
326 };
327 
328 static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
329 {
330     (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
331     0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
332 };
333 
334 static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
335 {
336     (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
337     0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
338 };
339 
340 static ippiGeneralFunc ippiCopyP3C3RTab[] =
341 {
342     (ippiGeneralFunc)ippiCopy_8u_P3C3R, 0, (ippiGeneralFunc)ippiCopy_16u_P3C3R, 0,
343     0, (ippiGeneralFunc)ippiCopy_32f_P3C3R, 0, 0
344 };
345 
346 static ippiGeneralFunc ippiRGB2XYZTab[] =
347 {
348     (ippiGeneralFunc)ippiRGBToXYZ_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToXYZ_16u_C3R, 0,
349     0, (ippiGeneralFunc)ippiRGBToXYZ_32f_C3R, 0, 0
350 };
351 
352 static ippiGeneralFunc ippiXYZ2RGBTab[] =
353 {
354     (ippiGeneralFunc)ippiXYZToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiXYZToRGB_16u_C3R, 0,
355     0, (ippiGeneralFunc)ippiXYZToRGB_32f_C3R, 0, 0
356 };
357 
358 static ippiGeneralFunc ippiRGB2HSVTab[] =
359 {
360     (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
361     0, 0, 0, 0
362 };
363 
364 static ippiGeneralFunc ippiHSV2RGBTab[] =
365 {
366     (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
367     0, 0, 0, 0
368 };
369 
370 static ippiGeneralFunc ippiRGB2HLSTab[] =
371 {
372     (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
373     0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
374 };
375 
376 static ippiGeneralFunc ippiHLS2RGBTab[] =
377 {
378     (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
379     0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
380 };
381 
382 #if !defined(HAVE_IPP_ICV_ONLY) && 0
383 static ippiGeneralFunc ippiRGBToLUVTab[] =
384 {
385     (ippiGeneralFunc)ippiRGBToLUV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToLUV_16u_C3R, 0,
386     0, (ippiGeneralFunc)ippiRGBToLUV_32f_C3R, 0, 0
387 };
388 
389 static ippiGeneralFunc ippiLUVToRGBTab[] =
390 {
391     (ippiGeneralFunc)ippiLUVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiLUVToRGB_16u_C3R, 0,
392     0, (ippiGeneralFunc)ippiLUVToRGB_32f_C3R, 0, 0
393 };
394 #endif
395 
396 struct IPPGeneralFunctor
397 {
IPPGeneralFunctorcv::IPPGeneralFunctor398     IPPGeneralFunctor(ippiGeneralFunc _func) : func(_func){}
operator ()cv::IPPGeneralFunctor399     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
400     {
401         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
402     }
403 private:
404     ippiGeneralFunc func;
405 };
406 
407 struct IPPReorderFunctor
408 {
IPPReorderFunctorcv::IPPReorderFunctor409     IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : func(_func)
410     {
411         order[0] = _order0;
412         order[1] = _order1;
413         order[2] = _order2;
414         order[3] = 3;
415     }
operator ()cv::IPPReorderFunctor416     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
417     {
418         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
419     }
420 private:
421     ippiReorderFunc func;
422     int order[4];
423 };
424 
425 struct IPPColor2GrayFunctor
426 {
IPPColor2GrayFunctorcv::IPPColor2GrayFunctor427     IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
428         func(_func)
429     {
430         coeffs[0] = 0.114f;
431         coeffs[1] = 0.587f;
432         coeffs[2] = 0.299f;
433     }
operator ()cv::IPPColor2GrayFunctor434     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
435     {
436         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
437     }
438 private:
439     ippiColor2GrayFunc func;
440     Ipp32f coeffs[3];
441 };
442 
443 struct IPPGray2BGRFunctor
444 {
IPPGray2BGRFunctorcv::IPPGray2BGRFunctor445     IPPGray2BGRFunctor(ippiGeneralFunc _func) :
446         func(_func)
447     {
448     }
449 
operator ()cv::IPPGray2BGRFunctor450     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
451     {
452         if (func == 0)
453             return false;
454 
455         const void* srcarray[3] = { src, src, src };
456         return func(srcarray, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0;
457     }
458 private:
459     ippiGeneralFunc func;
460 };
461 
462 struct IPPGray2BGRAFunctor
463 {
IPPGray2BGRAFunctorcv::IPPGray2BGRAFunctor464     IPPGray2BGRAFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _depth) :
465         func1(_func1), func2(_func2), depth(_depth)
466     {
467     }
468 
operator ()cv::IPPGray2BGRAFunctor469     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
470     {
471         if (func1 == 0 || func2 == 0)
472             return false;
473 
474         const void* srcarray[3] = { src, src, src };
475         Mat temp(rows, cols, CV_MAKETYPE(depth, 3));
476         if(func1(srcarray, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
477             return false;
478         int order[4] = {0, 1, 2, 3};
479         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
480     }
481 private:
482     ippiGeneralFunc func1;
483     ippiReorderFunc func2;
484     int depth;
485 };
486 
487 struct IPPReorderGeneralFunctor
488 {
IPPReorderGeneralFunctorcv::IPPReorderGeneralFunctor489     IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
490         func1(_func1), func2(_func2), depth(_depth)
491     {
492         order[0] = _order0;
493         order[1] = _order1;
494         order[2] = _order2;
495         order[3] = 3;
496     }
operator ()cv::IPPReorderGeneralFunctor497     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
498     {
499         if (func1 == 0 || func2 == 0)
500             return false;
501 
502         Mat temp;
503         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
504         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
505             return false;
506         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
507     }
508 private:
509     ippiReorderFunc func1;
510     ippiGeneralFunc func2;
511     int order[4];
512     int depth;
513 };
514 
515 struct IPPGeneralReorderFunctor
516 {
IPPGeneralReorderFunctorcv::IPPGeneralReorderFunctor517     IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
518         func1(_func1), func2(_func2), depth(_depth)
519     {
520         order[0] = _order0;
521         order[1] = _order1;
522         order[2] = _order2;
523         order[3] = 3;
524     }
operator ()cv::IPPGeneralReorderFunctor525     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
526     {
527         if (func1 == 0 || func2 == 0)
528             return false;
529 
530         Mat temp;
531         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
532         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
533             return false;
534         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
535     }
536 private:
537     ippiGeneralFunc func1;
538     ippiReorderFunc func2;
539     int order[4];
540     int depth;
541 };
542 
543 #endif
544 
545 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
546 
547 template<typename _Tp> struct RGB2RGB
548 {
549     typedef _Tp channel_type;
550 
RGB2RGBcv::RGB2RGB551     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
operator ()cv::RGB2RGB552     void operator()(const _Tp* src, _Tp* dst, int n) const
553     {
554         int scn = srccn, dcn = dstcn, bidx = blueIdx;
555         if( dcn == 3 )
556         {
557             n *= 3;
558             for( int i = 0; i < n; i += 3, src += scn )
559             {
560                 _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
561                 dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
562             }
563         }
564         else if( scn == 3 )
565         {
566             n *= 3;
567             _Tp alpha = ColorChannel<_Tp>::max();
568             for( int i = 0; i < n; i += 3, dst += 4 )
569             {
570                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
571                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
572             }
573         }
574         else
575         {
576             n *= 4;
577             for( int i = 0; i < n; i += 4 )
578             {
579                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
580                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
581             }
582         }
583     }
584 
585     int srccn, dstcn, blueIdx;
586 };
587 
588 #if CV_NEON
589 
590 template<> struct RGB2RGB<uchar>
591 {
592     typedef uchar channel_type;
593 
RGB2RGBcv::RGB2RGB594     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
595         srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
596     {
597         v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
598         v_alpha2 = vget_low_u8(v_alpha);
599     }
600 
operator ()cv::RGB2RGB601     void operator()(const uchar * src, uchar * dst, int n) const
602     {
603         int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
604         if (dcn == 3)
605         {
606             n *= 3;
607             if (scn == 3)
608             {
609                 for ( ; i <= n - 48; i += 48, src += 48 )
610                 {
611                     uint8x16x3_t v_src = vld3q_u8(src), v_dst;
612                     v_dst.val[0] = v_src.val[bidx];
613                     v_dst.val[1] = v_src.val[1];
614                     v_dst.val[2] = v_src.val[bidx ^ 2];
615                     vst3q_u8(dst + i, v_dst);
616                 }
617                 for ( ; i <= n - 24; i += 24, src += 24 )
618                 {
619                     uint8x8x3_t v_src = vld3_u8(src), v_dst;
620                     v_dst.val[0] = v_src.val[bidx];
621                     v_dst.val[1] = v_src.val[1];
622                     v_dst.val[2] = v_src.val[bidx ^ 2];
623                     vst3_u8(dst + i, v_dst);
624                 }
625                 for ( ; i < n; i += 3, src += 3 )
626                 {
627                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
628                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
629                 }
630             }
631             else
632             {
633                 for ( ; i <= n - 48; i += 48, src += 64 )
634                 {
635                     uint8x16x4_t v_src = vld4q_u8(src);
636                     uint8x16x3_t v_dst;
637                     v_dst.val[0] = v_src.val[bidx];
638                     v_dst.val[1] = v_src.val[1];
639                     v_dst.val[2] = v_src.val[bidx ^ 2];
640                     vst3q_u8(dst + i, v_dst);
641                 }
642                 for ( ; i <= n - 24; i += 24, src += 32 )
643                 {
644                     uint8x8x4_t v_src = vld4_u8(src);
645                     uint8x8x3_t v_dst;
646                     v_dst.val[0] = v_src.val[bidx];
647                     v_dst.val[1] = v_src.val[1];
648                     v_dst.val[2] = v_src.val[bidx ^ 2];
649                     vst3_u8(dst + i, v_dst);
650                 }
651                 for ( ; i < n; i += 3, src += 4 )
652                 {
653                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
654                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
655                 }
656             }
657         }
658         else if (scn == 3)
659         {
660             n *= 3;
661             for ( ; i <= n - 48; i += 48, dst += 64 )
662             {
663                 uint8x16x3_t v_src = vld3q_u8(src + i);
664                 uint8x16x4_t v_dst;
665                 v_dst.val[bidx] = v_src.val[0];
666                 v_dst.val[1] = v_src.val[1];
667                 v_dst.val[bidx ^ 2] = v_src.val[2];
668                 v_dst.val[3] = v_alpha;
669                 vst4q_u8(dst, v_dst);
670             }
671             for ( ; i <= n - 24; i += 24, dst += 32 )
672             {
673                 uint8x8x3_t v_src = vld3_u8(src + i);
674                 uint8x8x4_t v_dst;
675                 v_dst.val[bidx] = v_src.val[0];
676                 v_dst.val[1] = v_src.val[1];
677                 v_dst.val[bidx ^ 2] = v_src.val[2];
678                 v_dst.val[3] = v_alpha2;
679                 vst4_u8(dst, v_dst);
680             }
681             uchar alpha = ColorChannel<uchar>::max();
682             for (; i < n; i += 3, dst += 4 )
683             {
684                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
685                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
686             }
687         }
688         else
689         {
690             n *= 4;
691             for ( ; i <= n - 64; i += 64 )
692             {
693                 uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
694                 v_dst.val[0] = v_src.val[2];
695                 v_dst.val[1] = v_src.val[1];
696                 v_dst.val[2] = v_src.val[0];
697                 v_dst.val[3] = v_src.val[3];
698                 vst4q_u8(dst + i, v_dst);
699             }
700             for ( ; i <= n - 32; i += 32 )
701             {
702                 uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
703                 v_dst.val[0] = v_src.val[2];
704                 v_dst.val[1] = v_src.val[1];
705                 v_dst.val[2] = v_src.val[0];
706                 v_dst.val[3] = v_src.val[3];
707                 vst4_u8(dst + i, v_dst);
708             }
709             for ( ; i < n; i += 4)
710             {
711                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
712                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
713             }
714         }
715     }
716 
717     int srccn, dstcn, blueIdx;
718 
719     uint8x16_t v_alpha;
720     uint8x8_t v_alpha2;
721 };
722 
723 #endif
724 
725 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
726 
727 struct RGB5x52RGB
728 {
729     typedef uchar channel_type;
730 
RGB5x52RGBcv::RGB5x52RGB731     RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
732         : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
733     {
734         #if CV_NEON
735         v_n3 = vdupq_n_u16(~3);
736         v_n7 = vdupq_n_u16(~7);
737         v_255 = vdupq_n_u8(255);
738         v_0 = vdupq_n_u8(0);
739         v_mask = vdupq_n_u16(0x8000);
740         #endif
741     }
742 
operator ()cv::RGB5x52RGB743     void operator()(const uchar* src, uchar* dst, int n) const
744     {
745         int dcn = dstcn, bidx = blueIdx, i = 0;
746         if( greenBits == 6 )
747         {
748             #if CV_NEON
749             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
750             {
751                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
752                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
753                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
754                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
755                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
756                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
757                 if (dcn == 3)
758                 {
759                     uint8x16x3_t v_dst;
760                     v_dst.val[bidx] = v_b;
761                     v_dst.val[1] = v_g;
762                     v_dst.val[bidx^2] = v_r;
763                     vst3q_u8(dst, v_dst);
764                 }
765                 else
766                 {
767                     uint8x16x4_t v_dst;
768                     v_dst.val[bidx] = v_b;
769                     v_dst.val[1] = v_g;
770                     v_dst.val[bidx^2] = v_r;
771                     v_dst.val[3] = v_255;
772                     vst4q_u8(dst, v_dst);
773                 }
774             }
775             #endif
776             for( ; i < n; i++, dst += dcn )
777             {
778                 unsigned t = ((const ushort*)src)[i];
779                 dst[bidx] = (uchar)(t << 3);
780                 dst[1] = (uchar)((t >> 3) & ~3);
781                 dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
782                 if( dcn == 4 )
783                     dst[3] = 255;
784             }
785         }
786         else
787         {
788             #if CV_NEON
789             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
790             {
791                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
792                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
793                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
794                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
795                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
796                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
797                 if (dcn == 3)
798                 {
799                     uint8x16x3_t v_dst;
800                     v_dst.val[bidx] = v_b;
801                     v_dst.val[1] = v_g;
802                     v_dst.val[bidx^2] = v_r;
803                     vst3q_u8(dst, v_dst);
804                 }
805                 else
806                 {
807                     uint8x16x4_t v_dst;
808                     v_dst.val[bidx] = v_b;
809                     v_dst.val[1] = v_g;
810                     v_dst.val[bidx^2] = v_r;
811                     v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
812                                                         vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
813                     vst4q_u8(dst, v_dst);
814                 }
815             }
816             #endif
817             for( ; i < n; i++, dst += dcn )
818             {
819                 unsigned t = ((const ushort*)src)[i];
820                 dst[bidx] = (uchar)(t << 3);
821                 dst[1] = (uchar)((t >> 2) & ~7);
822                 dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
823                 if( dcn == 4 )
824                     dst[3] = t & 0x8000 ? 255 : 0;
825             }
826         }
827     }
828 
829     int dstcn, blueIdx, greenBits;
830     #if CV_NEON
831     uint16x8_t v_n3, v_n7, v_mask;
832     uint8x16_t v_255, v_0;
833     #endif
834 };
835 
836 
837 struct RGB2RGB5x5
838 {
839     typedef uchar channel_type;
840 
RGB2RGB5x5cv::RGB2RGB5x5841     RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
842         : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
843     {
844         #if CV_NEON
845         v_n3 = vdup_n_u8(~3);
846         v_n7 = vdup_n_u8(~7);
847         v_mask = vdupq_n_u16(0x8000);
848         v_0 = vdupq_n_u16(0);
849         v_full = vdupq_n_u16(0xffff);
850         #endif
851     }
852 
operator ()cv::RGB2RGB5x5853     void operator()(const uchar* src, uchar* dst, int n) const
854     {
855         int scn = srccn, bidx = blueIdx, i = 0;
856         if (greenBits == 6)
857         {
858             if (scn == 3)
859             {
860                 #if CV_NEON
861                 for ( ; i <= n - 8; i += 8, src += 24 )
862                 {
863                     uint8x8x3_t v_src = vld3_u8(src);
864                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
865                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
866                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
867                     vst1q_u16((ushort *)dst + i, v_dst);
868                 }
869                 #endif
870                 for ( ; i < n; i++, src += 3 )
871                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
872             }
873             else
874             {
875                 #if CV_NEON
876                 for ( ; i <= n - 8; i += 8, src += 32 )
877                 {
878                     uint8x8x4_t v_src = vld4_u8(src);
879                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
880                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
881                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
882                     vst1q_u16((ushort *)dst + i, v_dst);
883                 }
884                 #endif
885                 for ( ; i < n; i++, src += 4 )
886                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
887             }
888         }
889         else if (scn == 3)
890         {
891             #if CV_NEON
892             for ( ; i <= n - 8; i += 8, src += 24 )
893             {
894                 uint8x8x3_t v_src = vld3_u8(src);
895                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
896                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
897                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
898                 vst1q_u16((ushort *)dst + i, v_dst);
899             }
900             #endif
901             for ( ; i < n; i++, src += 3 )
902                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
903         }
904         else
905         {
906             #if CV_NEON
907             for ( ; i <= n - 8; i += 8, src += 32 )
908             {
909                 uint8x8x4_t v_src = vld4_u8(src);
910                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
911                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
912                 v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
913                                                    vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
914                 vst1q_u16((ushort *)dst + i, v_dst);
915             }
916             #endif
917             for ( ; i < n; i++, src += 4 )
918                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
919                     ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
920         }
921     }
922 
923     int srccn, blueIdx, greenBits;
924     #if CV_NEON
925     uint8x8_t v_n3, v_n7;
926     uint16x8_t v_mask, v_0, v_full;
927     #endif
928 };
929 
930 ///////////////////////////////// Color to/from Grayscale ////////////////////////////////
931 
932 template<typename _Tp>
933 struct Gray2RGB
934 {
935     typedef _Tp channel_type;
936 
Gray2RGBcv::Gray2RGB937     Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
operator ()cv::Gray2RGB938     void operator()(const _Tp* src, _Tp* dst, int n) const
939     {
940         if( dstcn == 3 )
941             for( int i = 0; i < n; i++, dst += 3 )
942             {
943                 dst[0] = dst[1] = dst[2] = src[i];
944             }
945         else
946         {
947             _Tp alpha = ColorChannel<_Tp>::max();
948             for( int i = 0; i < n; i++, dst += 4 )
949             {
950                 dst[0] = dst[1] = dst[2] = src[i];
951                 dst[3] = alpha;
952             }
953         }
954     }
955 
956     int dstcn;
957 };
958 
959 
960 struct Gray2RGB5x5
961 {
962     typedef uchar channel_type;
963 
Gray2RGB5x5cv::Gray2RGB5x5964     Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
965     {
966         #if CV_NEON
967         v_n7 = vdup_n_u8(~7);
968         v_n3 = vdup_n_u8(~3);
969         #elif CV_SSE2
970         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
971         v_n7 = _mm_set1_epi16(~7);
972         v_n3 = _mm_set1_epi16(~3);
973         v_zero = _mm_setzero_si128();
974         #endif
975     }
976 
operator ()cv::Gray2RGB5x5977     void operator()(const uchar* src, uchar* dst, int n) const
978     {
979         int i = 0;
980         if( greenBits == 6 )
981         {
982             #if CV_NEON
983             for ( ; i <= n - 8; i += 8 )
984             {
985                 uint8x8_t v_src = vld1_u8(src + i);
986                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
987                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
988                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
989                 vst1q_u16((ushort *)dst + i, v_dst);
990             }
991             #elif CV_SSE2
992             if (haveSIMD)
993             {
994                 for ( ; i <= n - 16; i += 16 )
995                 {
996                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
997 
998                     __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
999                     __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
1000                                     _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
1001                                                  _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
1002                     _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
1003 
1004                     v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
1005                     v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
1006                             _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
1007                                          _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
1008                     _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
1009                 }
1010             }
1011             #endif
1012             for ( ; i < n; i++ )
1013             {
1014                 int t = src[i];
1015                 ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
1016             }
1017         }
1018         else
1019         {
1020             #if CV_NEON
1021             for ( ; i <= n - 8; i += 8 )
1022             {
1023                 uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
1024                 uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
1025                 vst1q_u16((ushort *)dst + i, v_dst);
1026             }
1027             #elif CV_SSE2
1028             if (haveSIMD)
1029             {
1030                 for ( ; i <= n - 16; i += 8 )
1031                 {
1032                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
1033 
1034                     __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3);
1035                     __m128i v_dst = _mm_or_si128(v_src_p,
1036                                     _mm_or_si128(_mm_slli_epi32(v_src_p, 5),
1037                                                  _mm_slli_epi16(v_src_p, 10)));
1038                     _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
1039 
1040                     v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3);
1041                     v_dst = _mm_or_si128(v_src_p,
1042                             _mm_or_si128(_mm_slli_epi16(v_src_p, 5),
1043                                          _mm_slli_epi16(v_src_p, 10)));
1044                     _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
1045                 }
1046             }
1047             #endif
1048             for( ; i < n; i++ )
1049             {
1050                 int t = src[i] >> 3;
1051                 ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
1052             }
1053         }
1054     }
1055     int greenBits;
1056 
1057     #if CV_NEON
1058     uint8x8_t v_n7, v_n3;
1059     #elif CV_SSE2
1060     __m128i v_n7, v_n3, v_zero;
1061     bool haveSIMD;
1062     #endif
1063 };
1064 
1065 
1066 #undef R2Y
1067 #undef G2Y
1068 #undef B2Y
1069 
1070 enum
1071 {
1072     yuv_shift = 14,
1073     xyz_shift = 12,
1074     R2Y = 4899,
1075     G2Y = 9617,
1076     B2Y = 1868,
1077     BLOCK_SIZE = 256
1078 };
1079 
1080 
1081 struct RGB5x52Gray
1082 {
1083     typedef uchar channel_type;
1084 
RGB5x52Graycv::RGB5x52Gray1085     RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
1086     {
1087         #if CV_NEON
1088         v_b2y = vdup_n_u16(B2Y);
1089         v_g2y = vdup_n_u16(G2Y);
1090         v_r2y = vdup_n_u16(R2Y);
1091         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1092         v_f8 = vdupq_n_u16(0xf8);
1093         v_fc = vdupq_n_u16(0xfc);
1094         #elif CV_SSE2
1095         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
1096         v_b2y = _mm_set1_epi16(B2Y);
1097         v_g2y = _mm_set1_epi16(G2Y);
1098         v_r2y = _mm_set1_epi16(R2Y);
1099         v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
1100         v_f8 = _mm_set1_epi16(0xf8);
1101         v_fc = _mm_set1_epi16(0xfc);
1102         #endif
1103     }
1104 
operator ()cv::RGB5x52Gray1105     void operator()(const uchar* src, uchar* dst, int n) const
1106     {
1107         int i = 0;
1108         if( greenBits == 6 )
1109         {
1110             #if CV_NEON
1111             for ( ; i <= n - 8; i += 8)
1112             {
1113                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1114                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1115                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
1116                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
1117 
1118                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1119                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1120                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1121                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1122                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1123                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1124 
1125                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1126             }
1127             #elif CV_SSE2
1128             if (haveSIMD)
1129             {
1130                 __m128i v_zero = _mm_setzero_si128();
1131 
1132                 for ( ; i <= n - 8; i += 8)
1133                 {
1134                     __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
1135                     __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
1136                             v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc),
1137                             v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8);
1138 
1139                     __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
1140                     __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
1141                     __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
1142                     __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
1143                     __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
1144                     __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
1145 
1146                     __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
1147                                                    _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
1148                     v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
1149                                            _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
1150 
1151                     __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
1152                                                    _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
1153                     v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
1154                                            _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
1155 
1156                     v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
1157                     v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
1158 
1159                     __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
1160                     _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
1161                 }
1162             }
1163             #endif
1164             for ( ; i < n; i++)
1165             {
1166                 int t = ((ushort*)src)[i];
1167                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1168                                            ((t >> 3) & 0xfc)*G2Y +
1169                                            ((t >> 8) & 0xf8)*R2Y, yuv_shift);
1170             }
1171         }
1172         else
1173         {
1174             #if CV_NEON
1175             for ( ; i <= n - 8; i += 8)
1176             {
1177                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
1178                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
1179                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
1180                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
1181 
1182                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
1183                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
1184                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
1185                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
1186                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
1187                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
1188 
1189                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
1190             }
1191             #elif CV_SSE2
1192             if (haveSIMD)
1193             {
1194                 __m128i v_zero = _mm_setzero_si128();
1195 
1196                 for ( ; i <= n - 8; i += 8)
1197                 {
1198                     __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
1199                     __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
1200                             v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8),
1201                             v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8);
1202 
1203                     __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
1204                     __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
1205                     __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
1206                     __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
1207                     __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
1208                     __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
1209 
1210                     __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
1211                                                    _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
1212                     v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
1213                                            _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
1214 
1215                     __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
1216                                                    _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
1217                     v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
1218                                            _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
1219 
1220                     v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
1221                     v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
1222 
1223                     __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
1224                     _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
1225                 }
1226             }
1227             #endif
1228             for ( ; i < n; i++)
1229             {
1230                 int t = ((ushort*)src)[i];
1231                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
1232                                            ((t >> 2) & 0xf8)*G2Y +
1233                                            ((t >> 7) & 0xf8)*R2Y, yuv_shift);
1234             }
1235         }
1236     }
1237     int greenBits;
1238 
1239     #if CV_NEON
1240     uint16x4_t v_b2y, v_g2y, v_r2y;
1241     uint32x4_t v_delta;
1242     uint16x8_t v_f8, v_fc;
1243     #elif CV_SSE2
1244     bool haveSIMD;
1245     __m128i v_b2y, v_g2y, v_r2y;
1246     __m128i v_delta;
1247     __m128i v_f8, v_fc;
1248     #endif
1249 };
1250 
1251 
1252 template<typename _Tp> struct RGB2Gray
1253 {
1254     typedef _Tp channel_type;
1255 
RGB2Graycv::RGB2Gray1256     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1257     {
1258         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1259         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1260         if(blueIdx == 0)
1261             std::swap(coeffs[0], coeffs[2]);
1262     }
1263 
operator ()cv::RGB2Gray1264     void operator()(const _Tp* src, _Tp* dst, int n) const
1265     {
1266         int scn = srccn;
1267         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1268         for(int i = 0; i < n; i++, src += scn)
1269             dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
1270     }
1271     int srccn;
1272     float coeffs[3];
1273 };
1274 
1275 template<> struct RGB2Gray<uchar>
1276 {
1277     typedef uchar channel_type;
1278 
RGB2Graycv::RGB2Gray1279     RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
1280     {
1281         const int coeffs0[] = { R2Y, G2Y, B2Y };
1282         if(!coeffs) coeffs = coeffs0;
1283 
1284         int b = 0, g = 0, r = (1 << (yuv_shift-1));
1285         int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
1286 
1287         for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
1288         {
1289             tab[i] = b;
1290             tab[i+256] = g;
1291             tab[i+512] = r;
1292         }
1293     }
operator ()cv::RGB2Gray1294     void operator()(const uchar* src, uchar* dst, int n) const
1295     {
1296         int scn = srccn;
1297         const int* _tab = tab;
1298         for(int i = 0; i < n; i++, src += scn)
1299             dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
1300     }
1301     int srccn;
1302     int tab[256*3];
1303 };
1304 
1305 #if CV_NEON
1306 
1307 template <>
1308 struct RGB2Gray<ushort>
1309 {
1310     typedef ushort channel_type;
1311 
RGB2Graycv::RGB2Gray1312     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
1313         srccn(_srccn)
1314     {
1315         static const int coeffs0[] = { R2Y, G2Y, B2Y };
1316         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1317         if( blueIdx == 0 )
1318             std::swap(coeffs[0], coeffs[2]);
1319 
1320         v_cb = vdup_n_u16(coeffs[0]);
1321         v_cg = vdup_n_u16(coeffs[1]);
1322         v_cr = vdup_n_u16(coeffs[2]);
1323         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
1324     }
1325 
operator ()cv::RGB2Gray1326     void operator()(const ushort* src, ushort* dst, int n) const
1327     {
1328         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
1329 
1330         for ( ; i <= n - 8; i += 8, src += scn * 8)
1331         {
1332             uint16x8_t v_b, v_r, v_g;
1333             if (scn == 3)
1334             {
1335                 uint16x8x3_t v_src = vld3q_u16(src);
1336                 v_b = v_src.val[0];
1337                 v_g = v_src.val[1];
1338                 v_r = v_src.val[2];
1339             }
1340             else
1341             {
1342                 uint16x8x4_t v_src = vld4q_u16(src);
1343                 v_b = v_src.val[0];
1344                 v_g = v_src.val[1];
1345                 v_r = v_src.val[2];
1346             }
1347 
1348             uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
1349                                            vmull_u16(vget_low_u16(v_b), v_cb),
1350                                                      vget_low_u16(v_g), v_cg),
1351                                                      vget_low_u16(v_r), v_cr);
1352             uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
1353                                            vmull_u16(vget_high_u16(v_b), v_cb),
1354                                                      vget_high_u16(v_g), v_cg),
1355                                                      vget_high_u16(v_r), v_cr);
1356 
1357             uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
1358             uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
1359 
1360             vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
1361         }
1362 
1363         for ( ; i <= n - 4; i += 4, src += scn * 4)
1364         {
1365             uint16x4_t v_b, v_r, v_g;
1366             if (scn == 3)
1367             {
1368                 uint16x4x3_t v_src = vld3_u16(src);
1369                 v_b = v_src.val[0];
1370                 v_g = v_src.val[1];
1371                 v_r = v_src.val[2];
1372             }
1373             else
1374             {
1375                 uint16x4x4_t v_src = vld4_u16(src);
1376                 v_b = v_src.val[0];
1377                 v_g = v_src.val[1];
1378                 v_r = v_src.val[2];
1379             }
1380 
1381             uint32x4_t v_dst = vmlal_u16(vmlal_u16(
1382                                          vmull_u16(v_b, v_cb),
1383                                                    v_g, v_cg),
1384                                                    v_r, v_cr);
1385 
1386             vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
1387         }
1388 
1389         for( ; i < n; i++, src += scn)
1390             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1391     }
1392 
1393     int srccn, coeffs[3];
1394     uint16x4_t v_cb, v_cg, v_cr;
1395     uint32x4_t v_delta;
1396 };
1397 
1398 template <>
1399 struct RGB2Gray<float>
1400 {
1401     typedef float channel_type;
1402 
RGB2Graycv::RGB2Gray1403     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1404     {
1405         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1406         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1407         if(blueIdx == 0)
1408             std::swap(coeffs[0], coeffs[2]);
1409 
1410         v_cb = vdupq_n_f32(coeffs[0]);
1411         v_cg = vdupq_n_f32(coeffs[1]);
1412         v_cr = vdupq_n_f32(coeffs[2]);
1413     }
1414 
operator ()cv::RGB2Gray1415     void operator()(const float * src, float * dst, int n) const
1416     {
1417         int scn = srccn, i = 0;
1418         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1419 
1420         if (scn == 3)
1421         {
1422             for ( ; i <= n - 8; i += 8, src += scn * 8)
1423             {
1424                 float32x4x3_t v_src = vld3q_f32(src);
1425                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1426 
1427                 v_src = vld3q_f32(src + scn * 4);
1428                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1429             }
1430 
1431             for ( ; i <= n - 4; i += 4, src += scn * 4)
1432             {
1433                 float32x4x3_t v_src = vld3q_f32(src);
1434                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1435             }
1436         }
1437         else
1438         {
1439             for ( ; i <= n - 8; i += 8, src += scn * 8)
1440             {
1441                 float32x4x4_t v_src = vld4q_f32(src);
1442                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1443 
1444                 v_src = vld4q_f32(src + scn * 4);
1445                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1446             }
1447 
1448             for ( ; i <= n - 4; i += 4, src += scn * 4)
1449             {
1450                 float32x4x4_t v_src = vld4q_f32(src);
1451                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
1452             }
1453         }
1454 
1455         for ( ; i < n; i++, src += scn)
1456             dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
1457     }
1458 
1459     int srccn;
1460     float coeffs[3];
1461     float32x4_t v_cb, v_cg, v_cr;
1462 };
1463 
1464 #elif CV_SSE2
1465 
1466 #if CV_SSE4_1
1467 
1468 template <>
1469 struct RGB2Gray<ushort>
1470 {
1471     typedef ushort channel_type;
1472 
RGB2Graycv::RGB2Gray1473     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
1474         srccn(_srccn)
1475     {
1476         static const int coeffs0[] = { R2Y, G2Y, B2Y };
1477         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1478         if( blueIdx == 0 )
1479             std::swap(coeffs[0], coeffs[2]);
1480 
1481         v_cb = _mm_set1_epi16((short)coeffs[0]);
1482         v_cg = _mm_set1_epi16((short)coeffs[1]);
1483         v_cr = _mm_set1_epi16((short)coeffs[2]);
1484         v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
1485 
1486         haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
1487     }
1488 
1489     // 16s x 8
processcv::RGB2Gray1490     void process(__m128i v_b, __m128i v_g, __m128i v_r,
1491                  __m128i & v_gray) const
1492     {
1493         __m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr);
1494         __m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg);
1495         __m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb);
1496         __m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr);
1497         __m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg);
1498         __m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb);
1499 
1500         __m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r),
1501                                         _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
1502         v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0);
1503         v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift);
1504 
1505         __m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r),
1506                                         _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
1507         v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1);
1508         v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift);
1509 
1510         v_gray = _mm_packus_epi32(v_gray0, v_gray1);
1511     }
1512 
operator ()cv::RGB2Gray1513     void operator()(const ushort* src, ushort* dst, int n) const
1514     {
1515         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
1516 
1517         if (scn == 3 && haveSIMD)
1518         {
1519             for ( ; i <= n - 16; i += 16, src += scn * 16)
1520             {
1521                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
1522                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
1523                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
1524                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
1525                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
1526                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
1527 
1528                 _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
1529 
1530                 __m128i v_gray0;
1531                 process(v_r0, v_g0, v_b0,
1532                         v_gray0);
1533 
1534                 __m128i v_gray1;
1535                 process(v_r1, v_g1, v_b1,
1536                         v_gray1);
1537 
1538                 _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
1539                 _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
1540             }
1541         }
1542         else if (scn == 4 && haveSIMD)
1543         {
1544             for ( ; i <= n - 16; i += 16, src += scn * 16)
1545             {
1546                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
1547                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
1548                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
1549                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
1550                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
1551                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
1552                 __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
1553                 __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
1554 
1555                 _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
1556 
1557                 __m128i v_gray0;
1558                 process(v_r0, v_g0, v_b0,
1559                         v_gray0);
1560 
1561                 __m128i v_gray1;
1562                 process(v_r1, v_g1, v_b1,
1563                         v_gray1);
1564 
1565                 _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
1566                 _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
1567             }
1568         }
1569 
1570         for( ; i < n; i++, src += scn)
1571             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1572     }
1573 
1574     int srccn, coeffs[3];
1575     __m128i v_cb, v_cg, v_cr;
1576     __m128i v_delta;
1577     bool haveSIMD;
1578 };
1579 
1580 #endif // CV_SSE4_1
1581 
1582 template <>
1583 struct RGB2Gray<float>
1584 {
1585     typedef float channel_type;
1586 
RGB2Graycv::RGB2Gray1587     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
1588     {
1589         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
1590         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
1591         if(blueIdx == 0)
1592             std::swap(coeffs[0], coeffs[2]);
1593 
1594         v_cb = _mm_set1_ps(coeffs[0]);
1595         v_cg = _mm_set1_ps(coeffs[1]);
1596         v_cr = _mm_set1_ps(coeffs[2]);
1597 
1598         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
1599     }
1600 
processcv::RGB2Gray1601     void process(__m128 v_b, __m128 v_g, __m128 v_r,
1602                  __m128 & v_gray) const
1603     {
1604         v_gray = _mm_mul_ps(v_r, v_cr);
1605         v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg));
1606         v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb));
1607     }
1608 
operator ()cv::RGB2Gray1609     void operator()(const float * src, float * dst, int n) const
1610     {
1611         int scn = srccn, i = 0;
1612         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1613 
1614         if (scn == 3 && haveSIMD)
1615         {
1616             for ( ; i <= n - 8; i += 8, src += scn * 8)
1617             {
1618                 __m128 v_r0 = _mm_loadu_ps(src);
1619                 __m128 v_r1 = _mm_loadu_ps(src + 4);
1620                 __m128 v_g0 = _mm_loadu_ps(src + 8);
1621                 __m128 v_g1 = _mm_loadu_ps(src + 12);
1622                 __m128 v_b0 = _mm_loadu_ps(src + 16);
1623                 __m128 v_b1 = _mm_loadu_ps(src + 20);
1624 
1625                 _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
1626 
1627                 __m128 v_gray0;
1628                 process(v_r0, v_g0, v_b0,
1629                         v_gray0);
1630 
1631                 __m128 v_gray1;
1632                 process(v_r1, v_g1, v_b1,
1633                         v_gray1);
1634 
1635                 _mm_storeu_ps(dst + i, v_gray0);
1636                 _mm_storeu_ps(dst + i + 4, v_gray1);
1637             }
1638         }
1639         else if (scn == 4 && haveSIMD)
1640         {
1641             for ( ; i <= n - 8; i += 8, src += scn * 8)
1642             {
1643                 __m128 v_r0 = _mm_loadu_ps(src);
1644                 __m128 v_r1 = _mm_loadu_ps(src + 4);
1645                 __m128 v_g0 = _mm_loadu_ps(src + 8);
1646                 __m128 v_g1 = _mm_loadu_ps(src + 12);
1647                 __m128 v_b0 = _mm_loadu_ps(src + 16);
1648                 __m128 v_b1 = _mm_loadu_ps(src + 20);
1649                 __m128 v_a0 = _mm_loadu_ps(src + 24);
1650                 __m128 v_a1 = _mm_loadu_ps(src + 28);
1651 
1652                 _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
1653 
1654                 __m128 v_gray0;
1655                 process(v_r0, v_g0, v_b0,
1656                         v_gray0);
1657 
1658                 __m128 v_gray1;
1659                 process(v_r1, v_g1, v_b1,
1660                         v_gray1);
1661 
1662                 _mm_storeu_ps(dst + i, v_gray0);
1663                 _mm_storeu_ps(dst + i + 4, v_gray1);
1664             }
1665         }
1666 
1667         for ( ; i < n; i++, src += scn)
1668             dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
1669     }
1670 
1671     int srccn;
1672     float coeffs[3];
1673     __m128 v_cb, v_cg, v_cr;
1674     bool haveSIMD;
1675 };
1676 
1677 #else
1678 
1679 template<> struct RGB2Gray<ushort>
1680 {
1681     typedef ushort channel_type;
1682 
RGB2Graycv::RGB2Gray1683     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
1684     {
1685         static const int coeffs0[] = { R2Y, G2Y, B2Y };
1686         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
1687         if( blueIdx == 0 )
1688             std::swap(coeffs[0], coeffs[2]);
1689     }
1690 
operator ()cv::RGB2Gray1691     void operator()(const ushort* src, ushort* dst, int n) const
1692     {
1693         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
1694         for(int i = 0; i < n; i++, src += scn)
1695             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
1696     }
1697     int srccn;
1698     int coeffs[3];
1699 };
1700 
1701 #endif
1702 
1703 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
1704 
1705 template<typename _Tp> struct RGB2YCrCb_f
1706 {
1707     typedef _Tp channel_type;
1708 
RGB2YCrCb_fcv::RGB2YCrCb_f1709     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
1710     {
1711         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1712         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1713         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1714     }
1715 
operator ()cv::RGB2YCrCb_f1716     void operator()(const _Tp* src, _Tp* dst, int n) const
1717     {
1718         int scn = srccn, bidx = blueIdx;
1719         const _Tp delta = ColorChannel<_Tp>::half();
1720         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1721         n *= 3;
1722         for(int i = 0; i < n; i += 3, src += scn)
1723         {
1724             _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
1725             _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
1726             _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
1727             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1728         }
1729     }
1730     int srccn, blueIdx;
1731     float coeffs[5];
1732 };
1733 
1734 #if CV_NEON
1735 
1736 template <>
1737 struct RGB2YCrCb_f<float>
1738 {
1739     typedef float channel_type;
1740 
RGB2YCrCb_fcv::RGB2YCrCb_f1741     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
1742         srccn(_srccn), blueIdx(_blueIdx)
1743     {
1744         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1745         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1746         if(blueIdx==0)
1747             std::swap(coeffs[0], coeffs[2]);
1748 
1749         v_c0 = vdupq_n_f32(coeffs[0]);
1750         v_c1 = vdupq_n_f32(coeffs[1]);
1751         v_c2 = vdupq_n_f32(coeffs[2]);
1752         v_c3 = vdupq_n_f32(coeffs[3]);
1753         v_c4 = vdupq_n_f32(coeffs[4]);
1754         v_delta = vdupq_n_f32(ColorChannel<float>::half());
1755     }
1756 
operator ()cv::RGB2YCrCb_f1757     void operator()(const float * src, float * dst, int n) const
1758     {
1759         int scn = srccn, bidx = blueIdx, i = 0;
1760         const float delta = ColorChannel<float>::half();
1761         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1762         n *= 3;
1763 
1764         if (scn == 3)
1765             for ( ; i <= n - 12; i += 12, src += 12)
1766             {
1767                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
1768                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1769                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1770                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1771 
1772                 vst3q_f32(dst + i, v_dst);
1773             }
1774         else
1775             for ( ; i <= n - 12; i += 12, src += 16)
1776             {
1777                 float32x4x4_t v_src = vld4q_f32(src);
1778                 float32x4x3_t v_dst;
1779                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
1780                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
1781                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
1782 
1783                 vst3q_f32(dst + i, v_dst);
1784             }
1785 
1786         for ( ; i < n; i += 3, src += scn)
1787         {
1788             float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
1789             float Cr = (src[bidx^2] - Y)*C3 + delta;
1790             float Cb = (src[bidx] - Y)*C4 + delta;
1791             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1792         }
1793     }
1794     int srccn, blueIdx;
1795     float coeffs[5];
1796     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
1797 };
1798 
1799 #elif CV_SSE2
1800 
1801 template <>
1802 struct RGB2YCrCb_f<float>
1803 {
1804     typedef float channel_type;
1805 
RGB2YCrCb_fcv::RGB2YCrCb_f1806     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
1807         srccn(_srccn), blueIdx(_blueIdx)
1808     {
1809         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
1810         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1811         if (blueIdx==0)
1812             std::swap(coeffs[0], coeffs[2]);
1813 
1814         v_c0 = _mm_set1_ps(coeffs[0]);
1815         v_c1 = _mm_set1_ps(coeffs[1]);
1816         v_c2 = _mm_set1_ps(coeffs[2]);
1817         v_c3 = _mm_set1_ps(coeffs[3]);
1818         v_c4 = _mm_set1_ps(coeffs[4]);
1819         v_delta = _mm_set1_ps(ColorChannel<float>::half());
1820 
1821         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
1822     }
1823 
processcv::RGB2YCrCb_f1824     void process(__m128 v_r, __m128 v_g, __m128 v_b,
1825                  __m128 & v_y, __m128 & v_cr, __m128 & v_cb) const
1826     {
1827         v_y = _mm_mul_ps(v_r, v_c0);
1828         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c1));
1829         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c2));
1830 
1831         v_cr = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 0 ? v_b : v_r, v_y), v_c3), v_delta);
1832         v_cb = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 2 ? v_b : v_r, v_y), v_c4), v_delta);
1833     }
1834 
operator ()cv::RGB2YCrCb_f1835     void operator()(const float * src, float * dst, int n) const
1836     {
1837         int scn = srccn, bidx = blueIdx, i = 0;
1838         const float delta = ColorChannel<float>::half();
1839         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1840         n *= 3;
1841 
1842         if (haveSIMD)
1843         {
1844             for ( ; i <= n - 24; i += 24, src += 8 * scn)
1845             {
1846                 __m128 v_r0 = _mm_loadu_ps(src);
1847                 __m128 v_r1 = _mm_loadu_ps(src + 4);
1848                 __m128 v_g0 = _mm_loadu_ps(src + 8);
1849                 __m128 v_g1 = _mm_loadu_ps(src + 12);
1850                 __m128 v_b0 = _mm_loadu_ps(src + 16);
1851                 __m128 v_b1 = _mm_loadu_ps(src + 20);
1852 
1853                 if (scn == 4)
1854                 {
1855                     __m128 v_a0 = _mm_loadu_ps(src + 24);
1856                     __m128 v_a1 = _mm_loadu_ps(src + 28);
1857                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
1858                                         v_b0, v_b1, v_a0, v_a1);
1859                 }
1860                 else
1861                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
1862 
1863                 __m128 v_y0, v_cr0, v_cb0;
1864                 process(v_r0, v_g0, v_b0,
1865                         v_y0, v_cr0, v_cb0);
1866 
1867                 __m128 v_y1, v_cr1, v_cb1;
1868                 process(v_r1, v_g1, v_b1,
1869                         v_y1, v_cr1, v_cb1);
1870 
1871                 _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
1872 
1873                 _mm_storeu_ps(dst + i, v_y0);
1874                 _mm_storeu_ps(dst + i + 4, v_y1);
1875                 _mm_storeu_ps(dst + i + 8, v_cr0);
1876                 _mm_storeu_ps(dst + i + 12, v_cr1);
1877                 _mm_storeu_ps(dst + i + 16, v_cb0);
1878                 _mm_storeu_ps(dst + i + 20, v_cb1);
1879             }
1880         }
1881 
1882         for ( ; i < n; i += 3, src += scn)
1883         {
1884             float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
1885             float Cr = (src[bidx^2] - Y)*C3 + delta;
1886             float Cb = (src[bidx] - Y)*C4 + delta;
1887             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
1888         }
1889     }
1890     int srccn, blueIdx;
1891     float coeffs[5];
1892     __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
1893     bool haveSIMD;
1894 };
1895 
1896 #endif
1897 
1898 template<typename _Tp> struct RGB2YCrCb_i
1899 {
1900     typedef _Tp channel_type;
1901 
RGB2YCrCb_icv::RGB2YCrCb_i1902     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1903         : srccn(_srccn), blueIdx(_blueIdx)
1904     {
1905         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1906         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1907         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
1908     }
operator ()cv::RGB2YCrCb_i1909     void operator()(const _Tp* src, _Tp* dst, int n) const
1910     {
1911         int scn = srccn, bidx = blueIdx;
1912         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1913         int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift);
1914         n *= 3;
1915         for(int i = 0; i < n; i += 3, src += scn)
1916         {
1917             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
1918             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
1919             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
1920             dst[i] = saturate_cast<_Tp>(Y);
1921             dst[i+1] = saturate_cast<_Tp>(Cr);
1922             dst[i+2] = saturate_cast<_Tp>(Cb);
1923         }
1924     }
1925     int srccn, blueIdx;
1926     int coeffs[5];
1927 };
1928 
1929 #if CV_NEON
1930 
1931 template <>
1932 struct RGB2YCrCb_i<uchar>
1933 {
1934     typedef uchar channel_type;
1935 
RGB2YCrCb_icv::RGB2YCrCb_i1936     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
1937         : srccn(_srccn), blueIdx(_blueIdx)
1938     {
1939         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
1940         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
1941         if (blueIdx==0)
1942             std::swap(coeffs[0], coeffs[2]);
1943 
1944         v_c0 = vdup_n_s16(coeffs[0]);
1945         v_c1 = vdup_n_s16(coeffs[1]);
1946         v_c2 = vdup_n_s16(coeffs[2]);
1947         v_c3 = vdupq_n_s32(coeffs[3]);
1948         v_c4 = vdupq_n_s32(coeffs[4]);
1949         v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
1950         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
1951     }
1952 
operator ()cv::RGB2YCrCb_i1953     void operator()(const uchar * src, uchar * dst, int n) const
1954     {
1955         int scn = srccn, bidx = blueIdx, i = 0;
1956         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
1957         int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
1958         n *= 3;
1959 
1960         for ( ; i <= n - 24; i += 24, src += scn * 8)
1961         {
1962             uint8x8x3_t v_dst;
1963             int16x8x3_t v_src16;
1964 
1965             if (scn == 3)
1966             {
1967                 uint8x8x3_t v_src = vld3_u8(src);
1968                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1969                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1970                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1971             }
1972             else
1973             {
1974                 uint8x8x4_t v_src = vld4_u8(src);
1975                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
1976                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
1977                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
1978             }
1979 
1980             int16x4x3_t v_src0;
1981             v_src0.val[0] = vget_low_s16(v_src16.val[0]);
1982             v_src0.val[1] = vget_low_s16(v_src16.val[1]);
1983             v_src0.val[2] = vget_low_s16(v_src16.val[2]);
1984 
1985             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1986             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
1987             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
1988             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
1989             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
1990             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
1991 
1992             v_src0.val[0] = vget_high_s16(v_src16.val[0]);
1993             v_src0.val[1] = vget_high_s16(v_src16.val[1]);
1994             v_src0.val[2] = vget_high_s16(v_src16.val[2]);
1995 
1996             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
1997             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
1998             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
1999             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
2000             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
2001             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
2002 
2003             v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
2004             v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
2005             v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
2006 
2007             vst3_u8(dst + i, v_dst);
2008         }
2009 
2010         for ( ; i < n; i += 3, src += scn)
2011         {
2012             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2013             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2014             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2015             dst[i] = saturate_cast<uchar>(Y);
2016             dst[i+1] = saturate_cast<uchar>(Cr);
2017             dst[i+2] = saturate_cast<uchar>(Cb);
2018         }
2019     }
2020     int srccn, blueIdx, coeffs[5];
2021     int16x4_t v_c0, v_c1, v_c2;
2022     int32x4_t v_c3, v_c4, v_delta, v_delta2;
2023 };
2024 
2025 template <>
2026 struct RGB2YCrCb_i<ushort>
2027 {
2028     typedef ushort channel_type;
2029 
RGB2YCrCb_icv::RGB2YCrCb_i2030     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
2031         : srccn(_srccn), blueIdx(_blueIdx)
2032     {
2033         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
2034         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
2035         if (blueIdx==0)
2036             std::swap(coeffs[0], coeffs[2]);
2037 
2038         v_c0 = vdupq_n_s32(coeffs[0]);
2039         v_c1 = vdupq_n_s32(coeffs[1]);
2040         v_c2 = vdupq_n_s32(coeffs[2]);
2041         v_c3 = vdupq_n_s32(coeffs[3]);
2042         v_c4 = vdupq_n_s32(coeffs[4]);
2043         v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
2044         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
2045     }
2046 
operator ()cv::RGB2YCrCb_i2047     void operator()(const ushort * src, ushort * dst, int n) const
2048     {
2049         int scn = srccn, bidx = blueIdx, i = 0;
2050         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
2051         int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
2052         n *= 3;
2053 
2054         for ( ; i <= n - 24; i += 24, src += scn * 8)
2055         {
2056             uint16x8x3_t v_src, v_dst;
2057             int32x4x3_t v_src0;
2058 
2059             if (scn == 3)
2060                 v_src = vld3q_u16(src);
2061             else
2062             {
2063                 uint16x8x4_t v_src_ = vld4q_u16(src);
2064                 v_src.val[0] = v_src_.val[0];
2065                 v_src.val[1] = v_src_.val[1];
2066                 v_src.val[2] = v_src_.val[2];
2067             }
2068 
2069             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
2070             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
2071             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
2072 
2073             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
2074             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
2075             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
2076             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
2077             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
2078             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
2079 
2080             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
2081             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
2082             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2083 
2084             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
2085             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
2086             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
2087             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
2088             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
2089             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
2090 
2091             v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
2092             v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
2093             v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
2094 
2095             vst3q_u16(dst + i, v_dst);
2096         }
2097 
2098         for ( ; i <= n - 12; i += 12, src += scn * 4)
2099         {
2100             uint16x4x3_t v_dst;
2101             int32x4x3_t v_src0;
2102 
2103             if (scn == 3)
2104             {
2105                 uint16x4x3_t v_src = vld3_u16(src);
2106                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
2107                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
2108                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2109             }
2110             else
2111             {
2112                 uint16x4x4_t v_src = vld4_u16(src);
2113                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
2114                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
2115                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2116             }
2117 
2118             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
2119             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
2120             int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
2121             v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
2122             int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
2123             v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
2124 
2125             v_dst.val[0] = vqmovun_s32(v_Y);
2126             v_dst.val[1] = vqmovun_s32(v_Cr);
2127             v_dst.val[2] = vqmovun_s32(v_Cb);
2128 
2129             vst3_u16(dst + i, v_dst);
2130         }
2131 
2132         for ( ; i < n; i += 3, src += scn)
2133         {
2134             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2135             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2136             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2137             dst[i] = saturate_cast<ushort>(Y);
2138             dst[i+1] = saturate_cast<ushort>(Cr);
2139             dst[i+2] = saturate_cast<ushort>(Cb);
2140         }
2141     }
2142     int srccn, blueIdx, coeffs[5];
2143     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
2144 };
2145 
2146 #elif CV_SSE4_1
2147 
2148 template <>
2149 struct RGB2YCrCb_i<uchar>
2150 {
2151     typedef uchar channel_type;
2152 
RGB2YCrCb_icv::RGB2YCrCb_i2153     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
2154         : srccn(_srccn), blueIdx(_blueIdx)
2155     {
2156         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
2157         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
2158         if (blueIdx==0)
2159             std::swap(coeffs[0], coeffs[2]);
2160 
2161         v_c0 = _mm_set1_epi32(coeffs[0]);
2162         v_c1 = _mm_set1_epi32(coeffs[1]);
2163         v_c2 = _mm_set1_epi32(coeffs[2]);
2164         v_c3 = _mm_set1_epi32(coeffs[3]);
2165         v_c4 = _mm_set1_epi32(coeffs[4]);
2166         v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
2167         v_delta = _mm_set1_epi32(ColorChannel<uchar>::half()*(1 << yuv_shift));
2168         v_delta = _mm_add_epi32(v_delta, v_delta2);
2169         v_zero = _mm_setzero_si128();
2170 
2171         haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
2172     }
2173 
2174     // 16u x 8
processcv::RGB2YCrCb_i2175     void process(__m128i v_r, __m128i v_g, __m128i v_b,
2176                  __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
2177     {
2178         __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
2179         __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
2180         __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
2181 
2182         __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2183                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2184                                      _mm_mullo_epi32(v_b_p, v_c2)));
2185         v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
2186 
2187         __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
2188         __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
2189         v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
2190         v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
2191 
2192         v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
2193         v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
2194         v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
2195 
2196         __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2197                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2198                                      _mm_mullo_epi32(v_b_p, v_c2)));
2199         v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
2200 
2201         __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
2202         __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
2203         v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
2204         v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
2205 
2206         v_y = _mm_packs_epi32(v_y0, v_y1);
2207         v_cr = _mm_packs_epi32(v_cr0, v_cr1);
2208         v_cb = _mm_packs_epi32(v_cb0, v_cb1);
2209     }
2210 
operator ()cv::RGB2YCrCb_i2211     void operator()(const uchar * src, uchar * dst, int n) const
2212     {
2213         int scn = srccn, bidx = blueIdx, i = 0;
2214         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
2215         int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
2216         n *= 3;
2217 
2218         if (haveSIMD)
2219         {
2220             for ( ; i <= n - 96; i += 96, src += scn * 32)
2221             {
2222                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
2223                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 16));
2224                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 32));
2225                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 48));
2226                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64));
2227                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80));
2228 
2229                 if (scn == 4)
2230                 {
2231                     __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 96));
2232                     __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 112));
2233                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1,
2234                                           v_b0, v_b1, v_a0, v_a1);
2235                 }
2236                 else
2237                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
2238 
2239                 __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
2240                 process(_mm_unpacklo_epi8(v_r0, v_zero),
2241                         _mm_unpacklo_epi8(v_g0, v_zero),
2242                         _mm_unpacklo_epi8(v_b0, v_zero),
2243                         v_y0, v_cr0, v_cb0);
2244 
2245                 __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
2246                 process(_mm_unpackhi_epi8(v_r0, v_zero),
2247                         _mm_unpackhi_epi8(v_g0, v_zero),
2248                         _mm_unpackhi_epi8(v_b0, v_zero),
2249                         v_y1, v_cr1, v_cb1);
2250 
2251                 __m128i v_y_0 = _mm_packus_epi16(v_y0, v_y1);
2252                 __m128i v_cr_0 = _mm_packus_epi16(v_cr0, v_cr1);
2253                 __m128i v_cb_0 = _mm_packus_epi16(v_cb0, v_cb1);
2254 
2255                 process(_mm_unpacklo_epi8(v_r1, v_zero),
2256                         _mm_unpacklo_epi8(v_g1, v_zero),
2257                         _mm_unpacklo_epi8(v_b1, v_zero),
2258                         v_y0, v_cr0, v_cb0);
2259 
2260                 process(_mm_unpackhi_epi8(v_r1, v_zero),
2261                         _mm_unpackhi_epi8(v_g1, v_zero),
2262                         _mm_unpackhi_epi8(v_b1, v_zero),
2263                         v_y1, v_cr1, v_cb1);
2264 
2265                 __m128i v_y_1 = _mm_packus_epi16(v_y0, v_y1);
2266                 __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1);
2267                 __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1);
2268 
2269                 _mm_interleave_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1);
2270 
2271                 _mm_storeu_si128((__m128i *)(dst + i), v_y_0);
2272                 _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1);
2273                 _mm_storeu_si128((__m128i *)(dst + i + 32), v_cr_0);
2274                 _mm_storeu_si128((__m128i *)(dst + i + 48), v_cr_1);
2275                 _mm_storeu_si128((__m128i *)(dst + i + 64), v_cb_0);
2276                 _mm_storeu_si128((__m128i *)(dst + i + 80), v_cb_1);
2277             }
2278         }
2279 
2280         for ( ; i < n; i += 3, src += scn)
2281         {
2282             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2283             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2284             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2285             dst[i] = saturate_cast<uchar>(Y);
2286             dst[i+1] = saturate_cast<uchar>(Cr);
2287             dst[i+2] = saturate_cast<uchar>(Cb);
2288         }
2289     }
2290 
2291     int srccn, blueIdx, coeffs[5];
2292     __m128i v_c0, v_c1, v_c2;
2293     __m128i v_c3, v_c4, v_delta, v_delta2;
2294     __m128i v_zero;
2295     bool haveSIMD;
2296 };
2297 
2298 template <>
2299 struct RGB2YCrCb_i<ushort>
2300 {
2301     typedef ushort channel_type;
2302 
RGB2YCrCb_icv::RGB2YCrCb_i2303     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
2304         : srccn(_srccn), blueIdx(_blueIdx)
2305     {
2306         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
2307         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
2308         if (blueIdx==0)
2309             std::swap(coeffs[0], coeffs[2]);
2310 
2311         v_c0 = _mm_set1_epi32(coeffs[0]);
2312         v_c1 = _mm_set1_epi32(coeffs[1]);
2313         v_c2 = _mm_set1_epi32(coeffs[2]);
2314         v_c3 = _mm_set1_epi32(coeffs[3]);
2315         v_c4 = _mm_set1_epi32(coeffs[4]);
2316         v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
2317         v_delta = _mm_set1_epi32(ColorChannel<ushort>::half()*(1 << yuv_shift));
2318         v_delta = _mm_add_epi32(v_delta, v_delta2);
2319         v_zero = _mm_setzero_si128();
2320 
2321         haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
2322     }
2323 
2324     // 16u x 8
processcv::RGB2YCrCb_i2325     void process(__m128i v_r, __m128i v_g, __m128i v_b,
2326                  __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
2327     {
2328         __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
2329         __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
2330         __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
2331 
2332         __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2333                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2334                                      _mm_mullo_epi32(v_b_p, v_c2)));
2335         v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
2336 
2337         __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
2338         __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
2339         v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
2340         v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
2341 
2342         v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
2343         v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
2344         v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
2345 
2346         __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
2347                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
2348                                      _mm_mullo_epi32(v_b_p, v_c2)));
2349         v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
2350 
2351         __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
2352         __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
2353         v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
2354         v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
2355 
2356         v_y = _mm_packus_epi32(v_y0, v_y1);
2357         v_cr = _mm_packus_epi32(v_cr0, v_cr1);
2358         v_cb = _mm_packus_epi32(v_cb0, v_cb1);
2359     }
2360 
operator ()cv::RGB2YCrCb_i2361     void operator()(const ushort * src, ushort * dst, int n) const
2362     {
2363         int scn = srccn, bidx = blueIdx, i = 0;
2364         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
2365         int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
2366         n *= 3;
2367 
2368         if (haveSIMD)
2369         {
2370             for ( ; i <= n - 48; i += 48, src += scn * 16)
2371             {
2372                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
2373                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
2374                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
2375                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
2376                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
2377                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
2378 
2379                 if (scn == 4)
2380                 {
2381                     __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
2382                     __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
2383 
2384                     _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1,
2385                                            v_b0, v_b1, v_a0, v_a1);
2386                 }
2387                 else
2388                     _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
2389 
2390                 __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
2391                 process(v_r0, v_g0, v_b0,
2392                         v_y0, v_cr0, v_cb0);
2393 
2394                 __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
2395                 process(v_r1, v_g1, v_b1,
2396                         v_y1, v_cr1, v_cb1);
2397 
2398                 _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
2399 
2400                 _mm_storeu_si128((__m128i *)(dst + i), v_y0);
2401                 _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1);
2402                 _mm_storeu_si128((__m128i *)(dst + i + 16), v_cr0);
2403                 _mm_storeu_si128((__m128i *)(dst + i + 24), v_cr1);
2404                 _mm_storeu_si128((__m128i *)(dst + i + 32), v_cb0);
2405                 _mm_storeu_si128((__m128i *)(dst + i + 40), v_cb1);
2406             }
2407         }
2408 
2409         for ( ; i < n; i += 3, src += scn)
2410         {
2411             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
2412             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
2413             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
2414             dst[i] = saturate_cast<ushort>(Y);
2415             dst[i+1] = saturate_cast<ushort>(Cr);
2416             dst[i+2] = saturate_cast<ushort>(Cb);
2417         }
2418     }
2419 
2420     int srccn, blueIdx, coeffs[5];
2421     __m128i v_c0, v_c1, v_c2;
2422     __m128i v_c3, v_c4, v_delta, v_delta2;
2423     __m128i v_zero;
2424     bool haveSIMD;
2425 };
2426 
2427 #endif // CV_SSE4_1
2428 
2429 template<typename _Tp> struct YCrCb2RGB_f
2430 {
2431     typedef _Tp channel_type;
2432 
YCrCb2RGB_fcv::YCrCb2RGB_f2433     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2434         : dstcn(_dstcn), blueIdx(_blueIdx)
2435     {
2436         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
2437         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2438     }
operator ()cv::YCrCb2RGB_f2439     void operator()(const _Tp* src, _Tp* dst, int n) const
2440     {
2441         int dcn = dstcn, bidx = blueIdx;
2442         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
2443         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2444         n *= 3;
2445         for(int i = 0; i < n; i += 3, dst += dcn)
2446         {
2447             _Tp Y = src[i];
2448             _Tp Cr = src[i+1];
2449             _Tp Cb = src[i+2];
2450 
2451             _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
2452             _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
2453             _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
2454 
2455             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
2456             if( dcn == 4 )
2457                 dst[3] = alpha;
2458         }
2459     }
2460     int dstcn, blueIdx;
2461     float coeffs[4];
2462 };
2463 
2464 #if CV_NEON
2465 
2466 template <>
2467 struct YCrCb2RGB_f<float>
2468 {
2469     typedef float channel_type;
2470 
YCrCb2RGB_fcv::YCrCb2RGB_f2471     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2472         : dstcn(_dstcn), blueIdx(_blueIdx)
2473     {
2474         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
2475         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2476 
2477         v_c0 = vdupq_n_f32(coeffs[0]);
2478         v_c1 = vdupq_n_f32(coeffs[1]);
2479         v_c2 = vdupq_n_f32(coeffs[2]);
2480         v_c3 = vdupq_n_f32(coeffs[3]);
2481         v_delta = vdupq_n_f32(ColorChannel<float>::half());
2482         v_alpha = vdupq_n_f32(ColorChannel<float>::max());
2483     }
2484 
operator ()cv::YCrCb2RGB_f2485     void operator()(const float* src, float* dst, int n) const
2486     {
2487         int dcn = dstcn, bidx = blueIdx, i = 0;
2488         const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
2489         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2490         n *= 3;
2491 
2492         if (dcn == 3)
2493             for ( ; i <= n - 12; i += 12, dst += 12)
2494             {
2495                 float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
2496                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
2497 
2498                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
2499                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
2500                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
2501 
2502                 vst3q_f32(dst, v_dst);
2503             }
2504         else
2505             for ( ; i <= n - 12; i += 12, dst += 16)
2506             {
2507                 float32x4x3_t v_src = vld3q_f32(src + i);
2508                 float32x4x4_t v_dst;
2509                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
2510 
2511                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
2512                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
2513                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
2514                 v_dst.val[3] = v_alpha;
2515 
2516                 vst4q_f32(dst, v_dst);
2517             }
2518 
2519         for ( ; i < n; i += 3, dst += dcn)
2520         {
2521             float Y = src[i], Cr = src[i+1], Cb = src[i+2];
2522 
2523             float b = Y + (Cb - delta)*C3;
2524             float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
2525             float r = Y + (Cr - delta)*C0;
2526 
2527             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
2528             if( dcn == 4 )
2529                 dst[3] = alpha;
2530         }
2531     }
2532     int dstcn, blueIdx;
2533     float coeffs[4];
2534     float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
2535 };
2536 
2537 #elif CV_SSE2
2538 
2539 template <>
2540 struct YCrCb2RGB_f<float>
2541 {
2542     typedef float channel_type;
2543 
YCrCb2RGB_fcv::YCrCb2RGB_f2544     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
2545         : dstcn(_dstcn), blueIdx(_blueIdx)
2546     {
2547         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
2548         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2549 
2550         v_c0 = _mm_set1_ps(coeffs[0]);
2551         v_c1 = _mm_set1_ps(coeffs[1]);
2552         v_c2 = _mm_set1_ps(coeffs[2]);
2553         v_c3 = _mm_set1_ps(coeffs[3]);
2554         v_delta = _mm_set1_ps(ColorChannel<float>::half());
2555         v_alpha = _mm_set1_ps(ColorChannel<float>::max());
2556 
2557         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
2558     }
2559 
processcv::YCrCb2RGB_f2560     void process(__m128 v_y, __m128 v_cr, __m128 v_cb,
2561                  __m128 & v_r, __m128 & v_g, __m128 & v_b) const
2562     {
2563         v_cb = _mm_sub_ps(v_cb, v_delta);
2564         v_cr = _mm_sub_ps(v_cr, v_delta);
2565 
2566         v_b = _mm_mul_ps(v_cb, v_c3);
2567         v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1));
2568         v_r = _mm_mul_ps(v_cr, v_c0);
2569 
2570         v_b = _mm_add_ps(v_b, v_y);
2571         v_g = _mm_add_ps(v_g, v_y);
2572         v_r = _mm_add_ps(v_r, v_y);
2573 
2574         if (blueIdx == 0)
2575             std::swap(v_b, v_r);
2576     }
2577 
operator ()cv::YCrCb2RGB_f2578     void operator()(const float* src, float* dst, int n) const
2579     {
2580         int dcn = dstcn, bidx = blueIdx, i = 0;
2581         const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
2582         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2583         n *= 3;
2584 
2585         if (haveSIMD)
2586         {
2587             for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
2588             {
2589                 __m128 v_y0 = _mm_loadu_ps(src + i);
2590                 __m128 v_y1 = _mm_loadu_ps(src + i + 4);
2591                 __m128 v_cr0 = _mm_loadu_ps(src + i + 8);
2592                 __m128 v_cr1 = _mm_loadu_ps(src + i + 12);
2593                 __m128 v_cb0 = _mm_loadu_ps(src + i + 16);
2594                 __m128 v_cb1 = _mm_loadu_ps(src + i + 20);
2595 
2596                 _mm_deinterleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
2597 
2598                 __m128 v_r0, v_g0, v_b0;
2599                 process(v_y0, v_cr0, v_cb0,
2600                         v_r0, v_g0, v_b0);
2601 
2602                 __m128 v_r1, v_g1, v_b1;
2603                 process(v_y1, v_cr1, v_cb1,
2604                         v_r1, v_g1, v_b1);
2605 
2606                 __m128 v_a0 = v_alpha, v_a1 = v_alpha;
2607 
2608                 if (dcn == 3)
2609                     _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
2610                 else
2611                     _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1,
2612                                       v_b0, v_b1, v_a0, v_a1);
2613 
2614                 _mm_storeu_ps(dst, v_r0);
2615                 _mm_storeu_ps(dst + 4, v_r1);
2616                 _mm_storeu_ps(dst + 8, v_g0);
2617                 _mm_storeu_ps(dst + 12, v_g1);
2618                 _mm_storeu_ps(dst + 16, v_b0);
2619                 _mm_storeu_ps(dst + 20, v_b1);
2620 
2621                 if (dcn == 4)
2622                 {
2623                     _mm_storeu_ps(dst + 24, v_a0);
2624                     _mm_storeu_ps(dst + 28, v_a1);
2625                 }
2626             }
2627         }
2628 
2629         for ( ; i < n; i += 3, dst += dcn)
2630         {
2631             float Y = src[i], Cr = src[i+1], Cb = src[i+2];
2632 
2633             float b = Y + (Cb - delta)*C3;
2634             float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
2635             float r = Y + (Cr - delta)*C0;
2636 
2637             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
2638             if( dcn == 4 )
2639                 dst[3] = alpha;
2640         }
2641     }
2642     int dstcn, blueIdx;
2643     float coeffs[4];
2644 
2645     __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
2646     bool haveSIMD;
2647 };
2648 
2649 #endif
2650 
2651 template<typename _Tp> struct YCrCb2RGB_i
2652 {
2653     typedef _Tp channel_type;
2654 
YCrCb2RGB_icv::YCrCb2RGB_i2655     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2656         : dstcn(_dstcn), blueIdx(_blueIdx)
2657     {
2658         static const int coeffs0[] = {22987, -11698, -5636, 29049};
2659         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2660     }
2661 
operator ()cv::YCrCb2RGB_i2662     void operator()(const _Tp* src, _Tp* dst, int n) const
2663     {
2664         int dcn = dstcn, bidx = blueIdx;
2665         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
2666         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2667         n *= 3;
2668         for(int i = 0; i < n; i += 3, dst += dcn)
2669         {
2670             _Tp Y = src[i];
2671             _Tp Cr = src[i+1];
2672             _Tp Cb = src[i+2];
2673 
2674             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2675             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2676             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2677 
2678             dst[bidx] = saturate_cast<_Tp>(b);
2679             dst[1] = saturate_cast<_Tp>(g);
2680             dst[bidx^2] = saturate_cast<_Tp>(r);
2681             if( dcn == 4 )
2682                 dst[3] = alpha;
2683         }
2684     }
2685     int dstcn, blueIdx;
2686     int coeffs[4];
2687 };
2688 
2689 #if CV_NEON
2690 
2691 template <>
2692 struct YCrCb2RGB_i<uchar>
2693 {
2694     typedef uchar channel_type;
2695 
YCrCb2RGB_icv::YCrCb2RGB_i2696     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2697         : dstcn(_dstcn), blueIdx(_blueIdx)
2698     {
2699         static const int coeffs0[] = {22987, -11698, -5636, 29049};
2700         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2701 
2702         v_c0 = vdupq_n_s32(coeffs[0]);
2703         v_c1 = vdupq_n_s32(coeffs[1]);
2704         v_c2 = vdupq_n_s32(coeffs[2]);
2705         v_c3 = vdupq_n_s32(coeffs[3]);
2706         v_delta = vdup_n_s16(ColorChannel<uchar>::half());
2707         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
2708         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
2709     }
2710 
operator ()cv::YCrCb2RGB_i2711     void operator()(const uchar* src, uchar* dst, int n) const
2712     {
2713         int dcn = dstcn, bidx = blueIdx, i = 0;
2714         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
2715         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2716         n *= 3;
2717 
2718         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2719         {
2720             uint8x8x3_t v_src = vld3_u8(src + i);
2721             int16x8x3_t v_src16;
2722             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
2723             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
2724             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
2725 
2726             int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
2727                       v_Cr = vget_low_s16(v_src16.val[1]),
2728                       v_Cb = vget_low_s16(v_src16.val[2]);
2729 
2730             int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
2731             v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
2732             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
2733             v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
2734             int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
2735             v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
2736 
2737             v_Y = vget_high_s16(v_src16.val[0]);
2738             v_Cr = vget_high_s16(v_src16.val[1]);
2739             v_Cb = vget_high_s16(v_src16.val[2]);
2740 
2741             int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
2742             v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
2743             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
2744             v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
2745             int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
2746             v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
2747 
2748             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
2749             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
2750             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
2751 
2752             if (dcn == 3)
2753             {
2754                 uint8x8x3_t v_dst;
2755                 v_dst.val[bidx] = v_b;
2756                 v_dst.val[1] = v_g;
2757                 v_dst.val[bidx^2] = v_r;
2758                 vst3_u8(dst, v_dst);
2759             }
2760             else
2761             {
2762                 uint8x8x4_t v_dst;
2763                 v_dst.val[bidx] = v_b;
2764                 v_dst.val[1] = v_g;
2765                 v_dst.val[bidx^2] = v_r;
2766                 v_dst.val[3] = v_alpha;
2767                 vst4_u8(dst, v_dst);
2768             }
2769         }
2770 
2771         for ( ; i < n; i += 3, dst += dcn)
2772         {
2773             uchar Y = src[i];
2774             uchar Cr = src[i+1];
2775             uchar Cb = src[i+2];
2776 
2777             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2778             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2779             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2780 
2781             dst[bidx] = saturate_cast<uchar>(b);
2782             dst[1] = saturate_cast<uchar>(g);
2783             dst[bidx^2] = saturate_cast<uchar>(r);
2784             if( dcn == 4 )
2785                 dst[3] = alpha;
2786         }
2787     }
2788     int dstcn, blueIdx;
2789     int coeffs[4];
2790 
2791     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
2792     int16x4_t v_delta;
2793     uint8x8_t v_alpha;
2794 };
2795 
2796 template <>
2797 struct YCrCb2RGB_i<ushort>
2798 {
2799     typedef ushort channel_type;
2800 
YCrCb2RGB_icv::YCrCb2RGB_i2801     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2802         : dstcn(_dstcn), blueIdx(_blueIdx)
2803     {
2804         static const int coeffs0[] = {22987, -11698, -5636, 29049};
2805         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2806 
2807         v_c0 = vdupq_n_s32(coeffs[0]);
2808         v_c1 = vdupq_n_s32(coeffs[1]);
2809         v_c2 = vdupq_n_s32(coeffs[2]);
2810         v_c3 = vdupq_n_s32(coeffs[3]);
2811         v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
2812         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
2813         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
2814         v_alpha2 = vget_low_u16(v_alpha);
2815     }
2816 
operator ()cv::YCrCb2RGB_i2817     void operator()(const ushort* src, ushort* dst, int n) const
2818     {
2819         int dcn = dstcn, bidx = blueIdx, i = 0;
2820         const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
2821         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
2822         n *= 3;
2823 
2824         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
2825         {
2826             uint16x8x3_t v_src = vld3q_u16(src + i);
2827 
2828             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
2829                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
2830                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
2831 
2832             int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2833             v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
2834             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2835             v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
2836             int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2837             v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
2838 
2839             v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
2840             v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
2841             v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
2842 
2843             int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2844             v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
2845             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2846             v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
2847             int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
2848             v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
2849 
2850             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
2851             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
2852             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
2853 
2854             if (dcn == 3)
2855             {
2856                 uint16x8x3_t v_dst;
2857                 v_dst.val[bidx] = v_b;
2858                 v_dst.val[1] = v_g;
2859                 v_dst.val[bidx^2] = v_r;
2860                 vst3q_u16(dst, v_dst);
2861             }
2862             else
2863             {
2864                 uint16x8x4_t v_dst;
2865                 v_dst.val[bidx] = v_b;
2866                 v_dst.val[1] = v_g;
2867                 v_dst.val[bidx^2] = v_r;
2868                 v_dst.val[3] = v_alpha;
2869                 vst4q_u16(dst, v_dst);
2870             }
2871         }
2872 
2873         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
2874         {
2875             uint16x4x3_t v_src = vld3_u16(src + i);
2876 
2877             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
2878                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
2879                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
2880 
2881             int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
2882             v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
2883             int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
2884             v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
2885             int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
2886             v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
2887 
2888             uint16x4_t v_bd = vqmovun_s32(v_b);
2889             uint16x4_t v_gd = vqmovun_s32(v_g);
2890             uint16x4_t v_rd = vqmovun_s32(v_r);
2891 
2892             if (dcn == 3)
2893             {
2894                 uint16x4x3_t v_dst;
2895                 v_dst.val[bidx] = v_bd;
2896                 v_dst.val[1] = v_gd;
2897                 v_dst.val[bidx^2] = v_rd;
2898                 vst3_u16(dst, v_dst);
2899             }
2900             else
2901             {
2902                 uint16x4x4_t v_dst;
2903                 v_dst.val[bidx] = v_bd;
2904                 v_dst.val[1] = v_gd;
2905                 v_dst.val[bidx^2] = v_rd;
2906                 v_dst.val[3] = v_alpha2;
2907                 vst4_u16(dst, v_dst);
2908             }
2909         }
2910 
2911         for ( ; i < n; i += 3, dst += dcn)
2912         {
2913             ushort Y = src[i];
2914             ushort Cr = src[i+1];
2915             ushort Cb = src[i+2];
2916 
2917             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
2918             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
2919             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
2920 
2921             dst[bidx] = saturate_cast<ushort>(b);
2922             dst[1] = saturate_cast<ushort>(g);
2923             dst[bidx^2] = saturate_cast<ushort>(r);
2924             if( dcn == 4 )
2925                 dst[3] = alpha;
2926         }
2927     }
2928     int dstcn, blueIdx;
2929     int coeffs[4];
2930 
2931     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
2932     uint16x8_t v_alpha;
2933     uint16x4_t v_alpha2;
2934 };
2935 
2936 #elif CV_SSE2
2937 
2938 template <>
2939 struct YCrCb2RGB_i<uchar>
2940 {
2941     typedef uchar channel_type;
2942 
YCrCb2RGB_icv::YCrCb2RGB_i2943     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
2944         : dstcn(_dstcn), blueIdx(_blueIdx)
2945     {
2946         static const int coeffs0[] = {22987, -11698, -5636, 29049};
2947         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
2948 
2949         v_c0 = _mm_set1_epi16((short)coeffs[0]);
2950         v_c1 = _mm_set1_epi16((short)coeffs[1]);
2951         v_c2 = _mm_set1_epi16((short)coeffs[2]);
2952         v_c3 = _mm_set1_epi16((short)coeffs[3]);
2953         v_delta = _mm_set1_epi16(ColorChannel<uchar>::half());
2954         v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
2955         v_zero = _mm_setzero_si128();
2956 
2957         uchar alpha = ColorChannel<uchar>::max();
2958         v_alpha = _mm_set1_epi8(*(char *)&alpha);
2959 
2960         useSSE = coeffs[0] <= std::numeric_limits<short>::max();
2961         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
2962     }
2963 
2964     // 16s x 8
processcv::YCrCb2RGB_i2965     void process(__m128i v_y, __m128i v_cr, __m128i v_cb,
2966                  __m128i & v_r, __m128i & v_g, __m128i & v_b) const
2967     {
2968         v_cr = _mm_sub_epi16(v_cr, v_delta);
2969         v_cb = _mm_sub_epi16(v_cb, v_delta);
2970 
2971         __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero);
2972 
2973         __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3);
2974         __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2);
2975         __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1);
2976         __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0);
2977 
2978         __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3);
2979         __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2);
2980         __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1);
2981         __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0);
2982 
2983         __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
2984         __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2),
2985                                                                   _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
2986                                       yuv_shift);
2987         __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
2988 
2989         v_r0 = _mm_add_epi32(v_r0, v_y_p);
2990         v_g0 = _mm_add_epi32(v_g0, v_y_p);
2991         v_b0 = _mm_add_epi32(v_b0, v_y_p);
2992 
2993         v_y_p = _mm_unpackhi_epi16(v_y, v_zero);
2994 
2995         __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
2996         __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2),
2997                                                                   _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
2998                                       yuv_shift);
2999         __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
3000 
3001         v_r1 = _mm_add_epi32(v_r1, v_y_p);
3002         v_g1 = _mm_add_epi32(v_g1, v_y_p);
3003         v_b1 = _mm_add_epi32(v_b1, v_y_p);
3004 
3005         v_r = _mm_packs_epi32(v_r0, v_r1);
3006         v_g = _mm_packs_epi32(v_g0, v_g1);
3007         v_b = _mm_packs_epi32(v_b0, v_b1);
3008     }
3009 
operator ()cv::YCrCb2RGB_i3010     void operator()(const uchar* src, uchar* dst, int n) const
3011     {
3012         int dcn = dstcn, bidx = blueIdx, i = 0;
3013         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
3014         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
3015         n *= 3;
3016 
3017         if (haveSIMD && useSSE)
3018         {
3019             for ( ; i <= n - 96; i += 96, dst += dcn * 32)
3020             {
3021                 __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i));
3022                 __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16));
3023                 __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32));
3024                 __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48));
3025                 __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64));
3026                 __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80));
3027 
3028                 _mm_deinterleave_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
3029 
3030                 __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero;
3031                 process(_mm_unpacklo_epi8(v_y0, v_zero),
3032                         _mm_unpacklo_epi8(v_cr0, v_zero),
3033                         _mm_unpacklo_epi8(v_cb0, v_zero),
3034                         v_r_0, v_g_0, v_b_0);
3035 
3036                 __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero;
3037                 process(_mm_unpackhi_epi8(v_y0, v_zero),
3038                         _mm_unpackhi_epi8(v_cr0, v_zero),
3039                         _mm_unpackhi_epi8(v_cb0, v_zero),
3040                         v_r_1, v_g_1, v_b_1);
3041 
3042                 __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1);
3043                 __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1);
3044                 __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1);
3045 
3046                 process(_mm_unpacklo_epi8(v_y1, v_zero),
3047                         _mm_unpacklo_epi8(v_cr1, v_zero),
3048                         _mm_unpacklo_epi8(v_cb1, v_zero),
3049                         v_r_0, v_g_0, v_b_0);
3050 
3051                 process(_mm_unpackhi_epi8(v_y1, v_zero),
3052                         _mm_unpackhi_epi8(v_cr1, v_zero),
3053                         _mm_unpackhi_epi8(v_cb1, v_zero),
3054                         v_r_1, v_g_1, v_b_1);
3055 
3056                 __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1);
3057                 __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1);
3058                 __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1);
3059 
3060                 if (bidx == 0)
3061                 {
3062                     std::swap(v_r0, v_b0);
3063                     std::swap(v_r1, v_b1);
3064                 }
3065 
3066                 __m128i v_a0 = v_alpha, v_a1 = v_alpha;
3067 
3068                 if (dcn == 3)
3069                     _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
3070                 else
3071                     _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1,
3072                                         v_b0, v_b1, v_a0, v_a1);
3073 
3074                 _mm_storeu_si128((__m128i *)(dst), v_r0);
3075                 _mm_storeu_si128((__m128i *)(dst + 16), v_r1);
3076                 _mm_storeu_si128((__m128i *)(dst + 32), v_g0);
3077                 _mm_storeu_si128((__m128i *)(dst + 48), v_g1);
3078                 _mm_storeu_si128((__m128i *)(dst + 64), v_b0);
3079                 _mm_storeu_si128((__m128i *)(dst + 80), v_b1);
3080 
3081                 if (dcn == 4)
3082                 {
3083                     _mm_storeu_si128((__m128i *)(dst + 96), v_a0);
3084                     _mm_storeu_si128((__m128i *)(dst + 112), v_a1);
3085                 }
3086             }
3087         }
3088 
3089         for ( ; i < n; i += 3, dst += dcn)
3090         {
3091             uchar Y = src[i];
3092             uchar Cr = src[i+1];
3093             uchar Cb = src[i+2];
3094 
3095             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
3096             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
3097             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
3098 
3099             dst[bidx] = saturate_cast<uchar>(b);
3100             dst[1] = saturate_cast<uchar>(g);
3101             dst[bidx^2] = saturate_cast<uchar>(r);
3102             if( dcn == 4 )
3103                 dst[3] = alpha;
3104         }
3105     }
3106     int dstcn, blueIdx;
3107     int coeffs[4];
3108     bool useSSE, haveSIMD;
3109 
3110     __m128i v_c0, v_c1, v_c2, v_c3, v_delta2;
3111     __m128i v_delta, v_alpha, v_zero;
3112 };
3113 
3114 #endif // CV_SSE2
3115 
3116 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
3117 
3118 static const float sRGB2XYZ_D65[] =
3119 {
3120     0.412453f, 0.357580f, 0.180423f,
3121     0.212671f, 0.715160f, 0.072169f,
3122     0.019334f, 0.119193f, 0.950227f
3123 };
3124 
3125 static const float XYZ2sRGB_D65[] =
3126 {
3127     3.240479f, -1.53715f, -0.498535f,
3128     -0.969256f, 1.875991f, 0.041556f,
3129     0.055648f, -0.204043f, 1.057311f
3130 };
3131 
3132 template<typename _Tp> struct RGB2XYZ_f
3133 {
3134     typedef _Tp channel_type;
3135 
RGB2XYZ_fcv::RGB2XYZ_f3136     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3137     {
3138         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
3139         if(blueIdx == 0)
3140         {
3141             std::swap(coeffs[0], coeffs[2]);
3142             std::swap(coeffs[3], coeffs[5]);
3143             std::swap(coeffs[6], coeffs[8]);
3144         }
3145     }
operator ()cv::RGB2XYZ_f3146     void operator()(const _Tp* src, _Tp* dst, int n) const
3147     {
3148         int scn = srccn;
3149         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3150               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3151               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3152 
3153         n *= 3;
3154         for(int i = 0; i < n; i += 3, src += scn)
3155         {
3156             _Tp X = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
3157             _Tp Y = saturate_cast<_Tp>(src[0]*C3 + src[1]*C4 + src[2]*C5);
3158             _Tp Z = saturate_cast<_Tp>(src[0]*C6 + src[1]*C7 + src[2]*C8);
3159             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
3160         }
3161     }
3162     int srccn;
3163     float coeffs[9];
3164 };
3165 
3166 #if CV_NEON
3167 
3168 template <>
3169 struct RGB2XYZ_f<float>
3170 {
3171     typedef float channel_type;
3172 
RGB2XYZ_fcv::RGB2XYZ_f3173     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3174     {
3175         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
3176         if(blueIdx == 0)
3177         {
3178             std::swap(coeffs[0], coeffs[2]);
3179             std::swap(coeffs[3], coeffs[5]);
3180             std::swap(coeffs[6], coeffs[8]);
3181         }
3182 
3183         v_c0 = vdupq_n_f32(coeffs[0]);
3184         v_c1 = vdupq_n_f32(coeffs[1]);
3185         v_c2 = vdupq_n_f32(coeffs[2]);
3186         v_c3 = vdupq_n_f32(coeffs[3]);
3187         v_c4 = vdupq_n_f32(coeffs[4]);
3188         v_c5 = vdupq_n_f32(coeffs[5]);
3189         v_c6 = vdupq_n_f32(coeffs[6]);
3190         v_c7 = vdupq_n_f32(coeffs[7]);
3191         v_c8 = vdupq_n_f32(coeffs[8]);
3192     }
3193 
operator ()cv::RGB2XYZ_f3194     void operator()(const float* src, float* dst, int n) const
3195     {
3196         int scn = srccn, i = 0;
3197         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3198               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3199               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3200 
3201         n *= 3;
3202 
3203         if (scn == 3)
3204             for ( ; i <= n - 12; i += 12, src += 12)
3205             {
3206                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
3207                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
3208                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
3209                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
3210                 vst3q_f32(dst + i, v_dst);
3211             }
3212         else
3213             for ( ; i <= n - 12; i += 12, src += 16)
3214             {
3215                 float32x4x4_t v_src = vld4q_f32(src);
3216                 float32x4x3_t v_dst;
3217                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
3218                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
3219                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
3220                 vst3q_f32(dst + i, v_dst);
3221             }
3222 
3223         for ( ; i < n; i += 3, src += scn)
3224         {
3225             float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
3226             float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
3227             float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
3228             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
3229         }
3230     }
3231 
3232     int srccn;
3233     float coeffs[9];
3234     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3235 };
3236 
3237 #elif CV_SSE2
3238 
3239 template <>
3240 struct RGB2XYZ_f<float>
3241 {
3242     typedef float channel_type;
3243 
RGB2XYZ_fcv::RGB2XYZ_f3244     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3245     {
3246         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
3247         if(blueIdx == 0)
3248         {
3249             std::swap(coeffs[0], coeffs[2]);
3250             std::swap(coeffs[3], coeffs[5]);
3251             std::swap(coeffs[6], coeffs[8]);
3252         }
3253 
3254         v_c0 = _mm_set1_ps(coeffs[0]);
3255         v_c1 = _mm_set1_ps(coeffs[1]);
3256         v_c2 = _mm_set1_ps(coeffs[2]);
3257         v_c3 = _mm_set1_ps(coeffs[3]);
3258         v_c4 = _mm_set1_ps(coeffs[4]);
3259         v_c5 = _mm_set1_ps(coeffs[5]);
3260         v_c6 = _mm_set1_ps(coeffs[6]);
3261         v_c7 = _mm_set1_ps(coeffs[7]);
3262         v_c8 = _mm_set1_ps(coeffs[8]);
3263 
3264         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
3265     }
3266 
processcv::RGB2XYZ_f3267     void process(__m128 v_r, __m128 v_g, __m128 v_b,
3268                  __m128 & v_x, __m128 & v_y, __m128 & v_z) const
3269     {
3270         v_x = _mm_mul_ps(v_r, v_c0);
3271         v_x = _mm_add_ps(v_x, _mm_mul_ps(v_g, v_c1));
3272         v_x = _mm_add_ps(v_x, _mm_mul_ps(v_b, v_c2));
3273 
3274         v_y = _mm_mul_ps(v_r, v_c3);
3275         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c4));
3276         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c5));
3277 
3278         v_z = _mm_mul_ps(v_r, v_c6);
3279         v_z = _mm_add_ps(v_z, _mm_mul_ps(v_g, v_c7));
3280         v_z = _mm_add_ps(v_z, _mm_mul_ps(v_b, v_c8));
3281     }
3282 
operator ()cv::RGB2XYZ_f3283     void operator()(const float* src, float* dst, int n) const
3284     {
3285         int scn = srccn, i = 0;
3286         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3287               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3288               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3289 
3290         n *= 3;
3291 
3292         if (haveSIMD)
3293         {
3294             for ( ; i <= n - 24; i += 24, src += 8 * scn)
3295             {
3296                 __m128 v_r0 = _mm_loadu_ps(src);
3297                 __m128 v_r1 = _mm_loadu_ps(src + 4);
3298                 __m128 v_g0 = _mm_loadu_ps(src + 8);
3299                 __m128 v_g1 = _mm_loadu_ps(src + 12);
3300                 __m128 v_b0 = _mm_loadu_ps(src + 16);
3301                 __m128 v_b1 = _mm_loadu_ps(src + 20);
3302 
3303                 if (scn == 4)
3304                 {
3305                     __m128 v_a0 = _mm_loadu_ps(src + 24);
3306                     __m128 v_a1 = _mm_loadu_ps(src + 28);
3307 
3308                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
3309                                         v_b0, v_b1, v_a0, v_a1);
3310                 }
3311                 else
3312                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
3313 
3314                 __m128 v_x0, v_y0, v_z0;
3315                 process(v_r0, v_g0, v_b0,
3316                         v_x0, v_y0, v_z0);
3317 
3318                 __m128 v_x1, v_y1, v_z1;
3319                 process(v_r1, v_g1, v_b1,
3320                         v_x1, v_y1, v_z1);
3321 
3322                 _mm_interleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
3323 
3324                 _mm_storeu_ps(dst + i, v_x0);
3325                 _mm_storeu_ps(dst + i + 4, v_x1);
3326                 _mm_storeu_ps(dst + i + 8, v_y0);
3327                 _mm_storeu_ps(dst + i + 12, v_y1);
3328                 _mm_storeu_ps(dst + i + 16, v_z0);
3329                 _mm_storeu_ps(dst + i + 20, v_z1);
3330             }
3331         }
3332 
3333         for ( ; i < n; i += 3, src += scn)
3334         {
3335             float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
3336             float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
3337             float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
3338             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
3339         }
3340     }
3341 
3342     int srccn;
3343     float coeffs[9];
3344     __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3345     bool haveSIMD;
3346 };
3347 
3348 
3349 #endif
3350 
3351 template<typename _Tp> struct RGB2XYZ_i
3352 {
3353     typedef _Tp channel_type;
3354 
RGB2XYZ_icv::RGB2XYZ_i3355     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3356     {
3357         static const int coeffs0[] =
3358         {
3359             1689,    1465,    739,
3360             871,     2929,    296,
3361             79,      488,     3892
3362         };
3363         for( int i = 0; i < 9; i++ )
3364             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3365         if(blueIdx == 0)
3366         {
3367             std::swap(coeffs[0], coeffs[2]);
3368             std::swap(coeffs[3], coeffs[5]);
3369             std::swap(coeffs[6], coeffs[8]);
3370         }
3371     }
operator ()cv::RGB2XYZ_i3372     void operator()(const _Tp* src, _Tp* dst, int n) const
3373     {
3374         int scn = srccn;
3375         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3376             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3377             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3378         n *= 3;
3379 
3380         for(int i = 0; i < n; i += 3, src += scn)
3381         {
3382             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
3383             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
3384             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
3385             dst[i] = saturate_cast<_Tp>(X); dst[i+1] = saturate_cast<_Tp>(Y);
3386             dst[i+2] = saturate_cast<_Tp>(Z);
3387         }
3388     }
3389     int srccn;
3390     int coeffs[9];
3391 };
3392 
3393 #if CV_NEON
3394 
3395 template <>
3396 struct RGB2XYZ_i<uchar>
3397 {
3398     typedef uchar channel_type;
3399 
RGB2XYZ_icv::RGB2XYZ_i3400     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3401     {
3402         static const int coeffs0[] =
3403         {
3404             1689,    1465,    739,
3405             871,     2929,    296,
3406             79,      488,     3892
3407         };
3408         for( int i = 0; i < 9; i++ )
3409             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3410         if(blueIdx == 0)
3411         {
3412             std::swap(coeffs[0], coeffs[2]);
3413             std::swap(coeffs[3], coeffs[5]);
3414             std::swap(coeffs[6], coeffs[8]);
3415         }
3416 
3417         v_c0 = vdup_n_u16(coeffs[0]);
3418         v_c1 = vdup_n_u16(coeffs[1]);
3419         v_c2 = vdup_n_u16(coeffs[2]);
3420         v_c3 = vdup_n_u16(coeffs[3]);
3421         v_c4 = vdup_n_u16(coeffs[4]);
3422         v_c5 = vdup_n_u16(coeffs[5]);
3423         v_c6 = vdup_n_u16(coeffs[6]);
3424         v_c7 = vdup_n_u16(coeffs[7]);
3425         v_c8 = vdup_n_u16(coeffs[8]);
3426         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
3427     }
operator ()cv::RGB2XYZ_i3428     void operator()(const uchar * src, uchar * dst, int n) const
3429     {
3430         int scn = srccn, i = 0;
3431         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3432             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3433             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3434         n *= 3;
3435 
3436         for ( ; i <= n - 24; i += 24, src += scn * 8)
3437         {
3438             uint8x8x3_t v_dst;
3439             uint16x8x3_t v_src16;
3440 
3441             if (scn == 3)
3442             {
3443                 uint8x8x3_t v_src = vld3_u8(src);
3444                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
3445                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
3446                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
3447             }
3448             else
3449             {
3450                 uint8x8x4_t v_src = vld4_u8(src);
3451                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
3452                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
3453                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
3454             }
3455 
3456             uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
3457                        v_s1 = vget_low_u16(v_src16.val[1]),
3458                        v_s2 = vget_low_u16(v_src16.val[2]);
3459 
3460             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3461             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3462             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3463             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
3464             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
3465             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
3466 
3467             v_s0 = vget_high_u16(v_src16.val[0]),
3468             v_s1 = vget_high_u16(v_src16.val[1]),
3469             v_s2 = vget_high_u16(v_src16.val[2]);
3470 
3471             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3472             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3473             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3474             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
3475             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
3476             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
3477 
3478             v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
3479             v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
3480             v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
3481 
3482             vst3_u8(dst + i, v_dst);
3483         }
3484 
3485         for ( ; i < n; i += 3, src += scn)
3486         {
3487             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
3488             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
3489             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
3490             dst[i] = saturate_cast<uchar>(X);
3491             dst[i+1] = saturate_cast<uchar>(Y);
3492             dst[i+2] = saturate_cast<uchar>(Z);
3493         }
3494     }
3495 
3496     int srccn, coeffs[9];
3497     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3498     uint32x4_t v_delta;
3499 };
3500 
3501 template <>
3502 struct RGB2XYZ_i<ushort>
3503 {
3504     typedef ushort channel_type;
3505 
RGB2XYZ_icv::RGB2XYZ_i3506     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
3507     {
3508         static const int coeffs0[] =
3509         {
3510             1689,    1465,    739,
3511             871,     2929,    296,
3512             79,      488,     3892
3513         };
3514         for( int i = 0; i < 9; i++ )
3515             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3516         if(blueIdx == 0)
3517         {
3518             std::swap(coeffs[0], coeffs[2]);
3519             std::swap(coeffs[3], coeffs[5]);
3520             std::swap(coeffs[6], coeffs[8]);
3521         }
3522 
3523         v_c0 = vdup_n_u16(coeffs[0]);
3524         v_c1 = vdup_n_u16(coeffs[1]);
3525         v_c2 = vdup_n_u16(coeffs[2]);
3526         v_c3 = vdup_n_u16(coeffs[3]);
3527         v_c4 = vdup_n_u16(coeffs[4]);
3528         v_c5 = vdup_n_u16(coeffs[5]);
3529         v_c6 = vdup_n_u16(coeffs[6]);
3530         v_c7 = vdup_n_u16(coeffs[7]);
3531         v_c8 = vdup_n_u16(coeffs[8]);
3532         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
3533     }
3534 
operator ()cv::RGB2XYZ_i3535     void operator()(const ushort * src, ushort * dst, int n) const
3536     {
3537         int scn = srccn, i = 0;
3538         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3539             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3540             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3541         n *= 3;
3542 
3543         for ( ; i <= n - 24; i += 24, src += scn * 8)
3544         {
3545             uint16x8x3_t v_src, v_dst;
3546 
3547             if (scn == 3)
3548                 v_src = vld3q_u16(src);
3549             else
3550             {
3551                 uint16x8x4_t v_src4 = vld4q_u16(src);
3552                 v_src.val[0] = v_src4.val[0];
3553                 v_src.val[1] = v_src4.val[1];
3554                 v_src.val[2] = v_src4.val[2];
3555             }
3556 
3557             uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
3558                        v_s1 = vget_low_u16(v_src.val[1]),
3559                        v_s2 = vget_low_u16(v_src.val[2]);
3560 
3561             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3562             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3563             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3564             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
3565             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
3566             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
3567 
3568             v_s0 = vget_high_u16(v_src.val[0]),
3569             v_s1 = vget_high_u16(v_src.val[1]),
3570             v_s2 = vget_high_u16(v_src.val[2]);
3571 
3572             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3573             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3574             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3575             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
3576             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
3577             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
3578 
3579             v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
3580             v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
3581             v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
3582 
3583             vst3q_u16(dst + i, v_dst);
3584         }
3585 
3586         for ( ; i <= n - 12; i += 12, src += scn * 4)
3587         {
3588             uint16x4x3_t v_dst;
3589             uint16x4_t v_s0, v_s1, v_s2;
3590 
3591             if (scn == 3)
3592             {
3593                 uint16x4x3_t v_src = vld3_u16(src);
3594                 v_s0 = v_src.val[0];
3595                 v_s1 = v_src.val[1];
3596                 v_s2 = v_src.val[2];
3597             }
3598             else
3599             {
3600                 uint16x4x4_t v_src = vld4_u16(src);
3601                 v_s0 = v_src.val[0];
3602                 v_s1 = v_src.val[1];
3603                 v_s2 = v_src.val[2];
3604             }
3605 
3606             uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3607             uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3608             uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3609 
3610             v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
3611             v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
3612             v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
3613 
3614             vst3_u16(dst + i, v_dst);
3615         }
3616 
3617         for ( ; i < n; i += 3, src += scn)
3618         {
3619             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
3620             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
3621             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
3622             dst[i] = saturate_cast<ushort>(X);
3623             dst[i+1] = saturate_cast<ushort>(Y);
3624             dst[i+2] = saturate_cast<ushort>(Z);
3625         }
3626     }
3627 
3628     int srccn, coeffs[9];
3629     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3630     uint32x4_t v_delta;
3631 };
3632 
3633 #endif
3634 
3635 template<typename _Tp> struct XYZ2RGB_f
3636 {
3637     typedef _Tp channel_type;
3638 
XYZ2RGB_fcv::XYZ2RGB_f3639     XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
3640     : dstcn(_dstcn), blueIdx(_blueIdx)
3641     {
3642         memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
3643         if(blueIdx == 0)
3644         {
3645             std::swap(coeffs[0], coeffs[6]);
3646             std::swap(coeffs[1], coeffs[7]);
3647             std::swap(coeffs[2], coeffs[8]);
3648         }
3649     }
3650 
operator ()cv::XYZ2RGB_f3651     void operator()(const _Tp* src, _Tp* dst, int n) const
3652     {
3653         int dcn = dstcn;
3654         _Tp alpha = ColorChannel<_Tp>::max();
3655         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3656               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3657               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3658         n *= 3;
3659         for(int i = 0; i < n; i += 3, dst += dcn)
3660         {
3661             _Tp B = saturate_cast<_Tp>(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2);
3662             _Tp G = saturate_cast<_Tp>(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5);
3663             _Tp R = saturate_cast<_Tp>(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8);
3664             dst[0] = B; dst[1] = G; dst[2] = R;
3665             if( dcn == 4 )
3666                 dst[3] = alpha;
3667         }
3668     }
3669     int dstcn, blueIdx;
3670     float coeffs[9];
3671 };
3672 
3673 #if CV_SSE2
3674 
3675 template <>
3676 struct XYZ2RGB_f<float>
3677 {
3678     typedef float channel_type;
3679 
XYZ2RGB_fcv::XYZ2RGB_f3680     XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
3681     : dstcn(_dstcn), blueIdx(_blueIdx)
3682     {
3683         memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
3684         if(blueIdx == 0)
3685         {
3686             std::swap(coeffs[0], coeffs[6]);
3687             std::swap(coeffs[1], coeffs[7]);
3688             std::swap(coeffs[2], coeffs[8]);
3689         }
3690 
3691         v_c0 = _mm_set1_ps(coeffs[0]);
3692         v_c1 = _mm_set1_ps(coeffs[1]);
3693         v_c2 = _mm_set1_ps(coeffs[2]);
3694         v_c3 = _mm_set1_ps(coeffs[3]);
3695         v_c4 = _mm_set1_ps(coeffs[4]);
3696         v_c5 = _mm_set1_ps(coeffs[5]);
3697         v_c6 = _mm_set1_ps(coeffs[6]);
3698         v_c7 = _mm_set1_ps(coeffs[7]);
3699         v_c8 = _mm_set1_ps(coeffs[8]);
3700 
3701         v_alpha = _mm_set1_ps(ColorChannel<float>::max());
3702 
3703         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
3704     }
3705 
processcv::XYZ2RGB_f3706     void process(__m128 v_x, __m128 v_y, __m128 v_z,
3707                  __m128 & v_r, __m128 & v_g, __m128 & v_b) const
3708     {
3709         v_b = _mm_mul_ps(v_x, v_c0);
3710         v_b = _mm_add_ps(v_b, _mm_mul_ps(v_y, v_c1));
3711         v_b = _mm_add_ps(v_b, _mm_mul_ps(v_z, v_c2));
3712 
3713         v_g = _mm_mul_ps(v_x, v_c3);
3714         v_g = _mm_add_ps(v_g, _mm_mul_ps(v_y, v_c4));
3715         v_g = _mm_add_ps(v_g, _mm_mul_ps(v_z, v_c5));
3716 
3717         v_r = _mm_mul_ps(v_x, v_c6);
3718         v_r = _mm_add_ps(v_r, _mm_mul_ps(v_y, v_c7));
3719         v_r = _mm_add_ps(v_r, _mm_mul_ps(v_z, v_c8));
3720     }
3721 
operator ()cv::XYZ2RGB_f3722     void operator()(const float* src, float* dst, int n) const
3723     {
3724         int dcn = dstcn;
3725         float alpha = ColorChannel<float>::max();
3726         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3727               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3728               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3729         n *= 3;
3730         int i = 0;
3731 
3732         if (haveSIMD)
3733         {
3734             for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
3735             {
3736                 __m128 v_x0 = _mm_loadu_ps(src + i);
3737                 __m128 v_x1 = _mm_loadu_ps(src + i + 4);
3738                 __m128 v_y0 = _mm_loadu_ps(src + i + 8);
3739                 __m128 v_y1 = _mm_loadu_ps(src + i + 12);
3740                 __m128 v_z0 = _mm_loadu_ps(src + i + 16);
3741                 __m128 v_z1 = _mm_loadu_ps(src + i + 20);
3742 
3743                 _mm_deinterleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
3744 
3745                 __m128 v_r0, v_g0, v_b0;
3746                 process(v_x0, v_y0, v_z0,
3747                         v_r0, v_g0, v_b0);
3748 
3749                 __m128 v_r1, v_g1, v_b1;
3750                 process(v_x1, v_y1, v_z1,
3751                         v_r1, v_g1, v_b1);
3752 
3753                 __m128 v_a0 = v_alpha, v_a1 = v_alpha;
3754 
3755                 if (dcn == 4)
3756                     _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1,
3757                                       v_r0, v_r1, v_a0, v_a1);
3758                 else
3759                     _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
3760 
3761                 _mm_storeu_ps(dst, v_b0);
3762                 _mm_storeu_ps(dst + 4, v_b1);
3763                 _mm_storeu_ps(dst + 8, v_g0);
3764                 _mm_storeu_ps(dst + 12, v_g1);
3765                 _mm_storeu_ps(dst + 16, v_r0);
3766                 _mm_storeu_ps(dst + 20, v_r1);
3767 
3768                 if (dcn == 4)
3769                 {
3770                     _mm_storeu_ps(dst + 24, v_a0);
3771                     _mm_storeu_ps(dst + 28, v_a1);
3772                 }
3773             }
3774 
3775         }
3776 
3777         for( ; i < n; i += 3, dst += dcn)
3778         {
3779             float B = src[i]*C0 + src[i+1]*C1 + src[i+2]*C2;
3780             float G = src[i]*C3 + src[i+1]*C4 + src[i+2]*C5;
3781             float R = src[i]*C6 + src[i+1]*C7 + src[i+2]*C8;
3782             dst[0] = B; dst[1] = G; dst[2] = R;
3783             if( dcn == 4 )
3784                 dst[3] = alpha;
3785         }
3786     }
3787     int dstcn, blueIdx;
3788     float coeffs[9];
3789 
3790     __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3791     __m128 v_alpha;
3792     bool haveSIMD;
3793 };
3794 
3795 #endif // CV_SSE2
3796 
3797 
3798 template<typename _Tp> struct XYZ2RGB_i
3799 {
3800     typedef _Tp channel_type;
3801 
XYZ2RGB_icv::XYZ2RGB_i3802     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
3803     : dstcn(_dstcn), blueIdx(_blueIdx)
3804     {
3805         static const int coeffs0[] =
3806         {
3807             13273,  -6296,  -2042,
3808             -3970,   7684,    170,
3809               228,   -836,   4331
3810         };
3811         for(int i = 0; i < 9; i++)
3812             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3813 
3814         if(blueIdx == 0)
3815         {
3816             std::swap(coeffs[0], coeffs[6]);
3817             std::swap(coeffs[1], coeffs[7]);
3818             std::swap(coeffs[2], coeffs[8]);
3819         }
3820     }
operator ()cv::XYZ2RGB_i3821     void operator()(const _Tp* src, _Tp* dst, int n) const
3822     {
3823         int dcn = dstcn;
3824         _Tp alpha = ColorChannel<_Tp>::max();
3825         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3826             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3827             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3828         n *= 3;
3829         for(int i = 0; i < n; i += 3, dst += dcn)
3830         {
3831             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
3832             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
3833             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
3834             dst[0] = saturate_cast<_Tp>(B); dst[1] = saturate_cast<_Tp>(G);
3835             dst[2] = saturate_cast<_Tp>(R);
3836             if( dcn == 4 )
3837                 dst[3] = alpha;
3838         }
3839     }
3840     int dstcn, blueIdx;
3841     int coeffs[9];
3842 };
3843 
3844 #if CV_NEON
3845 
3846 template <>
3847 struct XYZ2RGB_i<uchar>
3848 {
3849     typedef uchar channel_type;
3850 
XYZ2RGB_icv::XYZ2RGB_i3851     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
3852     : dstcn(_dstcn), blueIdx(_blueIdx)
3853     {
3854         static const int coeffs0[] =
3855         {
3856             13273,  -6296,  -2042,
3857             -3970,   7684,    170,
3858               228,   -836,   4331
3859         };
3860         for(int i = 0; i < 9; i++)
3861             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3862 
3863         if(blueIdx == 0)
3864         {
3865             std::swap(coeffs[0], coeffs[6]);
3866             std::swap(coeffs[1], coeffs[7]);
3867             std::swap(coeffs[2], coeffs[8]);
3868         }
3869 
3870         v_c0 = vdup_n_s16(coeffs[0]);
3871         v_c1 = vdup_n_s16(coeffs[1]);
3872         v_c2 = vdup_n_s16(coeffs[2]);
3873         v_c3 = vdup_n_s16(coeffs[3]);
3874         v_c4 = vdup_n_s16(coeffs[4]);
3875         v_c5 = vdup_n_s16(coeffs[5]);
3876         v_c6 = vdup_n_s16(coeffs[6]);
3877         v_c7 = vdup_n_s16(coeffs[7]);
3878         v_c8 = vdup_n_s16(coeffs[8]);
3879         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
3880         v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
3881     }
3882 
operator ()cv::XYZ2RGB_i3883     void operator()(const uchar* src, uchar* dst, int n) const
3884     {
3885         int dcn = dstcn, i = 0;
3886         uchar alpha = ColorChannel<uchar>::max();
3887         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
3888             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
3889             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
3890         n *= 3;
3891 
3892         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
3893         {
3894             uint8x8x3_t v_src = vld3_u8(src + i);
3895             int16x8x3_t v_src16;
3896             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
3897             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
3898             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
3899 
3900             int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
3901                        v_s1 = vget_low_s16(v_src16.val[1]),
3902                        v_s2 = vget_low_s16(v_src16.val[2]);
3903 
3904             int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3905             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3906             int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3907             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
3908             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
3909             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
3910 
3911             v_s0 = vget_high_s16(v_src16.val[0]),
3912             v_s1 = vget_high_s16(v_src16.val[1]),
3913             v_s2 = vget_high_s16(v_src16.val[2]);
3914 
3915             int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
3916             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
3917             int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
3918             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
3919             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
3920             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
3921 
3922             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
3923             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
3924             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
3925 
3926             if (dcn == 3)
3927             {
3928                 uint8x8x3_t v_dst;
3929                 v_dst.val[0] = v_b;
3930                 v_dst.val[1] = v_g;
3931                 v_dst.val[2] = v_r;
3932                 vst3_u8(dst, v_dst);
3933             }
3934             else
3935             {
3936                 uint8x8x4_t v_dst;
3937                 v_dst.val[0] = v_b;
3938                 v_dst.val[1] = v_g;
3939                 v_dst.val[2] = v_r;
3940                 v_dst.val[3] = v_alpha;
3941                 vst4_u8(dst, v_dst);
3942             }
3943         }
3944 
3945         for ( ; i < n; i += 3, dst += dcn)
3946         {
3947             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
3948             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
3949             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
3950             dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
3951             dst[2] = saturate_cast<uchar>(R);
3952             if( dcn == 4 )
3953                 dst[3] = alpha;
3954         }
3955     }
3956     int dstcn, blueIdx;
3957     int coeffs[9];
3958 
3959     int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
3960     uint8x8_t v_alpha;
3961     int32x4_t v_delta;
3962 };
3963 
3964 template <>
3965 struct XYZ2RGB_i<ushort>
3966 {
3967     typedef ushort channel_type;
3968 
XYZ2RGB_icv::XYZ2RGB_i3969     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
3970     : dstcn(_dstcn), blueIdx(_blueIdx)
3971     {
3972         static const int coeffs0[] =
3973         {
3974             13273,  -6296,  -2042,
3975             -3970,   7684,    170,
3976               228,   -836,   4331
3977         };
3978         for(int i = 0; i < 9; i++)
3979             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
3980 
3981         if(blueIdx == 0)
3982         {
3983             std::swap(coeffs[0], coeffs[6]);
3984             std::swap(coeffs[1], coeffs[7]);
3985             std::swap(coeffs[2], coeffs[8]);
3986         }
3987 
3988         v_c0 = vdupq_n_s32(coeffs[0]);
3989         v_c1 = vdupq_n_s32(coeffs[1]);
3990         v_c2 = vdupq_n_s32(coeffs[2]);
3991         v_c3 = vdupq_n_s32(coeffs[3]);
3992         v_c4 = vdupq_n_s32(coeffs[4]);
3993         v_c5 = vdupq_n_s32(coeffs[5]);
3994         v_c6 = vdupq_n_s32(coeffs[6]);
3995         v_c7 = vdupq_n_s32(coeffs[7]);
3996         v_c8 = vdupq_n_s32(coeffs[8]);
3997         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
3998         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
3999         v_alpha2 = vget_low_u16(v_alpha);
4000     }
4001 
operator ()cv::XYZ2RGB_i4002     void operator()(const ushort* src, ushort* dst, int n) const
4003     {
4004         int dcn = dstcn, i = 0;
4005         ushort alpha = ColorChannel<ushort>::max();
4006         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
4007             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
4008             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
4009         n *= 3;
4010 
4011         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
4012         {
4013             uint16x8x3_t v_src = vld3q_u16(src + i);
4014             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
4015                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
4016                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
4017 
4018             int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
4019             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
4020             int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
4021             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
4022             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
4023             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
4024 
4025             v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
4026             v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
4027             v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
4028 
4029             int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
4030             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
4031             int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
4032             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
4033             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
4034             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
4035 
4036             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
4037             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
4038             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
4039 
4040             if (dcn == 3)
4041             {
4042                 uint16x8x3_t v_dst;
4043                 v_dst.val[0] = v_b;
4044                 v_dst.val[1] = v_g;
4045                 v_dst.val[2] = v_r;
4046                 vst3q_u16(dst, v_dst);
4047             }
4048             else
4049             {
4050                 uint16x8x4_t v_dst;
4051                 v_dst.val[0] = v_b;
4052                 v_dst.val[1] = v_g;
4053                 v_dst.val[2] = v_r;
4054                 v_dst.val[3] = v_alpha;
4055                 vst4q_u16(dst, v_dst);
4056             }
4057         }
4058 
4059         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
4060         {
4061             uint16x4x3_t v_src = vld3_u16(src + i);
4062             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
4063                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
4064                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
4065 
4066             int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
4067             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
4068             int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
4069             v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
4070             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
4071             v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
4072 
4073             uint16x4_t v_b = vqmovun_s32(v_X);
4074             uint16x4_t v_g = vqmovun_s32(v_Y);
4075             uint16x4_t v_r = vqmovun_s32(v_Z);
4076 
4077             if (dcn == 3)
4078             {
4079                 uint16x4x3_t v_dst;
4080                 v_dst.val[0] = v_b;
4081                 v_dst.val[1] = v_g;
4082                 v_dst.val[2] = v_r;
4083                 vst3_u16(dst, v_dst);
4084             }
4085             else
4086             {
4087                 uint16x4x4_t v_dst;
4088                 v_dst.val[0] = v_b;
4089                 v_dst.val[1] = v_g;
4090                 v_dst.val[2] = v_r;
4091                 v_dst.val[3] = v_alpha2;
4092                 vst4_u16(dst, v_dst);
4093             }
4094         }
4095 
4096         for ( ; i < n; i += 3, dst += dcn)
4097         {
4098             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
4099             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
4100             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
4101             dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
4102             dst[2] = saturate_cast<ushort>(R);
4103             if( dcn == 4 )
4104                 dst[3] = alpha;
4105         }
4106     }
4107     int dstcn, blueIdx;
4108     int coeffs[9];
4109 
4110     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
4111     uint16x4_t v_alpha2;
4112     uint16x8_t v_alpha;
4113 };
4114 
4115 #endif
4116 
4117 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
4118 
4119 
4120 struct RGB2HSV_b
4121 {
4122     typedef uchar channel_type;
4123 
RGB2HSV_bcv::RGB2HSV_b4124     RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
4125     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
4126     {
4127         CV_Assert( hrange == 180 || hrange == 256 );
4128     }
4129 
operator ()cv::RGB2HSV_b4130     void operator()(const uchar* src, uchar* dst, int n) const
4131     {
4132         int i, bidx = blueIdx, scn = srccn;
4133         const int hsv_shift = 12;
4134 
4135         static int sdiv_table[256];
4136         static int hdiv_table180[256];
4137         static int hdiv_table256[256];
4138         static volatile bool initialized = false;
4139 
4140         int hr = hrange;
4141         const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
4142         n *= 3;
4143 
4144         if( !initialized )
4145         {
4146             sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
4147             for( i = 1; i < 256; i++ )
4148             {
4149                 sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
4150                 hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
4151                 hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
4152             }
4153             initialized = true;
4154         }
4155 
4156         for( i = 0; i < n; i += 3, src += scn )
4157         {
4158             int b = src[bidx], g = src[1], r = src[bidx^2];
4159             int h, s, v = b;
4160             int vmin = b, diff;
4161             int vr, vg;
4162 
4163             CV_CALC_MAX_8U( v, g );
4164             CV_CALC_MAX_8U( v, r );
4165             CV_CALC_MIN_8U( vmin, g );
4166             CV_CALC_MIN_8U( vmin, r );
4167 
4168             diff = v - vmin;
4169             vr = v == r ? -1 : 0;
4170             vg = v == g ? -1 : 0;
4171 
4172             s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
4173             h = (vr & (g - b)) +
4174                 (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
4175             h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
4176             h += h < 0 ? hr : 0;
4177 
4178             dst[i] = saturate_cast<uchar>(h);
4179             dst[i+1] = (uchar)s;
4180             dst[i+2] = (uchar)v;
4181         }
4182     }
4183 
4184     int srccn, blueIdx, hrange;
4185 };
4186 
4187 
4188 struct RGB2HSV_f
4189 {
4190     typedef float channel_type;
4191 
RGB2HSV_fcv::RGB2HSV_f4192     RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
4193     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
4194 
operator ()cv::RGB2HSV_f4195     void operator()(const float* src, float* dst, int n) const
4196     {
4197         int i, bidx = blueIdx, scn = srccn;
4198         float hscale = hrange*(1.f/360.f);
4199         n *= 3;
4200 
4201         for( i = 0; i < n; i += 3, src += scn )
4202         {
4203             float b = src[bidx], g = src[1], r = src[bidx^2];
4204             float h, s, v;
4205 
4206             float vmin, diff;
4207 
4208             v = vmin = r;
4209             if( v < g ) v = g;
4210             if( v < b ) v = b;
4211             if( vmin > g ) vmin = g;
4212             if( vmin > b ) vmin = b;
4213 
4214             diff = v - vmin;
4215             s = diff/(float)(fabs(v) + FLT_EPSILON);
4216             diff = (float)(60./(diff + FLT_EPSILON));
4217             if( v == r )
4218                 h = (g - b)*diff;
4219             else if( v == g )
4220                 h = (b - r)*diff + 120.f;
4221             else
4222                 h = (r - g)*diff + 240.f;
4223 
4224             if( h < 0 ) h += 360.f;
4225 
4226             dst[i] = h*hscale;
4227             dst[i+1] = s;
4228             dst[i+2] = v;
4229         }
4230     }
4231 
4232     int srccn, blueIdx;
4233     float hrange;
4234 };
4235 
4236 
4237 struct HSV2RGB_f
4238 {
4239     typedef float channel_type;
4240 
HSV2RGB_fcv::HSV2RGB_f4241     HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
4242     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
4243 
operator ()cv::HSV2RGB_f4244     void operator()(const float* src, float* dst, int n) const
4245     {
4246         int i, bidx = blueIdx, dcn = dstcn;
4247         float _hscale = hscale;
4248         float alpha = ColorChannel<float>::max();
4249         n *= 3;
4250 
4251         for( i = 0; i < n; i += 3, dst += dcn )
4252         {
4253             float h = src[i], s = src[i+1], v = src[i+2];
4254             float b, g, r;
4255 
4256             if( s == 0 )
4257                 b = g = r = v;
4258             else
4259             {
4260                 static const int sector_data[][3]=
4261                     {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
4262                 float tab[4];
4263                 int sector;
4264                 h *= _hscale;
4265                 if( h < 0 )
4266                     do h += 6; while( h < 0 );
4267                 else if( h >= 6 )
4268                     do h -= 6; while( h >= 6 );
4269                 sector = cvFloor(h);
4270                 h -= sector;
4271                 if( (unsigned)sector >= 6u )
4272                 {
4273                     sector = 0;
4274                     h = 0.f;
4275                 }
4276 
4277                 tab[0] = v;
4278                 tab[1] = v*(1.f - s);
4279                 tab[2] = v*(1.f - s*h);
4280                 tab[3] = v*(1.f - s*(1.f - h));
4281 
4282                 b = tab[sector_data[sector][0]];
4283                 g = tab[sector_data[sector][1]];
4284                 r = tab[sector_data[sector][2]];
4285             }
4286 
4287             dst[bidx] = b;
4288             dst[1] = g;
4289             dst[bidx^2] = r;
4290             if( dcn == 4 )
4291                 dst[3] = alpha;
4292         }
4293     }
4294 
4295     int dstcn, blueIdx;
4296     float hscale;
4297 };
4298 
4299 
4300 struct HSV2RGB_b
4301 {
4302     typedef uchar channel_type;
4303 
HSV2RGB_bcv::HSV2RGB_b4304     HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
4305     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
4306     {
4307         #if CV_NEON
4308         v_scale_inv = vdupq_n_f32(1.f/255.f);
4309         v_scale = vdupq_n_f32(255.f);
4310         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4311         #elif CV_SSE2
4312         v_scale_inv = _mm_set1_ps(1.f/255.f);
4313         v_scale = _mm_set1_ps(255.0f);
4314         v_zero = _mm_setzero_si128();
4315         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
4316         #endif
4317     }
4318 
4319     #if CV_SSE2
4320     // 16s x 8
processcv::HSV2RGB_b4321     void process(__m128i v_r, __m128i v_g, __m128i v_b,
4322                  float * buf) const
4323     {
4324         __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
4325         __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
4326         __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
4327 
4328         __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
4329         __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
4330         __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
4331 
4332         v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
4333         v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
4334 
4335         v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
4336         v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
4337 
4338         _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4339 
4340         _mm_store_ps(buf, v_r0);
4341         _mm_store_ps(buf + 4, v_r1);
4342         _mm_store_ps(buf + 8, v_g0);
4343         _mm_store_ps(buf + 12, v_g1);
4344         _mm_store_ps(buf + 16, v_b0);
4345         _mm_store_ps(buf + 20, v_b1);
4346     }
4347     #endif
4348 
operator ()cv::HSV2RGB_b4349     void operator()(const uchar* src, uchar* dst, int n) const
4350     {
4351         int i, j, dcn = dstcn;
4352         uchar alpha = ColorChannel<uchar>::max();
4353         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
4354 
4355         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
4356         {
4357             int dn = std::min(n - i, (int)BLOCK_SIZE);
4358             j = 0;
4359 
4360             #if CV_NEON
4361             for ( ; j <= (dn - 8) * 3; j += 24)
4362             {
4363                 uint8x8x3_t v_src = vld3_u8(src + j);
4364                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
4365                            v_t1 = vmovl_u8(v_src.val[1]),
4366                            v_t2 = vmovl_u8(v_src.val[2]);
4367 
4368                 float32x4x3_t v_dst;
4369                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
4370                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4371                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4372                 vst3q_f32(buf + j, v_dst);
4373 
4374                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
4375                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4376                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4377                 vst3q_f32(buf + j + 12, v_dst);
4378             }
4379             #elif CV_SSE2
4380             if (haveSIMD)
4381             {
4382                 for ( ; j <= (dn - 32) * 3; j += 96)
4383                 {
4384                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
4385                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
4386                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
4387                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
4388                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
4389                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
4390 
4391                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4392 
4393                     process(_mm_unpacklo_epi8(v_r0, v_zero),
4394                             _mm_unpacklo_epi8(v_g0, v_zero),
4395                             _mm_unpacklo_epi8(v_b0, v_zero),
4396                             buf + j);
4397 
4398                     process(_mm_unpackhi_epi8(v_r0, v_zero),
4399                             _mm_unpackhi_epi8(v_g0, v_zero),
4400                             _mm_unpackhi_epi8(v_b0, v_zero),
4401                             buf + j + 24);
4402 
4403                     process(_mm_unpacklo_epi8(v_r1, v_zero),
4404                             _mm_unpacklo_epi8(v_g1, v_zero),
4405                             _mm_unpacklo_epi8(v_b1, v_zero),
4406                             buf + j + 48);
4407 
4408                     process(_mm_unpackhi_epi8(v_r1, v_zero),
4409                             _mm_unpackhi_epi8(v_g1, v_zero),
4410                             _mm_unpackhi_epi8(v_b1, v_zero),
4411                             buf + j + 72);
4412                 }
4413             }
4414             #endif
4415 
4416             for( ; j < dn*3; j += 3 )
4417             {
4418                 buf[j] = src[j];
4419                 buf[j+1] = src[j+1]*(1.f/255.f);
4420                 buf[j+2] = src[j+2]*(1.f/255.f);
4421             }
4422             cvt(buf, buf, dn);
4423 
4424             j = 0;
4425             #if CV_NEON
4426             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
4427             {
4428                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4429                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4430                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4431                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4432                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4433                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4434                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4435 
4436                 if (dcn == 4)
4437                 {
4438                     uint8x8x4_t v_dst;
4439                     v_dst.val[0] = v_dst0;
4440                     v_dst.val[1] = v_dst1;
4441                     v_dst.val[2] = v_dst2;
4442                     v_dst.val[3] = v_alpha;
4443                     vst4_u8(dst, v_dst);
4444                 }
4445                 else
4446                 {
4447                     uint8x8x3_t v_dst;
4448                     v_dst.val[0] = v_dst0;
4449                     v_dst.val[1] = v_dst1;
4450                     v_dst.val[2] = v_dst2;
4451                     vst3_u8(dst, v_dst);
4452                 }
4453             }
4454             #elif CV_SSE2
4455             if (dcn == 3 && haveSIMD)
4456             {
4457                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
4458                 {
4459                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
4460                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
4461                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
4462                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
4463 
4464                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
4465                                                      _mm_cvtps_epi32(v_src1));
4466                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
4467                                                      _mm_cvtps_epi32(v_src3));
4468 
4469                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
4470                 }
4471 
4472                 int jr = j % 3;
4473                 if (jr)
4474                     dst -= jr, j -= jr;
4475             }
4476             #endif
4477 
4478             for( ; j < dn*3; j += 3, dst += dcn )
4479             {
4480                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
4481                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
4482                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
4483                 if( dcn == 4 )
4484                     dst[3] = alpha;
4485             }
4486         }
4487     }
4488 
4489     int dstcn;
4490     HSV2RGB_f cvt;
4491     #if CV_NEON
4492     float32x4_t v_scale, v_scale_inv;
4493     uint8x8_t v_alpha;
4494     #elif CV_SSE2
4495     __m128 v_scale_inv, v_scale;
4496     __m128i v_zero;
4497     bool haveSIMD;
4498     #endif
4499 };
4500 
4501 
4502 ///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
4503 
4504 struct RGB2HLS_f
4505 {
4506     typedef float channel_type;
4507 
RGB2HLS_fcv::RGB2HLS_f4508     RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
4509     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
4510 
operator ()cv::RGB2HLS_f4511     void operator()(const float* src, float* dst, int n) const
4512     {
4513         int i, bidx = blueIdx, scn = srccn;
4514         float hscale = hrange*(1.f/360.f);
4515         n *= 3;
4516 
4517         for( i = 0; i < n; i += 3, src += scn )
4518         {
4519             float b = src[bidx], g = src[1], r = src[bidx^2];
4520             float h = 0.f, s = 0.f, l;
4521             float vmin, vmax, diff;
4522 
4523             vmax = vmin = r;
4524             if( vmax < g ) vmax = g;
4525             if( vmax < b ) vmax = b;
4526             if( vmin > g ) vmin = g;
4527             if( vmin > b ) vmin = b;
4528 
4529             diff = vmax - vmin;
4530             l = (vmax + vmin)*0.5f;
4531 
4532             if( diff > FLT_EPSILON )
4533             {
4534                 s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
4535                 diff = 60.f/diff;
4536 
4537                 if( vmax == r )
4538                     h = (g - b)*diff;
4539                 else if( vmax == g )
4540                     h = (b - r)*diff + 120.f;
4541                 else
4542                     h = (r - g)*diff + 240.f;
4543 
4544                 if( h < 0.f ) h += 360.f;
4545             }
4546 
4547             dst[i] = h*hscale;
4548             dst[i+1] = l;
4549             dst[i+2] = s;
4550         }
4551     }
4552 
4553     int srccn, blueIdx;
4554     float hrange;
4555 };
4556 
4557 
4558 struct RGB2HLS_b
4559 {
4560     typedef uchar channel_type;
4561 
RGB2HLS_bcv::RGB2HLS_b4562     RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
4563     : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
4564     {
4565         #if CV_NEON
4566         v_scale_inv = vdupq_n_f32(1.f/255.f);
4567         v_scale = vdupq_n_f32(255.f);
4568         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4569         #elif CV_SSE2
4570         v_scale_inv = _mm_set1_ps(1.f/255.f);
4571         v_scale = _mm_set1_ps(255.f);
4572         v_zero = _mm_setzero_si128();
4573         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
4574         #endif
4575     }
4576 
4577     #if CV_SSE2
processcv::RGB2HLS_b4578     void process(const float * buf,
4579                  __m128i & v_h, __m128i & v_l, __m128i & v_s) const
4580     {
4581         __m128 v_h0f = _mm_load_ps(buf);
4582         __m128 v_h1f = _mm_load_ps(buf + 4);
4583         __m128 v_l0f = _mm_load_ps(buf + 8);
4584         __m128 v_l1f = _mm_load_ps(buf + 12);
4585         __m128 v_s0f = _mm_load_ps(buf + 16);
4586         __m128 v_s1f = _mm_load_ps(buf + 20);
4587 
4588         _mm_deinterleave_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f);
4589 
4590         v_l0f = _mm_mul_ps(v_l0f, v_scale);
4591         v_l1f = _mm_mul_ps(v_l1f, v_scale);
4592         v_s0f = _mm_mul_ps(v_s0f, v_scale);
4593         v_s1f = _mm_mul_ps(v_s1f, v_scale);
4594 
4595         v_h = _mm_packs_epi32(_mm_cvtps_epi32(v_h0f), _mm_cvtps_epi32(v_h1f));
4596         v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
4597         v_s = _mm_packs_epi32(_mm_cvtps_epi32(v_s0f), _mm_cvtps_epi32(v_s1f));
4598     }
4599     #endif
4600 
operator ()cv::RGB2HLS_b4601     void operator()(const uchar* src, uchar* dst, int n) const
4602     {
4603         int i, j, scn = srccn;
4604         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
4605 
4606         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
4607         {
4608             int dn = std::min(n - i, (int)BLOCK_SIZE);
4609             j = 0;
4610 
4611             #if CV_NEON
4612             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
4613             {
4614                 uint16x8_t v_t0, v_t1, v_t2;
4615 
4616                 if (scn == 3)
4617                 {
4618                     uint8x8x3_t v_src = vld3_u8(src);
4619                     v_t0 = vmovl_u8(v_src.val[0]);
4620                     v_t1 = vmovl_u8(v_src.val[1]);
4621                     v_t2 = vmovl_u8(v_src.val[2]);
4622                 }
4623                 else
4624                 {
4625                     uint8x8x4_t v_src = vld4_u8(src);
4626                     v_t0 = vmovl_u8(v_src.val[0]);
4627                     v_t1 = vmovl_u8(v_src.val[1]);
4628                     v_t2 = vmovl_u8(v_src.val[2]);
4629                 }
4630 
4631                 float32x4x3_t v_dst;
4632                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
4633                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4634                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4635                 vst3q_f32(buf + j, v_dst);
4636 
4637                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
4638                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4639                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4640                 vst3q_f32(buf + j + 12, v_dst);
4641             }
4642             #elif CV_SSE2
4643             if (scn == 3 && haveSIMD)
4644             {
4645                 for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
4646                 {
4647                     __m128i v_src = _mm_loadu_si128((__m128i const *)src);
4648 
4649                     __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
4650                     _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
4651                     _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
4652 
4653                     v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
4654                     _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
4655                     _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
4656                 }
4657 
4658                 int jr = j % 3;
4659                 if (jr)
4660                     src -= jr, j -= jr;
4661             }
4662             #endif
4663             for( ; j < dn*3; j += 3, src += scn )
4664             {
4665                 buf[j] = src[0]*(1.f/255.f);
4666                 buf[j+1] = src[1]*(1.f/255.f);
4667                 buf[j+2] = src[2]*(1.f/255.f);
4668             }
4669             cvt(buf, buf, dn);
4670 
4671             j = 0;
4672             #if CV_NEON
4673             for ( ; j <= (dn - 8) * 3; j += 24)
4674             {
4675                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4676 
4677                 uint8x8x3_t v_dst;
4678                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
4679                                                        vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
4680                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4681                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4682                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4683                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4684                 vst3_u8(dst + j, v_dst);
4685             }
4686             #elif CV_SSE2
4687             if (haveSIMD)
4688             {
4689                 for ( ; j <= (dn - 32) * 3; j += 96)
4690                 {
4691                     __m128i v_h_0, v_l_0, v_s_0;
4692                     process(buf + j,
4693                             v_h_0, v_l_0, v_s_0);
4694 
4695                     __m128i v_h_1, v_l_1, v_s_1;
4696                     process(buf + j + 24,
4697                             v_h_1, v_l_1, v_s_1);
4698 
4699                     __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1);
4700                     __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
4701                     __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1);
4702 
4703                     process(buf + j + 48,
4704                             v_h_0, v_l_0, v_s_0);
4705 
4706                     process(buf + j + 72,
4707                             v_h_1, v_l_1, v_s_1);
4708 
4709                     __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1);
4710                     __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
4711                     __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1);
4712 
4713                     _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
4714 
4715                     _mm_storeu_si128((__m128i *)(dst + j), v_h0);
4716                     _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1);
4717                     _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0);
4718                     _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1);
4719                     _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0);
4720                     _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1);
4721                 }
4722             }
4723             #endif
4724             for( ; j < dn*3; j += 3 )
4725             {
4726                 dst[j] = saturate_cast<uchar>(buf[j]);
4727                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
4728                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
4729             }
4730         }
4731     }
4732 
4733     int srccn;
4734     RGB2HLS_f cvt;
4735     #if CV_NEON
4736     float32x4_t v_scale, v_scale_inv;
4737     uint8x8_t v_alpha;
4738     #elif CV_SSE2
4739     __m128 v_scale, v_scale_inv;
4740     __m128i v_zero;
4741     bool haveSIMD;
4742     #endif
4743 };
4744 
4745 
4746 struct HLS2RGB_f
4747 {
4748     typedef float channel_type;
4749 
HLS2RGB_fcv::HLS2RGB_f4750     HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
4751     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
4752 
operator ()cv::HLS2RGB_f4753     void operator()(const float* src, float* dst, int n) const
4754     {
4755         int i, bidx = blueIdx, dcn = dstcn;
4756         float _hscale = hscale;
4757         float alpha = ColorChannel<float>::max();
4758         n *= 3;
4759 
4760         for( i = 0; i < n; i += 3, dst += dcn )
4761         {
4762             float h = src[i], l = src[i+1], s = src[i+2];
4763             float b, g, r;
4764 
4765             if( s == 0 )
4766                 b = g = r = l;
4767             else
4768             {
4769                 static const int sector_data[][3]=
4770                 {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
4771                 float tab[4];
4772                 int sector;
4773 
4774                 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
4775                 float p1 = 2*l - p2;
4776 
4777                 h *= _hscale;
4778                 if( h < 0 )
4779                     do h += 6; while( h < 0 );
4780                 else if( h >= 6 )
4781                     do h -= 6; while( h >= 6 );
4782 
4783                 assert( 0 <= h && h < 6 );
4784                 sector = cvFloor(h);
4785                 h -= sector;
4786 
4787                 tab[0] = p2;
4788                 tab[1] = p1;
4789                 tab[2] = p1 + (p2 - p1)*(1-h);
4790                 tab[3] = p1 + (p2 - p1)*h;
4791 
4792                 b = tab[sector_data[sector][0]];
4793                 g = tab[sector_data[sector][1]];
4794                 r = tab[sector_data[sector][2]];
4795             }
4796 
4797             dst[bidx] = b;
4798             dst[1] = g;
4799             dst[bidx^2] = r;
4800             if( dcn == 4 )
4801                 dst[3] = alpha;
4802         }
4803     }
4804 
4805     int dstcn, blueIdx;
4806     float hscale;
4807 };
4808 
4809 
4810 struct HLS2RGB_b
4811 {
4812     typedef uchar channel_type;
4813 
HLS2RGB_bcv::HLS2RGB_b4814     HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
4815     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
4816     {
4817         #if CV_NEON
4818         v_scale_inv = vdupq_n_f32(1.f/255.f);
4819         v_scale = vdupq_n_f32(255.f);
4820         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
4821         #elif CV_SSE2
4822         v_scale_inv = _mm_set1_ps(1.f/255.f);
4823         v_scale = _mm_set1_ps(255.f);
4824         v_zero = _mm_setzero_si128();
4825         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
4826         #endif
4827     }
4828 
4829     #if CV_SSE2
4830     // 16s x 8
processcv::HLS2RGB_b4831     void process(__m128i v_r, __m128i v_g, __m128i v_b,
4832                  float * buf) const
4833     {
4834         __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
4835         __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
4836         __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
4837 
4838         __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
4839         __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
4840         __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
4841 
4842         v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
4843         v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
4844 
4845         v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
4846         v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
4847 
4848         _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4849 
4850         _mm_store_ps(buf, v_r0);
4851         _mm_store_ps(buf + 4, v_r1);
4852         _mm_store_ps(buf + 8, v_g0);
4853         _mm_store_ps(buf + 12, v_g1);
4854         _mm_store_ps(buf + 16, v_b0);
4855         _mm_store_ps(buf + 20, v_b1);
4856     }
4857     #endif
4858 
operator ()cv::HLS2RGB_b4859     void operator()(const uchar* src, uchar* dst, int n) const
4860     {
4861         int i, j, dcn = dstcn;
4862         uchar alpha = ColorChannel<uchar>::max();
4863         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
4864 
4865         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
4866         {
4867             int dn = std::min(n - i, (int)BLOCK_SIZE);
4868             j = 0;
4869 
4870             #if CV_NEON
4871             for ( ; j <= (dn - 8) * 3; j += 24)
4872             {
4873                 uint8x8x3_t v_src = vld3_u8(src + j);
4874                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
4875                            v_t1 = vmovl_u8(v_src.val[1]),
4876                            v_t2 = vmovl_u8(v_src.val[2]);
4877 
4878                 float32x4x3_t v_dst;
4879                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
4880                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
4881                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
4882                 vst3q_f32(buf + j, v_dst);
4883 
4884                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
4885                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
4886                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
4887                 vst3q_f32(buf + j + 12, v_dst);
4888             }
4889             #elif CV_SSE2
4890             if (haveSIMD)
4891             {
4892                 for ( ; j <= (dn - 32) * 3; j += 96)
4893                 {
4894                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
4895                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
4896                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
4897                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
4898                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
4899                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
4900 
4901                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
4902 
4903                     process(_mm_unpacklo_epi8(v_r0, v_zero),
4904                             _mm_unpacklo_epi8(v_g0, v_zero),
4905                             _mm_unpacklo_epi8(v_b0, v_zero),
4906                             buf + j);
4907 
4908                     process(_mm_unpackhi_epi8(v_r0, v_zero),
4909                             _mm_unpackhi_epi8(v_g0, v_zero),
4910                             _mm_unpackhi_epi8(v_b0, v_zero),
4911                             buf + j + 24);
4912 
4913                     process(_mm_unpacklo_epi8(v_r1, v_zero),
4914                             _mm_unpacklo_epi8(v_g1, v_zero),
4915                             _mm_unpacklo_epi8(v_b1, v_zero),
4916                             buf + j + 48);
4917 
4918                     process(_mm_unpackhi_epi8(v_r1, v_zero),
4919                             _mm_unpackhi_epi8(v_g1, v_zero),
4920                             _mm_unpackhi_epi8(v_b1, v_zero),
4921                             buf + j + 72);
4922                 }
4923             }
4924             #endif
4925             for( ; j < dn*3; j += 3 )
4926             {
4927                 buf[j] = src[j];
4928                 buf[j+1] = src[j+1]*(1.f/255.f);
4929                 buf[j+2] = src[j+2]*(1.f/255.f);
4930             }
4931             cvt(buf, buf, dn);
4932 
4933             j = 0;
4934             #if CV_NEON
4935             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
4936             {
4937                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
4938                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
4939                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
4940                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
4941                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
4942                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
4943                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
4944 
4945                 if (dcn == 4)
4946                 {
4947                     uint8x8x4_t v_dst;
4948                     v_dst.val[0] = v_dst0;
4949                     v_dst.val[1] = v_dst1;
4950                     v_dst.val[2] = v_dst2;
4951                     v_dst.val[3] = v_alpha;
4952                     vst4_u8(dst, v_dst);
4953                 }
4954                 else
4955                 {
4956                     uint8x8x3_t v_dst;
4957                     v_dst.val[0] = v_dst0;
4958                     v_dst.val[1] = v_dst1;
4959                     v_dst.val[2] = v_dst2;
4960                     vst3_u8(dst, v_dst);
4961                 }
4962             }
4963             #elif CV_SSE2
4964             if (dcn == 3 && haveSIMD)
4965             {
4966                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
4967                 {
4968                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
4969                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
4970                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
4971                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
4972 
4973                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
4974                                                      _mm_cvtps_epi32(v_src1));
4975                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
4976                                                      _mm_cvtps_epi32(v_src3));
4977 
4978                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
4979                 }
4980 
4981                 int jr = j % 3;
4982                 if (jr)
4983                     dst -= jr, j -= jr;
4984             }
4985             #endif
4986 
4987             for( ; j < dn*3; j += 3, dst += dcn )
4988             {
4989                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
4990                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
4991                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
4992                 if( dcn == 4 )
4993                     dst[3] = alpha;
4994             }
4995         }
4996     }
4997 
4998     int dstcn;
4999     HLS2RGB_f cvt;
5000     #if CV_NEON
5001     float32x4_t v_scale, v_scale_inv;
5002     uint8x8_t v_alpha;
5003     #elif CV_SSE2
5004     __m128 v_scale, v_scale_inv;
5005     __m128i v_zero;
5006     bool haveSIMD;
5007     #endif
5008 };
5009 
5010 
5011 ///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
5012 
5013 static const float D65[] = { 0.950456f, 1.f, 1.088754f };
5014 
5015 enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 };
5016 static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4];
5017 static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
5018 
5019 static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
5020 static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
5021 
5022 static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];
5023 #undef lab_shift
5024 #define lab_shift xyz_shift
5025 #define gamma_shift 3
5026 #define lab_shift2 (lab_shift + gamma_shift)
5027 #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
5028 static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
5029 
initLabTabs()5030 static void initLabTabs()
5031 {
5032     static bool initialized = false;
5033     if(!initialized)
5034     {
5035         float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1], scale = 1.f/LabCbrtTabScale;
5036         int i;
5037         for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
5038         {
5039             float x = i*scale;
5040             f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);
5041         }
5042         splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);
5043 
5044         scale = 1.f/GammaTabScale;
5045         for(i = 0; i <= GAMMA_TAB_SIZE; i++)
5046         {
5047             float x = i*scale;
5048             g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4);
5049             ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*std::pow((double)x, 1./2.4) - 0.055);
5050         }
5051         splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);
5052         splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);
5053 
5054         for(i = 0; i < 256; i++)
5055         {
5056             float x = i*(1.f/255.f);
5057             sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4)));
5058             linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
5059         }
5060 
5061         for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
5062         {
5063             float x = i*(1.f/(255.f*(1 << gamma_shift)));
5064             LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
5065         }
5066         initialized = true;
5067     }
5068 }
5069 
5070 struct RGB2Lab_b
5071 {
5072     typedef uchar channel_type;
5073 
RGB2Lab_bcv::RGB2Lab_b5074     RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,
5075               const float* _whitept, bool _srgb)
5076     : srccn(_srccn), srgb(_srgb)
5077     {
5078         static volatile int _3 = 3;
5079         initLabTabs();
5080 
5081         if (!_coeffs)
5082             _coeffs = sRGB2XYZ_D65;
5083         if (!_whitept)
5084             _whitept = D65;
5085 
5086         float scale[] =
5087         {
5088             (1 << lab_shift)/_whitept[0],
5089             (float)(1 << lab_shift),
5090             (1 << lab_shift)/_whitept[2]
5091         };
5092 
5093         for( int i = 0; i < _3; i++ )
5094         {
5095             coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);
5096             coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
5097             coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]);
5098 
5099             CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
5100                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
5101         }
5102     }
5103 
operator ()cv::RGB2Lab_b5104     void operator()(const uchar* src, uchar* dst, int n) const
5105     {
5106         const int Lscale = (116*255+50)/100;
5107         const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
5108         const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
5109         int i, scn = srccn;
5110         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5111             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5112             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5113         n *= 3;
5114 
5115         for( i = 0; i < n; i += 3, src += scn )
5116         {
5117             int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
5118             int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
5119             int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
5120             int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
5121 
5122             int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
5123             int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
5124             int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
5125 
5126             dst[i] = saturate_cast<uchar>(L);
5127             dst[i+1] = saturate_cast<uchar>(a);
5128             dst[i+2] = saturate_cast<uchar>(b);
5129         }
5130     }
5131 
5132     int srccn;
5133     int coeffs[9];
5134     bool srgb;
5135 };
5136 
5137 
5138 #define clip(value) \
5139     value < 0.0f ? 0.0f : value > 1.0f ? 1.0f : value;
5140 
5141 struct RGB2Lab_f
5142 {
5143     typedef float channel_type;
5144 
RGB2Lab_fcv::RGB2Lab_f5145     RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
5146               const float* _whitept, bool _srgb)
5147     : srccn(_srccn), srgb(_srgb)
5148     {
5149         volatile int _3 = 3;
5150         initLabTabs();
5151 
5152         if (!_coeffs)
5153             _coeffs = sRGB2XYZ_D65;
5154         if (!_whitept)
5155             _whitept = D65;
5156 
5157         float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
5158 
5159         for( int i = 0; i < _3; i++ )
5160         {
5161             int j = i * 3;
5162             coeffs[j + (blueIdx ^ 2)] = _coeffs[j] * scale[i];
5163             coeffs[j + 1] = _coeffs[j + 1] * scale[i];
5164             coeffs[j + blueIdx] = _coeffs[j + 2] * scale[i];
5165 
5166             CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
5167                        coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale );
5168         }
5169     }
5170 
operator ()cv::RGB2Lab_f5171     void operator()(const float* src, float* dst, int n) const
5172     {
5173         int i, scn = srccn;
5174         float gscale = GammaTabScale;
5175         const float* gammaTab = srgb ? sRGBGammaTab : 0;
5176         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5177               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5178               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5179         n *= 3;
5180 
5181         static const float _1_3 = 1.0f / 3.0f;
5182         static const float _a = 16.0f / 116.0f;
5183         for (i = 0; i < n; i += 3, src += scn )
5184         {
5185             float R = clip(src[0]);
5186             float G = clip(src[1]);
5187             float B = clip(src[2]);
5188 
5189             if (gammaTab)
5190             {
5191                 R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
5192                 G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
5193                 B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
5194             }
5195             float X = R*C0 + G*C1 + B*C2;
5196             float Y = R*C3 + G*C4 + B*C5;
5197             float Z = R*C6 + G*C7 + B*C8;
5198 
5199             float FX = X > 0.008856f ? std::pow(X, _1_3) : (7.787f * X + _a);
5200             float FY = Y > 0.008856f ? std::pow(Y, _1_3) : (7.787f * Y + _a);
5201             float FZ = Z > 0.008856f ? std::pow(Z, _1_3) : (7.787f * Z + _a);
5202 
5203             float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
5204             float a = 500.f * (FX - FY);
5205             float b = 200.f * (FY - FZ);
5206 
5207             dst[i] = L;
5208             dst[i + 1] = a;
5209             dst[i + 2] = b;
5210         }
5211     }
5212 
5213     int srccn;
5214     float coeffs[9];
5215     bool srgb;
5216 };
5217 
5218 struct Lab2RGB_f
5219 {
5220     typedef float channel_type;
5221 
Lab2RGB_fcv::Lab2RGB_f5222     Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
5223               const float* _whitept, bool _srgb )
5224     : dstcn(_dstcn), srgb(_srgb)
5225     {
5226         initLabTabs();
5227 
5228         if(!_coeffs)
5229             _coeffs = XYZ2sRGB_D65;
5230         if(!_whitept)
5231             _whitept = D65;
5232 
5233         for( int i = 0; i < 3; i++ )
5234         {
5235             coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];
5236             coeffs[i+3] = _coeffs[i+3]*_whitept[i];
5237             coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];
5238         }
5239     }
5240 
operator ()cv::Lab2RGB_f5241     void operator()(const float* src, float* dst, int n) const
5242     {
5243         int i, dcn = dstcn;
5244         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
5245         float gscale = GammaTabScale;
5246         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5247         C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5248         C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5249         float alpha = ColorChannel<float>::max();
5250         n *= 3;
5251 
5252         static const float lThresh = 0.008856f * 903.3f;
5253         static const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
5254         for (i = 0; i < n; i += 3, dst += dcn)
5255         {
5256             float li = src[i];
5257             float ai = src[i + 1];
5258             float bi = src[i + 2];
5259 
5260             float y, fy;
5261             if (li <= lThresh)
5262             {
5263                 y = li / 903.3f;
5264                 fy = 7.787f * y + 16.0f / 116.0f;
5265             }
5266             else
5267             {
5268                 fy = (li + 16.0f) / 116.0f;
5269                 y = fy * fy * fy;
5270             }
5271 
5272             float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
5273 
5274             for (int j = 0; j < 2; j++)
5275                 if (fxz[j] <= fThresh)
5276                     fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
5277                 else
5278                     fxz[j] = fxz[j] * fxz[j] * fxz[j];
5279 
5280 
5281             float x = fxz[0], z = fxz[1];
5282             float ro = C0 * x + C1 * y + C2 * z;
5283             float go = C3 * x + C4 * y + C5 * z;
5284             float bo = C6 * x + C7 * y + C8 * z;
5285             ro = clip(ro);
5286             go = clip(go);
5287             bo = clip(bo);
5288 
5289             if (gammaTab)
5290             {
5291                 ro = splineInterpolate(ro * gscale, gammaTab, GAMMA_TAB_SIZE);
5292                 go = splineInterpolate(go * gscale, gammaTab, GAMMA_TAB_SIZE);
5293                 bo = splineInterpolate(bo * gscale, gammaTab, GAMMA_TAB_SIZE);
5294             }
5295 
5296             dst[0] = ro, dst[1] = go, dst[2] = bo;
5297             if( dcn == 4 )
5298                 dst[3] = alpha;
5299         }
5300     }
5301 
5302     int dstcn;
5303     float coeffs[9];
5304     bool srgb;
5305 };
5306 
5307 #undef clip
5308 
5309 struct Lab2RGB_b
5310 {
5311     typedef uchar channel_type;
5312 
Lab2RGB_bcv::Lab2RGB_b5313     Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
5314                const float* _whitept, bool _srgb )
5315     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
5316     {
5317         #if CV_NEON
5318         v_scale_inv = vdupq_n_f32(100.f/255.f);
5319         v_scale = vdupq_n_f32(255.f);
5320         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
5321         v_128 = vdupq_n_f32(128.0f);
5322         #elif CV_SSE2
5323         v_scale_inv = _mm_set1_ps(100.f/255.f);
5324         v_scale = _mm_set1_ps(255.f);
5325         v_128 = _mm_set1_ps(128.0f);
5326         v_zero = _mm_setzero_si128();
5327         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
5328         #endif
5329     }
5330 
5331     #if CV_SSE2
5332     // 16s x 8
processcv::Lab2RGB_b5333     void process(__m128i v_r, __m128i v_g, __m128i v_b,
5334                  float * buf) const
5335     {
5336         __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
5337         __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
5338         __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
5339 
5340         __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
5341         __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
5342         __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
5343 
5344         v_r0 = _mm_mul_ps(v_r0, v_scale_inv);
5345         v_r1 = _mm_mul_ps(v_r1, v_scale_inv);
5346 
5347         v_g0 = _mm_sub_ps(v_g0, v_128);
5348         v_g1 = _mm_sub_ps(v_g1, v_128);
5349         v_b0 = _mm_sub_ps(v_b0, v_128);
5350         v_b1 = _mm_sub_ps(v_b1, v_128);
5351 
5352         _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
5353 
5354         _mm_store_ps(buf, v_r0);
5355         _mm_store_ps(buf + 4, v_r1);
5356         _mm_store_ps(buf + 8, v_g0);
5357         _mm_store_ps(buf + 12, v_g1);
5358         _mm_store_ps(buf + 16, v_b0);
5359         _mm_store_ps(buf + 20, v_b1);
5360     }
5361     #endif
5362 
operator ()cv::Lab2RGB_b5363     void operator()(const uchar* src, uchar* dst, int n) const
5364     {
5365         int i, j, dcn = dstcn;
5366         uchar alpha = ColorChannel<uchar>::max();
5367         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
5368 
5369         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
5370         {
5371             int dn = std::min(n - i, (int)BLOCK_SIZE);
5372             j = 0;
5373 
5374             #if CV_NEON
5375             for ( ; j <= (dn - 8) * 3; j += 24)
5376             {
5377                 uint8x8x3_t v_src = vld3_u8(src + j);
5378                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
5379                            v_t1 = vmovl_u8(v_src.val[1]),
5380                            v_t2 = vmovl_u8(v_src.val[2]);
5381 
5382                 float32x4x3_t v_dst;
5383                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
5384                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128);
5385                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128);
5386                 vst3q_f32(buf + j, v_dst);
5387 
5388                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
5389                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128);
5390                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
5391                 vst3q_f32(buf + j + 12, v_dst);
5392             }
5393             #elif CV_SSE2
5394             if (haveSIMD)
5395             {
5396                 for ( ; j <= (dn - 32) * 3; j += 96)
5397                 {
5398                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
5399                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
5400                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
5401                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
5402                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
5403                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
5404 
5405                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
5406 
5407                     process(_mm_unpacklo_epi8(v_r0, v_zero),
5408                             _mm_unpacklo_epi8(v_g0, v_zero),
5409                             _mm_unpacklo_epi8(v_b0, v_zero),
5410                             buf + j);
5411 
5412                     process(_mm_unpackhi_epi8(v_r0, v_zero),
5413                             _mm_unpackhi_epi8(v_g0, v_zero),
5414                             _mm_unpackhi_epi8(v_b0, v_zero),
5415                             buf + j + 24);
5416 
5417                     process(_mm_unpacklo_epi8(v_r1, v_zero),
5418                             _mm_unpacklo_epi8(v_g1, v_zero),
5419                             _mm_unpacklo_epi8(v_b1, v_zero),
5420                             buf + j + 48);
5421 
5422                     process(_mm_unpackhi_epi8(v_r1, v_zero),
5423                             _mm_unpackhi_epi8(v_g1, v_zero),
5424                             _mm_unpackhi_epi8(v_b1, v_zero),
5425                             buf + j + 72);
5426                 }
5427             }
5428             #endif
5429 
5430             for( ; j < dn*3; j += 3 )
5431             {
5432                 buf[j] = src[j]*(100.f/255.f);
5433                 buf[j+1] = (float)(src[j+1] - 128);
5434                 buf[j+2] = (float)(src[j+2] - 128);
5435             }
5436             cvt(buf, buf, dn);
5437             j = 0;
5438 
5439             #if CV_NEON
5440             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
5441             {
5442                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
5443                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
5444                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
5445                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
5446                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
5447                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
5448                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
5449 
5450                 if (dcn == 4)
5451                 {
5452                     uint8x8x4_t v_dst;
5453                     v_dst.val[0] = v_dst0;
5454                     v_dst.val[1] = v_dst1;
5455                     v_dst.val[2] = v_dst2;
5456                     v_dst.val[3] = v_alpha;
5457                     vst4_u8(dst, v_dst);
5458                 }
5459                 else
5460                 {
5461                     uint8x8x3_t v_dst;
5462                     v_dst.val[0] = v_dst0;
5463                     v_dst.val[1] = v_dst1;
5464                     v_dst.val[2] = v_dst2;
5465                     vst3_u8(dst, v_dst);
5466                 }
5467             }
5468             #elif CV_SSE2
5469             if (dcn == 3 && haveSIMD)
5470             {
5471                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
5472                 {
5473                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
5474                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
5475                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
5476                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
5477 
5478                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
5479                                                      _mm_cvtps_epi32(v_src1));
5480                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
5481                                                      _mm_cvtps_epi32(v_src3));
5482 
5483                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
5484                 }
5485 
5486                 int jr = j % 3;
5487                 if (jr)
5488                     dst -= jr, j -= jr;
5489             }
5490             #endif
5491 
5492             for( ; j < dn*3; j += 3, dst += dcn )
5493             {
5494                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
5495                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
5496                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
5497                 if( dcn == 4 )
5498                     dst[3] = alpha;
5499             }
5500         }
5501     }
5502 
5503     int dstcn;
5504     Lab2RGB_f cvt;
5505 
5506     #if CV_NEON
5507     float32x4_t v_scale, v_scale_inv, v_128;
5508     uint8x8_t v_alpha;
5509     #elif CV_SSE2
5510     __m128 v_scale, v_scale_inv, v_128;
5511     __m128i v_zero;
5512     bool haveSIMD;
5513     #endif
5514 };
5515 
5516 
5517 ///////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////
5518 
5519 struct RGB2Luv_f
5520 {
5521     typedef float channel_type;
5522 
RGB2Luv_fcv::RGB2Luv_f5523     RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,
5524                const float* whitept, bool _srgb )
5525     : srccn(_srccn), srgb(_srgb)
5526     {
5527         volatile int i;
5528         initLabTabs();
5529 
5530         if(!_coeffs) _coeffs = sRGB2XYZ_D65;
5531         if(!whitept) whitept = D65;
5532 
5533         for( i = 0; i < 3; i++ )
5534         {
5535             coeffs[i*3] = _coeffs[i*3];
5536             coeffs[i*3+1] = _coeffs[i*3+1];
5537             coeffs[i*3+2] = _coeffs[i*3+2];
5538             if( blueIdx == 0 )
5539                 std::swap(coeffs[i*3], coeffs[i*3+2]);
5540             CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
5541                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );
5542         }
5543 
5544         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
5545         un = 4*whitept[0]*d;
5546         vn = 9*whitept[1]*d;
5547 
5548         CV_Assert(whitept[1] == 1.f);
5549     }
5550 
operator ()cv::RGB2Luv_f5551     void operator()(const float* src, float* dst, int n) const
5552     {
5553         int i, scn = srccn;
5554         float gscale = GammaTabScale;
5555         const float* gammaTab = srgb ? sRGBGammaTab : 0;
5556         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5557               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5558               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5559         float _un = 13*un, _vn = 13*vn;
5560         n *= 3;
5561 
5562         for( i = 0; i < n; i += 3, src += scn )
5563         {
5564             float R = src[0], G = src[1], B = src[2];
5565             if( gammaTab )
5566             {
5567                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
5568                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
5569                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
5570             }
5571 
5572             float X = R*C0 + G*C1 + B*C2;
5573             float Y = R*C3 + G*C4 + B*C5;
5574             float Z = R*C6 + G*C7 + B*C8;
5575 
5576             float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
5577             L = 116.f*L - 16.f;
5578 
5579             float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
5580             float u = L*(X*d - _un);
5581             float v = L*((9*0.25f)*Y*d - _vn);
5582 
5583             dst[i] = L; dst[i+1] = u; dst[i+2] = v;
5584         }
5585     }
5586 
5587     int srccn;
5588     float coeffs[9], un, vn;
5589     bool srgb;
5590 };
5591 
5592 
5593 struct Luv2RGB_f
5594 {
5595     typedef float channel_type;
5596 
Luv2RGB_fcv::Luv2RGB_f5597     Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
5598               const float* whitept, bool _srgb )
5599     : dstcn(_dstcn), srgb(_srgb)
5600     {
5601         initLabTabs();
5602 
5603         if(!_coeffs) _coeffs = XYZ2sRGB_D65;
5604         if(!whitept) whitept = D65;
5605 
5606         for( int i = 0; i < 3; i++ )
5607         {
5608             coeffs[i+(blueIdx^2)*3] = _coeffs[i];
5609             coeffs[i+3] = _coeffs[i+3];
5610             coeffs[i+blueIdx*3] = _coeffs[i+6];
5611         }
5612 
5613         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
5614         un = 4*whitept[0]*d;
5615         vn = 9*whitept[1]*d;
5616 
5617         CV_Assert(whitept[1] == 1.f);
5618     }
5619 
operator ()cv::Luv2RGB_f5620     void operator()(const float* src, float* dst, int n) const
5621     {
5622         int i, dcn = dstcn;
5623         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
5624         float gscale = GammaTabScale;
5625         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
5626               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
5627               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
5628         float alpha = ColorChannel<float>::max();
5629         float _un = un, _vn = vn;
5630         n *= 3;
5631 
5632         for( i = 0; i < n; i += 3, dst += dcn )
5633         {
5634             float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;
5635             Y = (L + 16.f) * (1.f/116.f);
5636             Y = Y*Y*Y;
5637             d = (1.f/13.f)/L;
5638             u = u*d + _un;
5639             v = v*d + _vn;
5640             float iv = 1.f/v;
5641             X = 2.25f * u * Y * iv ;
5642             Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
5643 
5644             float R = X*C0 + Y*C1 + Z*C2;
5645             float G = X*C3 + Y*C4 + Z*C5;
5646             float B = X*C6 + Y*C7 + Z*C8;
5647 
5648             R = std::min(std::max(R, 0.f), 1.f);
5649             G = std::min(std::max(G, 0.f), 1.f);
5650             B = std::min(std::max(B, 0.f), 1.f);
5651 
5652             if( gammaTab )
5653             {
5654                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
5655                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
5656                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
5657             }
5658 
5659             dst[0] = R; dst[1] = G; dst[2] = B;
5660             if( dcn == 4 )
5661                 dst[3] = alpha;
5662         }
5663     }
5664 
5665     int dstcn;
5666     float coeffs[9], un, vn;
5667     bool srgb;
5668 };
5669 
5670 
5671 struct RGB2Luv_b
5672 {
5673     typedef uchar channel_type;
5674 
RGB2Luv_bcv::RGB2Luv_b5675     RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
5676                const float* _whitept, bool _srgb )
5677     : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb)
5678     {
5679         #if CV_NEON
5680         v_scale_inv = vdupq_n_f32(1.f/255.f);
5681         v_scale = vdupq_n_f32(2.55f);
5682         v_coeff1 = vdupq_n_f32(0.72033898305084743f);
5683         v_coeff2 = vdupq_n_f32(96.525423728813564f);
5684         v_coeff3 = vdupq_n_f32(0.9732824427480916f);
5685         v_coeff4 = vdupq_n_f32(136.259541984732824f);
5686         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
5687         #elif CV_SSE2
5688         v_zero = _mm_setzero_si128();
5689         v_scale_inv = _mm_set1_ps(1.f/255.f);
5690         v_scale = _mm_set1_ps(2.55f);
5691         v_coeff1 = _mm_set1_ps(0.72033898305084743f);
5692         v_coeff2 = _mm_set1_ps(96.525423728813564f);
5693         v_coeff3 = _mm_set1_ps(0.9732824427480916f);
5694         v_coeff4 = _mm_set1_ps(136.259541984732824f);
5695         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
5696         #endif
5697     }
5698 
5699     #if CV_SSE2
processcv::RGB2Luv_b5700     void process(const float * buf,
5701                  __m128i & v_l, __m128i & v_u, __m128i & v_v) const
5702     {
5703         __m128 v_l0f = _mm_load_ps(buf);
5704         __m128 v_l1f = _mm_load_ps(buf + 4);
5705         __m128 v_u0f = _mm_load_ps(buf + 8);
5706         __m128 v_u1f = _mm_load_ps(buf + 12);
5707         __m128 v_v0f = _mm_load_ps(buf + 16);
5708         __m128 v_v1f = _mm_load_ps(buf + 20);
5709 
5710         _mm_deinterleave_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f);
5711 
5712         v_l0f = _mm_mul_ps(v_l0f, v_scale);
5713         v_l1f = _mm_mul_ps(v_l1f, v_scale);
5714         v_u0f = _mm_add_ps(_mm_mul_ps(v_u0f, v_coeff1), v_coeff2);
5715         v_u1f = _mm_add_ps(_mm_mul_ps(v_u1f, v_coeff1), v_coeff2);
5716         v_v0f = _mm_add_ps(_mm_mul_ps(v_v0f, v_coeff3), v_coeff4);
5717         v_v1f = _mm_add_ps(_mm_mul_ps(v_v1f, v_coeff3), v_coeff4);
5718 
5719         v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
5720         v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
5721         v_v = _mm_packs_epi32(_mm_cvtps_epi32(v_v0f), _mm_cvtps_epi32(v_v1f));
5722     }
5723     #endif
5724 
operator ()cv::RGB2Luv_b5725     void operator()(const uchar* src, uchar* dst, int n) const
5726     {
5727         int i, j, scn = srccn;
5728         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
5729 
5730         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
5731         {
5732             int dn = std::min(n - i, (int)BLOCK_SIZE);
5733             j = 0;
5734 
5735             #if CV_NEON
5736             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
5737             {
5738                 uint16x8_t v_t0, v_t1, v_t2;
5739 
5740                 if (scn == 3)
5741                 {
5742                     uint8x8x3_t v_src = vld3_u8(src);
5743                     v_t0 = vmovl_u8(v_src.val[0]);
5744                     v_t1 = vmovl_u8(v_src.val[1]);
5745                     v_t2 = vmovl_u8(v_src.val[2]);
5746                 }
5747                 else
5748                 {
5749                     uint8x8x4_t v_src = vld4_u8(src);
5750                     v_t0 = vmovl_u8(v_src.val[0]);
5751                     v_t1 = vmovl_u8(v_src.val[1]);
5752                     v_t2 = vmovl_u8(v_src.val[2]);
5753                 }
5754 
5755                 float32x4x3_t v_dst;
5756                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
5757                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
5758                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
5759                 vst3q_f32(buf + j, v_dst);
5760 
5761                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
5762                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
5763                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
5764                 vst3q_f32(buf + j + 12, v_dst);
5765             }
5766             #elif CV_SSE2
5767             if (scn == 3 && haveSIMD)
5768             {
5769                 for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
5770                 {
5771                     __m128i v_src = _mm_loadu_si128((__m128i const *)src);
5772 
5773                     __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
5774                     _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
5775                     _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
5776 
5777                     v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
5778                     _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
5779                     _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
5780                 }
5781 
5782                 int jr = j % 3;
5783                 if (jr)
5784                     src -= jr, j -= jr;
5785             }
5786             #endif
5787             for( ; j < dn*3; j += 3, src += scn )
5788             {
5789                 buf[j] = src[0]*(1.f/255.f);
5790                 buf[j+1] = (float)(src[1]*(1.f/255.f));
5791                 buf[j+2] = (float)(src[2]*(1.f/255.f));
5792             }
5793             cvt(buf, buf, dn);
5794 
5795             j = 0;
5796             #if CV_NEON
5797             for ( ; j <= (dn - 8) * 3; j += 24)
5798             {
5799                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
5800 
5801                 uint8x8x3_t v_dst;
5802                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
5803                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
5804                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))),
5805                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2)))));
5806                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))),
5807                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4)))));
5808 
5809                 vst3_u8(dst + j, v_dst);
5810             }
5811             #elif CV_SSE2
5812             if (haveSIMD)
5813             {
5814                 for ( ; j <= (dn - 32) * 3; j += 96)
5815                 {
5816                     __m128i v_l_0, v_u_0, v_v_0;
5817                     process(buf + j,
5818                             v_l_0, v_u_0, v_v_0);
5819 
5820                     __m128i v_l_1, v_u_1, v_v_1;
5821                     process(buf + j + 24,
5822                             v_l_1, v_u_1, v_v_1);
5823 
5824                     __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
5825                     __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1);
5826                     __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1);
5827 
5828                     process(buf + j + 48,
5829                             v_l_0, v_u_0, v_v_0);
5830 
5831                     process(buf + j + 72,
5832                             v_l_1, v_u_1, v_v_1);
5833 
5834                     __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
5835                     __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1);
5836                     __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1);
5837 
5838                     _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
5839 
5840                     _mm_storeu_si128((__m128i *)(dst + j), v_l0);
5841                     _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1);
5842                     _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0);
5843                     _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1);
5844                     _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0);
5845                     _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1);
5846                 }
5847             }
5848             #endif
5849 
5850             for( ; j < dn*3; j += 3 )
5851             {
5852                 dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
5853                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);
5854                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*0.9732824427480916f + 136.259541984732824f);
5855             }
5856         }
5857     }
5858 
5859     int srccn;
5860     RGB2Luv_f cvt;
5861 
5862     #if CV_NEON
5863     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
5864     uint8x8_t v_alpha;
5865     #elif CV_SSE2
5866     __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
5867     __m128i v_zero;
5868     bool haveSIMD;
5869     #endif
5870 };
5871 
5872 
5873 struct Luv2RGB_b
5874 {
5875     typedef uchar channel_type;
5876 
Luv2RGB_bcv::Luv2RGB_b5877     Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
5878                const float* _whitept, bool _srgb )
5879     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
5880     {
5881         #if CV_NEON
5882         v_scale_inv = vdupq_n_f32(100.f/255.f);
5883         v_coeff1 = vdupq_n_f32(1.388235294117647f);
5884         v_coeff2 = vdupq_n_f32(1.027450980392157f);
5885         v_134 = vdupq_n_f32(134.f);
5886         v_140 = vdupq_n_f32(140.f);
5887         v_scale = vdupq_n_f32(255.f);
5888         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
5889         #elif CV_SSE2
5890         v_scale_inv = _mm_set1_ps(100.f/255.f);
5891         v_coeff1 = _mm_set1_ps(1.388235294117647f);
5892         v_coeff2 = _mm_set1_ps(1.027450980392157f);
5893         v_134 = _mm_set1_ps(134.f);
5894         v_140 = _mm_set1_ps(140.f);
5895         v_scale = _mm_set1_ps(255.f);
5896         v_zero = _mm_setzero_si128();
5897         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
5898         #endif
5899     }
5900 
5901     #if CV_SSE2
5902     // 16s x 8
processcv::Luv2RGB_b5903     void process(__m128i v_l, __m128i v_u, __m128i v_v,
5904                  float * buf) const
5905     {
5906         __m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero));
5907         __m128 v_u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_u, v_zero));
5908         __m128 v_v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_v, v_zero));
5909 
5910         __m128 v_l1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_l, v_zero));
5911         __m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero));
5912         __m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero));
5913 
5914         v_l0 = _mm_mul_ps(v_l0, v_scale_inv);
5915         v_l1 = _mm_mul_ps(v_l1, v_scale_inv);
5916 
5917         v_u0 = _mm_sub_ps(_mm_mul_ps(v_u0, v_coeff1), v_134);
5918         v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134);
5919         v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140);
5920         v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140);
5921 
5922         _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
5923 
5924         _mm_store_ps(buf, v_l0);
5925         _mm_store_ps(buf + 4, v_l1);
5926         _mm_store_ps(buf + 8, v_u0);
5927         _mm_store_ps(buf + 12, v_u1);
5928         _mm_store_ps(buf + 16, v_v0);
5929         _mm_store_ps(buf + 20, v_v1);
5930     }
5931     #endif
5932 
operator ()cv::Luv2RGB_b5933     void operator()(const uchar* src, uchar* dst, int n) const
5934     {
5935         int i, j, dcn = dstcn;
5936         uchar alpha = ColorChannel<uchar>::max();
5937         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
5938 
5939         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
5940         {
5941             int dn = std::min(n - i, (int)BLOCK_SIZE);
5942             j = 0;
5943 
5944             #if CV_NEON
5945             for ( ; j <= (dn - 8) * 3; j += 24)
5946             {
5947                 uint8x8x3_t v_src = vld3_u8(src + j);
5948                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
5949                            v_t1 = vmovl_u8(v_src.val[1]),
5950                            v_t2 = vmovl_u8(v_src.val[2]);
5951 
5952                 float32x4x3_t v_dst;
5953                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
5954                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_coeff1), v_134);
5955                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_coeff2), v_140);
5956                 vst3q_f32(buf + j, v_dst);
5957 
5958                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
5959                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_coeff1), v_134);
5960                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140);
5961                 vst3q_f32(buf + j + 12, v_dst);
5962             }
5963             #elif CV_SSE2
5964             if (haveSIMD)
5965             {
5966                 for ( ; j <= (dn - 32) * 3; j += 96)
5967                 {
5968                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
5969                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
5970                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
5971                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
5972                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
5973                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
5974 
5975                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
5976 
5977                     process(_mm_unpacklo_epi8(v_r0, v_zero),
5978                             _mm_unpacklo_epi8(v_g0, v_zero),
5979                             _mm_unpacklo_epi8(v_b0, v_zero),
5980                             buf + j);
5981 
5982                     process(_mm_unpackhi_epi8(v_r0, v_zero),
5983                             _mm_unpackhi_epi8(v_g0, v_zero),
5984                             _mm_unpackhi_epi8(v_b0, v_zero),
5985                             buf + j + 24);
5986 
5987                     process(_mm_unpacklo_epi8(v_r1, v_zero),
5988                             _mm_unpacklo_epi8(v_g1, v_zero),
5989                             _mm_unpacklo_epi8(v_b1, v_zero),
5990                             buf + j + 48);
5991 
5992                     process(_mm_unpackhi_epi8(v_r1, v_zero),
5993                             _mm_unpackhi_epi8(v_g1, v_zero),
5994                             _mm_unpackhi_epi8(v_b1, v_zero),
5995                             buf + j + 72);
5996                 }
5997             }
5998             #endif
5999             for( ; j < dn*3; j += 3 )
6000             {
6001                 buf[j] = src[j]*(100.f/255.f);
6002                 buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);
6003                 buf[j+2] = (float)(src[j+2]*1.027450980392157f - 140.f);
6004             }
6005             cvt(buf, buf, dn);
6006 
6007             j = 0;
6008             #if CV_NEON
6009             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
6010             {
6011                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
6012                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
6013                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
6014                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
6015                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
6016                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
6017                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
6018 
6019                 if (dcn == 4)
6020                 {
6021                     uint8x8x4_t v_dst;
6022                     v_dst.val[0] = v_dst0;
6023                     v_dst.val[1] = v_dst1;
6024                     v_dst.val[2] = v_dst2;
6025                     v_dst.val[3] = v_alpha;
6026                     vst4_u8(dst, v_dst);
6027                 }
6028                 else
6029                 {
6030                     uint8x8x3_t v_dst;
6031                     v_dst.val[0] = v_dst0;
6032                     v_dst.val[1] = v_dst1;
6033                     v_dst.val[2] = v_dst2;
6034                     vst3_u8(dst, v_dst);
6035                 }
6036             }
6037             #elif CV_SSE2
6038             if (dcn == 3 && haveSIMD)
6039             {
6040                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
6041                 {
6042                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
6043                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
6044                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
6045                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
6046 
6047                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
6048                                                      _mm_cvtps_epi32(v_src1));
6049                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
6050                                                      _mm_cvtps_epi32(v_src3));
6051 
6052                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
6053                 }
6054 
6055                 int jr = j % 3;
6056                 if (jr)
6057                     dst -= jr, j -= jr;
6058             }
6059             #endif
6060 
6061             for( ; j < dn*3; j += 3, dst += dcn )
6062             {
6063                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
6064                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
6065                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
6066                 if( dcn == 4 )
6067                     dst[3] = alpha;
6068             }
6069         }
6070     }
6071 
6072     int dstcn;
6073     Luv2RGB_f cvt;
6074 
6075     #if CV_NEON
6076     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
6077     uint8x8_t v_alpha;
6078     #elif CV_SSE2
6079     __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
6080     __m128i v_zero;
6081     bool haveSIMD;
6082     #endif
6083 };
6084 
6085 
6086 ///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
6087 
6088 const int ITUR_BT_601_CY = 1220542;
6089 const int ITUR_BT_601_CUB = 2116026;
6090 const int ITUR_BT_601_CUG = -409993;
6091 const int ITUR_BT_601_CVG = -852492;
6092 const int ITUR_BT_601_CVR = 1673527;
6093 const int ITUR_BT_601_SHIFT = 20;
6094 
6095 // Coefficients for RGB to YUV420p conversion
6096 const int ITUR_BT_601_CRY =  269484;
6097 const int ITUR_BT_601_CGY =  528482;
6098 const int ITUR_BT_601_CBY =  102760;
6099 const int ITUR_BT_601_CRU = -155188;
6100 const int ITUR_BT_601_CGU = -305135;
6101 const int ITUR_BT_601_CBU =  460324;
6102 const int ITUR_BT_601_CGV = -385875;
6103 const int ITUR_BT_601_CBV = -74448;
6104 
6105 template<int bIdx, int uIdx>
6106 struct YUV420sp2RGB888Invoker : ParallelLoopBody
6107 {
6108     Mat* dst;
6109     const uchar* my1, *muv;
6110     int width, stride;
6111 
YUV420sp2RGB888Invokercv::YUV420sp2RGB888Invoker6112     YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
6113         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
6114 
operator ()cv::YUV420sp2RGB888Invoker6115     void operator()(const Range& range) const
6116     {
6117         int rangeBegin = range.start * 2;
6118         int rangeEnd = range.end * 2;
6119 
6120         //R = 1.164(Y - 16) + 1.596(V - 128)
6121         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
6122         //B = 1.164(Y - 16)                  + 2.018(U - 128)
6123 
6124         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
6125         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
6126         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
6127 
6128         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
6129 
6130 #ifdef HAVE_TEGRA_OPTIMIZATION
6131         if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 3, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
6132             return;
6133 #endif
6134 
6135         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
6136         {
6137             uchar* row1 = dst->ptr<uchar>(j);
6138             uchar* row2 = dst->ptr<uchar>(j + 1);
6139             const uchar* y2 = y1 + stride;
6140 
6141             for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6)
6142             {
6143                 int u = int(uv[i + 0 + uIdx]) - 128;
6144                 int v = int(uv[i + 1 - uIdx]) - 128;
6145 
6146                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6147                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6148                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6149 
6150                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
6151                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6152                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6153                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6154 
6155                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
6156                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6157                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6158                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6159 
6160                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
6161                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6162                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6163                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6164 
6165                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
6166                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6167                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6168                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6169             }
6170         }
6171     }
6172 };
6173 
6174 template<int bIdx, int uIdx>
6175 struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
6176 {
6177     Mat* dst;
6178     const uchar* my1, *muv;
6179     int width, stride;
6180 
YUV420sp2RGBA8888Invokercv::YUV420sp2RGBA8888Invoker6181     YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
6182         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
6183 
operator ()cv::YUV420sp2RGBA8888Invoker6184     void operator()(const Range& range) const
6185     {
6186         int rangeBegin = range.start * 2;
6187         int rangeEnd = range.end * 2;
6188 
6189         //R = 1.164(Y - 16) + 1.596(V - 128)
6190         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
6191         //B = 1.164(Y - 16)                  + 2.018(U - 128)
6192 
6193         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
6194         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
6195         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
6196 
6197         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
6198 
6199 #ifdef HAVE_TEGRA_OPTIMIZATION
6200         if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 4, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
6201             return;
6202 #endif
6203 
6204         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
6205         {
6206             uchar* row1 = dst->ptr<uchar>(j);
6207             uchar* row2 = dst->ptr<uchar>(j + 1);
6208             const uchar* y2 = y1 + stride;
6209 
6210             for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8)
6211             {
6212                 int u = int(uv[i + 0 + uIdx]) - 128;
6213                 int v = int(uv[i + 1 - uIdx]) - 128;
6214 
6215                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6216                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6217                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6218 
6219                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
6220                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6221                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6222                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6223                 row1[3]      = uchar(0xff);
6224 
6225                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
6226                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6227                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6228                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6229                 row1[7]      = uchar(0xff);
6230 
6231                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
6232                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6233                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6234                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6235                 row2[3]      = uchar(0xff);
6236 
6237                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
6238                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6239                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6240                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6241                 row2[7]      = uchar(0xff);
6242             }
6243         }
6244     }
6245 };
6246 
6247 template<int bIdx>
6248 struct YUV420p2RGB888Invoker : ParallelLoopBody
6249 {
6250     Mat* dst;
6251     const uchar* my1, *mu, *mv;
6252     int width, stride;
6253     int ustepIdx, vstepIdx;
6254 
YUV420p2RGB888Invokercv::YUV420p2RGB888Invoker6255     YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
6256         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
6257 
operator ()cv::YUV420p2RGB888Invoker6258     void operator()(const Range& range) const
6259     {
6260         const int rangeBegin = range.start * 2;
6261         const int rangeEnd = range.end * 2;
6262 
6263         int uvsteps[2] = {width/2, stride - width/2};
6264         int usIdx = ustepIdx, vsIdx = vstepIdx;
6265 
6266         const uchar* y1 = my1 + rangeBegin * stride;
6267         const uchar* u1 = mu + (range.start / 2) * stride;
6268         const uchar* v1 = mv + (range.start / 2) * stride;
6269 
6270         if(range.start % 2 == 1)
6271         {
6272             u1 += uvsteps[(usIdx++) & 1];
6273             v1 += uvsteps[(vsIdx++) & 1];
6274         }
6275 
6276         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
6277         {
6278             uchar* row1 = dst->ptr<uchar>(j);
6279             uchar* row2 = dst->ptr<uchar>(j + 1);
6280             const uchar* y2 = y1 + stride;
6281 
6282             for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6)
6283             {
6284                 int u = int(u1[i]) - 128;
6285                 int v = int(v1[i]) - 128;
6286 
6287                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6288                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6289                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6290 
6291                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
6292                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6293                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6294                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6295 
6296                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6297                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6298                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6299                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6300 
6301                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
6302                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6303                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6304                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6305 
6306                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6307                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6308                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6309                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6310             }
6311         }
6312     }
6313 };
6314 
6315 template<int bIdx>
6316 struct YUV420p2RGBA8888Invoker : ParallelLoopBody
6317 {
6318     Mat* dst;
6319     const uchar* my1, *mu, *mv;
6320     int width, stride;
6321     int ustepIdx, vstepIdx;
6322 
YUV420p2RGBA8888Invokercv::YUV420p2RGBA8888Invoker6323     YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
6324         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
6325 
operator ()cv::YUV420p2RGBA8888Invoker6326     void operator()(const Range& range) const
6327     {
6328         int rangeBegin = range.start * 2;
6329         int rangeEnd = range.end * 2;
6330 
6331         int uvsteps[2] = {width/2, stride - width/2};
6332         int usIdx = ustepIdx, vsIdx = vstepIdx;
6333 
6334         const uchar* y1 = my1 + rangeBegin * stride;
6335         const uchar* u1 = mu + (range.start / 2) * stride;
6336         const uchar* v1 = mv + (range.start / 2) * stride;
6337 
6338         if(range.start % 2 == 1)
6339         {
6340             u1 += uvsteps[(usIdx++) & 1];
6341             v1 += uvsteps[(vsIdx++) & 1];
6342         }
6343 
6344         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
6345         {
6346             uchar* row1 = dst->ptr<uchar>(j);
6347             uchar* row2 = dst->ptr<uchar>(j + 1);
6348             const uchar* y2 = y1 + stride;
6349 
6350             for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8)
6351             {
6352                 int u = int(u1[i]) - 128;
6353                 int v = int(v1[i]) - 128;
6354 
6355                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6356                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6357                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6358 
6359                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
6360                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6361                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6362                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6363                 row1[3]      = uchar(0xff);
6364 
6365                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6366                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6367                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6368                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6369                 row1[7]      = uchar(0xff);
6370 
6371                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
6372                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
6373                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
6374                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
6375                 row2[3]      = uchar(0xff);
6376 
6377                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
6378                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
6379                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
6380                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
6381                 row2[7]      = uchar(0xff);
6382             }
6383         }
6384     }
6385 };
6386 
6387 #define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
6388 
6389 template<int bIdx, int uIdx>
cvtYUV420sp2RGB(Mat & _dst,int _stride,const uchar * _y1,const uchar * _uv)6390 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
6391 {
6392     YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
6393     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6394         parallel_for_(Range(0, _dst.rows/2), converter);
6395     else
6396         converter(Range(0, _dst.rows/2));
6397 }
6398 
6399 template<int bIdx, int uIdx>
cvtYUV420sp2RGBA(Mat & _dst,int _stride,const uchar * _y1,const uchar * _uv)6400 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
6401 {
6402     YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
6403     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6404         parallel_for_(Range(0, _dst.rows/2), converter);
6405     else
6406         converter(Range(0, _dst.rows/2));
6407 }
6408 
6409 template<int bIdx>
cvtYUV420p2RGB(Mat & _dst,int _stride,const uchar * _y1,const uchar * _u,const uchar * _v,int ustepIdx,int vstepIdx)6410 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
6411 {
6412     YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
6413     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6414         parallel_for_(Range(0, _dst.rows/2), converter);
6415     else
6416         converter(Range(0, _dst.rows/2));
6417 }
6418 
6419 template<int bIdx>
cvtYUV420p2RGBA(Mat & _dst,int _stride,const uchar * _y1,const uchar * _u,const uchar * _v,int ustepIdx,int vstepIdx)6420 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
6421 {
6422     YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
6423     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
6424         parallel_for_(Range(0, _dst.rows/2), converter);
6425     else
6426         converter(Range(0, _dst.rows/2));
6427 }
6428 
6429 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
6430 
6431 template<int bIdx>
6432 struct RGB888toYUV420pInvoker: public ParallelLoopBody
6433 {
RGB888toYUV420pInvokercv::RGB888toYUV420pInvoker6434     RGB888toYUV420pInvoker( const Mat& src, Mat* dst, const int uIdx )
6435         : src_(src),
6436           dst_(dst),
6437           uIdx_(uIdx) { }
6438 
operator ()cv::RGB888toYUV420pInvoker6439     void operator()(const Range& rowRange) const
6440     {
6441         const int w = src_.cols;
6442         const int h = src_.rows;
6443 
6444         const int cn = src_.channels();
6445         for( int i = rowRange.start; i < rowRange.end; i++ )
6446         {
6447             const uchar* row0 = src_.ptr<uchar>(2 * i);
6448             const uchar* row1 = src_.ptr<uchar>(2 * i + 1);
6449 
6450             uchar* y = dst_->ptr<uchar>(2*i);
6451             uchar* u = dst_->ptr<uchar>(h + i/2) + (i % 2) * (w/2);
6452             uchar* v = dst_->ptr<uchar>(h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
6453             if( uIdx_ == 2 ) std::swap(u, v);
6454 
6455             for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
6456             {
6457                 int r00 = row0[2-bIdx + j];      int g00 = row0[1 + j];      int b00 = row0[bIdx + j];
6458                 int r01 = row0[2-bIdx + cn + j]; int g01 = row0[1 + cn + j]; int b01 = row0[bIdx + cn + j];
6459                 int r10 = row1[2-bIdx + j];      int g10 = row1[1 + j];      int b10 = row1[bIdx + j];
6460                 int r11 = row1[2-bIdx + cn + j]; int g11 = row1[1 + cn + j]; int b11 = row1[bIdx + cn + j];
6461 
6462                 const int shifted16 = (16 << ITUR_BT_601_SHIFT);
6463                 const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
6464                 int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16;
6465                 int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16;
6466                 int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16;
6467                 int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16;
6468 
6469                 y[2*k + 0]            = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
6470                 y[2*k + 1]            = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
6471                 y[2*k + dst_->step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
6472                 y[2*k + dst_->step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
6473 
6474                 const int shifted128 = (128 << ITUR_BT_601_SHIFT);
6475                 int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
6476                 int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128;
6477 
6478                 u[k] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
6479                 v[k] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
6480             }
6481         }
6482     }
6483 
isFitcv::RGB888toYUV420pInvoker6484     static bool isFit( const Mat& src )
6485     {
6486         return (src.total() >= 320*240);
6487     }
6488 
6489 private:
6490     RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&);
6491 
6492     const Mat& src_;
6493     Mat* const dst_;
6494     const int uIdx_;
6495 };
6496 
6497 template<int bIdx, int uIdx>
cvtRGBtoYUV420p(const Mat & src,Mat & dst)6498 static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
6499 {
6500     RGB888toYUV420pInvoker<bIdx> colorConverter(src, &dst, uIdx);
6501     if( RGB888toYUV420pInvoker<bIdx>::isFit(src) )
6502         parallel_for_(Range(0, src.rows/2), colorConverter);
6503     else
6504         colorConverter(Range(0, src.rows/2));
6505 }
6506 
6507 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
6508 
6509 template<int bIdx, int uIdx, int yIdx>
6510 struct YUV422toRGB888Invoker : ParallelLoopBody
6511 {
6512     Mat* dst;
6513     const uchar* src;
6514     int width, stride;
6515 
YUV422toRGB888Invokercv::YUV422toRGB888Invoker6516     YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
6517         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
6518 
operator ()cv::YUV422toRGB888Invoker6519     void operator()(const Range& range) const
6520     {
6521         int rangeBegin = range.start;
6522         int rangeEnd = range.end;
6523 
6524         const int uidx = 1 - yIdx + uIdx * 2;
6525         const int vidx = (2 + uidx) % 4;
6526         const uchar* yuv_src = src + rangeBegin * stride;
6527 
6528         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
6529         {
6530             uchar* row = dst->ptr<uchar>(j);
6531 
6532             for (int i = 0; i < 2 * width; i += 4, row += 6)
6533             {
6534                 int u = int(yuv_src[i + uidx]) - 128;
6535                 int v = int(yuv_src[i + vidx]) - 128;
6536 
6537                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6538                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6539                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6540 
6541                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
6542                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6543                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6544                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6545 
6546                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
6547                 row[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6548                 row[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6549                 row[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6550             }
6551         }
6552     }
6553 };
6554 
6555 template<int bIdx, int uIdx, int yIdx>
6556 struct YUV422toRGBA8888Invoker : ParallelLoopBody
6557 {
6558     Mat* dst;
6559     const uchar* src;
6560     int width, stride;
6561 
YUV422toRGBA8888Invokercv::YUV422toRGBA8888Invoker6562     YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
6563         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
6564 
operator ()cv::YUV422toRGBA8888Invoker6565     void operator()(const Range& range) const
6566     {
6567         int rangeBegin = range.start;
6568         int rangeEnd = range.end;
6569 
6570         const int uidx = 1 - yIdx + uIdx * 2;
6571         const int vidx = (2 + uidx) % 4;
6572         const uchar* yuv_src = src + rangeBegin * stride;
6573 
6574         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
6575         {
6576             uchar* row = dst->ptr<uchar>(j);
6577 
6578             for (int i = 0; i < 2 * width; i += 4, row += 8)
6579             {
6580                 int u = int(yuv_src[i + uidx]) - 128;
6581                 int v = int(yuv_src[i + vidx]) - 128;
6582 
6583                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
6584                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
6585                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
6586 
6587                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
6588                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
6589                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
6590                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
6591                 row[3]      = uchar(0xff);
6592 
6593                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
6594                 row[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
6595                 row[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
6596                 row[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
6597                 row[7]      = uchar(0xff);
6598             }
6599         }
6600     }
6601 };
6602 
6603 #define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
6604 
6605 template<int bIdx, int uIdx, int yIdx>
cvtYUV422toRGB(Mat & _dst,int _stride,const uchar * _yuv)6606 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
6607 {
6608     YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
6609     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
6610         parallel_for_(Range(0, _dst.rows), converter);
6611     else
6612         converter(Range(0, _dst.rows));
6613 }
6614 
6615 template<int bIdx, int uIdx, int yIdx>
cvtYUV422toRGBA(Mat & _dst,int _stride,const uchar * _yuv)6616 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
6617 {
6618     YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
6619     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
6620         parallel_for_(Range(0, _dst.rows), converter);
6621     else
6622         converter(Range(0, _dst.rows));
6623 }
6624 
6625 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
6626 
6627 template<typename _Tp>
6628 struct RGBA2mRGBA
6629 {
6630     typedef _Tp channel_type;
6631 
operator ()cv::RGBA2mRGBA6632     void operator()(const _Tp* src, _Tp* dst, int n) const
6633     {
6634         _Tp max_val  = ColorChannel<_Tp>::max();
6635         _Tp half_val = ColorChannel<_Tp>::half();
6636         for( int i = 0; i < n; i++ )
6637         {
6638             _Tp v0 = *src++;
6639             _Tp v1 = *src++;
6640             _Tp v2 = *src++;
6641             _Tp v3 = *src++;
6642 
6643             *dst++ = (v0 * v3 + half_val) / max_val;
6644             *dst++ = (v1 * v3 + half_val) / max_val;
6645             *dst++ = (v2 * v3 + half_val) / max_val;
6646             *dst++ = v3;
6647         }
6648     }
6649 };
6650 
6651 
6652 template<typename _Tp>
6653 struct mRGBA2RGBA
6654 {
6655     typedef _Tp channel_type;
6656 
operator ()cv::mRGBA2RGBA6657     void operator()(const _Tp* src, _Tp* dst, int n) const
6658     {
6659         _Tp max_val = ColorChannel<_Tp>::max();
6660         for( int i = 0; i < n; i++ )
6661         {
6662             _Tp v0 = *src++;
6663             _Tp v1 = *src++;
6664             _Tp v2 = *src++;
6665             _Tp v3 = *src++;
6666             _Tp v3_half = v3 / 2;
6667 
6668             *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
6669             *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
6670             *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
6671             *dst++ = v3;
6672         }
6673     }
6674 };
6675 
6676 #ifdef HAVE_OPENCL
6677 
ocl_cvtColor(InputArray _src,OutputArray _dst,int code,int dcn)6678 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
6679 {
6680     bool ok = false;
6681     UMat src = _src.getUMat(), dst;
6682     Size sz = src.size(), dstSz = sz;
6683     int scn = src.channels(), depth = src.depth(), bidx, uidx, yidx;
6684     int dims = 2, stripeSize = 1;
6685     ocl::Kernel k;
6686 
6687     if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
6688         return false;
6689 
6690     ocl::Device dev = ocl::Device::getDefault();
6691     int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
6692     int pxPerWIx = 1;
6693 
6694     size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
6695     cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
6696                              depth, scn, pxPerWIy);
6697 
6698     switch (code)
6699     {
6700     case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
6701     case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
6702     {
6703         CV_Assert(scn == 3 || scn == 4);
6704         dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
6705         bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
6706         k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
6707                  opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
6708                         reverse ? "REVERSE" : "ORDER"));
6709         break;
6710     }
6711     case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
6712     case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
6713     {
6714         dcn = code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA || code == COLOR_BGR5652RGBA || code == COLOR_BGR5552RGBA ? 4 : 3;
6715         CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
6716         bidx = code == COLOR_BGR5652BGR || code == COLOR_BGR5552BGR ||
6717             code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA ? 0 : 2;
6718         int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
6719             code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
6720         k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
6721                  opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
6722         break;
6723     }
6724     case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
6725     case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
6726     {
6727         CV_Assert((scn == 3 || scn == 4) && depth == CV_8U );
6728         bidx = code == COLOR_BGR2BGR565 || code == COLOR_BGR2BGR555 ||
6729             code == COLOR_BGRA2BGR565 || code == COLOR_BGRA2BGR555 ? 0 : 2;
6730         int greenbits = code == COLOR_BGR2BGR565 || code == COLOR_RGB2BGR565 ||
6731             code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
6732         dcn = 2;
6733         k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
6734                  opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
6735         break;
6736     }
6737     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
6738     {
6739         CV_Assert(scn == 2 && depth == CV_8U);
6740         dcn = 1;
6741         int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
6742         k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
6743                  opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
6744         break;
6745     }
6746     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
6747     {
6748         CV_Assert(scn == 1 && depth == CV_8U);
6749         dcn = 2;
6750         int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
6751         k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
6752                  opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
6753         break;
6754     }
6755     case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
6756     case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
6757     {
6758         CV_Assert(scn == 3 || scn == 4);
6759         bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
6760         dcn = 1;
6761         k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
6762                  opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
6763                                bidx, stripeSize));
6764         globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
6765         break;
6766     }
6767     case COLOR_GRAY2BGR:
6768     case COLOR_GRAY2BGRA:
6769     {
6770         CV_Assert(scn == 1);
6771         dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
6772         k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
6773                  opts + format("-D bidx=0 -D dcn=%d", dcn));
6774         break;
6775     }
6776     case COLOR_BGR2YUV:
6777     case COLOR_RGB2YUV:
6778     {
6779         CV_Assert(scn == 3 || scn == 4);
6780         bidx = code == COLOR_RGB2YUV ? 0 : 2;
6781         dcn = 3;
6782         k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
6783                  opts + format("-D dcn=3 -D bidx=%d", bidx));
6784         break;
6785     }
6786     case COLOR_YUV2BGR:
6787     case COLOR_YUV2RGB:
6788     {
6789         if(dcn < 0) dcn = 3;
6790         CV_Assert(dcn == 3 || dcn == 4);
6791         bidx = code == COLOR_YUV2RGB ? 0 : 2;
6792         k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
6793                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
6794         break;
6795     }
6796     case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV21:
6797     case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV21:
6798     {
6799         CV_Assert( scn == 1 );
6800         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6801         dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ||
6802                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2RGBA_NV21 ? 4 : 3;
6803         bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ||
6804                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 0 : 2;
6805         uidx = code == COLOR_YUV2RGBA_NV21 || code == COLOR_YUV2RGB_NV21 ||
6806                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 1 : 0;
6807 
6808         dstSz = Size(sz.width, sz.height * 2 / 3);
6809         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
6810         k.create("YUV2RGB_NVx", ocl::imgproc::cvtcolor_oclsrc,
6811                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx));
6812         break;
6813     }
6814     case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
6815     case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
6816     {
6817         CV_Assert( scn == 1 );
6818         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6819         dcn  = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2RGBA_YV12 ||
6820                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2RGBA_IYUV ? 4 : 3;
6821         bidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
6822                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2BGR_IYUV ? 0 : 2;
6823         uidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
6824                code == COLOR_YUV2RGBA_YV12 || code == COLOR_YUV2RGB_YV12 ? 1 : 0;
6825 
6826         dstSz = Size(sz.width, sz.height * 2 / 3);
6827         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
6828         k.create("YUV2RGB_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
6829                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
6830                  src.isContinuous() ? " -D SRC_CONT" : ""));
6831         break;
6832     }
6833     case COLOR_YUV2GRAY_420:
6834     {
6835         if (dcn <= 0) dcn = 1;
6836 
6837         CV_Assert( dcn == 1 );
6838         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
6839 
6840         dstSz = Size(sz.width, sz.height * 2 / 3);
6841         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6842         dst = _dst.getUMat();
6843 
6844         src.rowRange(0, dstSz.height).copyTo(dst);
6845         return true;
6846     }
6847     case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
6848     case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
6849     {
6850         if (dcn <= 0) dcn = 1;
6851         bidx = code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ||
6852                code == COLOR_BGRA2YUV_IYUV || code == COLOR_BGR2YUV_IYUV ? 0 : 2;
6853         uidx = code == COLOR_RGBA2YUV_YV12 || code == COLOR_RGB2YUV_YV12 ||
6854                code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ? 1 : 0;
6855 
6856         CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
6857         CV_Assert( dcn == 1 );
6858         CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
6859 
6860         dstSz = Size(sz.width, sz.height / 2 * 3);
6861         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
6862         dst = _dst.getUMat();
6863 
6864         if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
6865             dst.step % 4 == 0 && dst.offset % 4 == 0)
6866         {
6867             pxPerWIx = 2;
6868         }
6869         globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
6870 
6871         k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
6872                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx));
6873         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
6874         return k.run(2, globalsize, NULL, false);
6875     }
6876     case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
6877     case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
6878     case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
6879     {
6880         if (dcn <= 0)
6881             dcn = (code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2RGBA_YUY2 ||
6882                    code==COLOR_YUV2BGRA_YUY2 || code==COLOR_YUV2RGBA_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 4 : 3;
6883 
6884         bidx = (code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2BGRA_YUY2 ||
6885                 code==COLOR_YUV2BGR_YUY2 || code==COLOR_YUV2BGRA_YVYU || code==COLOR_YUV2BGR_YVYU) ? 0 : 2;
6886         yidx = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
6887         uidx = (code==COLOR_YUV2RGB_YVYU || code==COLOR_YUV2RGBA_YVYU ||
6888                 code==COLOR_YUV2BGR_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 2 : 0;
6889         uidx = 1 - yidx + uidx;
6890 
6891         CV_Assert( dcn == 3 || dcn == 4 );
6892         CV_Assert( scn == 2 && depth == CV_8U );
6893 
6894         k.create("YUV2RGB_422", ocl::imgproc::cvtcolor_oclsrc,
6895                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
6896                                 src.offset % 4 == 0 && src.step % 4 == 0 ? " -D USE_OPTIMIZED_LOAD" : ""));
6897         break;
6898     }
6899     case COLOR_BGR2YCrCb:
6900     case COLOR_RGB2YCrCb:
6901     {
6902         CV_Assert(scn == 3 || scn == 4);
6903         bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
6904         dcn = 3;
6905         k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
6906                  opts + format("-D dcn=3 -D bidx=%d", bidx));
6907         break;
6908     }
6909     case COLOR_YCrCb2BGR:
6910     case COLOR_YCrCb2RGB:
6911     {
6912         if( dcn <= 0 )
6913             dcn = 3;
6914         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
6915         bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
6916         k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
6917                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
6918         break;
6919     }
6920     case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
6921     {
6922         CV_Assert(scn == 3 || scn == 4);
6923         bidx = code == COLOR_BGR2XYZ ? 0 : 2;
6924 
6925         UMat c;
6926         if (depth == CV_32F)
6927         {
6928             float coeffs[] =
6929             {
6930                 0.412453f, 0.357580f, 0.180423f,
6931                 0.212671f, 0.715160f, 0.072169f,
6932                 0.019334f, 0.119193f, 0.950227f
6933             };
6934             if (bidx == 0)
6935             {
6936                 std::swap(coeffs[0], coeffs[2]);
6937                 std::swap(coeffs[3], coeffs[5]);
6938                 std::swap(coeffs[6], coeffs[8]);
6939             }
6940             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
6941         }
6942         else
6943         {
6944             int coeffs[] =
6945             {
6946                 1689,    1465,    739,
6947                 871,     2929,    296,
6948                 79,      488,     3892
6949             };
6950             if (bidx == 0)
6951             {
6952                 std::swap(coeffs[0], coeffs[2]);
6953                 std::swap(coeffs[3], coeffs[5]);
6954                 std::swap(coeffs[6], coeffs[8]);
6955             }
6956             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
6957         }
6958 
6959         _dst.create(dstSz, CV_MAKETYPE(depth, 3));
6960         dst = _dst.getUMat();
6961 
6962         k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
6963                  opts + format("-D dcn=3 -D bidx=%d", bidx));
6964         if (k.empty())
6965             return false;
6966         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
6967         return k.run(2, globalsize, 0, false);
6968     }
6969     case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
6970     {
6971         if (dcn <= 0)
6972             dcn = 3;
6973         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
6974         bidx = code == COLOR_XYZ2BGR ? 0 : 2;
6975 
6976         UMat c;
6977         if (depth == CV_32F)
6978         {
6979             float coeffs[] =
6980             {
6981                 3.240479f, -1.53715f, -0.498535f,
6982                 -0.969256f, 1.875991f, 0.041556f,
6983                 0.055648f, -0.204043f, 1.057311f
6984             };
6985             if (bidx == 0)
6986             {
6987                 std::swap(coeffs[0], coeffs[6]);
6988                 std::swap(coeffs[1], coeffs[7]);
6989                 std::swap(coeffs[2], coeffs[8]);
6990             }
6991             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
6992         }
6993         else
6994         {
6995             int coeffs[] =
6996             {
6997                 13273,  -6296,  -2042,
6998                 -3970,   7684,    170,
6999                   228,   -836,   4331
7000             };
7001             if (bidx == 0)
7002             {
7003                 std::swap(coeffs[0], coeffs[6]);
7004                 std::swap(coeffs[1], coeffs[7]);
7005                 std::swap(coeffs[2], coeffs[8]);
7006             }
7007             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
7008         }
7009 
7010         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
7011         dst = _dst.getUMat();
7012 
7013         k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
7014                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
7015         if (k.empty())
7016             return false;
7017         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
7018         return k.run(2, globalsize, 0, false);
7019     }
7020     case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
7021     case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
7022     {
7023         CV_Assert((scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F));
7024         bidx = code == COLOR_BGR2HSV || code == COLOR_BGR2HLS ||
7025             code == COLOR_BGR2HSV_FULL || code == COLOR_BGR2HLS_FULL ? 0 : 2;
7026         int hrange = depth == CV_32F ? 360 : code == COLOR_BGR2HSV || code == COLOR_RGB2HSV ||
7027             code == COLOR_BGR2HLS || code == COLOR_RGB2HLS ? 180 : 256;
7028         bool is_hsv = code == COLOR_BGR2HSV || code == COLOR_RGB2HSV || code == COLOR_BGR2HSV_FULL || code == COLOR_RGB2HSV_FULL;
7029         String kernelName = String("RGB2") + (is_hsv ? "HSV" : "HLS");
7030         dcn = 3;
7031 
7032         if (is_hsv && depth == CV_8U)
7033         {
7034             static UMat sdiv_data;
7035             static UMat hdiv_data180;
7036             static UMat hdiv_data256;
7037             static int sdiv_table[256];
7038             static int hdiv_table180[256];
7039             static int hdiv_table256[256];
7040             static volatile bool initialized180 = false, initialized256 = false;
7041             volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
7042 
7043             if (!initialized)
7044             {
7045                 int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
7046                 UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
7047 
7048                 sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
7049 
7050                 int v = 255 << hsv_shift;
7051                 if (!initialized180 && !initialized256)
7052                 {
7053                     for(int i = 1; i < 256; i++ )
7054                         sdiv_table[i] = saturate_cast<int>(v/(1.*i));
7055                     Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
7056                 }
7057 
7058                 v = hrange << hsv_shift;
7059                 for (int i = 1; i < 256; i++ )
7060                     hdiv_table[i] = saturate_cast<int>(v/(6.*i));
7061 
7062                 Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
7063                 initialized = true;
7064             }
7065 
7066             _dst.create(dstSz, CV_8UC3);
7067             dst = _dst.getUMat();
7068 
7069             k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
7070                      opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
7071                                    hrange, bidx));
7072             if (k.empty())
7073                 return false;
7074 
7075             k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
7076                    ocl::KernelArg::PtrReadOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
7077                                                                        ocl::KernelArg::PtrReadOnly(hdiv_data180));
7078 
7079             return k.run(2, globalsize, NULL, false);
7080         }
7081         else
7082             k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
7083                      opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
7084                                    hrange*(1.f/360.f), bidx));
7085         break;
7086     }
7087     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
7088     case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
7089     {
7090         if (dcn <= 0)
7091             dcn = 3;
7092         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
7093         bidx = code == COLOR_HSV2BGR || code == COLOR_HLS2BGR ||
7094             code == COLOR_HSV2BGR_FULL || code == COLOR_HLS2BGR_FULL ? 0 : 2;
7095         int hrange = depth == CV_32F ? 360 : code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
7096             code == COLOR_HLS2BGR || code == COLOR_HLS2RGB ? 180 : 255;
7097         bool is_hsv = code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
7098                 code == COLOR_HSV2BGR_FULL || code == COLOR_HSV2RGB_FULL;
7099 
7100         String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
7101         k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
7102                  opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
7103                                dcn, bidx, hrange, 6.f/hrange));
7104         break;
7105     }
7106     case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
7107     {
7108         CV_Assert(scn == 4 && depth == CV_8U);
7109         dcn = 4;
7110 
7111         k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
7112                  opts + "-D dcn=4 -D bidx=3");
7113         break;
7114     }
7115     case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
7116     case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
7117     {
7118         CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
7119 
7120         bidx = code == CV_BGR2Lab || code == CV_LBGR2Lab || code == CV_BGR2Luv || code == CV_LBGR2Luv ? 0 : 2;
7121         bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_RGB2Luv || code == CV_BGR2Luv;
7122         bool lab = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_LBGR2Lab || code == CV_LRGB2Lab;
7123         float un, vn;
7124         dcn = 3;
7125 
7126         k.create(format("BGR2%s", lab ? "Lab" : "Luv").c_str(),
7127                  ocl::imgproc::cvtcolor_oclsrc,
7128                  opts + format("-D dcn=%d -D bidx=%d%s",
7129                                dcn, bidx, srgb ? " -D SRGB" : ""));
7130         if (k.empty())
7131             return false;
7132 
7133         initLabTabs();
7134 
7135         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
7136         dst = _dst.getUMat();
7137 
7138         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
7139                 dstarg = ocl::KernelArg::WriteOnly(dst);
7140 
7141         if (depth == CV_8U && lab)
7142         {
7143             static UMat usRGBGammaTab, ulinearGammaTab, uLabCbrtTab, ucoeffs;
7144 
7145             if (srgb && usRGBGammaTab.empty())
7146                 Mat(1, 256, CV_16UC1, sRGBGammaTab_b).copyTo(usRGBGammaTab);
7147             else if (ulinearGammaTab.empty())
7148                 Mat(1, 256, CV_16UC1, linearGammaTab_b).copyTo(ulinearGammaTab);
7149             if (uLabCbrtTab.empty())
7150                 Mat(1, LAB_CBRT_TAB_SIZE_B, CV_16UC1, LabCbrtTab_b).copyTo(uLabCbrtTab);
7151 
7152             {
7153                 int coeffs[9];
7154                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
7155                 const float scale[] =
7156                 {
7157                     (1 << lab_shift)/_whitept[0],
7158                     (float)(1 << lab_shift),
7159                     (1 << lab_shift)/_whitept[2]
7160                 };
7161 
7162                 for (int i = 0; i < 3; i++ )
7163                 {
7164                     coeffs[i*3+(bidx^2)] = cvRound(_coeffs[i*3]*scale[i]);
7165                     coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
7166                     coeffs[i*3+bidx] = cvRound(_coeffs[i*3+2]*scale[i]);
7167 
7168                     CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
7169                               coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
7170                 }
7171                 Mat(1, 9, CV_32SC1, coeffs).copyTo(ucoeffs);
7172             }
7173 
7174             const int Lscale = (116*255+50)/100;
7175             const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
7176 
7177             k.args(srcarg, dstarg,
7178                    ocl::KernelArg::PtrReadOnly(srgb ? usRGBGammaTab : ulinearGammaTab),
7179                    ocl::KernelArg::PtrReadOnly(uLabCbrtTab), ocl::KernelArg::PtrReadOnly(ucoeffs),
7180                    Lscale, Lshift);
7181         }
7182         else
7183         {
7184             static UMat usRGBGammaTab, ucoeffs, uLabCbrtTab;
7185 
7186             if (srgb && usRGBGammaTab.empty())
7187                 Mat(1, GAMMA_TAB_SIZE * 4, CV_32FC1, sRGBGammaTab).copyTo(usRGBGammaTab);
7188             if (!lab && uLabCbrtTab.empty())
7189                 Mat(1, LAB_CBRT_TAB_SIZE * 4, CV_32FC1, LabCbrtTab).copyTo(uLabCbrtTab);
7190 
7191             {
7192                 float coeffs[9];
7193                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
7194                 float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
7195 
7196                 for (int i = 0; i < 3; i++)
7197                 {
7198                     int j = i * 3;
7199                     coeffs[j + (bidx ^ 2)] = _coeffs[j] * (lab ? scale[i] : 1);
7200                     coeffs[j + 1] = _coeffs[j + 1] * (lab ? scale[i] : 1);
7201                     coeffs[j + bidx] = _coeffs[j + 2] * (lab ? scale[i] : 1);
7202 
7203                     CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
7204                                coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*(lab ? LabCbrtTabScale : 1) );
7205                 }
7206 
7207                 float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
7208                 un = 13*4*_whitept[0]*d;
7209                 vn = 13*9*_whitept[1]*d;
7210 
7211                 Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
7212             }
7213 
7214             float _1_3 = 1.0f / 3.0f, _a = 16.0f / 116.0f;
7215             ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
7216 
7217             if (lab)
7218             {
7219                 if (srgb)
7220                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
7221                            ucoeffsarg, _1_3, _a);
7222                 else
7223                     k.args(srcarg, dstarg, ucoeffsarg, _1_3, _a);
7224             }
7225             else
7226             {
7227                 ocl::KernelArg LabCbrtTabarg = ocl::KernelArg::PtrReadOnly(uLabCbrtTab);
7228                 if (srgb)
7229                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
7230                            LabCbrtTabarg, ucoeffsarg, un, vn);
7231                 else
7232                     k.args(srcarg, dstarg, LabCbrtTabarg, ucoeffsarg, un, vn);
7233             }
7234         }
7235 
7236         return k.run(dims, globalsize, NULL, false);
7237     }
7238     case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
7239     case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
7240     {
7241         if( dcn <= 0 )
7242             dcn = 3;
7243         CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
7244 
7245         bidx = code == CV_Lab2BGR || code == CV_Lab2LBGR || code == CV_Luv2BGR || code == CV_Luv2LBGR ? 0 : 2;
7246         bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Luv2BGR || code == CV_Luv2RGB;
7247         bool lab = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Lab2LBGR || code == CV_Lab2LRGB;
7248         float un, vn;
7249 
7250         k.create(format("%s2BGR", lab ? "Lab" : "Luv").c_str(),
7251                  ocl::imgproc::cvtcolor_oclsrc,
7252                  opts + format("-D dcn=%d -D bidx=%d%s",
7253                                dcn, bidx, srgb ? " -D SRGB" : ""));
7254         if (k.empty())
7255             return false;
7256 
7257         initLabTabs();
7258         static UMat ucoeffs, usRGBInvGammaTab;
7259 
7260         if (srgb && usRGBInvGammaTab.empty())
7261             Mat(1, GAMMA_TAB_SIZE*4, CV_32FC1, sRGBInvGammaTab).copyTo(usRGBInvGammaTab);
7262 
7263         {
7264             float coeffs[9];
7265             const float * const _coeffs = XYZ2sRGB_D65, * const _whitept = D65;
7266 
7267             for( int i = 0; i < 3; i++ )
7268             {
7269                 coeffs[i+(bidx^2)*3] = _coeffs[i] * (lab ? _whitept[i] : 1);
7270                 coeffs[i+3] = _coeffs[i+3] * (lab ? _whitept[i] : 1);
7271                 coeffs[i+bidx*3] = _coeffs[i+6] * (lab ? _whitept[i] : 1);
7272             }
7273 
7274             float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
7275             un = 4*_whitept[0]*d;
7276             vn = 9*_whitept[1]*d;
7277 
7278             Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
7279         }
7280 
7281         _dst.create(sz, CV_MAKETYPE(depth, dcn));
7282         dst = _dst.getUMat();
7283 
7284         float lThresh = 0.008856f * 903.3f;
7285         float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
7286 
7287         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
7288                 dstarg = ocl::KernelArg::WriteOnly(dst),
7289                 coeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
7290 
7291         if (lab)
7292         {
7293             if (srgb)
7294                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
7295                        coeffsarg, lThresh, fThresh);
7296             else
7297                 k.args(srcarg, dstarg, coeffsarg, lThresh, fThresh);
7298         }
7299         else
7300         {
7301             if (srgb)
7302                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
7303                        coeffsarg, un, vn);
7304             else
7305                 k.args(srcarg, dstarg, coeffsarg, un, vn);
7306         }
7307 
7308         return k.run(dims, globalsize, NULL, false);
7309     }
7310     default:
7311         break;
7312     }
7313 
7314     if( !k.empty() )
7315     {
7316         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
7317         dst = _dst.getUMat();
7318         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
7319         ok = k.run(dims, globalsize, NULL, false);
7320     }
7321     return ok;
7322 }
7323 
7324 #endif
7325 
7326 }//namespace cv
7327 
7328 //////////////////////////////////////////////////////////////////////////////////////////
7329 //                                   The main function                                  //
7330 //////////////////////////////////////////////////////////////////////////////////////////
7331 
cvtColor(InputArray _src,OutputArray _dst,int code,int dcn)7332 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
7333 {
7334     int stype = _src.type();
7335     int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
7336 
7337     CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() && !(depth == CV_8U && (code == CV_Luv2BGR || code == CV_Luv2RGB)),
7338                 ocl_cvtColor(_src, _dst, code, dcn) )
7339 
7340     Mat src = _src.getMat(), dst;
7341     Size sz = src.size();
7342 
7343     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
7344 
7345     switch( code )
7346     {
7347         case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
7348         case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
7349             CV_Assert( scn == 3 || scn == 4 );
7350             dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
7351             bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
7352 
7353             _dst.create( sz, CV_MAKETYPE(depth, dcn));
7354             dst = _dst.getMat();
7355 
7356 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7357             CV_IPP_CHECK()
7358             {
7359                 if( code == CV_BGR2BGRA)
7360                 {
7361                     if ( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
7362                     {
7363                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7364                         return;
7365                     }
7366                     setIppErrorStatus();
7367                 }
7368                 else if( code == CV_BGRA2BGR )
7369                 {
7370                     if ( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
7371                     {
7372                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7373                         return;
7374                     }
7375                     setIppErrorStatus();
7376                 }
7377                 else if( code == CV_BGR2RGBA )
7378                 {
7379                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
7380                     {
7381                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7382                         return;
7383                     }
7384                     setIppErrorStatus();
7385                 }
7386                 else if( code == CV_RGBA2BGR )
7387                 {
7388                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
7389                     {
7390                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7391                         return;
7392                     }
7393                     setIppErrorStatus();
7394                 }
7395                 else if( code == CV_RGB2BGR )
7396                 {
7397                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
7398                     {
7399                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7400                         return;
7401                     }
7402                     setIppErrorStatus();
7403                 }
7404 #if IPP_VERSION_X100 >= 801
7405                 else if( code == CV_RGBA2BGRA )
7406                 {
7407                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
7408                     {
7409                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7410                         return;
7411                     }
7412                     setIppErrorStatus();
7413                 }
7414 #endif
7415             }
7416 #endif
7417 
7418             if( depth == CV_8U )
7419             {
7420 #ifdef HAVE_TEGRA_OPTIMIZATION
7421                 if(tegra::useTegra() && tegra::cvtBGR2RGB(src, dst, bidx))
7422                     break;
7423 #endif
7424                 CvtColorLoop(src, dst, RGB2RGB<uchar>(scn, dcn, bidx));
7425             }
7426             else if( depth == CV_16U )
7427                 CvtColorLoop(src, dst, RGB2RGB<ushort>(scn, dcn, bidx));
7428             else
7429                 CvtColorLoop(src, dst, RGB2RGB<float>(scn, dcn, bidx));
7430             break;
7431 
7432         case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
7433         case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
7434             CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
7435             _dst.create(sz, CV_8UC2);
7436             dst = _dst.getMat();
7437 
7438 #if defined(HAVE_IPP) && 0 // breaks OCL accuracy tests
7439             CV_IPP_CHECK()
7440             {
7441                 CV_SUPPRESS_DEPRECATED_START
7442 
7443                 if (code == CV_BGR2BGR565 && scn == 3)
7444                 {
7445                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
7446                     {
7447                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7448                         return;
7449                     }
7450                     setIppErrorStatus();
7451                 }
7452                 else if (code == CV_BGRA2BGR565 && scn == 4)
7453                 {
7454                     if (CvtColorIPPLoopCopy(src, dst,
7455                                             IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7456                                             (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, depth)))
7457                     {
7458                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7459                         return;
7460                     }
7461                     setIppErrorStatus();
7462                 }
7463                 else if (code == CV_RGB2BGR565 && scn == 3)
7464                 {
7465                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
7466                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
7467                     {
7468                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7469                         return;
7470                     }
7471                     setIppErrorStatus();
7472                 }
7473                 else if (code == CV_RGBA2BGR565 && scn == 4)
7474                 {
7475                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7476                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
7477                     {
7478                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7479                         return;
7480                     }
7481                     setIppErrorStatus();
7482                 }
7483                 CV_SUPPRESS_DEPRECATED_END
7484             }
7485 #endif
7486 
7487 #ifdef HAVE_TEGRA_OPTIMIZATION
7488             if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
7489                 if(tegra::useTegra() && tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
7490                     break;
7491 #endif
7492 
7493             CvtColorLoop(src, dst, RGB2RGB5x5(scn,
7494                       code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
7495                       code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,
7496                       code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
7497                       code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5 // green bits
7498                                               ));
7499             break;
7500 
7501         case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
7502         case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
7503             if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
7504             CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
7505             _dst.create(sz, CV_MAKETYPE(depth, dcn));
7506             dst = _dst.getMat();
7507 
7508 #ifdef HAVE_IPP
7509             CV_IPP_CHECK()
7510             {
7511                 CV_SUPPRESS_DEPRECATED_START
7512                 if (code == CV_BGR5652BGR && dcn == 3)
7513                 {
7514                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
7515                     {
7516                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7517                         return;
7518                     }
7519                     setIppErrorStatus();
7520                 }
7521                 else if (code == CV_BGR5652RGB && dcn == 3)
7522                 {
7523                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
7524                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
7525                     {
7526                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7527                         return;
7528                     }
7529                     setIppErrorStatus();
7530                 }
7531                 else if (code == CV_BGR5652BGRA && dcn == 4)
7532                 {
7533                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
7534                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
7535                     {
7536                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7537                         return;
7538                     }
7539                     setIppErrorStatus();
7540                 }
7541                 else if (code == CV_BGR5652RGBA && dcn == 4)
7542                 {
7543                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
7544                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
7545                     {
7546                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7547                         return;
7548                     }
7549                     setIppErrorStatus();
7550                 }
7551                 CV_SUPPRESS_DEPRECATED_END
7552             }
7553 #endif
7554 
7555             CvtColorLoop(src, dst, RGB5x52RGB(dcn,
7556                       code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
7557                       code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
7558                       code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
7559                       code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5 // green bits
7560                       ));
7561             break;
7562 
7563         case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
7564             CV_Assert( scn == 3 || scn == 4 );
7565             _dst.create(sz, CV_MAKETYPE(depth, 1));
7566             dst = _dst.getMat();
7567 
7568 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7569             CV_IPP_CHECK()
7570             {
7571                 if( code == CV_BGR2GRAY && depth == CV_32F )
7572                 {
7573                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
7574                     {
7575                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7576                         return;
7577                     }
7578                     setIppErrorStatus();
7579                 }
7580                 else if( code == CV_RGB2GRAY && depth == CV_32F )
7581                 {
7582                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
7583                     {
7584                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7585                         return;
7586                     }
7587                     setIppErrorStatus();
7588                 }
7589                 else if( code == CV_BGRA2GRAY && depth == CV_32F )
7590                 {
7591                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
7592                     {
7593                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7594                         return;
7595                     }
7596                     setIppErrorStatus();
7597                 }
7598                 else if( code == CV_RGBA2GRAY && depth == CV_32F )
7599                 {
7600                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
7601                     {
7602                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7603                         return;
7604                     }
7605                     setIppErrorStatus();
7606                 }
7607             }
7608 #endif
7609 
7610             bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
7611 
7612             if( depth == CV_8U )
7613             {
7614 #ifdef HAVE_TEGRA_OPTIMIZATION
7615                 if(tegra::useTegra() && tegra::cvtRGB2Gray(src, dst, bidx))
7616                     break;
7617 #endif
7618                 CvtColorLoop(src, dst, RGB2Gray<uchar>(scn, bidx, 0));
7619             }
7620             else if( depth == CV_16U )
7621                 CvtColorLoop(src, dst, RGB2Gray<ushort>(scn, bidx, 0));
7622             else
7623                 CvtColorLoop(src, dst, RGB2Gray<float>(scn, bidx, 0));
7624             break;
7625 
7626         case CV_BGR5652GRAY: case CV_BGR5552GRAY:
7627             CV_Assert( scn == 2 && depth == CV_8U );
7628             _dst.create(sz, CV_8UC1);
7629             dst = _dst.getMat();
7630 
7631             CvtColorLoop(src, dst, RGB5x52Gray(code == CV_BGR5652GRAY ? 6 : 5));
7632             break;
7633 
7634         case CV_GRAY2BGR: case CV_GRAY2BGRA:
7635             if( dcn <= 0 ) dcn = (code==CV_GRAY2BGRA) ? 4 : 3;
7636             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
7637             _dst.create(sz, CV_MAKETYPE(depth, dcn));
7638             dst = _dst.getMat();
7639 
7640 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7641             CV_IPP_CHECK()
7642             {
7643                 if( code == CV_GRAY2BGR )
7644                 {
7645                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRFunctor(ippiCopyP3C3RTab[depth])) )
7646                     {
7647                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7648                         return;
7649                     }
7650                     setIppErrorStatus();
7651                 }
7652                 else if( code == CV_GRAY2BGRA )
7653                 {
7654                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRAFunctor(ippiCopyP3C3RTab[depth], ippiSwapChannelsC3C4RTab[depth], depth)) )
7655                     {
7656                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7657                         return;
7658                     }
7659                     setIppErrorStatus();
7660                 }
7661             }
7662 #endif
7663 
7664 
7665             if( depth == CV_8U )
7666             {
7667 #ifdef HAVE_TEGRA_OPTIMIZATION
7668                 if(tegra::useTegra() && tegra::cvtGray2RGB(src, dst))
7669                     break;
7670 #endif
7671                 CvtColorLoop(src, dst, Gray2RGB<uchar>(dcn));
7672             }
7673             else if( depth == CV_16U )
7674                 CvtColorLoop(src, dst, Gray2RGB<ushort>(dcn));
7675             else
7676                 CvtColorLoop(src, dst, Gray2RGB<float>(dcn));
7677             break;
7678 
7679         case CV_GRAY2BGR565: case CV_GRAY2BGR555:
7680             CV_Assert( scn == 1 && depth == CV_8U );
7681             _dst.create(sz, CV_8UC2);
7682             dst = _dst.getMat();
7683 
7684             CvtColorLoop(src, dst, Gray2RGB5x5(code == CV_GRAY2BGR565 ? 6 : 5));
7685             break;
7686 
7687         case CV_BGR2YCrCb: case CV_RGB2YCrCb:
7688         case CV_BGR2YUV: case CV_RGB2YUV:
7689             {
7690             CV_Assert( scn == 3 || scn == 4 );
7691             bidx = code == CV_BGR2YCrCb || code == CV_BGR2YUV ? 0 : 2;
7692             static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
7693             static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
7694             const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
7695             const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
7696 
7697             _dst.create(sz, CV_MAKETYPE(depth, 3));
7698             dst = _dst.getMat();
7699 
7700 #if defined HAVE_IPP && 0
7701             CV_IPP_CHECK()
7702             {
7703                 if (code == CV_RGB2YUV && scn == 3 && depth == CV_8U)
7704                 {
7705                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
7706                     {
7707                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7708                         return;
7709                     }
7710                     setIppErrorStatus();
7711                 }
7712                 else if (code == CV_BGR2YUV && scn == 3 && depth == CV_8U)
7713                 {
7714                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
7715                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
7716                     {
7717                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7718                         return;
7719                     }
7720                     setIppErrorStatus();
7721                 }
7722                 else if (code == CV_RGB2YUV && scn == 4 && depth == CV_8U)
7723                 {
7724                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7725                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
7726                     {
7727                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7728                         return;
7729                     }
7730                     setIppErrorStatus();
7731                 }
7732                 else if (code == CV_BGR2YUV && scn == 4 && depth == CV_8U)
7733                 {
7734                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
7735                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
7736                     {
7737                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7738                         return;
7739                     }
7740                     setIppErrorStatus();
7741                 }
7742             }
7743 #endif
7744 
7745             if( depth == CV_8U )
7746             {
7747 #ifdef HAVE_TEGRA_OPTIMIZATION
7748                 if((code == CV_RGB2YCrCb || code == CV_BGR2YCrCb) && tegra::useTegra() && tegra::cvtRGB2YCrCb(src, dst, bidx))
7749                     break;
7750 #endif
7751                 CvtColorLoop(src, dst, RGB2YCrCb_i<uchar>(scn, bidx, coeffs_i));
7752             }
7753             else if( depth == CV_16U )
7754                 CvtColorLoop(src, dst, RGB2YCrCb_i<ushort>(scn, bidx, coeffs_i));
7755             else
7756                 CvtColorLoop(src, dst, RGB2YCrCb_f<float>(scn, bidx, coeffs_f));
7757             }
7758             break;
7759 
7760         case CV_YCrCb2BGR: case CV_YCrCb2RGB:
7761         case CV_YUV2BGR: case CV_YUV2RGB:
7762             {
7763             if( dcn <= 0 ) dcn = 3;
7764             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
7765             bidx = code == CV_YCrCb2BGR || code == CV_YUV2BGR ? 0 : 2;
7766             static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
7767             static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
7768             const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
7769             const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
7770 
7771             _dst.create(sz, CV_MAKETYPE(depth, dcn));
7772             dst = _dst.getMat();
7773 
7774 #if defined HAVE_IPP && 0
7775             CV_IPP_CHECK()
7776             {
7777                 if (code == CV_YUV2RGB && dcn == 3 && depth == CV_8U)
7778                 {
7779                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
7780                     {
7781                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7782                         return;
7783                     }
7784                     setIppErrorStatus();
7785                 }
7786                 else if (code == CV_YUV2BGR && dcn == 3 && depth == CV_8U)
7787                 {
7788                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
7789                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
7790                     {
7791                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7792                         return;
7793                     }
7794                     setIppErrorStatus();
7795                 }
7796                 else if (code == CV_YUV2RGB && dcn == 4 && depth == CV_8U)
7797                 {
7798                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
7799                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
7800                     {
7801                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7802                         return;
7803                     }
7804                     setIppErrorStatus();
7805                 }
7806                 else if (code == CV_YUV2BGR && dcn == 4 && depth == CV_8U)
7807                 {
7808                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
7809                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
7810                     {
7811                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7812                         return;
7813                     }
7814                     setIppErrorStatus();
7815                 }
7816             }
7817 #endif
7818 
7819             if( depth == CV_8U )
7820                 CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
7821             else if( depth == CV_16U )
7822                 CvtColorLoop(src, dst, YCrCb2RGB_i<ushort>(dcn, bidx, coeffs_i));
7823             else
7824                 CvtColorLoop(src, dst, YCrCb2RGB_f<float>(dcn, bidx, coeffs_f));
7825             }
7826             break;
7827 
7828         case CV_BGR2XYZ: case CV_RGB2XYZ:
7829             CV_Assert( scn == 3 || scn == 4 );
7830             bidx = code == CV_BGR2XYZ ? 0 : 2;
7831 
7832             _dst.create(sz, CV_MAKETYPE(depth, 3));
7833             dst = _dst.getMat();
7834 
7835 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7836             CV_IPP_CHECK()
7837             {
7838                 if( code == CV_BGR2XYZ && scn == 3 && depth != CV_32F )
7839                 {
7840                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
7841                     {
7842                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7843                         return;
7844                     }
7845                     setIppErrorStatus();
7846                 }
7847                 else if( code == CV_BGR2XYZ && scn == 4 && depth != CV_32F )
7848                 {
7849                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
7850                     {
7851                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7852                         return;
7853                     }
7854                     setIppErrorStatus();
7855                 }
7856                 else if( code == CV_RGB2XYZ && scn == 3 && depth != CV_32F )
7857                 {
7858                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2XYZTab[depth])) )
7859                     {
7860                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7861                         return;
7862                     }
7863                     setIppErrorStatus();
7864                 }
7865                 else if( code == CV_RGB2XYZ && scn == 4 && depth != CV_32F )
7866                 {
7867                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 0, 1, 2, depth)) )
7868                     {
7869                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7870                         return;
7871                     }
7872                     setIppErrorStatus();
7873                 }
7874             }
7875 #endif
7876 
7877             if( depth == CV_8U )
7878                 CvtColorLoop(src, dst, RGB2XYZ_i<uchar>(scn, bidx, 0));
7879             else if( depth == CV_16U )
7880                 CvtColorLoop(src, dst, RGB2XYZ_i<ushort>(scn, bidx, 0));
7881             else
7882                 CvtColorLoop(src, dst, RGB2XYZ_f<float>(scn, bidx, 0));
7883             break;
7884 
7885         case CV_XYZ2BGR: case CV_XYZ2RGB:
7886             if( dcn <= 0 ) dcn = 3;
7887             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
7888             bidx = code == CV_XYZ2BGR ? 0 : 2;
7889 
7890             _dst.create(sz, CV_MAKETYPE(depth, dcn));
7891             dst = _dst.getMat();
7892 
7893 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7894             CV_IPP_CHECK()
7895             {
7896                 if( code == CV_XYZ2BGR && dcn == 3 && depth != CV_32F )
7897                 {
7898                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
7899                     {
7900                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7901                         return;
7902                     }
7903                     setIppErrorStatus();
7904                 }
7905                 else if( code == CV_XYZ2BGR && dcn == 4 && depth != CV_32F )
7906                 {
7907                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
7908                     {
7909                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7910                         return;
7911                     }
7912                     setIppErrorStatus();
7913                 }
7914                 if( code == CV_XYZ2RGB && dcn == 3 && depth != CV_32F )
7915                 {
7916                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiXYZ2RGBTab[depth])) )
7917                     {
7918                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7919                         return;
7920                     }
7921                     setIppErrorStatus();
7922                 }
7923                 else if( code == CV_XYZ2RGB && dcn == 4 && depth != CV_32F )
7924                 {
7925                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
7926                     {
7927                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7928                         return;
7929                     }
7930                     setIppErrorStatus();
7931                 }
7932             }
7933 #endif
7934 
7935             if( depth == CV_8U )
7936                 CvtColorLoop(src, dst, XYZ2RGB_i<uchar>(dcn, bidx, 0));
7937             else if( depth == CV_16U )
7938                 CvtColorLoop(src, dst, XYZ2RGB_i<ushort>(dcn, bidx, 0));
7939             else
7940                 CvtColorLoop(src, dst, XYZ2RGB_f<float>(dcn, bidx, 0));
7941             break;
7942 
7943         case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
7944         case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
7945             {
7946             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
7947             bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
7948                 code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
7949             int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
7950                 code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
7951 
7952             _dst.create(sz, CV_MAKETYPE(depth, 3));
7953             dst = _dst.getMat();
7954 
7955 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
7956             CV_IPP_CHECK()
7957             {
7958                 if( depth == CV_8U || depth == CV_16U )
7959                 {
7960 #if 0 // breaks OCL accuracy tests
7961                     if( code == CV_BGR2HSV_FULL && scn == 3 )
7962                     {
7963                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
7964                         {
7965                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7966                             return;
7967                         }
7968                         setIppErrorStatus();
7969                     }
7970                     else if( code == CV_BGR2HSV_FULL && scn == 4 )
7971                     {
7972                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
7973                         {
7974                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7975                             return;
7976                         }
7977                         setIppErrorStatus();
7978                     }
7979                     else if( code == CV_RGB2HSV_FULL && scn == 4 )
7980                     {
7981                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
7982                         {
7983                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7984                             return;
7985                         }
7986                         setIppErrorStatus();
7987                     } else
7988 #endif
7989                     if( code == CV_RGB2HSV_FULL && scn == 3 && depth == CV_16U )
7990                     {
7991                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HSVTab[depth])) )
7992                         {
7993                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
7994                             return;
7995                         }
7996                         setIppErrorStatus();
7997                     }
7998                     else if( code == CV_BGR2HLS_FULL && scn == 3 )
7999                     {
8000                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
8001                         {
8002                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8003                             return;
8004                         }
8005                         setIppErrorStatus();
8006                     }
8007                     else if( code == CV_BGR2HLS_FULL && scn == 4 )
8008                     {
8009                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
8010                         {
8011                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8012                             return;
8013                         }
8014                         setIppErrorStatus();
8015                     }
8016                     else if( code == CV_RGB2HLS_FULL && scn == 3 )
8017                     {
8018                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
8019                         {
8020                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8021                             return;
8022                         }
8023                         setIppErrorStatus();
8024                     }
8025                     else if( code == CV_RGB2HLS_FULL && scn == 4 )
8026                     {
8027                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
8028                         {
8029                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8030                             return;
8031                         }
8032                         setIppErrorStatus();
8033                     }
8034                 }
8035             }
8036 #endif
8037 
8038             if( code == CV_BGR2HSV || code == CV_RGB2HSV ||
8039                 code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )
8040             {
8041 #ifdef HAVE_TEGRA_OPTIMIZATION
8042                 if(tegra::useTegra() && tegra::cvtRGB2HSV(src, dst, bidx, hrange))
8043                     break;
8044 #endif
8045                 if( depth == CV_8U )
8046                     CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));
8047                 else
8048                     CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));
8049             }
8050             else
8051             {
8052                 if( depth == CV_8U )
8053                     CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));
8054                 else
8055                     CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));
8056             }
8057             }
8058             break;
8059 
8060         case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
8061         case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
8062             {
8063             if( dcn <= 0 ) dcn = 3;
8064             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
8065             bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
8066                 code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
8067             int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
8068                 code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
8069 
8070             _dst.create(sz, CV_MAKETYPE(depth, dcn));
8071             dst = _dst.getMat();
8072 
8073 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
8074             CV_IPP_CHECK()
8075             {
8076                 if( depth == CV_8U || depth == CV_16U )
8077                 {
8078                     if( code == CV_HSV2BGR_FULL && dcn == 3 )
8079                     {
8080                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8081                         {
8082                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8083                             return;
8084                         }
8085                         setIppErrorStatus();
8086                     }
8087                     else if( code == CV_HSV2BGR_FULL && dcn == 4 )
8088                     {
8089                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8090                         {
8091                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8092                             return;
8093                         }
8094                         setIppErrorStatus();
8095                     }
8096                     else if( code == CV_HSV2RGB_FULL && dcn == 3 )
8097                     {
8098                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
8099                         {
8100                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8101                             return;
8102                         }
8103                         setIppErrorStatus();
8104                     }
8105                     else if( code == CV_HSV2RGB_FULL && dcn == 4 )
8106                     {
8107                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8108                         {
8109                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8110                             return;
8111                         }
8112                         setIppErrorStatus();
8113                     }
8114                     else if( code == CV_HLS2BGR_FULL && dcn == 3 )
8115                     {
8116                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8117                         {
8118                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8119                             return;
8120                         }
8121                         setIppErrorStatus();
8122                     }
8123                     else if( code == CV_HLS2BGR_FULL && dcn == 4 )
8124                     {
8125                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8126                         {
8127                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8128                             return;
8129                         }
8130                         setIppErrorStatus();
8131                     }
8132                     else if( code == CV_HLS2RGB_FULL && dcn == 3 )
8133                     {
8134                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
8135                         {
8136                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8137                             return;
8138                         }
8139                         setIppErrorStatus();
8140                     }
8141                     else if( code == CV_HLS2RGB_FULL && dcn == 4 )
8142                     {
8143                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8144                         {
8145                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8146                             return;
8147                         }
8148                         setIppErrorStatus();
8149                     }
8150                 }
8151             }
8152 #endif
8153 
8154             if( code == CV_HSV2BGR || code == CV_HSV2RGB ||
8155                 code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )
8156             {
8157                 if( depth == CV_8U )
8158                     CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));
8159                 else
8160                     CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));
8161             }
8162             else
8163             {
8164                 if( depth == CV_8U )
8165                     CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));
8166                 else
8167                     CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));
8168             }
8169             }
8170             break;
8171 
8172         case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
8173         case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
8174             {
8175             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
8176             bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||
8177                    code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;
8178             bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
8179                         code == CV_BGR2Luv || code == CV_RGB2Luv;
8180 
8181             _dst.create(sz, CV_MAKETYPE(depth, 3));
8182             dst = _dst.getMat();
8183 
8184 #if defined HAVE_IPP && 0
8185             CV_IPP_CHECK()
8186             {
8187                 if (code == CV_LBGR2Lab && scn == 3 && depth == CV_8U)
8188                 {
8189                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
8190                     {
8191                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8192                         return;
8193                     }
8194                     setIppErrorStatus();
8195                 }
8196                 else if (code == CV_LBGR2Lab && scn == 4 && depth == CV_8U)
8197                 {
8198                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8199                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
8200                     {
8201                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8202                         return;
8203                     }
8204                     setIppErrorStatus();
8205                 }
8206                 else
8207                 if (code == CV_LRGB2Lab && scn == 3 && depth == CV_8U) // slower than OpenCV
8208                 {
8209                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
8210                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
8211                     {
8212                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8213                         return;
8214                     }
8215                     setIppErrorStatus();
8216                 }
8217                 else if (code == CV_LRGB2Lab && scn == 4 && depth == CV_8U) // slower than OpenCV
8218                 {
8219                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8220                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
8221                     {
8222                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8223                         return;
8224                     }
8225                     setIppErrorStatus();
8226                 }
8227                 else if (code == CV_LRGB2Luv && scn == 3)
8228                 {
8229                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
8230                     {
8231                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8232                         return;
8233                     }
8234                     setIppErrorStatus();
8235                 }
8236                 else if (code == CV_LRGB2Luv && scn == 4)
8237                 {
8238                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8239                                                                            ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
8240                     {
8241                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8242                         return;
8243                     }
8244                     setIppErrorStatus();
8245                 }
8246                 else if (code == CV_LBGR2Luv && scn == 3)
8247                 {
8248                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
8249                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
8250                     {
8251                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8252                         return;
8253                     }
8254                     setIppErrorStatus();
8255                 }
8256                 else if (code == CV_LBGR2Luv && scn == 4)
8257                 {
8258                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
8259                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
8260                     {
8261                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8262                         return;
8263                     }
8264                     setIppErrorStatus();
8265                 }
8266             }
8267 #endif
8268 
8269             if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
8270                 code == CV_LBGR2Lab || code == CV_LRGB2Lab )
8271             {
8272                 if( depth == CV_8U )
8273                     CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));
8274                 else
8275                     CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));
8276             }
8277             else
8278             {
8279                 if( depth == CV_8U )
8280                     CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));
8281                 else
8282                     CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));
8283             }
8284             }
8285             break;
8286 
8287         case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
8288         case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
8289             {
8290             if( dcn <= 0 ) dcn = 3;
8291             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
8292             bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||
8293                    code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;
8294             bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
8295                     code == CV_Luv2BGR || code == CV_Luv2RGB;
8296 
8297             _dst.create(sz, CV_MAKETYPE(depth, dcn));
8298             dst = _dst.getMat();
8299 
8300 #if defined HAVE_IPP && 0
8301             CV_IPP_CHECK()
8302             {
8303                 if( code == CV_Lab2LBGR && dcn == 3 && depth == CV_8U)
8304                 {
8305                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
8306                     {
8307                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8308                         return;
8309                     }
8310                     setIppErrorStatus();
8311                 }
8312                 else if( code == CV_Lab2LBGR && dcn == 4 && depth == CV_8U )
8313                 {
8314                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
8315                                         ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8316                     {
8317                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8318                         return;
8319                     }
8320                     setIppErrorStatus();
8321                 }
8322                 if( code == CV_Lab2LRGB && dcn == 3 && depth == CV_8U )
8323                 {
8324                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
8325                                                                                ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8326                     {
8327                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8328                         return;
8329                     }
8330                     setIppErrorStatus();
8331                 }
8332                 else if( code == CV_Lab2LRGB && dcn == 4 && depth == CV_8U )
8333                 {
8334                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
8335                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8336                     {
8337                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8338                         return;
8339                     }
8340                     setIppErrorStatus();
8341                 }
8342                 if( code == CV_Luv2LRGB && dcn == 3 )
8343                 {
8344                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
8345                         return;
8346                 }
8347                 else if( code == CV_Luv2LRGB && dcn == 4 )
8348                 {
8349                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
8350                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
8351                     {
8352                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8353                         return;
8354                     }
8355                 }
8356                 if( code == CV_Luv2LBGR && dcn == 3 )
8357                 {
8358                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
8359                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
8360                     {
8361                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8362                         return;
8363                     }
8364                 }
8365                 else if( code == CV_Luv2LBGR && dcn == 4 )
8366                 {
8367                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
8368                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
8369                     {
8370                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8371                         return;
8372                     }
8373                 }
8374             }
8375 #endif
8376 
8377             if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
8378                 code == CV_Lab2LBGR || code == CV_Lab2LRGB )
8379             {
8380                 if( depth == CV_8U )
8381                     CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));
8382                 else
8383                     CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));
8384             }
8385             else
8386             {
8387                 if( depth == CV_8U )
8388                     CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));
8389                 else
8390                     CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));
8391             }
8392             }
8393             break;
8394 
8395         case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
8396         case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
8397         case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
8398         case CV_BayerBG2BGR_EA: case CV_BayerGB2BGR_EA: case CV_BayerRG2BGR_EA: case CV_BayerGR2BGR_EA:
8399             demosaicing(src, _dst, code, dcn);
8400             break;
8401 
8402         case CV_YUV2BGR_NV21:  case CV_YUV2RGB_NV21:  case CV_YUV2BGR_NV12:  case CV_YUV2RGB_NV12:
8403         case CV_YUV2BGRA_NV21: case CV_YUV2RGBA_NV21: case CV_YUV2BGRA_NV12: case CV_YUV2RGBA_NV12:
8404             {
8405                 // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
8406                 // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
8407 
8408                 if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
8409                 const int bIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2BGR_NV12 || code==CV_YUV2BGRA_NV12) ? 0 : 2;
8410                 const int uIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
8411 
8412                 CV_Assert( dcn == 3 || dcn == 4 );
8413                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
8414 
8415                 Size dstSz(sz.width, sz.height * 2 / 3);
8416                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8417                 dst = _dst.getMat();
8418 
8419                 int srcstep = (int)src.step;
8420                 const uchar* y = src.ptr();
8421                 const uchar* uv = y + srcstep * dstSz.height;
8422 
8423                 switch(dcn*100 + bIdx * 10 + uIdx)
8424                 {
8425                     case 300: cvtYUV420sp2RGB<0, 0> (dst, srcstep, y, uv); break;
8426                     case 301: cvtYUV420sp2RGB<0, 1> (dst, srcstep, y, uv); break;
8427                     case 320: cvtYUV420sp2RGB<2, 0> (dst, srcstep, y, uv); break;
8428                     case 321: cvtYUV420sp2RGB<2, 1> (dst, srcstep, y, uv); break;
8429                     case 400: cvtYUV420sp2RGBA<0, 0>(dst, srcstep, y, uv); break;
8430                     case 401: cvtYUV420sp2RGBA<0, 1>(dst, srcstep, y, uv); break;
8431                     case 420: cvtYUV420sp2RGBA<2, 0>(dst, srcstep, y, uv); break;
8432                     case 421: cvtYUV420sp2RGBA<2, 1>(dst, srcstep, y, uv); break;
8433                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8434                 };
8435             }
8436             break;
8437         case CV_YUV2BGR_YV12: case CV_YUV2RGB_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2RGBA_YV12:
8438         case CV_YUV2BGR_IYUV: case CV_YUV2RGB_IYUV: case CV_YUV2BGRA_IYUV: case CV_YUV2RGBA_IYUV:
8439             {
8440                 //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
8441                 //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
8442 
8443                 if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
8444                 const int bIdx = (code==CV_YUV2BGR_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2BGR_IYUV || code==CV_YUV2BGRA_IYUV) ? 0 : 2;
8445                 const int uIdx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
8446 
8447                 CV_Assert( dcn == 3 || dcn == 4 );
8448                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
8449 
8450                 Size dstSz(sz.width, sz.height * 2 / 3);
8451                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8452                 dst = _dst.getMat();
8453 
8454                 int srcstep = (int)src.step;
8455                 const uchar* y = src.ptr();
8456                 const uchar* u = y + srcstep * dstSz.height;
8457                 const uchar* v = y + srcstep * (dstSz.height + dstSz.height/4) + (dstSz.width/2) * ((dstSz.height % 4)/2);
8458 
8459                 int ustepIdx = 0;
8460                 int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
8461 
8462                 if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
8463 
8464                 switch(dcn*10 + bIdx)
8465                 {
8466                     case 30: cvtYUV420p2RGB<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8467                     case 32: cvtYUV420p2RGB<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8468                     case 40: cvtYUV420p2RGBA<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8469                     case 42: cvtYUV420p2RGBA<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
8470                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8471                 };
8472             }
8473             break;
8474         case CV_YUV2GRAY_420:
8475             {
8476                 if (dcn <= 0) dcn = 1;
8477 
8478                 CV_Assert( dcn == 1 );
8479                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
8480 
8481                 Size dstSz(sz.width, sz.height * 2 / 3);
8482                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8483                 dst = _dst.getMat();
8484 #if defined HAVE_IPP
8485                 CV_IPP_CHECK()
8486                 {
8487                     if (ippStsNoErr == ippiCopy_8u_C1R(src.data, (int)src.step, dst.data, (int)dst.step,
8488                             ippiSize(dstSz.width, dstSz.height)))
8489                     {
8490                         CV_IMPL_ADD(CV_IMPL_IPP);
8491                         return;
8492                     }
8493                     setIppErrorStatus();
8494                 }
8495 #endif
8496                 src(Range(0, dstSz.height), Range::all()).copyTo(dst);
8497             }
8498             break;
8499         case CV_RGB2YUV_YV12: case CV_BGR2YUV_YV12: case CV_RGBA2YUV_YV12: case CV_BGRA2YUV_YV12:
8500         case CV_RGB2YUV_IYUV: case CV_BGR2YUV_IYUV: case CV_RGBA2YUV_IYUV: case CV_BGRA2YUV_IYUV:
8501             {
8502                 if (dcn <= 0) dcn = 1;
8503                 const int bIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_BGR2YUV_YV12 || code == CV_BGRA2YUV_YV12) ? 0 : 2;
8504                 const int uIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
8505 
8506                 CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
8507                 CV_Assert( dcn == 1 );
8508                 CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
8509 
8510                 Size dstSz(sz.width, sz.height / 2 * 3);
8511                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
8512                 dst = _dst.getMat();
8513 
8514                 switch(bIdx + uIdx*10)
8515                 {
8516                     case 10: cvtRGBtoYUV420p<0, 1>(src, dst); break;
8517                     case 12: cvtRGBtoYUV420p<2, 1>(src, dst); break;
8518                     case 20: cvtRGBtoYUV420p<0, 2>(src, dst); break;
8519                     case 22: cvtRGBtoYUV420p<2, 2>(src, dst); break;
8520                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8521                 };
8522             }
8523             break;
8524         case CV_YUV2RGB_UYVY: case CV_YUV2BGR_UYVY: case CV_YUV2RGBA_UYVY: case CV_YUV2BGRA_UYVY:
8525         case CV_YUV2RGB_YUY2: case CV_YUV2BGR_YUY2: case CV_YUV2RGB_YVYU: case CV_YUV2BGR_YVYU:
8526         case CV_YUV2RGBA_YUY2: case CV_YUV2BGRA_YUY2: case CV_YUV2RGBA_YVYU: case CV_YUV2BGRA_YVYU:
8527             {
8528                 //http://www.fourcc.org/yuv.php#UYVY
8529                 //http://www.fourcc.org/yuv.php#YUY2
8530                 //http://www.fourcc.org/yuv.php#YVYU
8531 
8532                 if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
8533                 const int bIdx = (code==CV_YUV2BGR_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2BGR_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2BGR_YVYU || code==CV_YUV2BGRA_YVYU) ? 0 : 2;
8534                 const int ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
8535                 const int uIdx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
8536 
8537                 CV_Assert( dcn == 3 || dcn == 4 );
8538                 CV_Assert( scn == 2 && depth == CV_8U );
8539 
8540                 _dst.create(sz, CV_8UC(dcn));
8541                 dst = _dst.getMat();
8542 
8543                 switch(dcn*1000 + bIdx*100 + uIdx*10 + ycn)
8544                 {
8545                     case 3000: cvtYUV422toRGB<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8546                     case 3001: cvtYUV422toRGB<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8547                     case 3010: cvtYUV422toRGB<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8548                     case 3011: cvtYUV422toRGB<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8549                     case 3200: cvtYUV422toRGB<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8550                     case 3201: cvtYUV422toRGB<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8551                     case 3210: cvtYUV422toRGB<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8552                     case 3211: cvtYUV422toRGB<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8553                     case 4000: cvtYUV422toRGBA<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8554                     case 4001: cvtYUV422toRGBA<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8555                     case 4010: cvtYUV422toRGBA<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8556                     case 4011: cvtYUV422toRGBA<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8557                     case 4200: cvtYUV422toRGBA<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8558                     case 4201: cvtYUV422toRGBA<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8559                     case 4210: cvtYUV422toRGBA<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
8560                     case 4211: cvtYUV422toRGBA<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
8561                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
8562                 };
8563             }
8564             break;
8565         case CV_YUV2GRAY_UYVY: case CV_YUV2GRAY_YUY2:
8566             {
8567                 if (dcn <= 0) dcn = 1;
8568 
8569                 CV_Assert( dcn == 1 );
8570                 CV_Assert( scn == 2 && depth == CV_8U );
8571 
8572                 extractChannel(_src, _dst, code == CV_YUV2GRAY_UYVY ? 1 : 0);
8573             }
8574             break;
8575         case CV_RGBA2mRGBA:
8576             {
8577                 if (dcn <= 0) dcn = 4;
8578                 CV_Assert( scn == 4 && dcn == 4 );
8579 
8580                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
8581                 dst = _dst.getMat();
8582 
8583                 if( depth == CV_8U )
8584                 {
8585 #if defined(HAVE_IPP)
8586                     CV_IPP_CHECK()
8587                     {
8588                         if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
8589                         {
8590                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
8591                             return;
8592                         }
8593                         setIppErrorStatus();
8594                     }
8595 #endif
8596                     CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
8597                 }
8598                 else
8599                 {
8600                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
8601                 }
8602             }
8603             break;
8604         case CV_mRGBA2RGBA:
8605             {
8606                 if (dcn <= 0) dcn = 4;
8607                 CV_Assert( scn == 4 && dcn == 4 );
8608 
8609                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
8610                 dst = _dst.getMat();
8611 
8612                 if( depth == CV_8U )
8613                     CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
8614                 else
8615                 {
8616                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
8617                 }
8618             }
8619             break;
8620         default:
8621             CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
8622     }
8623 }
8624 
8625 CV_IMPL void
cvCvtColor(const CvArr * srcarr,CvArr * dstarr,int code)8626 cvCvtColor( const CvArr* srcarr, CvArr* dstarr, int code )
8627 {
8628     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
8629     CV_Assert( src.depth() == dst.depth() );
8630 
8631     cv::cvtColor(src, dst, code, dst.channels());
8632     CV_Assert( dst.data == dst0.data );
8633 }
8634 
8635 
8636 /* End of file. */
8637