1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #include "precomp.hpp"
44 #include "opencl_kernels_imgproc.hpp"
45 
46 /****************************************************************************************\
47                                     Base Image Filter
48 \****************************************************************************************/
49 
50 #if IPP_VERSION_X100 >= 701
51 #define USE_IPP_SEP_FILTERS 1
52 #else
53 #undef USE_IPP_SEP_FILTERS
54 #endif
55 
56 namespace cv
57 {
58 
BaseRowFilter()59 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
~BaseRowFilter()60 BaseRowFilter::~BaseRowFilter() {}
61 
BaseColumnFilter()62 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
~BaseColumnFilter()63 BaseColumnFilter::~BaseColumnFilter() {}
reset()64 void BaseColumnFilter::reset() {}
65 
BaseFilter()66 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
~BaseFilter()67 BaseFilter::~BaseFilter() {}
reset()68 void BaseFilter::reset() {}
69 
FilterEngine()70 FilterEngine::FilterEngine()
71 {
72     srcType = dstType = bufType = -1;
73     rowBorderType = columnBorderType = BORDER_REPLICATE;
74     bufStep = startY = startY0 = endY = rowCount = dstY = 0;
75     maxWidth = 0;
76 
77     wholeSize = Size(-1,-1);
78 }
79 
80 
FilterEngine(const Ptr<BaseFilter> & _filter2D,const Ptr<BaseRowFilter> & _rowFilter,const Ptr<BaseColumnFilter> & _columnFilter,int _srcType,int _dstType,int _bufType,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)81 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
82                             const Ptr<BaseRowFilter>& _rowFilter,
83                             const Ptr<BaseColumnFilter>& _columnFilter,
84                             int _srcType, int _dstType, int _bufType,
85                             int _rowBorderType, int _columnBorderType,
86                             const Scalar& _borderValue )
87 {
88     init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
89          _rowBorderType, _columnBorderType, _borderValue);
90 }
91 
~FilterEngine()92 FilterEngine::~FilterEngine()
93 {
94 }
95 
96 
init(const Ptr<BaseFilter> & _filter2D,const Ptr<BaseRowFilter> & _rowFilter,const Ptr<BaseColumnFilter> & _columnFilter,int _srcType,int _dstType,int _bufType,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)97 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
98                          const Ptr<BaseRowFilter>& _rowFilter,
99                          const Ptr<BaseColumnFilter>& _columnFilter,
100                          int _srcType, int _dstType, int _bufType,
101                          int _rowBorderType, int _columnBorderType,
102                          const Scalar& _borderValue )
103 {
104     _srcType = CV_MAT_TYPE(_srcType);
105     _bufType = CV_MAT_TYPE(_bufType);
106     _dstType = CV_MAT_TYPE(_dstType);
107 
108     srcType = _srcType;
109     int srcElemSize = (int)getElemSize(srcType);
110     dstType = _dstType;
111     bufType = _bufType;
112 
113     filter2D = _filter2D;
114     rowFilter = _rowFilter;
115     columnFilter = _columnFilter;
116 
117     if( _columnBorderType < 0 )
118         _columnBorderType = _rowBorderType;
119 
120     rowBorderType = _rowBorderType;
121     columnBorderType = _columnBorderType;
122 
123     CV_Assert( columnBorderType != BORDER_WRAP );
124 
125     if( isSeparable() )
126     {
127         CV_Assert( rowFilter && columnFilter );
128         ksize = Size(rowFilter->ksize, columnFilter->ksize);
129         anchor = Point(rowFilter->anchor, columnFilter->anchor);
130     }
131     else
132     {
133         CV_Assert( bufType == srcType );
134         ksize = filter2D->ksize;
135         anchor = filter2D->anchor;
136     }
137 
138     CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
139                0 <= anchor.y && anchor.y < ksize.height );
140 
141     borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
142     int borderLength = std::max(ksize.width - 1, 1);
143     borderTab.resize(borderLength*borderElemSize);
144 
145     maxWidth = bufStep = 0;
146     constBorderRow.clear();
147 
148     if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
149     {
150         constBorderValue.resize(srcElemSize*borderLength);
151         int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4));
152         scalarToRawData(_borderValue, &constBorderValue[0], srcType1,
153                         borderLength*CV_MAT_CN(srcType));
154     }
155 
156     wholeSize = Size(-1,-1);
157 }
158 
159 #define VEC_ALIGN CV_MALLOC_ALIGN
160 
start(Size _wholeSize,Rect _roi,int _maxBufRows)161 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
162 {
163     int i, j;
164 
165     wholeSize = _wholeSize;
166     roi = _roi;
167     CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
168         roi.x + roi.width <= wholeSize.width &&
169         roi.y + roi.height <= wholeSize.height );
170 
171     int esz = (int)getElemSize(srcType);
172     int bufElemSize = (int)getElemSize(bufType);
173     const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
174 
175     if( _maxBufRows < 0 )
176         _maxBufRows = ksize.height + 3;
177     _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
178 
179     if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
180     {
181         rows.resize(_maxBufRows);
182         maxWidth = std::max(maxWidth, roi.width);
183         int cn = CV_MAT_CN(srcType);
184         srcRow.resize(esz*(maxWidth + ksize.width - 1));
185         if( columnBorderType == BORDER_CONSTANT )
186         {
187             constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
188             uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
189             int n = (int)constBorderValue.size(), N;
190             N = (maxWidth + ksize.width - 1)*esz;
191             tdst = isSeparable() ? &srcRow[0] : dst;
192 
193             for( i = 0; i < N; i += n )
194             {
195                 n = std::min( n, N - i );
196                 for(j = 0; j < n; j++)
197                     tdst[i+j] = constVal[j];
198             }
199 
200             if( isSeparable() )
201                 (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
202         }
203 
204         int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
205             (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
206         ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
207     }
208 
209     // adjust bufstep so that the used part of the ring buffer stays compact in memory
210     bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
211 
212     dx1 = std::max(anchor.x - roi.x, 0);
213     dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
214 
215     // recompute border tables
216     if( dx1 > 0 || dx2 > 0 )
217     {
218         if( rowBorderType == BORDER_CONSTANT )
219         {
220             int nr = isSeparable() ? 1 : (int)rows.size();
221             for( i = 0; i < nr; i++ )
222             {
223                 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
224                 memcpy( dst, constVal, dx1*esz );
225                 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
226             }
227         }
228         else
229         {
230             int xofs1 = std::min(roi.x, anchor.x) - roi.x;
231 
232             int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
233             int* btab = (int*)&borderTab[0];
234 
235             for( i = 0; i < dx1; i++ )
236             {
237                 int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
238                 for( j = 0; j < btab_esz; j++ )
239                     btab[i*btab_esz + j] = p0 + j;
240             }
241 
242             for( i = 0; i < dx2; i++ )
243             {
244                 int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
245                 for( j = 0; j < btab_esz; j++ )
246                     btab[(i + dx1)*btab_esz + j] = p0 + j;
247             }
248         }
249     }
250 
251     rowCount = dstY = 0;
252     startY = startY0 = std::max(roi.y - anchor.y, 0);
253     endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
254     if( columnFilter )
255         columnFilter->reset();
256     if( filter2D )
257         filter2D->reset();
258 
259     return startY;
260 }
261 
262 
start(const Mat & src,const Rect & _srcRoi,bool isolated,int maxBufRows)263 int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
264                         bool isolated, int maxBufRows)
265 {
266     Rect srcRoi = _srcRoi;
267 
268     if( srcRoi == Rect(0,0,-1,-1) )
269         srcRoi = Rect(0,0,src.cols,src.rows);
270 
271     CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
272         srcRoi.width >= 0 && srcRoi.height >= 0 &&
273         srcRoi.x + srcRoi.width <= src.cols &&
274         srcRoi.y + srcRoi.height <= src.rows );
275 
276     Point ofs;
277     Size wsz(src.cols, src.rows);
278     if( !isolated )
279         src.locateROI( wsz, ofs );
280     start( wsz, srcRoi + ofs, maxBufRows );
281 
282     return startY - ofs.y;
283 }
284 
285 
remainingInputRows() const286 int FilterEngine::remainingInputRows() const
287 {
288     return endY - startY - rowCount;
289 }
290 
remainingOutputRows() const291 int FilterEngine::remainingOutputRows() const
292 {
293     return roi.height - dstY;
294 }
295 
proceed(const uchar * src,int srcstep,int count,uchar * dst,int dststep)296 int FilterEngine::proceed( const uchar* src, int srcstep, int count,
297                            uchar* dst, int dststep )
298 {
299     CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
300 
301     const int *btab = &borderTab[0];
302     int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
303     uchar** brows = &rows[0];
304     int bufRows = (int)rows.size();
305     int cn = CV_MAT_CN(bufType);
306     int width = roi.width, kwidth = ksize.width;
307     int kheight = ksize.height, ay = anchor.y;
308     int _dx1 = dx1, _dx2 = dx2;
309     int width1 = roi.width + kwidth - 1;
310     int xofs1 = std::min(roi.x, anchor.x);
311     bool isSep = isSeparable();
312     bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
313     int dy = 0, i = 0;
314 
315     src -= xofs1*esz;
316     count = std::min(count, remainingInputRows());
317 
318     CV_Assert( src && dst && count > 0 );
319 
320     for(;; dst += dststep*i, dy += i)
321     {
322         int dcount = bufRows - ay - startY - rowCount + roi.y;
323         dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
324         dcount = std::min(dcount, count);
325         count -= dcount;
326         for( ; dcount-- > 0; src += srcstep )
327         {
328             int bi = (startY - startY0 + rowCount) % bufRows;
329             uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
330             uchar* row = isSep ? &srcRow[0] : brow;
331 
332             if( ++rowCount > bufRows )
333             {
334                 --rowCount;
335                 ++startY;
336             }
337 
338             memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
339 
340             if( makeBorder )
341             {
342                 if( btab_esz*(int)sizeof(int) == esz )
343                 {
344                     const int* isrc = (const int*)src;
345                     int* irow = (int*)row;
346 
347                     for( i = 0; i < _dx1*btab_esz; i++ )
348                         irow[i] = isrc[btab[i]];
349                     for( i = 0; i < _dx2*btab_esz; i++ )
350                         irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
351                 }
352                 else
353                 {
354                     for( i = 0; i < _dx1*esz; i++ )
355                         row[i] = src[btab[i]];
356                     for( i = 0; i < _dx2*esz; i++ )
357                         row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
358                 }
359             }
360 
361             if( isSep )
362                 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
363         }
364 
365         int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
366         for( i = 0; i < max_i; i++ )
367         {
368             int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
369                             wholeSize.height, columnBorderType);
370             if( srcY < 0 ) // can happen only with constant border type
371                 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
372             else
373             {
374                 CV_Assert( srcY >= startY );
375                 if( srcY >= startY + rowCount )
376                     break;
377                 int bi = (srcY - startY0) % bufRows;
378                 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
379             }
380         }
381         if( i < kheight )
382             break;
383         i -= kheight - 1;
384         if( isSeparable() )
385             (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
386         else
387             (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
388     }
389 
390     dstY += dy;
391     CV_Assert( dstY <= roi.height );
392     return dy;
393 }
394 
395 
apply(const Mat & src,Mat & dst,const Rect & _srcRoi,Point dstOfs,bool isolated)396 void FilterEngine::apply(const Mat& src, Mat& dst,
397     const Rect& _srcRoi, Point dstOfs, bool isolated)
398 {
399     CV_Assert( src.type() == srcType && dst.type() == dstType );
400 
401     Rect srcRoi = _srcRoi;
402     if( srcRoi == Rect(0,0,-1,-1) )
403         srcRoi = Rect(0,0,src.cols,src.rows);
404 
405     if( srcRoi.area() == 0 )
406         return;
407 
408     CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
409         dstOfs.x + srcRoi.width <= dst.cols &&
410         dstOfs.y + srcRoi.height <= dst.rows );
411 
412     int y = start(src, srcRoi, isolated);
413     proceed( src.ptr() + y*src.step + srcRoi.x*src.elemSize(),
414              (int)src.step, endY - startY,
415              dst.ptr(dstOfs.y) +
416              dstOfs.x*dst.elemSize(), (int)dst.step );
417 }
418 
419 }
420 
421 /****************************************************************************************\
422 *                                 Separable linear filter                                *
423 \****************************************************************************************/
424 
getKernelType(InputArray filter_kernel,Point anchor)425 int cv::getKernelType(InputArray filter_kernel, Point anchor)
426 {
427     Mat _kernel = filter_kernel.getMat();
428     CV_Assert( _kernel.channels() == 1 );
429     int i, sz = _kernel.rows*_kernel.cols;
430 
431     Mat kernel;
432     _kernel.convertTo(kernel, CV_64F);
433 
434     const double* coeffs = kernel.ptr<double>();
435     double sum = 0;
436     int type = KERNEL_SMOOTH + KERNEL_INTEGER;
437     if( (_kernel.rows == 1 || _kernel.cols == 1) &&
438         anchor.x*2 + 1 == _kernel.cols &&
439         anchor.y*2 + 1 == _kernel.rows )
440         type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
441 
442     for( i = 0; i < sz; i++ )
443     {
444         double a = coeffs[i], b = coeffs[sz - i - 1];
445         if( a != b )
446             type &= ~KERNEL_SYMMETRICAL;
447         if( a != -b )
448             type &= ~KERNEL_ASYMMETRICAL;
449         if( a < 0 )
450             type &= ~KERNEL_SMOOTH;
451         if( a != saturate_cast<int>(a) )
452             type &= ~KERNEL_INTEGER;
453         sum += a;
454     }
455 
456     if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
457         type &= ~KERNEL_SMOOTH;
458     return type;
459 }
460 
461 
462 namespace cv
463 {
464 
465 struct RowNoVec
466 {
RowNoVeccv::RowNoVec467     RowNoVec() {}
RowNoVeccv::RowNoVec468     RowNoVec(const Mat&) {}
operator ()cv::RowNoVec469     int operator()(const uchar*, uchar*, int, int) const { return 0; }
470 };
471 
472 struct ColumnNoVec
473 {
ColumnNoVeccv::ColumnNoVec474     ColumnNoVec() {}
ColumnNoVeccv::ColumnNoVec475     ColumnNoVec(const Mat&, int, int, double) {}
operator ()cv::ColumnNoVec476     int operator()(const uchar**, uchar*, int) const { return 0; }
477 };
478 
479 struct SymmRowSmallNoVec
480 {
SymmRowSmallNoVeccv::SymmRowSmallNoVec481     SymmRowSmallNoVec() {}
SymmRowSmallNoVeccv::SymmRowSmallNoVec482     SymmRowSmallNoVec(const Mat&, int) {}
operator ()cv::SymmRowSmallNoVec483     int operator()(const uchar*, uchar*, int, int) const { return 0; }
484 };
485 
486 struct SymmColumnSmallNoVec
487 {
SymmColumnSmallNoVeccv::SymmColumnSmallNoVec488     SymmColumnSmallNoVec() {}
SymmColumnSmallNoVeccv::SymmColumnSmallNoVec489     SymmColumnSmallNoVec(const Mat&, int, int, double) {}
operator ()cv::SymmColumnSmallNoVec490     int operator()(const uchar**, uchar*, int) const { return 0; }
491 };
492 
493 struct FilterNoVec
494 {
FilterNoVeccv::FilterNoVec495     FilterNoVec() {}
FilterNoVeccv::FilterNoVec496     FilterNoVec(const Mat&, int, double) {}
operator ()cv::FilterNoVec497     int operator()(const uchar**, uchar*, int) const { return 0; }
498 };
499 
500 
501 #if CV_SSE2
502 
503 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
504 
505 struct RowVec_8u32s
506 {
RowVec_8u32scv::RowVec_8u32s507     RowVec_8u32s() { smallValues = false; }
RowVec_8u32scv::RowVec_8u32s508     RowVec_8u32s( const Mat& _kernel )
509     {
510         kernel = _kernel;
511         smallValues = true;
512         int k, ksize = kernel.rows + kernel.cols - 1;
513         for( k = 0; k < ksize; k++ )
514         {
515             int v = kernel.ptr<int>()[k];
516             if( v < SHRT_MIN || v > SHRT_MAX )
517             {
518                 smallValues = false;
519                 break;
520             }
521         }
522     }
523 
operator ()cv::RowVec_8u32s524     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
525     {
526         if( !checkHardwareSupport(CV_CPU_SSE2) )
527             return 0;
528 
529         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
530         int* dst = (int*)_dst;
531         const int* _kx = kernel.ptr<int>();
532         width *= cn;
533 
534         if( smallValues )
535         {
536             for( ; i <= width - 16; i += 16 )
537             {
538                 const uchar* src = _src + i;
539                 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
540                 __m128i x0, x1, x2, x3;
541 
542                 for( k = 0; k < _ksize; k++, src += cn )
543                 {
544                     f = _mm_cvtsi32_si128(_kx[k]);
545                     f = _mm_shuffle_epi32(f, 0);
546                     f = _mm_packs_epi32(f, f);
547 
548                     x0 = _mm_loadu_si128((const __m128i*)src);
549                     x2 = _mm_unpackhi_epi8(x0, z);
550                     x0 = _mm_unpacklo_epi8(x0, z);
551                     x1 = _mm_mulhi_epi16(x0, f);
552                     x3 = _mm_mulhi_epi16(x2, f);
553                     x0 = _mm_mullo_epi16(x0, f);
554                     x2 = _mm_mullo_epi16(x2, f);
555 
556                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
557                     s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
558                     s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
559                     s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
560                 }
561 
562                 _mm_store_si128((__m128i*)(dst + i), s0);
563                 _mm_store_si128((__m128i*)(dst + i + 4), s1);
564                 _mm_store_si128((__m128i*)(dst + i + 8), s2);
565                 _mm_store_si128((__m128i*)(dst + i + 12), s3);
566             }
567 
568             for( ; i <= width - 4; i += 4 )
569             {
570                 const uchar* src = _src + i;
571                 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
572 
573                 for( k = 0; k < _ksize; k++, src += cn )
574                 {
575                     f = _mm_cvtsi32_si128(_kx[k]);
576                     f = _mm_shuffle_epi32(f, 0);
577                     f = _mm_packs_epi32(f, f);
578 
579                     x0 = _mm_cvtsi32_si128(*(const int*)src);
580                     x0 = _mm_unpacklo_epi8(x0, z);
581                     x1 = _mm_mulhi_epi16(x0, f);
582                     x0 = _mm_mullo_epi16(x0, f);
583                     s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
584                 }
585                 _mm_store_si128((__m128i*)(dst + i), s0);
586             }
587         }
588         return i;
589     }
590 
591     Mat kernel;
592     bool smallValues;
593 };
594 
595 
596 struct SymmRowSmallVec_8u32s
597 {
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s598     SymmRowSmallVec_8u32s() { smallValues = false; }
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s599     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
600     {
601         kernel = _kernel;
602         symmetryType = _symmetryType;
603         smallValues = true;
604         int k, ksize = kernel.rows + kernel.cols - 1;
605         for( k = 0; k < ksize; k++ )
606         {
607             int v = kernel.ptr<int>()[k];
608             if( v < SHRT_MIN || v > SHRT_MAX )
609             {
610                 smallValues = false;
611                 break;
612             }
613         }
614     }
615 
operator ()cv::SymmRowSmallVec_8u32s616     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
617     {
618         if( !checkHardwareSupport(CV_CPU_SSE2) )
619             return 0;
620 
621         int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
622         int* dst = (int*)_dst;
623         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
624         const int* kx = kernel.ptr<int>() + _ksize/2;
625         if( !smallValues )
626             return 0;
627 
628         src += (_ksize/2)*cn;
629         width *= cn;
630 
631         __m128i z = _mm_setzero_si128();
632         if( symmetrical )
633         {
634             if( _ksize == 1 )
635                 return 0;
636             if( _ksize == 3 )
637             {
638                 if( kx[0] == 2 && kx[1] == 1 )
639                     for( ; i <= width - 16; i += 16, src += 16 )
640                     {
641                         __m128i x0, x1, x2, y0, y1, y2;
642                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
643                         x1 = _mm_loadu_si128((__m128i*)src);
644                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
645                         y0 = _mm_unpackhi_epi8(x0, z);
646                         x0 = _mm_unpacklo_epi8(x0, z);
647                         y1 = _mm_unpackhi_epi8(x1, z);
648                         x1 = _mm_unpacklo_epi8(x1, z);
649                         y2 = _mm_unpackhi_epi8(x2, z);
650                         x2 = _mm_unpacklo_epi8(x2, z);
651                         x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
652                         y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
653                         _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
654                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
655                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
656                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
657                     }
658                 else if( kx[0] == -2 && kx[1] == 1 )
659                     for( ; i <= width - 16; i += 16, src += 16 )
660                     {
661                         __m128i x0, x1, x2, y0, y1, y2;
662                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
663                         x1 = _mm_loadu_si128((__m128i*)src);
664                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
665                         y0 = _mm_unpackhi_epi8(x0, z);
666                         x0 = _mm_unpacklo_epi8(x0, z);
667                         y1 = _mm_unpackhi_epi8(x1, z);
668                         x1 = _mm_unpacklo_epi8(x1, z);
669                         y2 = _mm_unpackhi_epi8(x2, z);
670                         x2 = _mm_unpacklo_epi8(x2, z);
671                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
672                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
673                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
674                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
675                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
676                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
677                     }
678                 else
679                 {
680                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
681                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
682                     k0 = _mm_packs_epi32(k0, k0);
683                     k1 = _mm_packs_epi32(k1, k1);
684 
685                     for( ; i <= width - 16; i += 16, src += 16 )
686                     {
687                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
688                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
689                         x1 = _mm_loadu_si128((__m128i*)src);
690                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
691                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
692                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
693                         y1 = _mm_unpackhi_epi8(x1, z);
694                         x1 = _mm_unpacklo_epi8(x1, z);
695 
696                         t1 = _mm_mulhi_epi16(x1, k0);
697                         t0 = _mm_mullo_epi16(x1, k0);
698                         x2 = _mm_mulhi_epi16(x0, k1);
699                         x0 = _mm_mullo_epi16(x0, k1);
700                         z0 = _mm_unpacklo_epi16(t0, t1);
701                         z1 = _mm_unpackhi_epi16(t0, t1);
702                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
703                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
704 
705                         t1 = _mm_mulhi_epi16(y1, k0);
706                         t0 = _mm_mullo_epi16(y1, k0);
707                         y1 = _mm_mulhi_epi16(y0, k1);
708                         y0 = _mm_mullo_epi16(y0, k1);
709                         z2 = _mm_unpacklo_epi16(t0, t1);
710                         z3 = _mm_unpackhi_epi16(t0, t1);
711                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
712                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
713                         _mm_store_si128((__m128i*)(dst + i), z0);
714                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
715                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
716                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
717                     }
718                 }
719             }
720             else if( _ksize == 5 )
721             {
722                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
723                     for( ; i <= width - 16; i += 16, src += 16 )
724                     {
725                         __m128i x0, x1, x2, y0, y1, y2;
726                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
727                         x1 = _mm_loadu_si128((__m128i*)src);
728                         x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
729                         y0 = _mm_unpackhi_epi8(x0, z);
730                         x0 = _mm_unpacklo_epi8(x0, z);
731                         y1 = _mm_unpackhi_epi8(x1, z);
732                         x1 = _mm_unpacklo_epi8(x1, z);
733                         y2 = _mm_unpackhi_epi8(x2, z);
734                         x2 = _mm_unpacklo_epi8(x2, z);
735                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
736                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
737                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
738                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
739                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
740                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
741                     }
742                 else
743                 {
744                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
745                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
746                             k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
747                     k0 = _mm_packs_epi32(k0, k0);
748                     k1 = _mm_packs_epi32(k1, k1);
749                     k2 = _mm_packs_epi32(k2, k2);
750 
751                     for( ; i <= width - 16; i += 16, src += 16 )
752                     {
753                         __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
754                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
755                         x1 = _mm_loadu_si128((__m128i*)src);
756                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
757                         y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
758                         x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
759                         y1 = _mm_unpackhi_epi8(x1, z);
760                         x1 = _mm_unpacklo_epi8(x1, z);
761 
762                         t1 = _mm_mulhi_epi16(x1, k0);
763                         t0 = _mm_mullo_epi16(x1, k0);
764                         x2 = _mm_mulhi_epi16(x0, k1);
765                         x0 = _mm_mullo_epi16(x0, k1);
766                         z0 = _mm_unpacklo_epi16(t0, t1);
767                         z1 = _mm_unpackhi_epi16(t0, t1);
768                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
769                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
770 
771                         t1 = _mm_mulhi_epi16(y1, k0);
772                         t0 = _mm_mullo_epi16(y1, k0);
773                         y1 = _mm_mulhi_epi16(y0, k1);
774                         y0 = _mm_mullo_epi16(y0, k1);
775                         z2 = _mm_unpacklo_epi16(t0, t1);
776                         z3 = _mm_unpackhi_epi16(t0, t1);
777                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
778                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
779 
780                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
781                         x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
782                         y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
783                         y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
784 
785                         t1 = _mm_mulhi_epi16(y0, k2);
786                         t0 = _mm_mullo_epi16(y0, k2);
787                         y0 = _mm_mullo_epi16(y1, k2);
788                         y1 = _mm_mulhi_epi16(y1, k2);
789                         z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
790                         z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
791                         z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
792                         z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
793 
794                         _mm_store_si128((__m128i*)(dst + i), z0);
795                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
796                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
797                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
798                     }
799                 }
800             }
801         }
802         else
803         {
804             if( _ksize == 3 )
805             {
806                 if( kx[0] == 0 && kx[1] == 1 )
807                     for( ; i <= width - 16; i += 16, src += 16 )
808                     {
809                         __m128i x0, x1, y0;
810                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
811                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
812                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
813                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
814                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
815                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
816                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
817                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
818                     }
819                 else
820                 {
821                     __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
822                     k1 = _mm_packs_epi32(k1, k1);
823 
824                     for( ; i <= width - 16; i += 16, src += 16 )
825                     {
826                         __m128i x0, x1, y0, y1, z0, z1, z2, z3;
827                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
828                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
829                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
830                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
831 
832                         x1 = _mm_mulhi_epi16(x0, k1);
833                         x0 = _mm_mullo_epi16(x0, k1);
834                         z0 = _mm_unpacklo_epi16(x0, x1);
835                         z1 = _mm_unpackhi_epi16(x0, x1);
836 
837                         y1 = _mm_mulhi_epi16(y0, k1);
838                         y0 = _mm_mullo_epi16(y0, k1);
839                         z2 = _mm_unpacklo_epi16(y0, y1);
840                         z3 = _mm_unpackhi_epi16(y0, y1);
841                         _mm_store_si128((__m128i*)(dst + i), z0);
842                         _mm_store_si128((__m128i*)(dst + i + 4), z1);
843                         _mm_store_si128((__m128i*)(dst + i + 8), z2);
844                         _mm_store_si128((__m128i*)(dst + i + 12), z3);
845                     }
846                 }
847             }
848             else if( _ksize == 5 )
849             {
850                 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
851                         k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
852                         k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
853                 k0 = _mm_packs_epi32(k0, k0);
854                 k1 = _mm_packs_epi32(k1, k1);
855                 k2 = _mm_packs_epi32(k2, k2);
856 
857                 for( ; i <= width - 16; i += 16, src += 16 )
858                 {
859                     __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
860                     x0 = _mm_loadu_si128((__m128i*)(src + cn));
861                     x2 = _mm_loadu_si128((__m128i*)(src - cn));
862                     y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
863                     x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
864 
865                     x2 = _mm_mulhi_epi16(x0, k1);
866                     x0 = _mm_mullo_epi16(x0, k1);
867                     z0 = _mm_unpacklo_epi16(x0, x2);
868                     z1 = _mm_unpackhi_epi16(x0, x2);
869                     y1 = _mm_mulhi_epi16(y0, k1);
870                     y0 = _mm_mullo_epi16(y0, k1);
871                     z2 = _mm_unpacklo_epi16(y0, y1);
872                     z3 = _mm_unpackhi_epi16(y0, y1);
873 
874                     x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
875                     x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
876                     y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
877                     y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
878 
879                     t1 = _mm_mulhi_epi16(y0, k2);
880                     t0 = _mm_mullo_epi16(y0, k2);
881                     y0 = _mm_mullo_epi16(y1, k2);
882                     y1 = _mm_mulhi_epi16(y1, k2);
883                     z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
884                     z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
885                     z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
886                     z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
887 
888                     _mm_store_si128((__m128i*)(dst + i), z0);
889                     _mm_store_si128((__m128i*)(dst + i + 4), z1);
890                     _mm_store_si128((__m128i*)(dst + i + 8), z2);
891                     _mm_store_si128((__m128i*)(dst + i + 12), z3);
892                 }
893             }
894         }
895 
896         src -= (_ksize/2)*cn;
897         kx -= _ksize/2;
898         for( ; i <= width - 4; i += 4, src += 4 )
899         {
900             __m128i f, s0 = z, x0, x1;
901 
902             for( k = j = 0; k < _ksize; k++, j += cn )
903             {
904                 f = _mm_cvtsi32_si128(kx[k]);
905                 f = _mm_shuffle_epi32(f, 0);
906                 f = _mm_packs_epi32(f, f);
907 
908                 x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
909                 x0 = _mm_unpacklo_epi8(x0, z);
910                 x1 = _mm_mulhi_epi16(x0, f);
911                 x0 = _mm_mullo_epi16(x0, f);
912                 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
913             }
914             _mm_store_si128((__m128i*)(dst + i), s0);
915         }
916 
917         return i;
918     }
919 
920     Mat kernel;
921     int symmetryType;
922     bool smallValues;
923 };
924 
925 
926 struct SymmColumnVec_32s8u
927 {
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u928     SymmColumnVec_32s8u() { symmetryType=0; }
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u929     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
930     {
931         symmetryType = _symmetryType;
932         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
933         delta = (float)(_delta/(1 << _bits));
934         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
935     }
936 
operator ()cv::SymmColumnVec_32s8u937     int operator()(const uchar** _src, uchar* dst, int width) const
938     {
939         if( !checkHardwareSupport(CV_CPU_SSE2) )
940             return 0;
941 
942         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
943         const float* ky = kernel.ptr<float>() + ksize2;
944         int i = 0, k;
945         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
946         const int** src = (const int**)_src;
947         const __m128i *S, *S2;
948         __m128 d4 = _mm_set1_ps(delta);
949 
950         if( symmetrical )
951         {
952             for( ; i <= width - 16; i += 16 )
953             {
954                 __m128 f = _mm_load_ss(ky);
955                 f = _mm_shuffle_ps(f, f, 0);
956                 __m128 s0, s1, s2, s3;
957                 __m128i x0, x1;
958                 S = (const __m128i*)(src[0] + i);
959                 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
960                 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
961                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
962                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
963                 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
964                 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
965                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
966                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
967 
968                 for( k = 1; k <= ksize2; k++ )
969                 {
970                     S = (const __m128i*)(src[k] + i);
971                     S2 = (const __m128i*)(src[-k] + i);
972                     f = _mm_load_ss(ky+k);
973                     f = _mm_shuffle_ps(f, f, 0);
974                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
975                     x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
976                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
977                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
978                     x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
979                     x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
980                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
981                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
982                 }
983 
984                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
985                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
986                 x0 = _mm_packus_epi16(x0, x1);
987                 _mm_storeu_si128((__m128i*)(dst + i), x0);
988             }
989 
990             for( ; i <= width - 4; i += 4 )
991             {
992                 __m128 f = _mm_load_ss(ky);
993                 f = _mm_shuffle_ps(f, f, 0);
994                 __m128i x0;
995                 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
996                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
997 
998                 for( k = 1; k <= ksize2; k++ )
999                 {
1000                     S = (const __m128i*)(src[k] + i);
1001                     S2 = (const __m128i*)(src[-k] + i);
1002                     f = _mm_load_ss(ky+k);
1003                     f = _mm_shuffle_ps(f, f, 0);
1004                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1005                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1006                 }
1007 
1008                 x0 = _mm_cvtps_epi32(s0);
1009                 x0 = _mm_packs_epi32(x0, x0);
1010                 x0 = _mm_packus_epi16(x0, x0);
1011                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1012             }
1013         }
1014         else
1015         {
1016             for( ; i <= width - 16; i += 16 )
1017             {
1018                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1019                 __m128i x0, x1;
1020 
1021                 for( k = 1; k <= ksize2; k++ )
1022                 {
1023                     S = (const __m128i*)(src[k] + i);
1024                     S2 = (const __m128i*)(src[-k] + i);
1025                     f = _mm_load_ss(ky+k);
1026                     f = _mm_shuffle_ps(f, f, 0);
1027                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1028                     x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1029                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1030                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1031                     x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1032                     x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1033                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1034                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1035                 }
1036 
1037                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1038                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1039                 x0 = _mm_packus_epi16(x0, x1);
1040                 _mm_storeu_si128((__m128i*)(dst + i), x0);
1041             }
1042 
1043             for( ; i <= width - 4; i += 4 )
1044             {
1045                 __m128 f, s0 = d4;
1046                 __m128i x0;
1047 
1048                 for( k = 1; k <= ksize2; k++ )
1049                 {
1050                     S = (const __m128i*)(src[k] + i);
1051                     S2 = (const __m128i*)(src[-k] + i);
1052                     f = _mm_load_ss(ky+k);
1053                     f = _mm_shuffle_ps(f, f, 0);
1054                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1055                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1056                 }
1057 
1058                 x0 = _mm_cvtps_epi32(s0);
1059                 x0 = _mm_packs_epi32(x0, x0);
1060                 x0 = _mm_packus_epi16(x0, x0);
1061                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1062             }
1063         }
1064 
1065         return i;
1066     }
1067 
1068     int symmetryType;
1069     float delta;
1070     Mat kernel;
1071 };
1072 
1073 
1074 struct SymmColumnSmallVec_32s16s
1075 {
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s1076     SymmColumnSmallVec_32s16s() { symmetryType=0; }
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s1077     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1078     {
1079         symmetryType = _symmetryType;
1080         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1081         delta = (float)(_delta/(1 << _bits));
1082         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1083     }
1084 
operator ()cv::SymmColumnSmallVec_32s16s1085     int operator()(const uchar** _src, uchar* _dst, int width) const
1086     {
1087         if( !checkHardwareSupport(CV_CPU_SSE2) )
1088             return 0;
1089 
1090         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1091         const float* ky = kernel.ptr<float>() + ksize2;
1092         int i = 0;
1093         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1094         const int** src = (const int**)_src;
1095         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1096         short* dst = (short*)_dst;
1097         __m128 df4 = _mm_set1_ps(delta);
1098         __m128i d4 = _mm_cvtps_epi32(df4);
1099 
1100         if( symmetrical )
1101         {
1102             if( ky[0] == 2 && ky[1] == 1 )
1103             {
1104                 for( ; i <= width - 8; i += 8 )
1105                 {
1106                     __m128i s0, s1, s2, s3, s4, s5;
1107                     s0 = _mm_load_si128((__m128i*)(S0 + i));
1108                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1109                     s2 = _mm_load_si128((__m128i*)(S1 + i));
1110                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1111                     s4 = _mm_load_si128((__m128i*)(S2 + i));
1112                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1113                     s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1114                     s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1115                     s0 = _mm_add_epi32(s0, d4);
1116                     s1 = _mm_add_epi32(s1, d4);
1117                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1118                 }
1119             }
1120             else if( ky[0] == -2 && ky[1] == 1 )
1121             {
1122                 for( ; i <= width - 8; i += 8 )
1123                 {
1124                     __m128i s0, s1, s2, s3, s4, s5;
1125                     s0 = _mm_load_si128((__m128i*)(S0 + i));
1126                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1127                     s2 = _mm_load_si128((__m128i*)(S1 + i));
1128                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1129                     s4 = _mm_load_si128((__m128i*)(S2 + i));
1130                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1131                     s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1132                     s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1133                     s0 = _mm_add_epi32(s0, d4);
1134                     s1 = _mm_add_epi32(s1, d4);
1135                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1136                 }
1137             }
1138             else
1139             {
1140                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1141                 for( ; i <= width - 8; i += 8 )
1142                 {
1143                     __m128 s0, s1;
1144                     s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1145                     s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1146                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1147                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1148                     __m128i x0, x1;
1149                     x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1150                                        _mm_load_si128((__m128i*)(S2 + i)));
1151                     x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1152                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
1153                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1154                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1155                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1156                     _mm_storeu_si128((__m128i*)(dst + i), x0);
1157                 }
1158             }
1159         }
1160         else
1161         {
1162             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1163             {
1164                 if( ky[1] < 0 )
1165                     std::swap(S0, S2);
1166                 for( ; i <= width - 8; i += 8 )
1167                 {
1168                     __m128i s0, s1, s2, s3;
1169                     s0 = _mm_load_si128((__m128i*)(S2 + i));
1170                     s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1171                     s2 = _mm_load_si128((__m128i*)(S0 + i));
1172                     s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1173                     s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1174                     s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1175                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1176                 }
1177             }
1178             else
1179             {
1180                 __m128 k1 = _mm_set1_ps(ky[1]);
1181                 for( ; i <= width - 8; i += 8 )
1182                 {
1183                     __m128 s0 = df4, s1 = df4;
1184                     __m128i x0, x1;
1185                     x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
1186                                        _mm_load_si128((__m128i*)(S0 + i)));
1187                     x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
1188                                        _mm_load_si128((__m128i*)(S0 + i + 4)));
1189                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1190                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1191                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1192                     _mm_storeu_si128((__m128i*)(dst + i), x0);
1193                 }
1194             }
1195         }
1196 
1197         return i;
1198     }
1199 
1200     int symmetryType;
1201     float delta;
1202     Mat kernel;
1203 };
1204 
1205 
1206 /////////////////////////////////////// 16s //////////////////////////////////
1207 
1208 struct RowVec_16s32f
1209 {
RowVec_16s32fcv::RowVec_16s32f1210     RowVec_16s32f() {}
RowVec_16s32fcv::RowVec_16s32f1211     RowVec_16s32f( const Mat& _kernel )
1212     {
1213         kernel = _kernel;
1214         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1215     }
1216 
operator ()cv::RowVec_16s32f1217     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1218     {
1219         if( !sse2_supported )
1220             return 0;
1221 
1222         int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1223         float* dst = (float*)_dst;
1224         const float* _kx = kernel.ptr<float>();
1225         width *= cn;
1226 
1227         for( ; i <= width - 8; i += 8 )
1228         {
1229             const short* src = (const short*)_src + i;
1230             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1231             for( k = 0; k < _ksize; k++, src += cn )
1232             {
1233                 f = _mm_load_ss(_kx+k);
1234                 f = _mm_shuffle_ps(f, f, 0);
1235 
1236                 __m128i x0i = _mm_loadu_si128((const __m128i*)src);
1237                 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
1238                 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
1239                 x0 = _mm_cvtepi32_ps(x0i);
1240                 x1 = _mm_cvtepi32_ps(x1i);
1241                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1242                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1243             }
1244             _mm_store_ps(dst + i, s0);
1245             _mm_store_ps(dst + i + 4, s1);
1246         }
1247         return i;
1248     }
1249 
1250     Mat kernel;
1251     bool sse2_supported;
1252 };
1253 
1254 
1255 struct SymmColumnVec_32f16s
1256 {
SymmColumnVec_32f16scv::SymmColumnVec_32f16s1257     SymmColumnVec_32f16s() { symmetryType=0; }
SymmColumnVec_32f16scv::SymmColumnVec_32f16s1258     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
1259     {
1260         symmetryType = _symmetryType;
1261         kernel = _kernel;
1262         delta = (float)_delta;
1263         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1264         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1265     }
1266 
operator ()cv::SymmColumnVec_32f16s1267     int operator()(const uchar** _src, uchar* _dst, int width) const
1268     {
1269         if( !sse2_supported )
1270             return 0;
1271 
1272         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1273         const float* ky = kernel.ptr<float>() + ksize2;
1274         int i = 0, k;
1275         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1276         const float** src = (const float**)_src;
1277         const float *S, *S2;
1278         short* dst = (short*)_dst;
1279         __m128 d4 = _mm_set1_ps(delta);
1280 
1281         if( symmetrical )
1282         {
1283             for( ; i <= width - 16; i += 16 )
1284             {
1285                 __m128 f = _mm_load_ss(ky);
1286                 f = _mm_shuffle_ps(f, f, 0);
1287                 __m128 s0, s1, s2, s3;
1288                 __m128 x0, x1;
1289                 S = src[0] + i;
1290                 s0 = _mm_load_ps(S);
1291                 s1 = _mm_load_ps(S+4);
1292                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1293                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1294                 s2 = _mm_load_ps(S+8);
1295                 s3 = _mm_load_ps(S+12);
1296                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1297                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1298 
1299                 for( k = 1; k <= ksize2; k++ )
1300                 {
1301                     S = src[k] + i;
1302                     S2 = src[-k] + i;
1303                     f = _mm_load_ss(ky+k);
1304                     f = _mm_shuffle_ps(f, f, 0);
1305                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1306                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1307                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1308                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1309                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1310                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1311                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1312                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1313                 }
1314 
1315                 __m128i s0i = _mm_cvtps_epi32(s0);
1316                 __m128i s1i = _mm_cvtps_epi32(s1);
1317                 __m128i s2i = _mm_cvtps_epi32(s2);
1318                 __m128i s3i = _mm_cvtps_epi32(s3);
1319 
1320                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1321                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1322             }
1323 
1324             for( ; i <= width - 4; i += 4 )
1325             {
1326                 __m128 f = _mm_load_ss(ky);
1327                 f = _mm_shuffle_ps(f, f, 0);
1328                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1329                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1330 
1331                 for( k = 1; k <= ksize2; k++ )
1332                 {
1333                     f = _mm_load_ss(ky+k);
1334                     f = _mm_shuffle_ps(f, f, 0);
1335                     S = src[k] + i;
1336                     S2 = src[-k] + i;
1337                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1338                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1339                 }
1340 
1341                 __m128i s0i = _mm_cvtps_epi32(s0);
1342                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1343             }
1344         }
1345         else
1346         {
1347             for( ; i <= width - 16; i += 16 )
1348             {
1349                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1350                 __m128 x0, x1;
1351                 S = src[0] + i;
1352 
1353                 for( k = 1; k <= ksize2; k++ )
1354                 {
1355                     S = src[k] + i;
1356                     S2 = src[-k] + i;
1357                     f = _mm_load_ss(ky+k);
1358                     f = _mm_shuffle_ps(f, f, 0);
1359                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1360                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1361                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1362                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1363                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1364                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1365                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1366                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1367                 }
1368 
1369                 __m128i s0i = _mm_cvtps_epi32(s0);
1370                 __m128i s1i = _mm_cvtps_epi32(s1);
1371                 __m128i s2i = _mm_cvtps_epi32(s2);
1372                 __m128i s3i = _mm_cvtps_epi32(s3);
1373 
1374                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1375                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1376             }
1377 
1378             for( ; i <= width - 4; i += 4 )
1379             {
1380                 __m128 f, x0, s0 = d4;
1381 
1382                 for( k = 1; k <= ksize2; k++ )
1383                 {
1384                     f = _mm_load_ss(ky+k);
1385                     f = _mm_shuffle_ps(f, f, 0);
1386                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1387                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1388                 }
1389 
1390                 __m128i s0i = _mm_cvtps_epi32(s0);
1391                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1392             }
1393         }
1394 
1395         return i;
1396     }
1397 
1398     int symmetryType;
1399     float delta;
1400     Mat kernel;
1401     bool sse2_supported;
1402 };
1403 
1404 
1405 /////////////////////////////////////// 32f //////////////////////////////////
1406 
1407 struct RowVec_32f
1408 {
RowVec_32fcv::RowVec_32f1409     RowVec_32f()
1410     {
1411         haveSSE = checkHardwareSupport(CV_CPU_SSE);
1412     }
1413 
RowVec_32fcv::RowVec_32f1414     RowVec_32f( const Mat& _kernel )
1415     {
1416         kernel = _kernel;
1417         haveSSE = checkHardwareSupport(CV_CPU_SSE);
1418 #if defined USE_IPP_SEP_FILTERS && 0
1419         bufsz = -1;
1420 #endif
1421     }
1422 
operator ()cv::RowVec_32f1423     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1424     {
1425 #if defined USE_IPP_SEP_FILTERS && 0
1426         CV_IPP_CHECK()
1427         {
1428             int ret = ippiOperator(_src, _dst, width, cn);
1429             if (ret > 0)
1430                 return ret;
1431         }
1432 #endif
1433         int _ksize = kernel.rows + kernel.cols - 1;
1434         const float* src0 = (const float*)_src;
1435         float* dst = (float*)_dst;
1436         const float* _kx = kernel.ptr<float>();
1437 
1438         if( !haveSSE )
1439             return 0;
1440 
1441         int i = 0, k;
1442         width *= cn;
1443 
1444         for( ; i <= width - 8; i += 8 )
1445         {
1446             const float* src = src0 + i;
1447             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1448             for( k = 0; k < _ksize; k++, src += cn )
1449             {
1450                 f = _mm_load_ss(_kx+k);
1451                 f = _mm_shuffle_ps(f, f, 0);
1452 
1453                 x0 = _mm_loadu_ps(src);
1454                 x1 = _mm_loadu_ps(src + 4);
1455                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1456                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1457             }
1458             _mm_store_ps(dst + i, s0);
1459             _mm_store_ps(dst + i + 4, s1);
1460         }
1461         return i;
1462     }
1463 
1464     Mat kernel;
1465     bool haveSSE;
1466 #if defined USE_IPP_SEP_FILTERS && 0
1467 private:
1468     mutable int bufsz;
ippiOperatorcv::RowVec_32f1469     int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const
1470     {
1471         int _ksize = kernel.rows + kernel.cols - 1;
1472         if ((1 != cn && 3 != cn) || width < _ksize*8)
1473             return 0;
1474 
1475         const float* src = (const float*)_src;
1476         float* dst = (float*)_dst;
1477         const float* _kx = (const float*)kernel.data;
1478 
1479         IppiSize roisz = { width, 1 };
1480         if( bufsz < 0 )
1481         {
1482             if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) ||
1483                 (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0))
1484                 return 0;
1485         }
1486         AutoBuffer<uchar> buf(bufsz + 64);
1487         uchar* bufptr = alignPtr((uchar*)buf, 32);
1488         int step = (int)(width*sizeof(dst[0])*cn);
1489         float borderValue[] = {0.f, 0.f, 0.f};
1490         // here is the trick. IPP needs border type and extrapolates the row. We did it already.
1491         // So we pass anchor=0 and ignore the right tail of results since they are incorrect there.
1492         if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src, step, &dst, roisz, _kx, _ksize, 0,
1493                                                             ippBorderRepl, borderValue[0], bufptr) < 0) ||
1494             (cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src, step, &dst, roisz, _kx, _ksize, 0,
1495                                                             ippBorderRepl, borderValue, bufptr) < 0))
1496         {
1497             setIppErrorStatus();
1498             return 0;
1499         }
1500         CV_IMPL_ADD(CV_IMPL_IPP);
1501         return width - _ksize + 1;
1502     }
1503 #endif
1504 };
1505 
1506 
1507 struct SymmRowSmallVec_32f
1508 {
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f1509     SymmRowSmallVec_32f() {}
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f1510     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1511     {
1512         kernel = _kernel;
1513         symmetryType = _symmetryType;
1514     }
1515 
operator ()cv::SymmRowSmallVec_32f1516     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1517     {
1518         if( !checkHardwareSupport(CV_CPU_SSE) )
1519             return 0;
1520 
1521         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1522         float* dst = (float*)_dst;
1523         const float* src = (const float*)_src + (_ksize/2)*cn;
1524         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1525         const float* kx = kernel.ptr<float>() + _ksize/2;
1526         width *= cn;
1527 
1528         if( symmetrical )
1529         {
1530             if( _ksize == 1 )
1531                 return 0;
1532             if( _ksize == 3 )
1533             {
1534                 if( kx[0] == 2 && kx[1] == 1 )
1535                     for( ; i <= width - 8; i += 8, src += 8 )
1536                     {
1537                         __m128 x0, x1, x2, y0, y1, y2;
1538                         x0 = _mm_loadu_ps(src - cn);
1539                         x1 = _mm_loadu_ps(src);
1540                         x2 = _mm_loadu_ps(src + cn);
1541                         y0 = _mm_loadu_ps(src - cn + 4);
1542                         y1 = _mm_loadu_ps(src + 4);
1543                         y2 = _mm_loadu_ps(src + cn + 4);
1544                         x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1545                         y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1546                         _mm_store_ps(dst + i, x0);
1547                         _mm_store_ps(dst + i + 4, y0);
1548                     }
1549                 else if( kx[0] == -2 && kx[1] == 1 )
1550                     for( ; i <= width - 8; i += 8, src += 8 )
1551                     {
1552                         __m128 x0, x1, x2, y0, y1, y2;
1553                         x0 = _mm_loadu_ps(src - cn);
1554                         x1 = _mm_loadu_ps(src);
1555                         x2 = _mm_loadu_ps(src + cn);
1556                         y0 = _mm_loadu_ps(src - cn + 4);
1557                         y1 = _mm_loadu_ps(src + 4);
1558                         y2 = _mm_loadu_ps(src + cn + 4);
1559                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1560                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1561                         _mm_store_ps(dst + i, x0);
1562                         _mm_store_ps(dst + i + 4, y0);
1563                     }
1564                 else
1565                 {
1566                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1567                     for( ; i <= width - 8; i += 8, src += 8 )
1568                     {
1569                         __m128 x0, x1, x2, y0, y1, y2;
1570                         x0 = _mm_loadu_ps(src - cn);
1571                         x1 = _mm_loadu_ps(src);
1572                         x2 = _mm_loadu_ps(src + cn);
1573                         y0 = _mm_loadu_ps(src - cn + 4);
1574                         y1 = _mm_loadu_ps(src + 4);
1575                         y2 = _mm_loadu_ps(src + cn + 4);
1576 
1577                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1578                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1579                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1580                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1581                         _mm_store_ps(dst + i, x0);
1582                         _mm_store_ps(dst + i + 4, y0);
1583                     }
1584                 }
1585             }
1586             else if( _ksize == 5 )
1587             {
1588                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1589                     for( ; i <= width - 8; i += 8, src += 8 )
1590                     {
1591                         __m128 x0, x1, x2, y0, y1, y2;
1592                         x0 = _mm_loadu_ps(src - cn*2);
1593                         x1 = _mm_loadu_ps(src);
1594                         x2 = _mm_loadu_ps(src + cn*2);
1595                         y0 = _mm_loadu_ps(src - cn*2 + 4);
1596                         y1 = _mm_loadu_ps(src + 4);
1597                         y2 = _mm_loadu_ps(src + cn*2 + 4);
1598                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1599                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1600                         _mm_store_ps(dst + i, x0);
1601                         _mm_store_ps(dst + i + 4, y0);
1602                     }
1603                 else
1604                 {
1605                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1606                     for( ; i <= width - 8; i += 8, src += 8 )
1607                     {
1608                         __m128 x0, x1, x2, y0, y1, y2;
1609                         x0 = _mm_loadu_ps(src - cn);
1610                         x1 = _mm_loadu_ps(src);
1611                         x2 = _mm_loadu_ps(src + cn);
1612                         y0 = _mm_loadu_ps(src - cn + 4);
1613                         y1 = _mm_loadu_ps(src + 4);
1614                         y2 = _mm_loadu_ps(src + cn + 4);
1615 
1616                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1617                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1618                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1619                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1620 
1621                         x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1622                         y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1623                         x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1624                         y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1625 
1626                         _mm_store_ps(dst + i, x0);
1627                         _mm_store_ps(dst + i + 4, y0);
1628                     }
1629                 }
1630             }
1631         }
1632         else
1633         {
1634             if( _ksize == 3 )
1635             {
1636                 if( kx[0] == 0 && kx[1] == 1 )
1637                     for( ; i <= width - 8; i += 8, src += 8 )
1638                     {
1639                         __m128 x0, x2, y0, y2;
1640                         x0 = _mm_loadu_ps(src + cn);
1641                         x2 = _mm_loadu_ps(src - cn);
1642                         y0 = _mm_loadu_ps(src + cn + 4);
1643                         y2 = _mm_loadu_ps(src - cn + 4);
1644                         x0 = _mm_sub_ps(x0, x2);
1645                         y0 = _mm_sub_ps(y0, y2);
1646                         _mm_store_ps(dst + i, x0);
1647                         _mm_store_ps(dst + i + 4, y0);
1648                     }
1649                 else
1650                 {
1651                     __m128 k1 = _mm_set1_ps(kx[1]);
1652                     for( ; i <= width - 8; i += 8, src += 8 )
1653                     {
1654                         __m128 x0, x2, y0, y2;
1655                         x0 = _mm_loadu_ps(src + cn);
1656                         x2 = _mm_loadu_ps(src - cn);
1657                         y0 = _mm_loadu_ps(src + cn + 4);
1658                         y2 = _mm_loadu_ps(src - cn + 4);
1659 
1660                         x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1661                         y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1662                         _mm_store_ps(dst + i, x0);
1663                         _mm_store_ps(dst + i + 4, y0);
1664                     }
1665                 }
1666             }
1667             else if( _ksize == 5 )
1668             {
1669                 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1670                 for( ; i <= width - 8; i += 8, src += 8 )
1671                 {
1672                     __m128 x0, x2, y0, y2;
1673                     x0 = _mm_loadu_ps(src + cn);
1674                     x2 = _mm_loadu_ps(src - cn);
1675                     y0 = _mm_loadu_ps(src + cn + 4);
1676                     y2 = _mm_loadu_ps(src - cn + 4);
1677 
1678                     x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1679                     y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1680 
1681                     x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1682                     y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1683                     x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1684                     y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1685 
1686                     _mm_store_ps(dst + i, x0);
1687                     _mm_store_ps(dst + i + 4, y0);
1688                 }
1689             }
1690         }
1691 
1692         return i;
1693     }
1694 
1695     Mat kernel;
1696     int symmetryType;
1697 };
1698 
1699 
1700 struct SymmColumnVec_32f
1701 {
SymmColumnVec_32fcv::SymmColumnVec_32f1702     SymmColumnVec_32f() { symmetryType=0; }
SymmColumnVec_32fcv::SymmColumnVec_32f1703     SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1704     {
1705         symmetryType = _symmetryType;
1706         kernel = _kernel;
1707         delta = (float)_delta;
1708         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1709     }
1710 
operator ()cv::SymmColumnVec_32f1711     int operator()(const uchar** _src, uchar* _dst, int width) const
1712     {
1713         if( !checkHardwareSupport(CV_CPU_SSE) )
1714             return 0;
1715 
1716         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1717         const float* ky = kernel.ptr<float>() + ksize2;
1718         int i = 0, k;
1719         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1720         const float** src = (const float**)_src;
1721         const float *S, *S2;
1722         float* dst = (float*)_dst;
1723         __m128 d4 = _mm_set1_ps(delta);
1724 
1725         if( symmetrical )
1726         {
1727             for( ; i <= width - 16; i += 16 )
1728             {
1729                 __m128 f = _mm_load_ss(ky);
1730                 f = _mm_shuffle_ps(f, f, 0);
1731                 __m128 s0, s1, s2, s3;
1732                 __m128 x0, x1;
1733                 S = src[0] + i;
1734                 s0 = _mm_load_ps(S);
1735                 s1 = _mm_load_ps(S+4);
1736                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1737                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1738                 s2 = _mm_load_ps(S+8);
1739                 s3 = _mm_load_ps(S+12);
1740                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1741                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1742 
1743                 for( k = 1; k <= ksize2; k++ )
1744                 {
1745                     S = src[k] + i;
1746                     S2 = src[-k] + i;
1747                     f = _mm_load_ss(ky+k);
1748                     f = _mm_shuffle_ps(f, f, 0);
1749                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1750                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1751                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1752                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1753                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1754                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1755                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1756                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1757                 }
1758 
1759                 _mm_storeu_ps(dst + i, s0);
1760                 _mm_storeu_ps(dst + i + 4, s1);
1761                 _mm_storeu_ps(dst + i + 8, s2);
1762                 _mm_storeu_ps(dst + i + 12, s3);
1763             }
1764 
1765             for( ; i <= width - 4; i += 4 )
1766             {
1767                 __m128 f = _mm_load_ss(ky);
1768                 f = _mm_shuffle_ps(f, f, 0);
1769                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1770                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1771 
1772                 for( k = 1; k <= ksize2; k++ )
1773                 {
1774                     f = _mm_load_ss(ky+k);
1775                     f = _mm_shuffle_ps(f, f, 0);
1776                     S = src[k] + i;
1777                     S2 = src[-k] + i;
1778                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1779                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1780                 }
1781 
1782                 _mm_storeu_ps(dst + i, s0);
1783             }
1784         }
1785         else
1786         {
1787             for( ; i <= width - 16; i += 16 )
1788             {
1789                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1790                 __m128 x0, x1;
1791                 S = src[0] + i;
1792 
1793                 for( k = 1; k <= ksize2; k++ )
1794                 {
1795                     S = src[k] + i;
1796                     S2 = src[-k] + i;
1797                     f = _mm_load_ss(ky+k);
1798                     f = _mm_shuffle_ps(f, f, 0);
1799                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1800                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1801                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1802                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1803                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1804                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1805                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1806                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1807                 }
1808 
1809                 _mm_storeu_ps(dst + i, s0);
1810                 _mm_storeu_ps(dst + i + 4, s1);
1811                 _mm_storeu_ps(dst + i + 8, s2);
1812                 _mm_storeu_ps(dst + i + 12, s3);
1813             }
1814 
1815             for( ; i <= width - 4; i += 4 )
1816             {
1817                 __m128 f, x0, s0 = d4;
1818 
1819                 for( k = 1; k <= ksize2; k++ )
1820                 {
1821                     f = _mm_load_ss(ky+k);
1822                     f = _mm_shuffle_ps(f, f, 0);
1823                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1824                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1825                 }
1826 
1827                 _mm_storeu_ps(dst + i, s0);
1828             }
1829         }
1830 
1831         return i;
1832     }
1833 
1834     int symmetryType;
1835     float delta;
1836     Mat kernel;
1837 };
1838 
1839 
1840 struct SymmColumnSmallVec_32f
1841 {
SymmColumnSmallVec_32fcv::SymmColumnSmallVec_32f1842     SymmColumnSmallVec_32f() { symmetryType=0; }
SymmColumnSmallVec_32fcv::SymmColumnSmallVec_32f1843     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1844     {
1845         symmetryType = _symmetryType;
1846         kernel = _kernel;
1847         delta = (float)_delta;
1848         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1849     }
1850 
operator ()cv::SymmColumnSmallVec_32f1851     int operator()(const uchar** _src, uchar* _dst, int width) const
1852     {
1853         if( !checkHardwareSupport(CV_CPU_SSE) )
1854             return 0;
1855 
1856         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1857         const float* ky = kernel.ptr<float>() + ksize2;
1858         int i = 0;
1859         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1860         const float** src = (const float**)_src;
1861         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1862         float* dst = (float*)_dst;
1863         __m128 d4 = _mm_set1_ps(delta);
1864 
1865         if( symmetrical )
1866         {
1867             if( ky[0] == 2 && ky[1] == 1 )
1868             {
1869                 for( ; i <= width - 8; i += 8 )
1870                 {
1871                     __m128 s0, s1, s2, s3, s4, s5;
1872                     s0 = _mm_load_ps(S0 + i);
1873                     s1 = _mm_load_ps(S0 + i + 4);
1874                     s2 = _mm_load_ps(S1 + i);
1875                     s3 = _mm_load_ps(S1 + i + 4);
1876                     s4 = _mm_load_ps(S2 + i);
1877                     s5 = _mm_load_ps(S2 + i + 4);
1878                     s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1879                     s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1880                     s0 = _mm_add_ps(s0, d4);
1881                     s1 = _mm_add_ps(s1, d4);
1882                     _mm_storeu_ps(dst + i, s0);
1883                     _mm_storeu_ps(dst + i + 4, s1);
1884                 }
1885             }
1886             else if( ky[0] == -2 && ky[1] == 1 )
1887             {
1888                 for( ; i <= width - 8; i += 8 )
1889                 {
1890                     __m128 s0, s1, s2, s3, s4, s5;
1891                     s0 = _mm_load_ps(S0 + i);
1892                     s1 = _mm_load_ps(S0 + i + 4);
1893                     s2 = _mm_load_ps(S1 + i);
1894                     s3 = _mm_load_ps(S1 + i + 4);
1895                     s4 = _mm_load_ps(S2 + i);
1896                     s5 = _mm_load_ps(S2 + i + 4);
1897                     s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1898                     s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1899                     s0 = _mm_add_ps(s0, d4);
1900                     s1 = _mm_add_ps(s1, d4);
1901                     _mm_storeu_ps(dst + i, s0);
1902                     _mm_storeu_ps(dst + i + 4, s1);
1903                 }
1904             }
1905             else
1906             {
1907                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1908                 for( ; i <= width - 8; i += 8 )
1909                 {
1910                     __m128 s0, s1, x0, x1;
1911                     s0 = _mm_load_ps(S1 + i);
1912                     s1 = _mm_load_ps(S1 + i + 4);
1913                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1914                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1915                     x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1916                     x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1917                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1918                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1919                     _mm_storeu_ps(dst + i, s0);
1920                     _mm_storeu_ps(dst + i + 4, s1);
1921                 }
1922             }
1923         }
1924         else
1925         {
1926             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1927             {
1928                 if( ky[1] < 0 )
1929                     std::swap(S0, S2);
1930                 for( ; i <= width - 8; i += 8 )
1931                 {
1932                     __m128 s0, s1, s2, s3;
1933                     s0 = _mm_load_ps(S2 + i);
1934                     s1 = _mm_load_ps(S2 + i + 4);
1935                     s2 = _mm_load_ps(S0 + i);
1936                     s3 = _mm_load_ps(S0 + i + 4);
1937                     s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1938                     s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1939                     _mm_storeu_ps(dst + i, s0);
1940                     _mm_storeu_ps(dst + i + 4, s1);
1941                 }
1942             }
1943             else
1944             {
1945                 __m128 k1 = _mm_set1_ps(ky[1]);
1946                 for( ; i <= width - 8; i += 8 )
1947                 {
1948                     __m128 s0 = d4, s1 = d4, x0, x1;
1949                     x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
1950                     x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
1951                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1952                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1953                     _mm_storeu_ps(dst + i, s0);
1954                     _mm_storeu_ps(dst + i + 4, s1);
1955                 }
1956             }
1957         }
1958 
1959         return i;
1960     }
1961 
1962     int symmetryType;
1963     float delta;
1964     Mat kernel;
1965 };
1966 
1967 
1968 /////////////////////////////// non-separable filters ///////////////////////////////
1969 
1970 ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1971 
1972 struct FilterVec_8u
1973 {
FilterVec_8ucv::FilterVec_8u1974     FilterVec_8u() {}
FilterVec_8ucv::FilterVec_8u1975     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1976     {
1977         Mat kernel;
1978         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1979         delta = (float)(_delta/(1 << _bits));
1980         std::vector<Point> coords;
1981         preprocess2DKernel(kernel, coords, coeffs);
1982         _nz = (int)coords.size();
1983     }
1984 
operator ()cv::FilterVec_8u1985     int operator()(const uchar** src, uchar* dst, int width) const
1986     {
1987         if( !checkHardwareSupport(CV_CPU_SSE2) )
1988             return 0;
1989 
1990         const float* kf = (const float*)&coeffs[0];
1991         int i = 0, k, nz = _nz;
1992         __m128 d4 = _mm_set1_ps(delta);
1993 
1994         for( ; i <= width - 16; i += 16 )
1995         {
1996             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1997             __m128i x0, x1, z = _mm_setzero_si128();
1998 
1999             for( k = 0; k < nz; k++ )
2000             {
2001                 __m128 f = _mm_load_ss(kf+k), t0, t1;
2002                 f = _mm_shuffle_ps(f, f, 0);
2003 
2004                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2005                 x1 = _mm_unpackhi_epi8(x0, z);
2006                 x0 = _mm_unpacklo_epi8(x0, z);
2007 
2008                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2009                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2010                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2011                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2012 
2013                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2014                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2015                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2016                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2017             }
2018 
2019             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2020             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2021             x0 = _mm_packus_epi16(x0, x1);
2022             _mm_storeu_si128((__m128i*)(dst + i), x0);
2023         }
2024 
2025         for( ; i <= width - 4; i += 4 )
2026         {
2027             __m128 s0 = d4;
2028             __m128i x0, z = _mm_setzero_si128();
2029 
2030             for( k = 0; k < nz; k++ )
2031             {
2032                 __m128 f = _mm_load_ss(kf+k), t0;
2033                 f = _mm_shuffle_ps(f, f, 0);
2034 
2035                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2036                 x0 = _mm_unpacklo_epi8(x0, z);
2037                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2038                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2039             }
2040 
2041             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2042             x0 = _mm_packus_epi16(x0, x0);
2043             *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
2044         }
2045 
2046         return i;
2047     }
2048 
2049     int _nz;
2050     std::vector<uchar> coeffs;
2051     float delta;
2052 };
2053 
2054 
2055 struct FilterVec_8u16s
2056 {
FilterVec_8u16scv::FilterVec_8u16s2057     FilterVec_8u16s() {}
FilterVec_8u16scv::FilterVec_8u16s2058     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
2059     {
2060         Mat kernel;
2061         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2062         delta = (float)(_delta/(1 << _bits));
2063         std::vector<Point> coords;
2064         preprocess2DKernel(kernel, coords, coeffs);
2065         _nz = (int)coords.size();
2066     }
2067 
operator ()cv::FilterVec_8u16s2068     int operator()(const uchar** src, uchar* _dst, int width) const
2069     {
2070         if( !checkHardwareSupport(CV_CPU_SSE2) )
2071             return 0;
2072 
2073         const float* kf = (const float*)&coeffs[0];
2074         short* dst = (short*)_dst;
2075         int i = 0, k, nz = _nz;
2076         __m128 d4 = _mm_set1_ps(delta);
2077 
2078         for( ; i <= width - 16; i += 16 )
2079         {
2080             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2081             __m128i x0, x1, z = _mm_setzero_si128();
2082 
2083             for( k = 0; k < nz; k++ )
2084             {
2085                 __m128 f = _mm_load_ss(kf+k), t0, t1;
2086                 f = _mm_shuffle_ps(f, f, 0);
2087 
2088                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2089                 x1 = _mm_unpackhi_epi8(x0, z);
2090                 x0 = _mm_unpacklo_epi8(x0, z);
2091 
2092                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2093                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2094                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2095                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2096 
2097                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2098                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2099                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2100                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2101             }
2102 
2103             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2104             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2105             _mm_storeu_si128((__m128i*)(dst + i), x0);
2106             _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
2107         }
2108 
2109         for( ; i <= width - 4; i += 4 )
2110         {
2111             __m128 s0 = d4;
2112             __m128i x0, z = _mm_setzero_si128();
2113 
2114             for( k = 0; k < nz; k++ )
2115             {
2116                 __m128 f = _mm_load_ss(kf+k), t0;
2117                 f = _mm_shuffle_ps(f, f, 0);
2118 
2119                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2120                 x0 = _mm_unpacklo_epi8(x0, z);
2121                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2122                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2123             }
2124 
2125             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2126             _mm_storel_epi64((__m128i*)(dst + i), x0);
2127         }
2128 
2129         return i;
2130     }
2131 
2132     int _nz;
2133     std::vector<uchar> coeffs;
2134     float delta;
2135 };
2136 
2137 
2138 struct FilterVec_32f
2139 {
FilterVec_32fcv::FilterVec_32f2140     FilterVec_32f() {}
FilterVec_32fcv::FilterVec_32f2141     FilterVec_32f(const Mat& _kernel, int, double _delta)
2142     {
2143         delta = (float)_delta;
2144         std::vector<Point> coords;
2145         preprocess2DKernel(_kernel, coords, coeffs);
2146         _nz = (int)coords.size();
2147     }
2148 
operator ()cv::FilterVec_32f2149     int operator()(const uchar** _src, uchar* _dst, int width) const
2150     {
2151         if( !checkHardwareSupport(CV_CPU_SSE) )
2152             return 0;
2153 
2154         const float* kf = (const float*)&coeffs[0];
2155         const float** src = (const float**)_src;
2156         float* dst = (float*)_dst;
2157         int i = 0, k, nz = _nz;
2158         __m128 d4 = _mm_set1_ps(delta);
2159 
2160         for( ; i <= width - 16; i += 16 )
2161         {
2162             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2163 
2164             for( k = 0; k < nz; k++ )
2165             {
2166                 __m128 f = _mm_load_ss(kf+k), t0, t1;
2167                 f = _mm_shuffle_ps(f, f, 0);
2168                 const float* S = src[k] + i;
2169 
2170                 t0 = _mm_loadu_ps(S);
2171                 t1 = _mm_loadu_ps(S + 4);
2172                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2173                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2174 
2175                 t0 = _mm_loadu_ps(S + 8);
2176                 t1 = _mm_loadu_ps(S + 12);
2177                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2178                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2179             }
2180 
2181             _mm_storeu_ps(dst + i, s0);
2182             _mm_storeu_ps(dst + i + 4, s1);
2183             _mm_storeu_ps(dst + i + 8, s2);
2184             _mm_storeu_ps(dst + i + 12, s3);
2185         }
2186 
2187         for( ; i <= width - 4; i += 4 )
2188         {
2189             __m128 s0 = d4;
2190 
2191             for( k = 0; k < nz; k++ )
2192             {
2193                 __m128 f = _mm_load_ss(kf+k), t0;
2194                 f = _mm_shuffle_ps(f, f, 0);
2195                 t0 = _mm_loadu_ps(src[k] + i);
2196                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2197             }
2198             _mm_storeu_ps(dst + i, s0);
2199         }
2200 
2201         return i;
2202     }
2203 
2204     int _nz;
2205     std::vector<uchar> coeffs;
2206     float delta;
2207 };
2208 
2209 
2210 #elif CV_NEON
2211 
2212 struct SymmRowSmallVec_8u32s
2213 {
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s2214     SymmRowSmallVec_8u32s() { smallValues = false; }
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s2215     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
2216     {
2217         kernel = _kernel;
2218         symmetryType = _symmetryType;
2219         smallValues = true;
2220         int k, ksize = kernel.rows + kernel.cols - 1;
2221         for( k = 0; k < ksize; k++ )
2222         {
2223             int v = kernel.ptr<int>()[k];
2224             if( v < SHRT_MIN || v > SHRT_MAX )
2225             {
2226                 smallValues = false;
2227                 break;
2228             }
2229         }
2230     }
2231 
operator ()cv::SymmRowSmallVec_8u32s2232     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
2233     {
2234          if( !checkHardwareSupport(CV_CPU_NEON) )
2235              return 0;
2236 
2237         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
2238         int* dst = (int*)_dst;
2239         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2240         const int* kx = kernel.ptr<int>() + _ksize/2;
2241         if( !smallValues )
2242             return 0;
2243 
2244         src += (_ksize/2)*cn;
2245         width *= cn;
2246 
2247         if( symmetrical )
2248         {
2249             if( _ksize == 1 )
2250                 return 0;
2251             if( _ksize == 3 )
2252             {
2253                 if( kx[0] == 2 && kx[1] == 1 )
2254                 {
2255                     uint16x8_t zq = vdupq_n_u16(0);
2256 
2257                     for( ; i <= width - 8; i += 8, src += 8 )
2258                     {
2259                         uint8x8_t x0, x1, x2;
2260                         x0 = vld1_u8( (uint8_t *) (src - cn) );
2261                         x1 = vld1_u8( (uint8_t *) (src) );
2262                         x2 = vld1_u8( (uint8_t *) (src + cn) );
2263 
2264                         uint16x8_t y0, y1, y2;
2265                         y0 = vaddl_u8(x0, x2);
2266                         y1 = vshll_n_u8(x1, 1);
2267                         y2 = vaddq_u16(y0, y1);
2268 
2269                         uint16x8x2_t str;
2270                         str.val[0] = y2; str.val[1] = zq;
2271                         vst2q_u16( (uint16_t *) (dst + i), str );
2272                     }
2273                 }
2274                 else if( kx[0] == -2 && kx[1] == 1 )
2275                     return 0;
2276                 else
2277                 {
2278                     int32x4_t k32 = vdupq_n_s32(0);
2279                     k32 = vld1q_lane_s32(kx, k32, 0);
2280                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
2281 
2282                     int16x4_t k = vqmovn_s32(k32);
2283 
2284                     uint8x8_t z = vdup_n_u8(0);
2285 
2286                     for( ; i <= width - 8; i += 8, src += 8 )
2287                     {
2288                         uint8x8_t x0, x1, x2;
2289                         x0 = vld1_u8( (uint8_t *) (src - cn) );
2290                         x1 = vld1_u8( (uint8_t *) (src) );
2291                         x2 = vld1_u8( (uint8_t *) (src + cn) );
2292 
2293                         int16x8_t y0, y1;
2294                         int32x4_t y2, y3;
2295                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
2296                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
2297                         y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
2298                         y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
2299                         y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
2300                         y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
2301 
2302                         vst1q_s32((int32_t *)(dst + i), y2);
2303                         vst1q_s32((int32_t *)(dst + i + 4), y3);
2304                     }
2305                 }
2306             }
2307             else if( _ksize == 5 )
2308             {
2309                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
2310                     return 0;
2311                 else
2312                 {
2313                     int32x4_t k32 = vdupq_n_s32(0);
2314                     k32 = vld1q_lane_s32(kx, k32, 0);
2315                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
2316                     k32 = vld1q_lane_s32(kx + 2, k32, 2);
2317 
2318                     int16x4_t k = vqmovn_s32(k32);
2319 
2320                     uint8x8_t z = vdup_n_u8(0);
2321 
2322                     for( ; i <= width - 8; i += 8, src += 8 )
2323                     {
2324                         uint8x8_t x0, x1, x2, x3, x4;
2325                         x0 = vld1_u8( (uint8_t *) (src - cn) );
2326                         x1 = vld1_u8( (uint8_t *) (src) );
2327                         x2 = vld1_u8( (uint8_t *) (src + cn) );
2328 
2329                         int16x8_t y0, y1;
2330                         int32x4_t accl, acch;
2331                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
2332                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
2333                         accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
2334                         accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
2335                         acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
2336                         acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
2337 
2338                         int16x8_t y2;
2339                         x3 = vld1_u8( (uint8_t *) (src - cn*2) );
2340                         x4 = vld1_u8( (uint8_t *) (src + cn*2) );
2341                         y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
2342                         accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
2343                         acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
2344 
2345                         vst1q_s32((int32_t *)(dst + i), accl);
2346                         vst1q_s32((int32_t *)(dst + i + 4), acch);
2347                     }
2348                 }
2349             }
2350         }
2351         else
2352         {
2353             if( _ksize == 3 )
2354             {
2355                 if( kx[0] == 0 && kx[1] == 1 )
2356                 {
2357                     uint8x8_t z = vdup_n_u8(0);
2358 
2359                     for( ; i <= width - 8; i += 8, src += 8 )
2360                     {
2361                         uint8x8_t x0, x1;
2362                         x0 = vld1_u8( (uint8_t *) (src - cn) );
2363                         x1 = vld1_u8( (uint8_t *) (src + cn) );
2364 
2365                         int16x8_t y0;
2366                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2367                                 vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2368 
2369                         vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
2370                         vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
2371                     }
2372                 }
2373                 else
2374                 {
2375                     int32x4_t k32 = vdupq_n_s32(0);
2376                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
2377 
2378                     int16x4_t k = vqmovn_s32(k32);
2379 
2380                     uint8x8_t z = vdup_n_u8(0);
2381 
2382                     for( ; i <= width - 8; i += 8, src += 8 )
2383                     {
2384                         uint8x8_t x0, x1;
2385                         x0 = vld1_u8( (uint8_t *) (src - cn) );
2386                         x1 = vld1_u8( (uint8_t *) (src + cn) );
2387 
2388                         int16x8_t y0;
2389                         int32x4_t y1, y2;
2390                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2391                             vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2392                         y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
2393                         y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
2394 
2395                         vst1q_s32((int32_t *)(dst + i), y1);
2396                         vst1q_s32((int32_t *)(dst + i + 4), y2);
2397                     }
2398                 }
2399             }
2400             else if( _ksize == 5 )
2401             {
2402                 int32x4_t k32 = vdupq_n_s32(0);
2403                 k32 = vld1q_lane_s32(kx + 1, k32, 1);
2404                 k32 = vld1q_lane_s32(kx + 2, k32, 2);
2405 
2406                 int16x4_t k = vqmovn_s32(k32);
2407 
2408                 uint8x8_t z = vdup_n_u8(0);
2409 
2410                 for( ; i <= width - 8; i += 8, src += 8 )
2411                 {
2412                     uint8x8_t x0, x1;
2413                     x0 = vld1_u8( (uint8_t *) (src - cn) );
2414                     x1 = vld1_u8( (uint8_t *) (src + cn) );
2415 
2416                     int32x4_t accl, acch;
2417                     int16x8_t y0;
2418                     y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2419                         vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2420                     accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
2421                     acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
2422 
2423                     uint8x8_t x2, x3;
2424                     x2 = vld1_u8( (uint8_t *) (src - cn*2) );
2425                     x3 = vld1_u8( (uint8_t *) (src + cn*2) );
2426 
2427                     int16x8_t y1;
2428                     y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
2429                         vreinterpretq_s16_u16(vaddl_u8(x2, z)));
2430                     accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
2431                     acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
2432 
2433                     vst1q_s32((int32_t *)(dst + i), accl);
2434                     vst1q_s32((int32_t *)(dst + i + 4), acch);
2435                 }
2436             }
2437         }
2438 
2439         return i;
2440     }
2441 
2442     Mat kernel;
2443     int symmetryType;
2444     bool smallValues;
2445 };
2446 
2447 
2448 struct SymmColumnVec_32s8u
2449 {
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u2450     SymmColumnVec_32s8u() { symmetryType=0; }
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u2451     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
2452     {
2453         symmetryType = _symmetryType;
2454         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2455         delta = (float)(_delta/(1 << _bits));
2456         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2457     }
2458 
operator ()cv::SymmColumnVec_32s8u2459     int operator()(const uchar** _src, uchar* dst, int width) const
2460     {
2461          if( !checkHardwareSupport(CV_CPU_NEON) )
2462              return 0;
2463 
2464         int _ksize = kernel.rows + kernel.cols - 1;
2465         int ksize2 = _ksize / 2;
2466         const float* ky = kernel.ptr<float>() + ksize2;
2467         int i = 0, k;
2468         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2469         const int** src = (const int**)_src;
2470         const int *S, *S2;
2471 
2472         float32x4_t d4 = vdupq_n_f32(delta);
2473 
2474         if( symmetrical )
2475         {
2476             if( _ksize == 1 )
2477                 return 0;
2478 
2479 
2480             float32x2_t k32;
2481             k32 = vdup_n_f32(0);
2482             k32 = vld1_lane_f32(ky, k32, 0);
2483             k32 = vld1_lane_f32(ky + 1, k32, 1);
2484 
2485             for( ; i <= width - 8; i += 8 )
2486             {
2487                 float32x4_t accl, acch;
2488                 float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
2489 
2490                 S = src[0] + i;
2491 
2492                 f0l = vcvtq_f32_s32( vld1q_s32(S) );
2493                 f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2494 
2495                 S = src[1] + i;
2496                 S2 = src[-1] + i;
2497 
2498                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
2499                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2500                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
2501                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2502 
2503                 accl = acch = d4;
2504                 accl = vmlaq_lane_f32(accl, f0l, k32, 0);
2505                 acch = vmlaq_lane_f32(acch, f0h, k32, 0);
2506                 accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
2507                 acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
2508 
2509                 for( k = 2; k <= ksize2; k++ )
2510                 {
2511                     S = src[k] + i;
2512                     S2 = src[-k] + i;
2513 
2514                     float32x4_t f3l, f3h, f4l, f4h;
2515                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
2516                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2517                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
2518                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2519 
2520                     accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
2521                     acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
2522                 }
2523 
2524                 int32x4_t s32l, s32h;
2525                 s32l = vcvtq_s32_f32(accl);
2526                 s32h = vcvtq_s32_f32(acch);
2527 
2528                 int16x4_t s16l, s16h;
2529                 s16l = vqmovn_s32(s32l);
2530                 s16h = vqmovn_s32(s32h);
2531 
2532                 uint8x8_t u8;
2533                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
2534 
2535                 vst1_u8((uint8_t *)(dst + i), u8);
2536             }
2537         }
2538         else
2539         {
2540             float32x2_t k32;
2541             k32 = vdup_n_f32(0);
2542             k32 = vld1_lane_f32(ky + 1, k32, 1);
2543 
2544             for( ; i <= width - 8; i += 8 )
2545             {
2546                 float32x4_t accl, acch;
2547                 float32x4_t f1l, f1h, f2l, f2h;
2548 
2549                 S = src[1] + i;
2550                 S2 = src[-1] + i;
2551 
2552                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
2553                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2554                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
2555                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2556 
2557                 accl = acch = d4;
2558                 accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
2559                 acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
2560 
2561                 for( k = 2; k <= ksize2; k++ )
2562                 {
2563                     S = src[k] + i;
2564                     S2 = src[-k] + i;
2565 
2566                     float32x4_t f3l, f3h, f4l, f4h;
2567                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
2568                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2569                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
2570                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2571 
2572                     accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
2573                     acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
2574                 }
2575 
2576                 int32x4_t s32l, s32h;
2577                 s32l = vcvtq_s32_f32(accl);
2578                 s32h = vcvtq_s32_f32(acch);
2579 
2580                 int16x4_t s16l, s16h;
2581                 s16l = vqmovn_s32(s32l);
2582                 s16h = vqmovn_s32(s32h);
2583 
2584                 uint8x8_t u8;
2585                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
2586 
2587                 vst1_u8((uint8_t *)(dst + i), u8);
2588             }
2589         }
2590 
2591         return i;
2592     }
2593 
2594     int symmetryType;
2595     float delta;
2596     Mat kernel;
2597 };
2598 
2599 
2600 struct SymmColumnSmallVec_32s16s
2601 {
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s2602     SymmColumnSmallVec_32s16s() { symmetryType=0; }
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s2603     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
2604     {
2605         symmetryType = _symmetryType;
2606         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2607         delta = (float)(_delta/(1 << _bits));
2608         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2609     }
2610 
operator ()cv::SymmColumnSmallVec_32s16s2611     int operator()(const uchar** _src, uchar* _dst, int width) const
2612     {
2613          if( !checkHardwareSupport(CV_CPU_NEON) )
2614              return 0;
2615 
2616         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
2617         const float* ky = kernel.ptr<float>() + ksize2;
2618         int i = 0;
2619         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2620         const int** src = (const int**)_src;
2621         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
2622         short* dst = (short*)_dst;
2623         float32x4_t df4 = vdupq_n_f32(delta);
2624         int32x4_t d4 = vcvtq_s32_f32(df4);
2625 
2626         if( symmetrical )
2627         {
2628             if( ky[0] == 2 && ky[1] == 1 )
2629             {
2630                 for( ; i <= width - 4; i += 4 )
2631                 {
2632                     int32x4_t x0, x1, x2;
2633                     x0 = vld1q_s32((int32_t const *)(S0 + i));
2634                     x1 = vld1q_s32((int32_t const *)(S1 + i));
2635                     x2 = vld1q_s32((int32_t const *)(S2 + i));
2636 
2637                     int32x4_t y0, y1, y2, y3;
2638                     y0 = vaddq_s32(x0, x2);
2639                     y1 = vqshlq_n_s32(x1, 1);
2640                     y2 = vaddq_s32(y0, y1);
2641                     y3 = vaddq_s32(y2, d4);
2642 
2643                     int16x4_t t;
2644                     t = vqmovn_s32(y3);
2645 
2646                     vst1_s16((int16_t *)(dst + i), t);
2647                 }
2648             }
2649             else if( ky[0] == -2 && ky[1] == 1 )
2650             {
2651                 for( ; i <= width - 4; i += 4 )
2652                 {
2653                     int32x4_t x0, x1, x2;
2654                     x0 = vld1q_s32((int32_t const *)(S0 + i));
2655                     x1 = vld1q_s32((int32_t const *)(S1 + i));
2656                     x2 = vld1q_s32((int32_t const *)(S2 + i));
2657 
2658                     int32x4_t y0, y1, y2, y3;
2659                     y0 = vaddq_s32(x0, x2);
2660                     y1 = vqshlq_n_s32(x1, 1);
2661                     y2 = vsubq_s32(y0, y1);
2662                     y3 = vaddq_s32(y2, d4);
2663 
2664                     int16x4_t t;
2665                     t = vqmovn_s32(y3);
2666 
2667                     vst1_s16((int16_t *)(dst + i), t);
2668                 }
2669             }
2670             else if( ky[0] == 10 && ky[1] == 3 )
2671             {
2672                 for( ; i <= width - 4; i += 4 )
2673                 {
2674                     int32x4_t x0, x1, x2, x3;
2675                     x0 = vld1q_s32((int32_t const *)(S0 + i));
2676                     x1 = vld1q_s32((int32_t const *)(S1 + i));
2677                     x2 = vld1q_s32((int32_t const *)(S2 + i));
2678 
2679                     x3 = vaddq_s32(x0, x2);
2680 
2681                     int32x4_t y0;
2682                     y0 = vmlaq_n_s32(d4, x1, 10);
2683                     y0 = vmlaq_n_s32(y0, x3, 3);
2684 
2685                     int16x4_t t;
2686                     t = vqmovn_s32(y0);
2687 
2688                     vst1_s16((int16_t *)(dst + i), t);
2689                 }
2690             }
2691             else
2692             {
2693                 float32x2_t k32 = vdup_n_f32(0);
2694                 k32 = vld1_lane_f32(ky, k32, 0);
2695                 k32 = vld1_lane_f32(ky + 1, k32, 1);
2696 
2697                 for( ; i <= width - 4; i += 4 )
2698                 {
2699                     int32x4_t x0, x1, x2, x3, x4;
2700                     x0 = vld1q_s32((int32_t const *)(S0 + i));
2701                     x1 = vld1q_s32((int32_t const *)(S1 + i));
2702                     x2 = vld1q_s32((int32_t const *)(S2 + i));
2703 
2704                     x3 = vaddq_s32(x0, x2);
2705 
2706                     float32x4_t s0, s1, s2;
2707                     s0 = vcvtq_f32_s32(x1);
2708                     s1 = vcvtq_f32_s32(x3);
2709                     s2 = vmlaq_lane_f32(df4, s0, k32, 0);
2710                     s2 = vmlaq_lane_f32(s2, s1, k32, 1);
2711 
2712                     x4 = vcvtq_s32_f32(s2);
2713 
2714                     int16x4_t x5;
2715                     x5 = vqmovn_s32(x4);
2716 
2717                     vst1_s16((int16_t *)(dst + i), x5);
2718                 }
2719             }
2720         }
2721         else
2722         {
2723             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
2724             {
2725                 if( ky[1] < 0 )
2726                     std::swap(S0, S2);
2727                 for( ; i <= width - 4; i += 4 )
2728                 {
2729                     int32x4_t x0, x1;
2730                     x0 = vld1q_s32((int32_t const *)(S0 + i));
2731                     x1 = vld1q_s32((int32_t const *)(S2 + i));
2732 
2733                     int32x4_t y0, y1;
2734                     y0 = vsubq_s32(x1, x0);
2735                     y1 = vqaddq_s32(y0, d4);
2736 
2737                     int16x4_t t;
2738                     t = vqmovn_s32(y1);
2739 
2740                     vst1_s16((int16_t *)(dst + i), t);
2741                 }
2742             }
2743             else
2744             {
2745                 float32x2_t k32 = vdup_n_f32(0);
2746                 k32 = vld1_lane_f32(ky + 1, k32, 1);
2747 
2748                 for( ; i <= width - 4; i += 4 )
2749                 {
2750                     int32x4_t x0, x1, x2, x3;
2751                     x0 = vld1q_s32((int32_t const *)(S0 + i));
2752                     x1 = vld1q_s32((int32_t const *)(S2 + i));
2753 
2754                     x2 = vsubq_s32(x1, x0);
2755 
2756                     float32x4_t s0, s1;
2757                     s0 = vcvtq_f32_s32(x2);
2758                     s1 = vmlaq_lane_f32(df4, s0, k32, 1);
2759 
2760                     x3 = vcvtq_s32_f32(s1);
2761 
2762                     int16x4_t x4;
2763                     x4 = vqmovn_s32(x3);
2764 
2765                     vst1_s16((int16_t *)(dst + i), x4);
2766                 }
2767             }
2768         }
2769 
2770         return i;
2771     }
2772 
2773     int symmetryType;
2774     float delta;
2775     Mat kernel;
2776 };
2777 
2778 
2779 struct SymmColumnVec_32f16s
2780 {
SymmColumnVec_32f16scv::SymmColumnVec_32f16s2781     SymmColumnVec_32f16s() { symmetryType=0; }
SymmColumnVec_32f16scv::SymmColumnVec_32f16s2782     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
2783     {
2784         symmetryType = _symmetryType;
2785         kernel = _kernel;
2786         delta = (float)_delta;
2787         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2788          neon_supported = checkHardwareSupport(CV_CPU_NEON);
2789     }
2790 
operator ()cv::SymmColumnVec_32f16s2791     int operator()(const uchar** _src, uchar* _dst, int width) const
2792     {
2793          if( !neon_supported )
2794              return 0;
2795 
2796         int _ksize = kernel.rows + kernel.cols - 1;
2797         int ksize2 = _ksize / 2;
2798         const float* ky = kernel.ptr<float>() + ksize2;
2799         int i = 0, k;
2800         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2801         const float** src = (const float**)_src;
2802         const float *S, *S2;
2803         short* dst = (short*)_dst;
2804 
2805         float32x4_t d4 = vdupq_n_f32(delta);
2806 
2807         if( symmetrical )
2808         {
2809             if( _ksize == 1 )
2810                 return 0;
2811 
2812 
2813             float32x2_t k32;
2814             k32 = vdup_n_f32(0);
2815             k32 = vld1_lane_f32(ky, k32, 0);
2816             k32 = vld1_lane_f32(ky + 1, k32, 1);
2817 
2818             for( ; i <= width - 8; i += 8 )
2819             {
2820                 float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
2821                 float32x4_t accl, acch;
2822 
2823                 S = src[0] + i;
2824 
2825                 x0l = vld1q_f32(S);
2826                 x0h = vld1q_f32(S + 4);
2827 
2828                 S = src[1] + i;
2829                 S2 = src[-1] + i;
2830 
2831                 x1l = vld1q_f32(S);
2832                 x1h = vld1q_f32(S + 4);
2833                 x2l = vld1q_f32(S2);
2834                 x2h = vld1q_f32(S2 + 4);
2835 
2836                 accl = acch = d4;
2837                 accl = vmlaq_lane_f32(accl, x0l, k32, 0);
2838                 acch = vmlaq_lane_f32(acch, x0h, k32, 0);
2839                 accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
2840                 acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
2841 
2842                 for( k = 2; k <= ksize2; k++ )
2843                 {
2844                     S = src[k] + i;
2845                     S2 = src[-k] + i;
2846 
2847                     float32x4_t x3l, x3h, x4l, x4h;
2848                     x3l = vld1q_f32(S);
2849                     x3h = vld1q_f32(S + 4);
2850                     x4l = vld1q_f32(S2);
2851                     x4h = vld1q_f32(S2 + 4);
2852 
2853                     accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
2854                     acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
2855                 }
2856 
2857                 int32x4_t s32l, s32h;
2858                 s32l = vcvtq_s32_f32(accl);
2859                 s32h = vcvtq_s32_f32(acch);
2860 
2861                 int16x4_t s16l, s16h;
2862                 s16l = vqmovn_s32(s32l);
2863                 s16h = vqmovn_s32(s32h);
2864 
2865                 vst1_s16((int16_t *)(dst + i), s16l);
2866                 vst1_s16((int16_t *)(dst + i + 4), s16h);
2867             }
2868         }
2869         else
2870         {
2871             float32x2_t k32;
2872             k32 = vdup_n_f32(0);
2873             k32 = vld1_lane_f32(ky + 1, k32, 1);
2874 
2875             for( ; i <= width - 8; i += 8 )
2876             {
2877                 float32x4_t x1l, x1h, x2l, x2h;
2878                 float32x4_t accl, acch;
2879 
2880                 S = src[1] + i;
2881                 S2 = src[-1] + i;
2882 
2883                 x1l = vld1q_f32(S);
2884                 x1h = vld1q_f32(S + 4);
2885                 x2l = vld1q_f32(S2);
2886                 x2h = vld1q_f32(S2 + 4);
2887 
2888                 accl = acch = d4;
2889                 accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
2890                 acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
2891 
2892                 for( k = 2; k <= ksize2; k++ )
2893                 {
2894                     S = src[k] + i;
2895                     S2 = src[-k] + i;
2896 
2897                     float32x4_t x3l, x3h, x4l, x4h;
2898                     x3l = vld1q_f32(S);
2899                     x3h = vld1q_f32(S + 4);
2900                     x4l = vld1q_f32(S2);
2901                     x4h = vld1q_f32(S2 + 4);
2902 
2903                     accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
2904                     acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
2905                 }
2906 
2907                 int32x4_t s32l, s32h;
2908                 s32l = vcvtq_s32_f32(accl);
2909                 s32h = vcvtq_s32_f32(acch);
2910 
2911                 int16x4_t s16l, s16h;
2912                 s16l = vqmovn_s32(s32l);
2913                 s16h = vqmovn_s32(s32h);
2914 
2915                 vst1_s16((int16_t *)(dst + i), s16l);
2916                 vst1_s16((int16_t *)(dst + i + 4), s16h);
2917             }
2918         }
2919 
2920         return i;
2921     }
2922 
2923     int symmetryType;
2924     float delta;
2925     Mat kernel;
2926     bool neon_supported;
2927 };
2928 
2929 
2930 struct SymmRowSmallVec_32f
2931 {
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f2932     SymmRowSmallVec_32f() {}
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f2933     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
2934     {
2935         kernel = _kernel;
2936         symmetryType = _symmetryType;
2937     }
2938 
operator ()cv::SymmRowSmallVec_32f2939     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
2940     {
2941          if( !checkHardwareSupport(CV_CPU_NEON) )
2942              return 0;
2943 
2944         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
2945         float* dst = (float*)_dst;
2946         const float* src = (const float*)_src + (_ksize/2)*cn;
2947         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2948         const float* kx = kernel.ptr<float>() + _ksize/2;
2949         width *= cn;
2950 
2951         if( symmetrical )
2952         {
2953             if( _ksize == 1 )
2954                 return 0;
2955             if( _ksize == 3 )
2956             {
2957                 if( kx[0] == 2 && kx[1] == 1 )
2958                     return 0;
2959                 else if( kx[0] == -2 && kx[1] == 1 )
2960                     return 0;
2961                 else
2962                 {
2963                     return 0;
2964                 }
2965             }
2966             else if( _ksize == 5 )
2967             {
2968                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
2969                     return 0;
2970                 else
2971                 {
2972                     float32x2_t k0, k1;
2973                     k0 = k1 = vdup_n_f32(0);
2974                     k0 = vld1_lane_f32(kx + 0, k0, 0);
2975                     k0 = vld1_lane_f32(kx + 1, k0, 1);
2976                     k1 = vld1_lane_f32(kx + 2, k1, 0);
2977 
2978                     for( ; i <= width - 4; i += 4, src += 4 )
2979                     {
2980                         float32x4_t x0, x1, x2, x3, x4;
2981                         x0 = vld1q_f32(src);
2982                         x1 = vld1q_f32(src - cn);
2983                         x2 = vld1q_f32(src + cn);
2984                         x3 = vld1q_f32(src - cn*2);
2985                         x4 = vld1q_f32(src + cn*2);
2986 
2987                         float32x4_t y0;
2988                         y0 = vmulq_lane_f32(x0, k0, 0);
2989                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
2990                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
2991 
2992                         vst1q_f32(dst + i, y0);
2993                     }
2994                 }
2995             }
2996         }
2997         else
2998         {
2999             if( _ksize == 3 )
3000             {
3001                 if( kx[0] == 0 && kx[1] == 1 )
3002                     return 0;
3003                 else
3004                 {
3005                     return 0;
3006                 }
3007             }
3008             else if( _ksize == 5 )
3009             {
3010                 float32x2_t k;
3011                 k = vdup_n_f32(0);
3012                 k = vld1_lane_f32(kx + 1, k, 0);
3013                 k = vld1_lane_f32(kx + 2, k, 1);
3014 
3015                 for( ; i <= width - 4; i += 4, src += 4 )
3016                 {
3017                     float32x4_t x0, x1, x2, x3;
3018                     x0 = vld1q_f32(src - cn);
3019                     x1 = vld1q_f32(src + cn);
3020                     x2 = vld1q_f32(src - cn*2);
3021                     x3 = vld1q_f32(src + cn*2);
3022 
3023                     float32x4_t y0;
3024                     y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
3025                     y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
3026 
3027                     vst1q_f32(dst + i, y0);
3028                 }
3029             }
3030         }
3031 
3032         return i;
3033     }
3034 
3035     Mat kernel;
3036     int symmetryType;
3037 };
3038 
3039 
3040 typedef RowNoVec RowVec_8u32s;
3041 typedef RowNoVec RowVec_16s32f;
3042 typedef RowNoVec RowVec_32f;
3043 typedef ColumnNoVec SymmColumnVec_32f;
3044 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
3045 typedef FilterNoVec FilterVec_8u;
3046 typedef FilterNoVec FilterVec_8u16s;
3047 typedef FilterNoVec FilterVec_32f;
3048 
3049 
3050 #else
3051 
3052 typedef RowNoVec RowVec_8u32s;
3053 typedef RowNoVec RowVec_16s32f;
3054 typedef RowNoVec RowVec_32f;
3055 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
3056 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
3057 typedef ColumnNoVec SymmColumnVec_32s8u;
3058 typedef ColumnNoVec SymmColumnVec_32f16s;
3059 typedef ColumnNoVec SymmColumnVec_32f;
3060 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
3061 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
3062 typedef FilterNoVec FilterVec_8u;
3063 typedef FilterNoVec FilterVec_8u16s;
3064 typedef FilterNoVec FilterVec_32f;
3065 
3066 #endif
3067 
3068 
3069 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
3070 {
RowFiltercv::RowFilter3071     RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
3072     {
3073         if( _kernel.isContinuous() )
3074             kernel = _kernel;
3075         else
3076             _kernel.copyTo(kernel);
3077         anchor = _anchor;
3078         ksize = kernel.rows + kernel.cols - 1;
3079         CV_Assert( kernel.type() == DataType<DT>::type &&
3080                    (kernel.rows == 1 || kernel.cols == 1));
3081         vecOp = _vecOp;
3082     }
3083 
operator ()cv::RowFilter3084     void operator()(const uchar* src, uchar* dst, int width, int cn)
3085     {
3086         int _ksize = ksize;
3087         const DT* kx = kernel.ptr<DT>();
3088         const ST* S;
3089         DT* D = (DT*)dst;
3090         int i, k;
3091 
3092         i = vecOp(src, dst, width, cn);
3093         width *= cn;
3094         #if CV_ENABLE_UNROLLED
3095         for( ; i <= width - 4; i += 4 )
3096         {
3097             S = (const ST*)src + i;
3098             DT f = kx[0];
3099             DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
3100 
3101             for( k = 1; k < _ksize; k++ )
3102             {
3103                 S += cn;
3104                 f = kx[k];
3105                 s0 += f*S[0]; s1 += f*S[1];
3106                 s2 += f*S[2]; s3 += f*S[3];
3107             }
3108 
3109             D[i] = s0; D[i+1] = s1;
3110             D[i+2] = s2; D[i+3] = s3;
3111         }
3112         #endif
3113         for( ; i < width; i++ )
3114         {
3115             S = (const ST*)src + i;
3116             DT s0 = kx[0]*S[0];
3117             for( k = 1; k < _ksize; k++ )
3118             {
3119                 S += cn;
3120                 s0 += kx[k]*S[0];
3121             }
3122             D[i] = s0;
3123         }
3124     }
3125 
3126     Mat kernel;
3127     VecOp vecOp;
3128 };
3129 
3130 
3131 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
3132     public RowFilter<ST, DT, VecOp>
3133 {
SymmRowSmallFiltercv::SymmRowSmallFilter3134     SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
3135                         const VecOp& _vecOp = VecOp())
3136         : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
3137     {
3138         symmetryType = _symmetryType;
3139         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
3140     }
3141 
operator ()cv::SymmRowSmallFilter3142     void operator()(const uchar* src, uchar* dst, int width, int cn)
3143     {
3144         int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
3145         const DT* kx = this->kernel.template ptr<DT>() + ksize2;
3146         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
3147         DT* D = (DT*)dst;
3148         int i = this->vecOp(src, dst, width, cn), j, k;
3149         const ST* S = (const ST*)src + i + ksize2n;
3150         width *= cn;
3151 
3152         if( symmetrical )
3153         {
3154             if( this->ksize == 1 && kx[0] == 1 )
3155             {
3156                 for( ; i <= width - 2; i += 2 )
3157                 {
3158                     DT s0 = S[i], s1 = S[i+1];
3159                     D[i] = s0; D[i+1] = s1;
3160                 }
3161                 S += i;
3162             }
3163             else if( this->ksize == 3 )
3164             {
3165                 if( kx[0] == 2 && kx[1] == 1 )
3166                     for( ; i <= width - 2; i += 2, S += 2 )
3167                     {
3168                         DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
3169                         D[i] = s0; D[i+1] = s1;
3170                     }
3171                 else if( kx[0] == -2 && kx[1] == 1 )
3172                     for( ; i <= width - 2; i += 2, S += 2 )
3173                     {
3174                         DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
3175                         D[i] = s0; D[i+1] = s1;
3176                     }
3177                 else
3178                 {
3179                     DT k0 = kx[0], k1 = kx[1];
3180                     for( ; i <= width - 2; i += 2, S += 2 )
3181                     {
3182                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
3183                         D[i] = s0; D[i+1] = s1;
3184                     }
3185                 }
3186             }
3187             else if( this->ksize == 5 )
3188             {
3189                 DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
3190                 if( k0 == -2 && k1 == 0 && k2 == 1 )
3191                     for( ; i <= width - 2; i += 2, S += 2 )
3192                     {
3193                         DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
3194                         DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
3195                         D[i] = s0; D[i+1] = s1;
3196                     }
3197                 else
3198                     for( ; i <= width - 2; i += 2, S += 2 )
3199                     {
3200                         DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
3201                         DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
3202                         D[i] = s0; D[i+1] = s1;
3203                     }
3204             }
3205 
3206             for( ; i < width; i++, S++ )
3207             {
3208                 DT s0 = kx[0]*S[0];
3209                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
3210                     s0 += kx[k]*(S[j] + S[-j]);
3211                 D[i] = s0;
3212             }
3213         }
3214         else
3215         {
3216             if( this->ksize == 3 )
3217             {
3218                 if( kx[0] == 0 && kx[1] == 1 )
3219                     for( ; i <= width - 2; i += 2, S += 2 )
3220                     {
3221                         DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
3222                         D[i] = s0; D[i+1] = s1;
3223                     }
3224                 else
3225                 {
3226                     DT k1 = kx[1];
3227                     for( ; i <= width - 2; i += 2, S += 2 )
3228                     {
3229                         DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
3230                         D[i] = s0; D[i+1] = s1;
3231                     }
3232                 }
3233             }
3234             else if( this->ksize == 5 )
3235             {
3236                 DT k1 = kx[1], k2 = kx[2];
3237                 for( ; i <= width - 2; i += 2, S += 2 )
3238                 {
3239                     DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
3240                     DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
3241                     D[i] = s0; D[i+1] = s1;
3242                 }
3243             }
3244 
3245             for( ; i < width; i++, S++ )
3246             {
3247                 DT s0 = kx[0]*S[0];
3248                 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
3249                     s0 += kx[k]*(S[j] - S[-j]);
3250                 D[i] = s0;
3251             }
3252         }
3253     }
3254 
3255     int symmetryType;
3256 };
3257 
3258 
3259 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
3260 {
3261     typedef typename CastOp::type1 ST;
3262     typedef typename CastOp::rtype DT;
3263 
ColumnFiltercv::ColumnFilter3264     ColumnFilter( const Mat& _kernel, int _anchor,
3265         double _delta, const CastOp& _castOp=CastOp(),
3266         const VecOp& _vecOp=VecOp() )
3267     {
3268         if( _kernel.isContinuous() )
3269             kernel = _kernel;
3270         else
3271             _kernel.copyTo(kernel);
3272         anchor = _anchor;
3273         ksize = kernel.rows + kernel.cols - 1;
3274         delta = saturate_cast<ST>(_delta);
3275         castOp0 = _castOp;
3276         vecOp = _vecOp;
3277         CV_Assert( kernel.type() == DataType<ST>::type &&
3278                    (kernel.rows == 1 || kernel.cols == 1));
3279     }
3280 
operator ()cv::ColumnFilter3281     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3282     {
3283         const ST* ky = kernel.template ptr<ST>();
3284         ST _delta = delta;
3285         int _ksize = ksize;
3286         int i, k;
3287         CastOp castOp = castOp0;
3288 
3289         for( ; count--; dst += dststep, src++ )
3290         {
3291             DT* D = (DT*)dst;
3292             i = vecOp(src, dst, width);
3293             #if CV_ENABLE_UNROLLED
3294             for( ; i <= width - 4; i += 4 )
3295             {
3296                 ST f = ky[0];
3297                 const ST* S = (const ST*)src[0] + i;
3298                 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
3299                     s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
3300 
3301                 for( k = 1; k < _ksize; k++ )
3302                 {
3303                     S = (const ST*)src[k] + i; f = ky[k];
3304                     s0 += f*S[0]; s1 += f*S[1];
3305                     s2 += f*S[2]; s3 += f*S[3];
3306                 }
3307 
3308                 D[i] = castOp(s0); D[i+1] = castOp(s1);
3309                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3310             }
3311             #endif
3312             for( ; i < width; i++ )
3313             {
3314                 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
3315                 for( k = 1; k < _ksize; k++ )
3316                     s0 += ky[k]*((const ST*)src[k])[i];
3317                 D[i] = castOp(s0);
3318             }
3319         }
3320     }
3321 
3322     Mat kernel;
3323     CastOp castOp0;
3324     VecOp vecOp;
3325     ST delta;
3326 };
3327 
3328 
3329 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
3330 {
3331     typedef typename CastOp::type1 ST;
3332     typedef typename CastOp::rtype DT;
3333 
SymmColumnFiltercv::SymmColumnFilter3334     SymmColumnFilter( const Mat& _kernel, int _anchor,
3335         double _delta, int _symmetryType,
3336         const CastOp& _castOp=CastOp(),
3337         const VecOp& _vecOp=VecOp())
3338         : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
3339     {
3340         symmetryType = _symmetryType;
3341         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
3342     }
3343 
operator ()cv::SymmColumnFilter3344     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3345     {
3346         int ksize2 = this->ksize/2;
3347         const ST* ky = this->kernel.template ptr<ST>() + ksize2;
3348         int i, k;
3349         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
3350         ST _delta = this->delta;
3351         CastOp castOp = this->castOp0;
3352         src += ksize2;
3353 
3354         if( symmetrical )
3355         {
3356             for( ; count--; dst += dststep, src++ )
3357             {
3358                 DT* D = (DT*)dst;
3359                 i = (this->vecOp)(src, dst, width);
3360                 #if CV_ENABLE_UNROLLED
3361                 for( ; i <= width - 4; i += 4 )
3362                 {
3363                     ST f = ky[0];
3364                     const ST* S = (const ST*)src[0] + i, *S2;
3365                     ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
3366                         s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
3367 
3368                     for( k = 1; k <= ksize2; k++ )
3369                     {
3370                         S = (const ST*)src[k] + i;
3371                         S2 = (const ST*)src[-k] + i;
3372                         f = ky[k];
3373                         s0 += f*(S[0] + S2[0]);
3374                         s1 += f*(S[1] + S2[1]);
3375                         s2 += f*(S[2] + S2[2]);
3376                         s3 += f*(S[3] + S2[3]);
3377                     }
3378 
3379                     D[i] = castOp(s0); D[i+1] = castOp(s1);
3380                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3381                 }
3382                 #endif
3383                 for( ; i < width; i++ )
3384                 {
3385                     ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
3386                     for( k = 1; k <= ksize2; k++ )
3387                         s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
3388                     D[i] = castOp(s0);
3389                 }
3390             }
3391         }
3392         else
3393         {
3394             for( ; count--; dst += dststep, src++ )
3395             {
3396                 DT* D = (DT*)dst;
3397                 i = this->vecOp(src, dst, width);
3398                 #if CV_ENABLE_UNROLLED
3399                 for( ; i <= width - 4; i += 4 )
3400                 {
3401                     ST f = ky[0];
3402                     const ST *S, *S2;
3403                     ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3404 
3405                     for( k = 1; k <= ksize2; k++ )
3406                     {
3407                         S = (const ST*)src[k] + i;
3408                         S2 = (const ST*)src[-k] + i;
3409                         f = ky[k];
3410                         s0 += f*(S[0] - S2[0]);
3411                         s1 += f*(S[1] - S2[1]);
3412                         s2 += f*(S[2] - S2[2]);
3413                         s3 += f*(S[3] - S2[3]);
3414                     }
3415 
3416                     D[i] = castOp(s0); D[i+1] = castOp(s1);
3417                     D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3418                 }
3419                 #endif
3420                 for( ; i < width; i++ )
3421                 {
3422                     ST s0 = _delta;
3423                     for( k = 1; k <= ksize2; k++ )
3424                         s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
3425                     D[i] = castOp(s0);
3426                 }
3427             }
3428         }
3429     }
3430 
3431     int symmetryType;
3432 };
3433 
3434 
3435 template<class CastOp, class VecOp>
3436 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
3437 {
3438     typedef typename CastOp::type1 ST;
3439     typedef typename CastOp::rtype DT;
3440 
SymmColumnSmallFiltercv::SymmColumnSmallFilter3441     SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
3442                            double _delta, int _symmetryType,
3443                            const CastOp& _castOp=CastOp(),
3444                            const VecOp& _vecOp=VecOp())
3445         : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
3446     {
3447         CV_Assert( this->ksize == 3 );
3448     }
3449 
operator ()cv::SymmColumnSmallFilter3450     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3451     {
3452         int ksize2 = this->ksize/2;
3453         const ST* ky = this->kernel.template ptr<ST>() + ksize2;
3454         int i;
3455         bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
3456         bool is_1_2_1 = ky[0] == 2 && ky[1] == 1;
3457         bool is_1_m2_1 = ky[0] == -2 && ky[1] == 1;
3458         bool is_m1_0_1 = ky[0] == 0 && (ky[1] == 1 || ky[1] == -1);
3459         ST f0 = ky[0], f1 = ky[1];
3460         ST _delta = this->delta;
3461         CastOp castOp = this->castOp0;
3462         src += ksize2;
3463 
3464         for( ; count--; dst += dststep, src++ )
3465         {
3466             DT* D = (DT*)dst;
3467             i = (this->vecOp)(src, dst, width);
3468             const ST* S0 = (const ST*)src[-1];
3469             const ST* S1 = (const ST*)src[0];
3470             const ST* S2 = (const ST*)src[1];
3471 
3472             if( symmetrical )
3473             {
3474                 if( is_1_2_1 )
3475                 {
3476                     #if CV_ENABLE_UNROLLED
3477                     for( ; i <= width - 4; i += 4 )
3478                     {
3479                         ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
3480                         ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
3481                         D[i] = castOp(s0);
3482                         D[i+1] = castOp(s1);
3483 
3484                         s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
3485                         s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
3486                         D[i+2] = castOp(s0);
3487                         D[i+3] = castOp(s1);
3488                     }
3489                     #endif
3490                     for( ; i < width; i ++ )
3491                     {
3492                         ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
3493                         D[i] = castOp(s0);
3494                     }
3495                 }
3496                 else if( is_1_m2_1 )
3497                 {
3498                     #if CV_ENABLE_UNROLLED
3499                     for( ; i <= width - 4; i += 4 )
3500                     {
3501                         ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
3502                         ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
3503                         D[i] = castOp(s0);
3504                         D[i+1] = castOp(s1);
3505 
3506                         s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
3507                         s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
3508                         D[i+2] = castOp(s0);
3509                         D[i+3] = castOp(s1);
3510                     }
3511                     #endif
3512                     for( ; i < width; i ++ )
3513                     {
3514                         ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
3515                         D[i] = castOp(s0);
3516                     }
3517                 }
3518                 else
3519                 {
3520                     #if CV_ENABLE_UNROLLED
3521                     for( ; i <= width - 4; i += 4 )
3522                     {
3523                         ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
3524                         ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
3525                         D[i] = castOp(s0);
3526                         D[i+1] = castOp(s1);
3527 
3528                         s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
3529                         s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
3530                         D[i+2] = castOp(s0);
3531                         D[i+3] = castOp(s1);
3532                     }
3533                     #endif
3534                     for( ; i < width; i ++ )
3535                     {
3536                         ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
3537                         D[i] = castOp(s0);
3538                     }
3539                 }
3540             }
3541             else
3542             {
3543                 if( is_m1_0_1 )
3544                 {
3545                     if( f1 < 0 )
3546                         std::swap(S0, S2);
3547                     #if CV_ENABLE_UNROLLED
3548                     for( ; i <= width - 4; i += 4 )
3549                     {
3550                         ST s0 = S2[i] - S0[i] + _delta;
3551                         ST s1 = S2[i+1] - S0[i+1] + _delta;
3552                         D[i] = castOp(s0);
3553                         D[i+1] = castOp(s1);
3554 
3555                         s0 = S2[i+2] - S0[i+2] + _delta;
3556                         s1 = S2[i+3] - S0[i+3] + _delta;
3557                         D[i+2] = castOp(s0);
3558                         D[i+3] = castOp(s1);
3559                     }
3560                     #endif
3561                     for( ; i < width; i ++ )
3562                     {
3563                         ST s0 = S2[i] - S0[i] + _delta;
3564                         D[i] = castOp(s0);
3565                     }
3566                     if( f1 < 0 )
3567                         std::swap(S0, S2);
3568                 }
3569                 else
3570                 {
3571                     #if CV_ENABLE_UNROLLED
3572                     for( ; i <= width - 4; i += 4 )
3573                     {
3574                         ST s0 = (S2[i] - S0[i])*f1 + _delta;
3575                         ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
3576                         D[i] = castOp(s0);
3577                         D[i+1] = castOp(s1);
3578 
3579                         s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
3580                         s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
3581                         D[i+2] = castOp(s0);
3582                         D[i+3] = castOp(s1);
3583                     }
3584                     #endif
3585                     for( ; i < width; i++ )
3586                         D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
3587                 }
3588             }
3589         }
3590     }
3591 };
3592 
3593 template<typename ST, typename DT> struct Cast
3594 {
3595     typedef ST type1;
3596     typedef DT rtype;
3597 
operator ()cv::Cast3598     DT operator()(ST val) const { return saturate_cast<DT>(val); }
3599 };
3600 
3601 template<typename ST, typename DT, int bits> struct FixedPtCast
3602 {
3603     typedef ST type1;
3604     typedef DT rtype;
3605     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
3606 
operator ()cv::FixedPtCast3607     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
3608 };
3609 
3610 template<typename ST, typename DT> struct FixedPtCastEx
3611 {
3612     typedef ST type1;
3613     typedef DT rtype;
3614 
FixedPtCastExcv::FixedPtCastEx3615     FixedPtCastEx() : SHIFT(0), DELTA(0) {}
FixedPtCastExcv::FixedPtCastEx3616     FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
operator ()cv::FixedPtCastEx3617     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
3618     int SHIFT, DELTA;
3619 };
3620 
3621 }
3622 
getLinearRowFilter(int srcType,int bufType,InputArray _kernel,int anchor,int symmetryType)3623 cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
3624                                                    InputArray _kernel, int anchor,
3625                                                    int symmetryType )
3626 {
3627     Mat kernel = _kernel.getMat();
3628     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
3629     int cn = CV_MAT_CN(srcType);
3630     CV_Assert( cn == CV_MAT_CN(bufType) &&
3631         ddepth >= std::max(sdepth, CV_32S) &&
3632         kernel.type() == ddepth );
3633     int ksize = kernel.rows + kernel.cols - 1;
3634 
3635     if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
3636     {
3637         if( sdepth == CV_8U && ddepth == CV_32S )
3638             return makePtr<SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s> >
3639                 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType));
3640         if( sdepth == CV_32F && ddepth == CV_32F )
3641             return makePtr<SymmRowSmallFilter<float, float, SymmRowSmallVec_32f> >
3642                 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType));
3643     }
3644 
3645     if( sdepth == CV_8U && ddepth == CV_32S )
3646         return makePtr<RowFilter<uchar, int, RowVec_8u32s> >
3647             (kernel, anchor, RowVec_8u32s(kernel));
3648     if( sdepth == CV_8U && ddepth == CV_32F )
3649         return makePtr<RowFilter<uchar, float, RowNoVec> >(kernel, anchor);
3650     if( sdepth == CV_8U && ddepth == CV_64F )
3651         return makePtr<RowFilter<uchar, double, RowNoVec> >(kernel, anchor);
3652     if( sdepth == CV_16U && ddepth == CV_32F )
3653         return makePtr<RowFilter<ushort, float, RowNoVec> >(kernel, anchor);
3654     if( sdepth == CV_16U && ddepth == CV_64F )
3655         return makePtr<RowFilter<ushort, double, RowNoVec> >(kernel, anchor);
3656     if( sdepth == CV_16S && ddepth == CV_32F )
3657         return makePtr<RowFilter<short, float, RowVec_16s32f> >
3658                                   (kernel, anchor, RowVec_16s32f(kernel));
3659     if( sdepth == CV_16S && ddepth == CV_64F )
3660         return makePtr<RowFilter<short, double, RowNoVec> >(kernel, anchor);
3661     if( sdepth == CV_32F && ddepth == CV_32F )
3662         return makePtr<RowFilter<float, float, RowVec_32f> >
3663             (kernel, anchor, RowVec_32f(kernel));
3664     if( sdepth == CV_32F && ddepth == CV_64F )
3665         return makePtr<RowFilter<float, double, RowNoVec> >(kernel, anchor);
3666     if( sdepth == CV_64F && ddepth == CV_64F )
3667         return makePtr<RowFilter<double, double, RowNoVec> >(kernel, anchor);
3668 
3669     CV_Error_( CV_StsNotImplemented,
3670         ("Unsupported combination of source format (=%d), and buffer format (=%d)",
3671         srcType, bufType));
3672 
3673     return Ptr<BaseRowFilter>();
3674 }
3675 
3676 
getLinearColumnFilter(int bufType,int dstType,InputArray _kernel,int anchor,int symmetryType,double delta,int bits)3677 cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
3678                                              InputArray _kernel, int anchor,
3679                                              int symmetryType, double delta,
3680                                              int bits )
3681 {
3682     Mat kernel = _kernel.getMat();
3683     int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
3684     int cn = CV_MAT_CN(dstType);
3685     CV_Assert( cn == CV_MAT_CN(bufType) &&
3686         sdepth >= std::max(ddepth, CV_32S) &&
3687         kernel.type() == sdepth );
3688 
3689     if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
3690     {
3691         if( ddepth == CV_8U && sdepth == CV_32S )
3692             return makePtr<ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec> >
3693             (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits));
3694         if( ddepth == CV_8U && sdepth == CV_32F )
3695             return makePtr<ColumnFilter<Cast<float, uchar>, ColumnNoVec> >(kernel, anchor, delta);
3696         if( ddepth == CV_8U && sdepth == CV_64F )
3697             return makePtr<ColumnFilter<Cast<double, uchar>, ColumnNoVec> >(kernel, anchor, delta);
3698         if( ddepth == CV_16U && sdepth == CV_32F )
3699             return makePtr<ColumnFilter<Cast<float, ushort>, ColumnNoVec> >(kernel, anchor, delta);
3700         if( ddepth == CV_16U && sdepth == CV_64F )
3701             return makePtr<ColumnFilter<Cast<double, ushort>, ColumnNoVec> >(kernel, anchor, delta);
3702         if( ddepth == CV_16S && sdepth == CV_32F )
3703             return makePtr<ColumnFilter<Cast<float, short>, ColumnNoVec> >(kernel, anchor, delta);
3704         if( ddepth == CV_16S && sdepth == CV_64F )
3705             return makePtr<ColumnFilter<Cast<double, short>, ColumnNoVec> >(kernel, anchor, delta);
3706         if( ddepth == CV_32F && sdepth == CV_32F )
3707             return makePtr<ColumnFilter<Cast<float, float>, ColumnNoVec> >(kernel, anchor, delta);
3708         if( ddepth == CV_64F && sdepth == CV_64F )
3709             return makePtr<ColumnFilter<Cast<double, double>, ColumnNoVec> >(kernel, anchor, delta);
3710     }
3711     else
3712     {
3713         int ksize = kernel.rows + kernel.cols - 1;
3714         if( ksize == 3 )
3715         {
3716             if( ddepth == CV_8U && sdepth == CV_32S )
3717                 return makePtr<SymmColumnSmallFilter<
3718                     FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
3719                     (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
3720                     SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
3721             if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
3722                 return makePtr<SymmColumnSmallFilter<Cast<int, short>,
3723                     SymmColumnSmallVec_32s16s> >(kernel, anchor, delta, symmetryType,
3724                         Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta));
3725             if( ddepth == CV_32F && sdepth == CV_32F )
3726                 return makePtr<SymmColumnSmallFilter<
3727                     Cast<float, float>,SymmColumnSmallVec_32f> >
3728                     (kernel, anchor, delta, symmetryType, Cast<float, float>(),
3729                     SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta));
3730         }
3731         if( ddepth == CV_8U && sdepth == CV_32S )
3732             return makePtr<SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
3733                 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
3734                 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
3735         if( ddepth == CV_8U && sdepth == CV_32F )
3736             return makePtr<SymmColumnFilter<Cast<float, uchar>, ColumnNoVec> >
3737                 (kernel, anchor, delta, symmetryType);
3738         if( ddepth == CV_8U && sdepth == CV_64F )
3739             return makePtr<SymmColumnFilter<Cast<double, uchar>, ColumnNoVec> >
3740                 (kernel, anchor, delta, symmetryType);
3741         if( ddepth == CV_16U && sdepth == CV_32F )
3742             return makePtr<SymmColumnFilter<Cast<float, ushort>, ColumnNoVec> >
3743                 (kernel, anchor, delta, symmetryType);
3744         if( ddepth == CV_16U && sdepth == CV_64F )
3745             return makePtr<SymmColumnFilter<Cast<double, ushort>, ColumnNoVec> >
3746                 (kernel, anchor, delta, symmetryType);
3747         if( ddepth == CV_16S && sdepth == CV_32S )
3748             return makePtr<SymmColumnFilter<Cast<int, short>, ColumnNoVec> >
3749                 (kernel, anchor, delta, symmetryType);
3750         if( ddepth == CV_16S && sdepth == CV_32F )
3751             return makePtr<SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s> >
3752                  (kernel, anchor, delta, symmetryType, Cast<float, short>(),
3753                   SymmColumnVec_32f16s(kernel, symmetryType, 0, delta));
3754         if( ddepth == CV_16S && sdepth == CV_64F )
3755             return makePtr<SymmColumnFilter<Cast<double, short>, ColumnNoVec> >
3756                 (kernel, anchor, delta, symmetryType);
3757         if( ddepth == CV_32F && sdepth == CV_32F )
3758             return makePtr<SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f> >
3759                 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
3760                 SymmColumnVec_32f(kernel, symmetryType, 0, delta));
3761         if( ddepth == CV_64F && sdepth == CV_64F )
3762             return makePtr<SymmColumnFilter<Cast<double, double>, ColumnNoVec> >
3763                 (kernel, anchor, delta, symmetryType);
3764     }
3765 
3766     CV_Error_( CV_StsNotImplemented,
3767         ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
3768         bufType, dstType));
3769 
3770     return Ptr<BaseColumnFilter>();
3771 }
3772 
3773 
createSeparableLinearFilter(int _srcType,int _dstType,InputArray __rowKernel,InputArray __columnKernel,Point _anchor,double _delta,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)3774 cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
3775     int _srcType, int _dstType,
3776     InputArray __rowKernel, InputArray __columnKernel,
3777     Point _anchor, double _delta,
3778     int _rowBorderType, int _columnBorderType,
3779     const Scalar& _borderValue )
3780 {
3781     Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
3782     _srcType = CV_MAT_TYPE(_srcType);
3783     _dstType = CV_MAT_TYPE(_dstType);
3784     int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
3785     int cn = CV_MAT_CN(_srcType);
3786     CV_Assert( cn == CV_MAT_CN(_dstType) );
3787     int rsize = _rowKernel.rows + _rowKernel.cols - 1;
3788     int csize = _columnKernel.rows + _columnKernel.cols - 1;
3789     if( _anchor.x < 0 )
3790         _anchor.x = rsize/2;
3791     if( _anchor.y < 0 )
3792         _anchor.y = csize/2;
3793     int rtype = getKernelType(_rowKernel,
3794         _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
3795     int ctype = getKernelType(_columnKernel,
3796         _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
3797     Mat rowKernel, columnKernel;
3798 
3799     int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
3800     int bits = 0;
3801 
3802     if( sdepth == CV_8U &&
3803         ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
3804           ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
3805           ddepth == CV_8U) ||
3806          ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
3807           (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
3808           (rtype & ctype & KERNEL_INTEGER) &&
3809           ddepth == CV_16S)) )
3810     {
3811         bdepth = CV_32S;
3812         bits = ddepth == CV_8U ? 8 : 0;
3813         _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
3814         _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
3815         bits *= 2;
3816         _delta *= (1 << bits);
3817     }
3818     else
3819     {
3820         if( _rowKernel.type() != bdepth )
3821             _rowKernel.convertTo( rowKernel, bdepth );
3822         else
3823             rowKernel = _rowKernel;
3824         if( _columnKernel.type() != bdepth )
3825             _columnKernel.convertTo( columnKernel, bdepth );
3826         else
3827             columnKernel = _columnKernel;
3828     }
3829 
3830     int _bufType = CV_MAKETYPE(bdepth, cn);
3831     Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
3832         _srcType, _bufType, rowKernel, _anchor.x, rtype);
3833     Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
3834         _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
3835 
3836     return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(), _rowFilter, _columnFilter,
3837         _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
3838 }
3839 
3840 
3841 /****************************************************************************************\
3842 *                               Non-separable linear filter                              *
3843 \****************************************************************************************/
3844 
3845 namespace cv
3846 {
3847 
preprocess2DKernel(const Mat & kernel,std::vector<Point> & coords,std::vector<uchar> & coeffs)3848 void preprocess2DKernel( const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs )
3849 {
3850     int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
3851     if(nz == 0)
3852         nz = 1;
3853     CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
3854     coords.resize(nz);
3855     coeffs.resize(nz*getElemSize(ktype));
3856     uchar* _coeffs = &coeffs[0];
3857 
3858     for( i = k = 0; i < kernel.rows; i++ )
3859     {
3860         const uchar* krow = kernel.ptr(i);
3861         for( j = 0; j < kernel.cols; j++ )
3862         {
3863             if( ktype == CV_8U )
3864             {
3865                 uchar val = krow[j];
3866                 if( val == 0 )
3867                     continue;
3868                 coords[k] = Point(j,i);
3869                 _coeffs[k++] = val;
3870             }
3871             else if( ktype == CV_32S )
3872             {
3873                 int val = ((const int*)krow)[j];
3874                 if( val == 0 )
3875                     continue;
3876                 coords[k] = Point(j,i);
3877                 ((int*)_coeffs)[k++] = val;
3878             }
3879             else if( ktype == CV_32F )
3880             {
3881                 float val = ((const float*)krow)[j];
3882                 if( val == 0 )
3883                     continue;
3884                 coords[k] = Point(j,i);
3885                 ((float*)_coeffs)[k++] = val;
3886             }
3887             else
3888             {
3889                 double val = ((const double*)krow)[j];
3890                 if( val == 0 )
3891                     continue;
3892                 coords[k] = Point(j,i);
3893                 ((double*)_coeffs)[k++] = val;
3894             }
3895         }
3896     }
3897 }
3898 
3899 
3900 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
3901 {
3902     typedef typename CastOp::type1 KT;
3903     typedef typename CastOp::rtype DT;
3904 
Filter2Dcv::Filter2D3905     Filter2D( const Mat& _kernel, Point _anchor,
3906         double _delta, const CastOp& _castOp=CastOp(),
3907         const VecOp& _vecOp=VecOp() )
3908     {
3909         anchor = _anchor;
3910         ksize = _kernel.size();
3911         delta = saturate_cast<KT>(_delta);
3912         castOp0 = _castOp;
3913         vecOp = _vecOp;
3914         CV_Assert( _kernel.type() == DataType<KT>::type );
3915         preprocess2DKernel( _kernel, coords, coeffs );
3916         ptrs.resize( coords.size() );
3917     }
3918 
operator ()cv::Filter2D3919     void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
3920     {
3921         KT _delta = delta;
3922         const Point* pt = &coords[0];
3923         const KT* kf = (const KT*)&coeffs[0];
3924         const ST** kp = (const ST**)&ptrs[0];
3925         int i, k, nz = (int)coords.size();
3926         CastOp castOp = castOp0;
3927 
3928         width *= cn;
3929         for( ; count > 0; count--, dst += dststep, src++ )
3930         {
3931             DT* D = (DT*)dst;
3932 
3933             for( k = 0; k < nz; k++ )
3934                 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
3935 
3936             i = vecOp((const uchar**)kp, dst, width);
3937             #if CV_ENABLE_UNROLLED
3938             for( ; i <= width - 4; i += 4 )
3939             {
3940                 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3941 
3942                 for( k = 0; k < nz; k++ )
3943                 {
3944                     const ST* sptr = kp[k] + i;
3945                     KT f = kf[k];
3946                     s0 += f*sptr[0];
3947                     s1 += f*sptr[1];
3948                     s2 += f*sptr[2];
3949                     s3 += f*sptr[3];
3950                 }
3951 
3952                 D[i] = castOp(s0); D[i+1] = castOp(s1);
3953                 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3954             }
3955             #endif
3956             for( ; i < width; i++ )
3957             {
3958                 KT s0 = _delta;
3959                 for( k = 0; k < nz; k++ )
3960                     s0 += kf[k]*kp[k][i];
3961                 D[i] = castOp(s0);
3962             }
3963         }
3964     }
3965 
3966     std::vector<Point> coords;
3967     std::vector<uchar> coeffs;
3968     std::vector<uchar*> ptrs;
3969     KT delta;
3970     CastOp castOp0;
3971     VecOp vecOp;
3972 };
3973 
3974 #ifdef HAVE_OPENCL
3975 
3976 #define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
3977 #define ROUNDUP(sz, n)      ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
3978 
3979 // prepare kernel: transpose and make double rows (+align). Returns size of aligned row
3980 // Samples:
3981 //        a b c
3982 // Input: d e f
3983 //        g h i
3984 // Output, last two zeros is the alignment:
3985 // a d g a d g 0 0
3986 // b e h b e h 0 0
3987 // c f i c f i 0 0
3988 template <typename T>
_prepareKernelFilter2D(std::vector<T> & data,const Mat & kernel)3989 static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
3990 {
3991     Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
3992     int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
3993     data.clear(); data.resize(size_y_aligned * kernel.cols, 0);
3994     for (int x = 0; x < kernel.cols; x++)
3995     {
3996         for (int y = 0; y < kernel.rows; y++)
3997         {
3998             data[x * size_y_aligned + y] = _kernel.at<T>(y, x);
3999             data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x);
4000         }
4001     }
4002     return size_y_aligned;
4003 }
4004 
ocl_filter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernel,Point anchor,double delta,int borderType)4005 static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
4006                    InputArray _kernel, Point anchor,
4007                    double delta, int borderType )
4008 {
4009     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4010     ddepth = ddepth < 0 ? sdepth : ddepth;
4011     int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
4012             wtype = CV_MAKE_TYPE(wdepth, cn);
4013     if (cn > 4)
4014         return false;
4015 
4016     Size ksize = _kernel.size();
4017     if (anchor.x < 0)
4018         anchor.x = ksize.width / 2;
4019     if (anchor.y < 0)
4020         anchor.y = ksize.height / 2;
4021 
4022     bool isolated = (borderType & BORDER_ISOLATED) != 0;
4023     borderType &= ~BORDER_ISOLATED;
4024     const cv::ocl::Device &device = cv::ocl::Device::getDefault();
4025     bool doubleSupport = device.doubleFPConfig() > 0;
4026     if (wdepth == CV_64F && !doubleSupport)
4027         return false;
4028 
4029     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
4030                                        "BORDER_WRAP", "BORDER_REFLECT_101" };
4031 
4032     cv::Mat kernelMat = _kernel.getMat();
4033     cv::Size sz = _src.size(), wholeSize;
4034     size_t globalsize[2] = { sz.width, sz.height };
4035     size_t localsize_general[2] = {0, 1};
4036     size_t* localsize = NULL;
4037 
4038     ocl::Kernel k;
4039     UMat src = _src.getUMat();
4040     if (!isolated)
4041     {
4042         Point ofs;
4043         src.locateROI(wholeSize, ofs);
4044     }
4045 
4046     size_t tryWorkItems = device.maxWorkGroupSize();
4047     if (device.isIntel() && 128 < tryWorkItems)
4048         tryWorkItems = 128;
4049     char cvt[2][40];
4050 
4051     // For smaller filter kernels, there is a special kernel that is more
4052     // efficient than the general one.
4053     UMat kernalDataUMat;
4054     if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) &&
4055         ((ksize.width < 5 && ksize.height < 5) ||
4056         (ksize.width == 5 && ksize.height == 5 && cn == 1)))
4057     {
4058         kernelMat = kernelMat.reshape(0, 1);
4059         String kerStr = ocl::kernelToStr(kernelMat, CV_32F);
4060         int h = isolated ? sz.height : wholeSize.height;
4061         int w = isolated ? sz.width : wholeSize.width;
4062 
4063         if (w < ksize.width || h < ksize.height)
4064             return false;
4065 
4066         // Figure out what vector size to use for loading the pixels.
4067         int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4;
4068         int pxLoadVecSize = cn * pxLoadNumPixels;
4069 
4070         // Figure out how many pixels per work item to compute in X and Y
4071         // directions.  Too many and we run out of registers.
4072         int pxPerWorkItemX = 1;
4073         int pxPerWorkItemY = 1;
4074         if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
4075         {
4076             pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8;
4077             pxPerWorkItemY = sz.height % 2 ? 1 : 2;
4078         }
4079         else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
4080         {
4081             pxPerWorkItemX = sz.width % 2 ? 1 : 2;
4082             pxPerWorkItemY = sz.height % 2 ? 1 : 2;
4083         }
4084         globalsize[0] = sz.width / pxPerWorkItemX;
4085         globalsize[1] = sz.height / pxPerWorkItemY;
4086 
4087         // Need some padding in the private array for pixels
4088         int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
4089 
4090         // Make the global size a nice round number so the runtime can pick
4091         // from reasonable choices for the workgroup size
4092         const int wgRound = 256;
4093         globalsize[0] = ROUNDUP(globalsize[0], wgRound);
4094 
4095         char build_options[1024];
4096         sprintf(build_options, "-D cn=%d "
4097                 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
4098                 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
4099                 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
4100                 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
4101                 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
4102                 "-D convertToWT=%s -D convertToDstT=%s %s",
4103                 cn, anchor.x, anchor.y, ksize.width, ksize.height,
4104                 pxLoadVecSize, pxLoadNumPixels,
4105                 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
4106                 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4107                 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
4108                 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
4109                 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
4110                 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4111                 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str());
4112 
4113         if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options))
4114             return false;
4115     }
4116     else
4117     {
4118         localsize = localsize_general;
4119         std::vector<float> kernelMatDataFloat;
4120         int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
4121         String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);
4122 
4123         for ( ; ; )
4124         {
4125             size_t BLOCK_SIZE = tryWorkItems;
4126             while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
4127                 BLOCK_SIZE /= 2;
4128 
4129             if ((size_t)ksize.width > BLOCK_SIZE)
4130                 return false;
4131 
4132             int requiredTop = anchor.y;
4133             int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
4134             int requiredBottom = ksize.height - 1 - anchor.y;
4135             int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
4136             int h = isolated ? sz.height : wholeSize.height;
4137             int w = isolated ? sz.width : wholeSize.width;
4138             bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
4139 
4140             if ((w < ksize.width) || (h < ksize.height))
4141                 return false;
4142 
4143             String opts = format("-D LOCAL_SIZE=%d -D cn=%d "
4144                                  "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
4145                                  "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
4146                                  "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
4147                                  "-D convertToWT=%s -D convertToDstT=%s",
4148                                  (int)BLOCK_SIZE, cn, anchor.x, anchor.y,
4149                                  ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType],
4150                                  extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
4151                                  isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4152                                  doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(),
4153                                  ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
4154                                  ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
4155                                  ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4156                                  ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]));
4157 
4158             localsize[0] = BLOCK_SIZE;
4159             globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
4160             globalsize[1] = sz.height;
4161 
4162             if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts))
4163                 return false;
4164 
4165             size_t kernelWorkGroupSize = k.workGroupSize();
4166             if (localsize[0] <= kernelWorkGroupSize)
4167                 break;
4168             if (BLOCK_SIZE < kernelWorkGroupSize)
4169                 return false;
4170             tryWorkItems = kernelWorkGroupSize;
4171         }
4172     }
4173 
4174     _dst.create(sz, dtype);
4175     UMat dst = _dst.getUMat();
4176 
4177     int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
4178     int srcOffsetY = (int)(src.offset / src.step);
4179     int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width);
4180     int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height);
4181 
4182     k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY,
4183            srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta);
4184 
4185     return k.run(2, globalsize, localsize, false);
4186 }
4187 
4188 const int shift_bits = 8;
4189 
ocl_sepRowFilter2D(const UMat & src,UMat & buf,const Mat & kernelX,int anchor,int borderType,int ddepth,bool fast8uc1,bool int_arithm)4190 static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
4191                                int borderType, int ddepth, bool fast8uc1, bool int_arithm)
4192 {
4193     int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
4194     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4195     Size bufSize = buf.size();
4196     int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
4197 
4198     if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
4199         return false;
4200 
4201 #ifdef ANDROID
4202     size_t localsize[2] = {16, 10};
4203 #else
4204     size_t localsize[2] = {16, 16};
4205 #endif
4206 
4207     size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
4208     if (fast8uc1)
4209         globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
4210 
4211     int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
4212 
4213     bool isolated = (borderType & BORDER_ISOLATED) != 0;
4214     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
4215         * const btype = borderMap[borderType & ~BORDER_ISOLATED];
4216 
4217     bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
4218     extra_extrapolation |= src.rows < radiusY;
4219     extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
4220     extra_extrapolation |= src.cols < radiusX;
4221 
4222     char cvt[40];
4223     cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
4224                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s",
4225                                           radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
4226                                           extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
4227                                           isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4228                                           ocl::typeToStr(type), ocl::typeToStr(buf_type),
4229                                           ocl::convertTypeStr(sdepth, bdepth, cn, cvt),
4230                                           ocl::typeToStr(sdepth), ocl::typeToStr(bdepth),
4231                                           doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4232                                           int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4233     build_options += ocl::kernelToStr(kernelX, bdepth);
4234 
4235     Size srcWholeSize; Point srcOffset;
4236     src.locateROI(srcWholeSize, srcOffset);
4237 
4238     String kernelName("row_filter");
4239     if (fast8uc1)
4240         kernelName += "_C1_D0";
4241 
4242     ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
4243                   build_options);
4244     if (k.empty())
4245         return false;
4246 
4247     if (fast8uc1)
4248         k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
4249                srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
4250                ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
4251                buf.cols, buf.rows, radiusY);
4252     else
4253         k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
4254                srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
4255                ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);
4256 
4257     return k.run(2, globalsize, localsize, false);
4258 }
4259 
ocl_sepColFilter2D(const UMat & buf,UMat & dst,const Mat & kernelY,double delta,int anchor,bool int_arithm)4260 static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
4261 {
4262     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4263     if (dst.depth() == CV_64F && !doubleSupport)
4264         return false;
4265 
4266 #ifdef ANDROID
4267     size_t localsize[2] = { 16, 10 };
4268 #else
4269     size_t localsize[2] = { 16, 16 };
4270 #endif
4271     size_t globalsize[2] = { 0, 0 };
4272 
4273     int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
4274     Size sz = dst.size();
4275     int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
4276 
4277     globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
4278     globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
4279 
4280     char cvt[40];
4281     cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
4282                                           " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
4283                                           " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
4284                                           anchor, (int)localsize[0], (int)localsize[1], cn,
4285                                           ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
4286                                           ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
4287                                           ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
4288                                           2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4289                                           int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4290     build_options += ocl::kernelToStr(kernelY, bdepth);
4291 
4292     ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
4293                   build_options);
4294     if (k.empty())
4295         return false;
4296 
4297     k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
4298            static_cast<float>(delta));
4299 
4300     return k.run(2, globalsize, localsize, false);
4301 }
4302 
4303 const int optimizedSepFilterLocalWidth  = 16;
4304 const int optimizedSepFilterLocalHeight = 8;
4305 
ocl_sepFilter2D_SinglePass(InputArray _src,OutputArray _dst,Mat row_kernel,Mat col_kernel,double delta,int borderType,int ddepth,int bdepth,bool int_arithm)4306 static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
4307                                        Mat row_kernel, Mat col_kernel,
4308                                        double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
4309 {
4310     Size size = _src.size(), wholeSize;
4311     Point origin;
4312     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
4313             esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
4314             dtype = CV_MAKE_TYPE(ddepth, cn);
4315     size_t src_step = _src.step(), src_offset = _src.offset();
4316     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4317 
4318     if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
4319             !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
4320               borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
4321               borderType == BORDER_REFLECT_101))
4322         return false;
4323 
4324     size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
4325     size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
4326 
4327     char cvt[2][40];
4328     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4329                                        "BORDER_REFLECT_101" };
4330 
4331     String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
4332                              " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
4333                              " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
4334                              (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
4335                              ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
4336                              ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
4337                              ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4338                              ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
4339                              ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
4340                              ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth),
4341                              cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4342 
4343     ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
4344     if (k.empty())
4345         return false;
4346 
4347     UMat src = _src.getUMat();
4348     _dst.create(size, dtype);
4349     UMat dst = _dst.getUMat();
4350 
4351     int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
4352     int src_offset_y = static_cast<int>(src_offset / src_step);
4353 
4354     src.locateROI(wholeSize, origin);
4355 
4356     k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
4357            wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
4358            static_cast<float>(delta));
4359 
4360     return k.run(2, gt2, lt2, false);
4361 }
4362 
ocl_sepFilter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernelX,InputArray _kernelY,Point anchor,double delta,int borderType)4363 static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
4364                       InputArray _kernelX, InputArray _kernelY, Point anchor,
4365                       double delta, int borderType )
4366 {
4367     const ocl::Device & d = ocl::Device::getDefault();
4368     Size imgSize = _src.size();
4369 
4370     int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4371     if (cn > 4)
4372         return false;
4373 
4374     Mat kernelX = _kernelX.getMat().reshape(1, 1);
4375     if (kernelX.cols % 2 != 1)
4376         return false;
4377     Mat kernelY = _kernelY.getMat().reshape(1, 1);
4378     if (kernelY.cols % 2 != 1)
4379         return false;
4380 
4381     if (ddepth < 0)
4382         ddepth = sdepth;
4383 
4384     if (anchor.x < 0)
4385         anchor.x = kernelX.cols >> 1;
4386     if (anchor.y < 0)
4387         anchor.y = kernelY.cols >> 1;
4388 
4389     int rtype = getKernelType(kernelX,
4390         kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
4391     int ctype = getKernelType(kernelY,
4392         kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
4393 
4394     int bdepth = CV_32F;
4395     bool int_arithm = false;
4396     if( sdepth == CV_8U && ddepth == CV_8U &&
4397         rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
4398         ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
4399     {
4400         if (ocl::Device::getDefault().isIntel())
4401         {
4402             for (int i=0; i<kernelX.cols; i++)
4403                 kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
4404             if (kernelX.data != kernelY.data)
4405                 for (int i=0; i<kernelX.cols; i++)
4406                     kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
4407         } else
4408         {
4409             bdepth = CV_32S;
4410             kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
4411             kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
4412         }
4413         int_arithm = true;
4414     }
4415 
4416     CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
4417                 imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
4418                 imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
4419                 (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
4420                 anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
4421                 (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
4422                 ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
4423                                            borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
4424 
4425     UMat src = _src.getUMat();
4426     Size srcWholeSize; Point srcOffset;
4427     src.locateROI(srcWholeSize, srcOffset);
4428 
4429     bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
4430             src.cols % 4 == 0 && src.step % 4 == 0;
4431 
4432     Size srcSize = src.size();
4433     Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
4434     UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
4435     if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
4436         return false;
4437 
4438     _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
4439     UMat dst = _dst.getUMat();
4440 
4441     return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
4442 }
4443 
4444 #endif
4445 
4446 }
4447 
getLinearFilter(int srcType,int dstType,InputArray filter_kernel,Point anchor,double delta,int bits)4448 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
4449                                 InputArray filter_kernel, Point anchor,
4450                                 double delta, int bits)
4451 {
4452     Mat _kernel = filter_kernel.getMat();
4453     int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
4454     int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
4455     CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
4456 
4457     anchor = normalizeAnchor(anchor, _kernel.size());
4458 
4459     /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
4460         return makePtr<Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u> >
4461             (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
4462             FilterVec_8u(_kernel, bits, delta));
4463     if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
4464         return makePtr<Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s> >
4465             (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
4466             FilterVec_8u16s(_kernel, bits, delta));*/
4467 
4468     kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
4469     Mat kernel;
4470     if( _kernel.type() == kdepth )
4471         kernel = _kernel;
4472     else
4473         _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
4474 
4475     if( sdepth == CV_8U && ddepth == CV_8U )
4476         return makePtr<Filter2D<uchar, Cast<float, uchar>, FilterVec_8u> >
4477             (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta));
4478     if( sdepth == CV_8U && ddepth == CV_16U )
4479         return makePtr<Filter2D<uchar,
4480             Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
4481     if( sdepth == CV_8U && ddepth == CV_16S )
4482         return makePtr<Filter2D<uchar, Cast<float, short>, FilterVec_8u16s> >
4483             (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta));
4484     if( sdepth == CV_8U && ddepth == CV_32F )
4485         return makePtr<Filter2D<uchar,
4486             Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4487     if( sdepth == CV_8U && ddepth == CV_64F )
4488         return makePtr<Filter2D<uchar,
4489             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4490 
4491     if( sdepth == CV_16U && ddepth == CV_16U )
4492         return makePtr<Filter2D<ushort,
4493             Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
4494     if( sdepth == CV_16U && ddepth == CV_32F )
4495         return makePtr<Filter2D<ushort,
4496             Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4497     if( sdepth == CV_16U && ddepth == CV_64F )
4498         return makePtr<Filter2D<ushort,
4499             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4500 
4501     if( sdepth == CV_16S && ddepth == CV_16S )
4502         return makePtr<Filter2D<short,
4503             Cast<float, short>, FilterNoVec> >(kernel, anchor, delta);
4504     if( sdepth == CV_16S && ddepth == CV_32F )
4505         return makePtr<Filter2D<short,
4506             Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4507     if( sdepth == CV_16S && ddepth == CV_64F )
4508         return makePtr<Filter2D<short,
4509             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4510 
4511     if( sdepth == CV_32F && ddepth == CV_32F )
4512         return makePtr<Filter2D<float, Cast<float, float>, FilterVec_32f> >
4513             (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta));
4514     if( sdepth == CV_64F && ddepth == CV_64F )
4515         return makePtr<Filter2D<double,
4516             Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4517 
4518     CV_Error_( CV_StsNotImplemented,
4519         ("Unsupported combination of source format (=%d), and destination format (=%d)",
4520         srcType, dstType));
4521 
4522     return Ptr<BaseFilter>();
4523 }
4524 
4525 
createLinearFilter(int _srcType,int _dstType,InputArray filter_kernel,Point _anchor,double _delta,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)4526 cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
4527                                               InputArray filter_kernel,
4528                                               Point _anchor, double _delta,
4529                                               int _rowBorderType, int _columnBorderType,
4530                                               const Scalar& _borderValue )
4531 {
4532     Mat _kernel = filter_kernel.getMat();
4533     _srcType = CV_MAT_TYPE(_srcType);
4534     _dstType = CV_MAT_TYPE(_dstType);
4535     int cn = CV_MAT_CN(_srcType);
4536     CV_Assert( cn == CV_MAT_CN(_dstType) );
4537 
4538     Mat kernel = _kernel;
4539     int bits = 0;
4540 
4541     /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
4542     int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
4543     if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
4544         _kernel.rows*_kernel.cols <= (1 << 10) )
4545     {
4546         bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
4547         _kernel.convertTo(kernel, CV_32S, 1 << bits);
4548     }*/
4549 
4550     Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
4551         kernel, _anchor, _delta, bits);
4552 
4553     return makePtr<FilterEngine>(_filter2D, Ptr<BaseRowFilter>(),
4554         Ptr<BaseColumnFilter>(), _srcType, _dstType, _srcType,
4555         _rowBorderType, _columnBorderType, _borderValue );
4556 }
4557 
4558 
filter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernel,Point anchor0,double delta,int borderType)4559 void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
4560                    InputArray _kernel, Point anchor0,
4561                    double delta, int borderType )
4562 {
4563     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
4564                ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType))
4565 
4566     Mat src = _src.getMat(), kernel = _kernel.getMat();
4567 
4568     if( ddepth < 0 )
4569         ddepth = src.depth();
4570 
4571 #if CV_SSE2
4572     int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
4573         (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
4574 #else
4575     int dft_filter_size = 50;
4576 #endif
4577 
4578     _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
4579     Mat dst = _dst.getMat();
4580     Point anchor = normalizeAnchor(anchor0, kernel.size());
4581 
4582 #if IPP_VERSION_X100 > 0 && !defined HAVE_IPP_ICV_ONLY
4583     CV_IPP_CHECK()
4584     {
4585         typedef IppStatus (CV_STDCALL * ippiFilterBorder)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize dstRoiSize,
4586                                                           IppiBorderType border, const void * borderValue,
4587                                                           const IppiFilterBorderSpec* pSpec, Ipp8u* pBuffer);
4588 
4589         int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
4590                 ktype = kernel.type(), kdepth = CV_MAT_DEPTH(ktype);
4591         bool isolated = (borderType & BORDER_ISOLATED) != 0;
4592         Point ippAnchor(kernel.cols >> 1, kernel.rows >> 1);
4593         int borderTypeNI = borderType & ~BORDER_ISOLATED;
4594         IppiBorderType ippBorderType = ippiGetBorderType(borderTypeNI);
4595 
4596         if (borderTypeNI == BORDER_CONSTANT || borderTypeNI == BORDER_REPLICATE)
4597         {
4598             ippiFilterBorder ippFunc =
4599                 stype == CV_8UC1 ? (ippiFilterBorder)ippiFilterBorder_8u_C1R :
4600                 stype == CV_8UC3 ? (ippiFilterBorder)ippiFilterBorder_8u_C3R :
4601                 stype == CV_8UC4 ? (ippiFilterBorder)ippiFilterBorder_8u_C4R :
4602                 stype == CV_16UC1 ? (ippiFilterBorder)ippiFilterBorder_16u_C1R :
4603                 stype == CV_16UC3 ? (ippiFilterBorder)ippiFilterBorder_16u_C3R :
4604                 stype == CV_16UC4 ? (ippiFilterBorder)ippiFilterBorder_16u_C4R :
4605                 stype == CV_16SC1 ? (ippiFilterBorder)ippiFilterBorder_16s_C1R :
4606                 stype == CV_16SC3 ? (ippiFilterBorder)ippiFilterBorder_16s_C3R :
4607                 stype == CV_16SC4 ? (ippiFilterBorder)ippiFilterBorder_16s_C4R :
4608                 stype == CV_32FC1 ? (ippiFilterBorder)ippiFilterBorder_32f_C1R :
4609                 stype == CV_32FC3 ? (ippiFilterBorder)ippiFilterBorder_32f_C3R :
4610                 stype == CV_32FC4 ? (ippiFilterBorder)ippiFilterBorder_32f_C4R : 0;
4611 
4612             if (sdepth == ddepth && (ktype == CV_16SC1 || ktype == CV_32FC1) &&
4613                     ippFunc && (int)ippBorderType >= 0 && (!src.isSubmatrix() || isolated) &&
4614                     std::fabs(delta - 0) < DBL_EPSILON && ippAnchor == anchor && dst.data != src.data)
4615             {
4616                 IppiSize kernelSize = { kernel.cols, kernel.rows }, dstRoiSize = { dst.cols, dst.rows };
4617                 IppDataType dataType = ippiGetDataType(ddepth), kernelType = ippiGetDataType(kdepth);
4618                 Ipp32s specSize = 0, bufsize = 0;
4619                 IppStatus status = (IppStatus)-1;
4620 
4621                 if ((status = ippiFilterBorderGetSize(kernelSize, dstRoiSize, dataType, kernelType, cn, &specSize, &bufsize)) >= 0)
4622                 {
4623                     IppiFilterBorderSpec * spec = (IppiFilterBorderSpec *)ippMalloc(specSize);
4624                     Ipp8u * buffer = ippsMalloc_8u(bufsize);
4625                     Ipp32f borderValue[4] = { 0, 0, 0, 0 };
4626 
4627                     Mat reversedKernel;
4628                     flip(kernel, reversedKernel, -1);
4629 
4630                     if ((kdepth == CV_32F && (status = ippiFilterBorderInit_32f((const Ipp32f *)reversedKernel.data, kernelSize,
4631                             dataType, cn, ippRndFinancial, spec)) >= 0 ) ||
4632                         (kdepth == CV_16S && (status = ippiFilterBorderInit_16s((const Ipp16s *)reversedKernel.data,
4633                             kernelSize, 0, dataType, cn, ippRndFinancial, spec)) >= 0))
4634                     {
4635                         status = ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, dstRoiSize,
4636                                          ippBorderType, borderValue, spec, buffer);
4637                     }
4638 
4639                     ippsFree(buffer);
4640                     ippsFree(spec);
4641                 }
4642 
4643                 if (status >= 0)
4644                 {
4645                     CV_IMPL_ADD(CV_IMPL_IPP);
4646                     return;
4647                 }
4648                 setIppErrorStatus();
4649             }
4650         }
4651     }
4652 #endif
4653 
4654 #ifdef HAVE_TEGRA_OPTIMIZATION
4655     if( tegra::useTegra() && tegra::filter2D(src, dst, kernel, anchor, delta, borderType) )
4656         return;
4657 #endif
4658 
4659     if( kernel.cols*kernel.rows >= dft_filter_size )
4660     {
4661         Mat temp;
4662         // crossCorr doesn't accept non-zero delta with multiple channels
4663         if( src.channels() != 1 && delta != 0 )
4664         {
4665             // The semantics of filter2D require that the delta be applied
4666             // as floating-point math.  So wee need an intermediate Mat
4667             // with a float datatype.  If the dest is already floats,
4668             // we just use that.
4669             int corrDepth = dst.depth();
4670             if( (dst.depth() == CV_32F || dst.depth() == CV_64F) &&
4671                 src.data != dst.data )
4672             {
4673                 temp = dst;
4674             }
4675             else
4676             {
4677                 corrDepth = dst.depth() == CV_64F ? CV_64F : CV_32F;
4678                 temp.create( dst.size(), CV_MAKETYPE(corrDepth, dst.channels()) );
4679             }
4680             crossCorr( src, kernel, temp, src.size(),
4681                        CV_MAKETYPE(corrDepth, src.channels()),
4682                        anchor, 0, borderType );
4683             add( temp, delta, temp );
4684             if ( temp.data != dst.data )
4685             {
4686                 temp.convertTo( dst, dst.type() );
4687             }
4688         }
4689         else
4690         {
4691             if( src.data != dst.data )
4692                 temp = dst;
4693             else
4694                 temp.create(dst.size(), dst.type());
4695             crossCorr( src, kernel, temp, src.size(),
4696                        CV_MAKETYPE(ddepth, src.channels()),
4697                        anchor, delta, borderType );
4698             if( temp.data != dst.data )
4699                 temp.copyTo(dst);
4700         }
4701         return;
4702     }
4703 
4704     Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
4705                                              anchor, delta, borderType & ~BORDER_ISOLATED );
4706     f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
4707 }
4708 
4709 
sepFilter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernelX,InputArray _kernelY,Point anchor,double delta,int borderType)4710 void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
4711                       InputArray _kernelX, InputArray _kernelY, Point anchor,
4712                       double delta, int borderType )
4713 {
4714     CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
4715                ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
4716 
4717     Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
4718 
4719     if( ddepth < 0 )
4720         ddepth = src.depth();
4721 
4722     _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
4723     Mat dst = _dst.getMat();
4724 
4725     Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
4726         dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
4727     f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
4728 }
4729 
4730 
4731 CV_IMPL void
cvFilter2D(const CvArr * srcarr,CvArr * dstarr,const CvMat * _kernel,CvPoint anchor)4732 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
4733 {
4734     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
4735     cv::Mat kernel = cv::cvarrToMat(_kernel);
4736 
4737     CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
4738 
4739     cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
4740 }
4741 
4742 /* End of file. */
4743