1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44 #include "opencl_kernels_imgproc.hpp"
45
46 /****************************************************************************************\
47 Base Image Filter
48 \****************************************************************************************/
49
50 #if IPP_VERSION_X100 >= 701
51 #define USE_IPP_SEP_FILTERS 1
52 #else
53 #undef USE_IPP_SEP_FILTERS
54 #endif
55
56 namespace cv
57 {
58
BaseRowFilter()59 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; }
~BaseRowFilter()60 BaseRowFilter::~BaseRowFilter() {}
61
BaseColumnFilter()62 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; }
~BaseColumnFilter()63 BaseColumnFilter::~BaseColumnFilter() {}
reset()64 void BaseColumnFilter::reset() {}
65
BaseFilter()66 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); }
~BaseFilter()67 BaseFilter::~BaseFilter() {}
reset()68 void BaseFilter::reset() {}
69
FilterEngine()70 FilterEngine::FilterEngine()
71 {
72 srcType = dstType = bufType = -1;
73 rowBorderType = columnBorderType = BORDER_REPLICATE;
74 bufStep = startY = startY0 = endY = rowCount = dstY = 0;
75 maxWidth = 0;
76
77 wholeSize = Size(-1,-1);
78 }
79
80
FilterEngine(const Ptr<BaseFilter> & _filter2D,const Ptr<BaseRowFilter> & _rowFilter,const Ptr<BaseColumnFilter> & _columnFilter,int _srcType,int _dstType,int _bufType,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)81 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D,
82 const Ptr<BaseRowFilter>& _rowFilter,
83 const Ptr<BaseColumnFilter>& _columnFilter,
84 int _srcType, int _dstType, int _bufType,
85 int _rowBorderType, int _columnBorderType,
86 const Scalar& _borderValue )
87 {
88 init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType,
89 _rowBorderType, _columnBorderType, _borderValue);
90 }
91
~FilterEngine()92 FilterEngine::~FilterEngine()
93 {
94 }
95
96
init(const Ptr<BaseFilter> & _filter2D,const Ptr<BaseRowFilter> & _rowFilter,const Ptr<BaseColumnFilter> & _columnFilter,int _srcType,int _dstType,int _bufType,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)97 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D,
98 const Ptr<BaseRowFilter>& _rowFilter,
99 const Ptr<BaseColumnFilter>& _columnFilter,
100 int _srcType, int _dstType, int _bufType,
101 int _rowBorderType, int _columnBorderType,
102 const Scalar& _borderValue )
103 {
104 _srcType = CV_MAT_TYPE(_srcType);
105 _bufType = CV_MAT_TYPE(_bufType);
106 _dstType = CV_MAT_TYPE(_dstType);
107
108 srcType = _srcType;
109 int srcElemSize = (int)getElemSize(srcType);
110 dstType = _dstType;
111 bufType = _bufType;
112
113 filter2D = _filter2D;
114 rowFilter = _rowFilter;
115 columnFilter = _columnFilter;
116
117 if( _columnBorderType < 0 )
118 _columnBorderType = _rowBorderType;
119
120 rowBorderType = _rowBorderType;
121 columnBorderType = _columnBorderType;
122
123 CV_Assert( columnBorderType != BORDER_WRAP );
124
125 if( isSeparable() )
126 {
127 CV_Assert( rowFilter && columnFilter );
128 ksize = Size(rowFilter->ksize, columnFilter->ksize);
129 anchor = Point(rowFilter->anchor, columnFilter->anchor);
130 }
131 else
132 {
133 CV_Assert( bufType == srcType );
134 ksize = filter2D->ksize;
135 anchor = filter2D->anchor;
136 }
137
138 CV_Assert( 0 <= anchor.x && anchor.x < ksize.width &&
139 0 <= anchor.y && anchor.y < ksize.height );
140
141 borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1);
142 int borderLength = std::max(ksize.width - 1, 1);
143 borderTab.resize(borderLength*borderElemSize);
144
145 maxWidth = bufStep = 0;
146 constBorderRow.clear();
147
148 if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT )
149 {
150 constBorderValue.resize(srcElemSize*borderLength);
151 int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4));
152 scalarToRawData(_borderValue, &constBorderValue[0], srcType1,
153 borderLength*CV_MAT_CN(srcType));
154 }
155
156 wholeSize = Size(-1,-1);
157 }
158
159 #define VEC_ALIGN CV_MALLOC_ALIGN
160
start(Size _wholeSize,Rect _roi,int _maxBufRows)161 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows)
162 {
163 int i, j;
164
165 wholeSize = _wholeSize;
166 roi = _roi;
167 CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 &&
168 roi.x + roi.width <= wholeSize.width &&
169 roi.y + roi.height <= wholeSize.height );
170
171 int esz = (int)getElemSize(srcType);
172 int bufElemSize = (int)getElemSize(bufType);
173 const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0;
174
175 if( _maxBufRows < 0 )
176 _maxBufRows = ksize.height + 3;
177 _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1);
178
179 if( maxWidth < roi.width || _maxBufRows != (int)rows.size() )
180 {
181 rows.resize(_maxBufRows);
182 maxWidth = std::max(maxWidth, roi.width);
183 int cn = CV_MAT_CN(srcType);
184 srcRow.resize(esz*(maxWidth + ksize.width - 1));
185 if( columnBorderType == BORDER_CONSTANT )
186 {
187 constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN));
188 uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst;
189 int n = (int)constBorderValue.size(), N;
190 N = (maxWidth + ksize.width - 1)*esz;
191 tdst = isSeparable() ? &srcRow[0] : dst;
192
193 for( i = 0; i < N; i += n )
194 {
195 n = std::min( n, N - i );
196 for(j = 0; j < n; j++)
197 tdst[i+j] = constVal[j];
198 }
199
200 if( isSeparable() )
201 (*rowFilter)(&srcRow[0], dst, maxWidth, cn);
202 }
203
204 int maxBufStep = bufElemSize*(int)alignSize(maxWidth +
205 (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN);
206 ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN);
207 }
208
209 // adjust bufstep so that the used part of the ring buffer stays compact in memory
210 bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16);
211
212 dx1 = std::max(anchor.x - roi.x, 0);
213 dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0);
214
215 // recompute border tables
216 if( dx1 > 0 || dx2 > 0 )
217 {
218 if( rowBorderType == BORDER_CONSTANT )
219 {
220 int nr = isSeparable() ? 1 : (int)rows.size();
221 for( i = 0; i < nr; i++ )
222 {
223 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i;
224 memcpy( dst, constVal, dx1*esz );
225 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz );
226 }
227 }
228 else
229 {
230 int xofs1 = std::min(roi.x, anchor.x) - roi.x;
231
232 int btab_esz = borderElemSize, wholeWidth = wholeSize.width;
233 int* btab = (int*)&borderTab[0];
234
235 for( i = 0; i < dx1; i++ )
236 {
237 int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz;
238 for( j = 0; j < btab_esz; j++ )
239 btab[i*btab_esz + j] = p0 + j;
240 }
241
242 for( i = 0; i < dx2; i++ )
243 {
244 int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz;
245 for( j = 0; j < btab_esz; j++ )
246 btab[(i + dx1)*btab_esz + j] = p0 + j;
247 }
248 }
249 }
250
251 rowCount = dstY = 0;
252 startY = startY0 = std::max(roi.y - anchor.y, 0);
253 endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height);
254 if( columnFilter )
255 columnFilter->reset();
256 if( filter2D )
257 filter2D->reset();
258
259 return startY;
260 }
261
262
start(const Mat & src,const Rect & _srcRoi,bool isolated,int maxBufRows)263 int FilterEngine::start(const Mat& src, const Rect& _srcRoi,
264 bool isolated, int maxBufRows)
265 {
266 Rect srcRoi = _srcRoi;
267
268 if( srcRoi == Rect(0,0,-1,-1) )
269 srcRoi = Rect(0,0,src.cols,src.rows);
270
271 CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 &&
272 srcRoi.width >= 0 && srcRoi.height >= 0 &&
273 srcRoi.x + srcRoi.width <= src.cols &&
274 srcRoi.y + srcRoi.height <= src.rows );
275
276 Point ofs;
277 Size wsz(src.cols, src.rows);
278 if( !isolated )
279 src.locateROI( wsz, ofs );
280 start( wsz, srcRoi + ofs, maxBufRows );
281
282 return startY - ofs.y;
283 }
284
285
remainingInputRows() const286 int FilterEngine::remainingInputRows() const
287 {
288 return endY - startY - rowCount;
289 }
290
remainingOutputRows() const291 int FilterEngine::remainingOutputRows() const
292 {
293 return roi.height - dstY;
294 }
295
proceed(const uchar * src,int srcstep,int count,uchar * dst,int dststep)296 int FilterEngine::proceed( const uchar* src, int srcstep, int count,
297 uchar* dst, int dststep )
298 {
299 CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 );
300
301 const int *btab = &borderTab[0];
302 int esz = (int)getElemSize(srcType), btab_esz = borderElemSize;
303 uchar** brows = &rows[0];
304 int bufRows = (int)rows.size();
305 int cn = CV_MAT_CN(bufType);
306 int width = roi.width, kwidth = ksize.width;
307 int kheight = ksize.height, ay = anchor.y;
308 int _dx1 = dx1, _dx2 = dx2;
309 int width1 = roi.width + kwidth - 1;
310 int xofs1 = std::min(roi.x, anchor.x);
311 bool isSep = isSeparable();
312 bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT;
313 int dy = 0, i = 0;
314
315 src -= xofs1*esz;
316 count = std::min(count, remainingInputRows());
317
318 CV_Assert( src && dst && count > 0 );
319
320 for(;; dst += dststep*i, dy += i)
321 {
322 int dcount = bufRows - ay - startY - rowCount + roi.y;
323 dcount = dcount > 0 ? dcount : bufRows - kheight + 1;
324 dcount = std::min(dcount, count);
325 count -= dcount;
326 for( ; dcount-- > 0; src += srcstep )
327 {
328 int bi = (startY - startY0 + rowCount) % bufRows;
329 uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
330 uchar* row = isSep ? &srcRow[0] : brow;
331
332 if( ++rowCount > bufRows )
333 {
334 --rowCount;
335 ++startY;
336 }
337
338 memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz );
339
340 if( makeBorder )
341 {
342 if( btab_esz*(int)sizeof(int) == esz )
343 {
344 const int* isrc = (const int*)src;
345 int* irow = (int*)row;
346
347 for( i = 0; i < _dx1*btab_esz; i++ )
348 irow[i] = isrc[btab[i]];
349 for( i = 0; i < _dx2*btab_esz; i++ )
350 irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]];
351 }
352 else
353 {
354 for( i = 0; i < _dx1*esz; i++ )
355 row[i] = src[btab[i]];
356 for( i = 0; i < _dx2*esz; i++ )
357 row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]];
358 }
359 }
360
361 if( isSep )
362 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType));
363 }
364
365 int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1));
366 for( i = 0; i < max_i; i++ )
367 {
368 int srcY = borderInterpolate(dstY + dy + i + roi.y - ay,
369 wholeSize.height, columnBorderType);
370 if( srcY < 0 ) // can happen only with constant border type
371 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN);
372 else
373 {
374 CV_Assert( srcY >= startY );
375 if( srcY >= startY + rowCount )
376 break;
377 int bi = (srcY - startY0) % bufRows;
378 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep;
379 }
380 }
381 if( i < kheight )
382 break;
383 i -= kheight - 1;
384 if( isSeparable() )
385 (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn);
386 else
387 (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn);
388 }
389
390 dstY += dy;
391 CV_Assert( dstY <= roi.height );
392 return dy;
393 }
394
395
apply(const Mat & src,Mat & dst,const Rect & _srcRoi,Point dstOfs,bool isolated)396 void FilterEngine::apply(const Mat& src, Mat& dst,
397 const Rect& _srcRoi, Point dstOfs, bool isolated)
398 {
399 CV_Assert( src.type() == srcType && dst.type() == dstType );
400
401 Rect srcRoi = _srcRoi;
402 if( srcRoi == Rect(0,0,-1,-1) )
403 srcRoi = Rect(0,0,src.cols,src.rows);
404
405 if( srcRoi.area() == 0 )
406 return;
407
408 CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 &&
409 dstOfs.x + srcRoi.width <= dst.cols &&
410 dstOfs.y + srcRoi.height <= dst.rows );
411
412 int y = start(src, srcRoi, isolated);
413 proceed( src.ptr() + y*src.step + srcRoi.x*src.elemSize(),
414 (int)src.step, endY - startY,
415 dst.ptr(dstOfs.y) +
416 dstOfs.x*dst.elemSize(), (int)dst.step );
417 }
418
419 }
420
421 /****************************************************************************************\
422 * Separable linear filter *
423 \****************************************************************************************/
424
getKernelType(InputArray filter_kernel,Point anchor)425 int cv::getKernelType(InputArray filter_kernel, Point anchor)
426 {
427 Mat _kernel = filter_kernel.getMat();
428 CV_Assert( _kernel.channels() == 1 );
429 int i, sz = _kernel.rows*_kernel.cols;
430
431 Mat kernel;
432 _kernel.convertTo(kernel, CV_64F);
433
434 const double* coeffs = kernel.ptr<double>();
435 double sum = 0;
436 int type = KERNEL_SMOOTH + KERNEL_INTEGER;
437 if( (_kernel.rows == 1 || _kernel.cols == 1) &&
438 anchor.x*2 + 1 == _kernel.cols &&
439 anchor.y*2 + 1 == _kernel.rows )
440 type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL);
441
442 for( i = 0; i < sz; i++ )
443 {
444 double a = coeffs[i], b = coeffs[sz - i - 1];
445 if( a != b )
446 type &= ~KERNEL_SYMMETRICAL;
447 if( a != -b )
448 type &= ~KERNEL_ASYMMETRICAL;
449 if( a < 0 )
450 type &= ~KERNEL_SMOOTH;
451 if( a != saturate_cast<int>(a) )
452 type &= ~KERNEL_INTEGER;
453 sum += a;
454 }
455
456 if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) )
457 type &= ~KERNEL_SMOOTH;
458 return type;
459 }
460
461
462 namespace cv
463 {
464
465 struct RowNoVec
466 {
RowNoVeccv::RowNoVec467 RowNoVec() {}
RowNoVeccv::RowNoVec468 RowNoVec(const Mat&) {}
operator ()cv::RowNoVec469 int operator()(const uchar*, uchar*, int, int) const { return 0; }
470 };
471
472 struct ColumnNoVec
473 {
ColumnNoVeccv::ColumnNoVec474 ColumnNoVec() {}
ColumnNoVeccv::ColumnNoVec475 ColumnNoVec(const Mat&, int, int, double) {}
operator ()cv::ColumnNoVec476 int operator()(const uchar**, uchar*, int) const { return 0; }
477 };
478
479 struct SymmRowSmallNoVec
480 {
SymmRowSmallNoVeccv::SymmRowSmallNoVec481 SymmRowSmallNoVec() {}
SymmRowSmallNoVeccv::SymmRowSmallNoVec482 SymmRowSmallNoVec(const Mat&, int) {}
operator ()cv::SymmRowSmallNoVec483 int operator()(const uchar*, uchar*, int, int) const { return 0; }
484 };
485
486 struct SymmColumnSmallNoVec
487 {
SymmColumnSmallNoVeccv::SymmColumnSmallNoVec488 SymmColumnSmallNoVec() {}
SymmColumnSmallNoVeccv::SymmColumnSmallNoVec489 SymmColumnSmallNoVec(const Mat&, int, int, double) {}
operator ()cv::SymmColumnSmallNoVec490 int operator()(const uchar**, uchar*, int) const { return 0; }
491 };
492
493 struct FilterNoVec
494 {
FilterNoVeccv::FilterNoVec495 FilterNoVec() {}
FilterNoVeccv::FilterNoVec496 FilterNoVec(const Mat&, int, double) {}
operator ()cv::FilterNoVec497 int operator()(const uchar**, uchar*, int) const { return 0; }
498 };
499
500
501 #if CV_SSE2
502
503 ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
504
505 struct RowVec_8u32s
506 {
RowVec_8u32scv::RowVec_8u32s507 RowVec_8u32s() { smallValues = false; }
RowVec_8u32scv::RowVec_8u32s508 RowVec_8u32s( const Mat& _kernel )
509 {
510 kernel = _kernel;
511 smallValues = true;
512 int k, ksize = kernel.rows + kernel.cols - 1;
513 for( k = 0; k < ksize; k++ )
514 {
515 int v = kernel.ptr<int>()[k];
516 if( v < SHRT_MIN || v > SHRT_MAX )
517 {
518 smallValues = false;
519 break;
520 }
521 }
522 }
523
operator ()cv::RowVec_8u32s524 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
525 {
526 if( !checkHardwareSupport(CV_CPU_SSE2) )
527 return 0;
528
529 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
530 int* dst = (int*)_dst;
531 const int* _kx = kernel.ptr<int>();
532 width *= cn;
533
534 if( smallValues )
535 {
536 for( ; i <= width - 16; i += 16 )
537 {
538 const uchar* src = _src + i;
539 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z;
540 __m128i x0, x1, x2, x3;
541
542 for( k = 0; k < _ksize; k++, src += cn )
543 {
544 f = _mm_cvtsi32_si128(_kx[k]);
545 f = _mm_shuffle_epi32(f, 0);
546 f = _mm_packs_epi32(f, f);
547
548 x0 = _mm_loadu_si128((const __m128i*)src);
549 x2 = _mm_unpackhi_epi8(x0, z);
550 x0 = _mm_unpacklo_epi8(x0, z);
551 x1 = _mm_mulhi_epi16(x0, f);
552 x3 = _mm_mulhi_epi16(x2, f);
553 x0 = _mm_mullo_epi16(x0, f);
554 x2 = _mm_mullo_epi16(x2, f);
555
556 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
557 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1));
558 s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3));
559 s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3));
560 }
561
562 _mm_store_si128((__m128i*)(dst + i), s0);
563 _mm_store_si128((__m128i*)(dst + i + 4), s1);
564 _mm_store_si128((__m128i*)(dst + i + 8), s2);
565 _mm_store_si128((__m128i*)(dst + i + 12), s3);
566 }
567
568 for( ; i <= width - 4; i += 4 )
569 {
570 const uchar* src = _src + i;
571 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1;
572
573 for( k = 0; k < _ksize; k++, src += cn )
574 {
575 f = _mm_cvtsi32_si128(_kx[k]);
576 f = _mm_shuffle_epi32(f, 0);
577 f = _mm_packs_epi32(f, f);
578
579 x0 = _mm_cvtsi32_si128(*(const int*)src);
580 x0 = _mm_unpacklo_epi8(x0, z);
581 x1 = _mm_mulhi_epi16(x0, f);
582 x0 = _mm_mullo_epi16(x0, f);
583 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
584 }
585 _mm_store_si128((__m128i*)(dst + i), s0);
586 }
587 }
588 return i;
589 }
590
591 Mat kernel;
592 bool smallValues;
593 };
594
595
596 struct SymmRowSmallVec_8u32s
597 {
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s598 SymmRowSmallVec_8u32s() { smallValues = false; }
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s599 SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
600 {
601 kernel = _kernel;
602 symmetryType = _symmetryType;
603 smallValues = true;
604 int k, ksize = kernel.rows + kernel.cols - 1;
605 for( k = 0; k < ksize; k++ )
606 {
607 int v = kernel.ptr<int>()[k];
608 if( v < SHRT_MIN || v > SHRT_MAX )
609 {
610 smallValues = false;
611 break;
612 }
613 }
614 }
615
operator ()cv::SymmRowSmallVec_8u32s616 int operator()(const uchar* src, uchar* _dst, int width, int cn) const
617 {
618 if( !checkHardwareSupport(CV_CPU_SSE2) )
619 return 0;
620
621 int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
622 int* dst = (int*)_dst;
623 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
624 const int* kx = kernel.ptr<int>() + _ksize/2;
625 if( !smallValues )
626 return 0;
627
628 src += (_ksize/2)*cn;
629 width *= cn;
630
631 __m128i z = _mm_setzero_si128();
632 if( symmetrical )
633 {
634 if( _ksize == 1 )
635 return 0;
636 if( _ksize == 3 )
637 {
638 if( kx[0] == 2 && kx[1] == 1 )
639 for( ; i <= width - 16; i += 16, src += 16 )
640 {
641 __m128i x0, x1, x2, y0, y1, y2;
642 x0 = _mm_loadu_si128((__m128i*)(src - cn));
643 x1 = _mm_loadu_si128((__m128i*)src);
644 x2 = _mm_loadu_si128((__m128i*)(src + cn));
645 y0 = _mm_unpackhi_epi8(x0, z);
646 x0 = _mm_unpacklo_epi8(x0, z);
647 y1 = _mm_unpackhi_epi8(x1, z);
648 x1 = _mm_unpacklo_epi8(x1, z);
649 y2 = _mm_unpackhi_epi8(x2, z);
650 x2 = _mm_unpacklo_epi8(x2, z);
651 x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
652 y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
653 _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
654 _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
655 _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
656 _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
657 }
658 else if( kx[0] == -2 && kx[1] == 1 )
659 for( ; i <= width - 16; i += 16, src += 16 )
660 {
661 __m128i x0, x1, x2, y0, y1, y2;
662 x0 = _mm_loadu_si128((__m128i*)(src - cn));
663 x1 = _mm_loadu_si128((__m128i*)src);
664 x2 = _mm_loadu_si128((__m128i*)(src + cn));
665 y0 = _mm_unpackhi_epi8(x0, z);
666 x0 = _mm_unpacklo_epi8(x0, z);
667 y1 = _mm_unpackhi_epi8(x1, z);
668 x1 = _mm_unpacklo_epi8(x1, z);
669 y2 = _mm_unpackhi_epi8(x2, z);
670 x2 = _mm_unpacklo_epi8(x2, z);
671 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
672 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
673 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
674 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
675 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
676 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
677 }
678 else
679 {
680 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
681 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
682 k0 = _mm_packs_epi32(k0, k0);
683 k1 = _mm_packs_epi32(k1, k1);
684
685 for( ; i <= width - 16; i += 16, src += 16 )
686 {
687 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
688 x0 = _mm_loadu_si128((__m128i*)(src - cn));
689 x1 = _mm_loadu_si128((__m128i*)src);
690 x2 = _mm_loadu_si128((__m128i*)(src + cn));
691 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
692 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
693 y1 = _mm_unpackhi_epi8(x1, z);
694 x1 = _mm_unpacklo_epi8(x1, z);
695
696 t1 = _mm_mulhi_epi16(x1, k0);
697 t0 = _mm_mullo_epi16(x1, k0);
698 x2 = _mm_mulhi_epi16(x0, k1);
699 x0 = _mm_mullo_epi16(x0, k1);
700 z0 = _mm_unpacklo_epi16(t0, t1);
701 z1 = _mm_unpackhi_epi16(t0, t1);
702 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
703 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
704
705 t1 = _mm_mulhi_epi16(y1, k0);
706 t0 = _mm_mullo_epi16(y1, k0);
707 y1 = _mm_mulhi_epi16(y0, k1);
708 y0 = _mm_mullo_epi16(y0, k1);
709 z2 = _mm_unpacklo_epi16(t0, t1);
710 z3 = _mm_unpackhi_epi16(t0, t1);
711 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
712 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
713 _mm_store_si128((__m128i*)(dst + i), z0);
714 _mm_store_si128((__m128i*)(dst + i + 4), z1);
715 _mm_store_si128((__m128i*)(dst + i + 8), z2);
716 _mm_store_si128((__m128i*)(dst + i + 12), z3);
717 }
718 }
719 }
720 else if( _ksize == 5 )
721 {
722 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
723 for( ; i <= width - 16; i += 16, src += 16 )
724 {
725 __m128i x0, x1, x2, y0, y1, y2;
726 x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
727 x1 = _mm_loadu_si128((__m128i*)src);
728 x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
729 y0 = _mm_unpackhi_epi8(x0, z);
730 x0 = _mm_unpacklo_epi8(x0, z);
731 y1 = _mm_unpackhi_epi8(x1, z);
732 x1 = _mm_unpacklo_epi8(x1, z);
733 y2 = _mm_unpackhi_epi8(x2, z);
734 x2 = _mm_unpacklo_epi8(x2, z);
735 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
736 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
737 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
738 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
739 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
740 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
741 }
742 else
743 {
744 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
745 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
746 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
747 k0 = _mm_packs_epi32(k0, k0);
748 k1 = _mm_packs_epi32(k1, k1);
749 k2 = _mm_packs_epi32(k2, k2);
750
751 for( ; i <= width - 16; i += 16, src += 16 )
752 {
753 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
754 x0 = _mm_loadu_si128((__m128i*)(src - cn));
755 x1 = _mm_loadu_si128((__m128i*)src);
756 x2 = _mm_loadu_si128((__m128i*)(src + cn));
757 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
758 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
759 y1 = _mm_unpackhi_epi8(x1, z);
760 x1 = _mm_unpacklo_epi8(x1, z);
761
762 t1 = _mm_mulhi_epi16(x1, k0);
763 t0 = _mm_mullo_epi16(x1, k0);
764 x2 = _mm_mulhi_epi16(x0, k1);
765 x0 = _mm_mullo_epi16(x0, k1);
766 z0 = _mm_unpacklo_epi16(t0, t1);
767 z1 = _mm_unpackhi_epi16(t0, t1);
768 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2));
769 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2));
770
771 t1 = _mm_mulhi_epi16(y1, k0);
772 t0 = _mm_mullo_epi16(y1, k0);
773 y1 = _mm_mulhi_epi16(y0, k1);
774 y0 = _mm_mullo_epi16(y0, k1);
775 z2 = _mm_unpacklo_epi16(t0, t1);
776 z3 = _mm_unpackhi_epi16(t0, t1);
777 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
778 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
779
780 x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
781 x1 = _mm_loadu_si128((__m128i*)(src + cn*2));
782 y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
783 y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
784
785 t1 = _mm_mulhi_epi16(y0, k2);
786 t0 = _mm_mullo_epi16(y0, k2);
787 y0 = _mm_mullo_epi16(y1, k2);
788 y1 = _mm_mulhi_epi16(y1, k2);
789 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
790 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
791 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
792 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
793
794 _mm_store_si128((__m128i*)(dst + i), z0);
795 _mm_store_si128((__m128i*)(dst + i + 4), z1);
796 _mm_store_si128((__m128i*)(dst + i + 8), z2);
797 _mm_store_si128((__m128i*)(dst + i + 12), z3);
798 }
799 }
800 }
801 }
802 else
803 {
804 if( _ksize == 3 )
805 {
806 if( kx[0] == 0 && kx[1] == 1 )
807 for( ; i <= width - 16; i += 16, src += 16 )
808 {
809 __m128i x0, x1, y0;
810 x0 = _mm_loadu_si128((__m128i*)(src + cn));
811 x1 = _mm_loadu_si128((__m128i*)(src - cn));
812 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
813 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
814 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
815 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
816 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
817 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
818 }
819 else
820 {
821 __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
822 k1 = _mm_packs_epi32(k1, k1);
823
824 for( ; i <= width - 16; i += 16, src += 16 )
825 {
826 __m128i x0, x1, y0, y1, z0, z1, z2, z3;
827 x0 = _mm_loadu_si128((__m128i*)(src + cn));
828 x1 = _mm_loadu_si128((__m128i*)(src - cn));
829 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
830 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
831
832 x1 = _mm_mulhi_epi16(x0, k1);
833 x0 = _mm_mullo_epi16(x0, k1);
834 z0 = _mm_unpacklo_epi16(x0, x1);
835 z1 = _mm_unpackhi_epi16(x0, x1);
836
837 y1 = _mm_mulhi_epi16(y0, k1);
838 y0 = _mm_mullo_epi16(y0, k1);
839 z2 = _mm_unpacklo_epi16(y0, y1);
840 z3 = _mm_unpackhi_epi16(y0, y1);
841 _mm_store_si128((__m128i*)(dst + i), z0);
842 _mm_store_si128((__m128i*)(dst + i + 4), z1);
843 _mm_store_si128((__m128i*)(dst + i + 8), z2);
844 _mm_store_si128((__m128i*)(dst + i + 12), z3);
845 }
846 }
847 }
848 else if( _ksize == 5 )
849 {
850 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
851 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
852 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
853 k0 = _mm_packs_epi32(k0, k0);
854 k1 = _mm_packs_epi32(k1, k1);
855 k2 = _mm_packs_epi32(k2, k2);
856
857 for( ; i <= width - 16; i += 16, src += 16 )
858 {
859 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3;
860 x0 = _mm_loadu_si128((__m128i*)(src + cn));
861 x2 = _mm_loadu_si128((__m128i*)(src - cn));
862 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z));
863 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z));
864
865 x2 = _mm_mulhi_epi16(x0, k1);
866 x0 = _mm_mullo_epi16(x0, k1);
867 z0 = _mm_unpacklo_epi16(x0, x2);
868 z1 = _mm_unpackhi_epi16(x0, x2);
869 y1 = _mm_mulhi_epi16(y0, k1);
870 y0 = _mm_mullo_epi16(y0, k1);
871 z2 = _mm_unpacklo_epi16(y0, y1);
872 z3 = _mm_unpackhi_epi16(y0, y1);
873
874 x0 = _mm_loadu_si128((__m128i*)(src + cn*2));
875 x1 = _mm_loadu_si128((__m128i*)(src - cn*2));
876 y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
877 y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
878
879 t1 = _mm_mulhi_epi16(y0, k2);
880 t0 = _mm_mullo_epi16(y0, k2);
881 y0 = _mm_mullo_epi16(y1, k2);
882 y1 = _mm_mulhi_epi16(y1, k2);
883 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1));
884 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1));
885 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1));
886 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1));
887
888 _mm_store_si128((__m128i*)(dst + i), z0);
889 _mm_store_si128((__m128i*)(dst + i + 4), z1);
890 _mm_store_si128((__m128i*)(dst + i + 8), z2);
891 _mm_store_si128((__m128i*)(dst + i + 12), z3);
892 }
893 }
894 }
895
896 src -= (_ksize/2)*cn;
897 kx -= _ksize/2;
898 for( ; i <= width - 4; i += 4, src += 4 )
899 {
900 __m128i f, s0 = z, x0, x1;
901
902 for( k = j = 0; k < _ksize; k++, j += cn )
903 {
904 f = _mm_cvtsi32_si128(kx[k]);
905 f = _mm_shuffle_epi32(f, 0);
906 f = _mm_packs_epi32(f, f);
907
908 x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
909 x0 = _mm_unpacklo_epi8(x0, z);
910 x1 = _mm_mulhi_epi16(x0, f);
911 x0 = _mm_mullo_epi16(x0, f);
912 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1));
913 }
914 _mm_store_si128((__m128i*)(dst + i), s0);
915 }
916
917 return i;
918 }
919
920 Mat kernel;
921 int symmetryType;
922 bool smallValues;
923 };
924
925
926 struct SymmColumnVec_32s8u
927 {
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u928 SymmColumnVec_32s8u() { symmetryType=0; }
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u929 SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
930 {
931 symmetryType = _symmetryType;
932 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
933 delta = (float)(_delta/(1 << _bits));
934 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
935 }
936
operator ()cv::SymmColumnVec_32s8u937 int operator()(const uchar** _src, uchar* dst, int width) const
938 {
939 if( !checkHardwareSupport(CV_CPU_SSE2) )
940 return 0;
941
942 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
943 const float* ky = kernel.ptr<float>() + ksize2;
944 int i = 0, k;
945 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
946 const int** src = (const int**)_src;
947 const __m128i *S, *S2;
948 __m128 d4 = _mm_set1_ps(delta);
949
950 if( symmetrical )
951 {
952 for( ; i <= width - 16; i += 16 )
953 {
954 __m128 f = _mm_load_ss(ky);
955 f = _mm_shuffle_ps(f, f, 0);
956 __m128 s0, s1, s2, s3;
957 __m128i x0, x1;
958 S = (const __m128i*)(src[0] + i);
959 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
960 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
961 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
962 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
963 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
964 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
965 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
966 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
967
968 for( k = 1; k <= ksize2; k++ )
969 {
970 S = (const __m128i*)(src[k] + i);
971 S2 = (const __m128i*)(src[-k] + i);
972 f = _mm_load_ss(ky+k);
973 f = _mm_shuffle_ps(f, f, 0);
974 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
975 x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
976 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
977 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
978 x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
979 x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
980 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
981 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
982 }
983
984 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
985 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
986 x0 = _mm_packus_epi16(x0, x1);
987 _mm_storeu_si128((__m128i*)(dst + i), x0);
988 }
989
990 for( ; i <= width - 4; i += 4 )
991 {
992 __m128 f = _mm_load_ss(ky);
993 f = _mm_shuffle_ps(f, f, 0);
994 __m128i x0;
995 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
996 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
997
998 for( k = 1; k <= ksize2; k++ )
999 {
1000 S = (const __m128i*)(src[k] + i);
1001 S2 = (const __m128i*)(src[-k] + i);
1002 f = _mm_load_ss(ky+k);
1003 f = _mm_shuffle_ps(f, f, 0);
1004 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1005 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1006 }
1007
1008 x0 = _mm_cvtps_epi32(s0);
1009 x0 = _mm_packs_epi32(x0, x0);
1010 x0 = _mm_packus_epi16(x0, x0);
1011 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1012 }
1013 }
1014 else
1015 {
1016 for( ; i <= width - 16; i += 16 )
1017 {
1018 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1019 __m128i x0, x1;
1020
1021 for( k = 1; k <= ksize2; k++ )
1022 {
1023 S = (const __m128i*)(src[k] + i);
1024 S2 = (const __m128i*)(src[-k] + i);
1025 f = _mm_load_ss(ky+k);
1026 f = _mm_shuffle_ps(f, f, 0);
1027 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1028 x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
1029 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1030 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1031 x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
1032 x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
1033 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1034 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
1035 }
1036
1037 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1038 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
1039 x0 = _mm_packus_epi16(x0, x1);
1040 _mm_storeu_si128((__m128i*)(dst + i), x0);
1041 }
1042
1043 for( ; i <= width - 4; i += 4 )
1044 {
1045 __m128 f, s0 = d4;
1046 __m128i x0;
1047
1048 for( k = 1; k <= ksize2; k++ )
1049 {
1050 S = (const __m128i*)(src[k] + i);
1051 S2 = (const __m128i*)(src[-k] + i);
1052 f = _mm_load_ss(ky+k);
1053 f = _mm_shuffle_ps(f, f, 0);
1054 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
1055 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
1056 }
1057
1058 x0 = _mm_cvtps_epi32(s0);
1059 x0 = _mm_packs_epi32(x0, x0);
1060 x0 = _mm_packus_epi16(x0, x0);
1061 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
1062 }
1063 }
1064
1065 return i;
1066 }
1067
1068 int symmetryType;
1069 float delta;
1070 Mat kernel;
1071 };
1072
1073
1074 struct SymmColumnSmallVec_32s16s
1075 {
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s1076 SymmColumnSmallVec_32s16s() { symmetryType=0; }
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s1077 SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
1078 {
1079 symmetryType = _symmetryType;
1080 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1081 delta = (float)(_delta/(1 << _bits));
1082 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1083 }
1084
operator ()cv::SymmColumnSmallVec_32s16s1085 int operator()(const uchar** _src, uchar* _dst, int width) const
1086 {
1087 if( !checkHardwareSupport(CV_CPU_SSE2) )
1088 return 0;
1089
1090 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1091 const float* ky = kernel.ptr<float>() + ksize2;
1092 int i = 0;
1093 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1094 const int** src = (const int**)_src;
1095 const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1096 short* dst = (short*)_dst;
1097 __m128 df4 = _mm_set1_ps(delta);
1098 __m128i d4 = _mm_cvtps_epi32(df4);
1099
1100 if( symmetrical )
1101 {
1102 if( ky[0] == 2 && ky[1] == 1 )
1103 {
1104 for( ; i <= width - 8; i += 8 )
1105 {
1106 __m128i s0, s1, s2, s3, s4, s5;
1107 s0 = _mm_load_si128((__m128i*)(S0 + i));
1108 s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1109 s2 = _mm_load_si128((__m128i*)(S1 + i));
1110 s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1111 s4 = _mm_load_si128((__m128i*)(S2 + i));
1112 s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1113 s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
1114 s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
1115 s0 = _mm_add_epi32(s0, d4);
1116 s1 = _mm_add_epi32(s1, d4);
1117 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1118 }
1119 }
1120 else if( ky[0] == -2 && ky[1] == 1 )
1121 {
1122 for( ; i <= width - 8; i += 8 )
1123 {
1124 __m128i s0, s1, s2, s3, s4, s5;
1125 s0 = _mm_load_si128((__m128i*)(S0 + i));
1126 s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
1127 s2 = _mm_load_si128((__m128i*)(S1 + i));
1128 s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
1129 s4 = _mm_load_si128((__m128i*)(S2 + i));
1130 s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
1131 s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
1132 s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
1133 s0 = _mm_add_epi32(s0, d4);
1134 s1 = _mm_add_epi32(s1, d4);
1135 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1136 }
1137 }
1138 else
1139 {
1140 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1141 for( ; i <= width - 8; i += 8 )
1142 {
1143 __m128 s0, s1;
1144 s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
1145 s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
1146 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
1147 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
1148 __m128i x0, x1;
1149 x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
1150 _mm_load_si128((__m128i*)(S2 + i)));
1151 x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
1152 _mm_load_si128((__m128i*)(S2 + i + 4)));
1153 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1154 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1155 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1156 _mm_storeu_si128((__m128i*)(dst + i), x0);
1157 }
1158 }
1159 }
1160 else
1161 {
1162 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1163 {
1164 if( ky[1] < 0 )
1165 std::swap(S0, S2);
1166 for( ; i <= width - 8; i += 8 )
1167 {
1168 __m128i s0, s1, s2, s3;
1169 s0 = _mm_load_si128((__m128i*)(S2 + i));
1170 s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
1171 s2 = _mm_load_si128((__m128i*)(S0 + i));
1172 s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
1173 s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
1174 s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
1175 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
1176 }
1177 }
1178 else
1179 {
1180 __m128 k1 = _mm_set1_ps(ky[1]);
1181 for( ; i <= width - 8; i += 8 )
1182 {
1183 __m128 s0 = df4, s1 = df4;
1184 __m128i x0, x1;
1185 x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
1186 _mm_load_si128((__m128i*)(S0 + i)));
1187 x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
1188 _mm_load_si128((__m128i*)(S0 + i + 4)));
1189 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
1190 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
1191 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
1192 _mm_storeu_si128((__m128i*)(dst + i), x0);
1193 }
1194 }
1195 }
1196
1197 return i;
1198 }
1199
1200 int symmetryType;
1201 float delta;
1202 Mat kernel;
1203 };
1204
1205
1206 /////////////////////////////////////// 16s //////////////////////////////////
1207
1208 struct RowVec_16s32f
1209 {
RowVec_16s32fcv::RowVec_16s32f1210 RowVec_16s32f() {}
RowVec_16s32fcv::RowVec_16s32f1211 RowVec_16s32f( const Mat& _kernel )
1212 {
1213 kernel = _kernel;
1214 sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1215 }
1216
operator ()cv::RowVec_16s32f1217 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1218 {
1219 if( !sse2_supported )
1220 return 0;
1221
1222 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
1223 float* dst = (float*)_dst;
1224 const float* _kx = kernel.ptr<float>();
1225 width *= cn;
1226
1227 for( ; i <= width - 8; i += 8 )
1228 {
1229 const short* src = (const short*)_src + i;
1230 __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1231 for( k = 0; k < _ksize; k++, src += cn )
1232 {
1233 f = _mm_load_ss(_kx+k);
1234 f = _mm_shuffle_ps(f, f, 0);
1235
1236 __m128i x0i = _mm_loadu_si128((const __m128i*)src);
1237 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
1238 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
1239 x0 = _mm_cvtepi32_ps(x0i);
1240 x1 = _mm_cvtepi32_ps(x1i);
1241 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1242 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1243 }
1244 _mm_store_ps(dst + i, s0);
1245 _mm_store_ps(dst + i + 4, s1);
1246 }
1247 return i;
1248 }
1249
1250 Mat kernel;
1251 bool sse2_supported;
1252 };
1253
1254
1255 struct SymmColumnVec_32f16s
1256 {
SymmColumnVec_32f16scv::SymmColumnVec_32f16s1257 SymmColumnVec_32f16s() { symmetryType=0; }
SymmColumnVec_32f16scv::SymmColumnVec_32f16s1258 SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
1259 {
1260 symmetryType = _symmetryType;
1261 kernel = _kernel;
1262 delta = (float)_delta;
1263 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1264 sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
1265 }
1266
operator ()cv::SymmColumnVec_32f16s1267 int operator()(const uchar** _src, uchar* _dst, int width) const
1268 {
1269 if( !sse2_supported )
1270 return 0;
1271
1272 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1273 const float* ky = kernel.ptr<float>() + ksize2;
1274 int i = 0, k;
1275 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1276 const float** src = (const float**)_src;
1277 const float *S, *S2;
1278 short* dst = (short*)_dst;
1279 __m128 d4 = _mm_set1_ps(delta);
1280
1281 if( symmetrical )
1282 {
1283 for( ; i <= width - 16; i += 16 )
1284 {
1285 __m128 f = _mm_load_ss(ky);
1286 f = _mm_shuffle_ps(f, f, 0);
1287 __m128 s0, s1, s2, s3;
1288 __m128 x0, x1;
1289 S = src[0] + i;
1290 s0 = _mm_load_ps(S);
1291 s1 = _mm_load_ps(S+4);
1292 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1293 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1294 s2 = _mm_load_ps(S+8);
1295 s3 = _mm_load_ps(S+12);
1296 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1297 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1298
1299 for( k = 1; k <= ksize2; k++ )
1300 {
1301 S = src[k] + i;
1302 S2 = src[-k] + i;
1303 f = _mm_load_ss(ky+k);
1304 f = _mm_shuffle_ps(f, f, 0);
1305 x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1306 x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1307 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1308 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1309 x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1310 x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1311 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1312 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1313 }
1314
1315 __m128i s0i = _mm_cvtps_epi32(s0);
1316 __m128i s1i = _mm_cvtps_epi32(s1);
1317 __m128i s2i = _mm_cvtps_epi32(s2);
1318 __m128i s3i = _mm_cvtps_epi32(s3);
1319
1320 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1321 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1322 }
1323
1324 for( ; i <= width - 4; i += 4 )
1325 {
1326 __m128 f = _mm_load_ss(ky);
1327 f = _mm_shuffle_ps(f, f, 0);
1328 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1329 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1330
1331 for( k = 1; k <= ksize2; k++ )
1332 {
1333 f = _mm_load_ss(ky+k);
1334 f = _mm_shuffle_ps(f, f, 0);
1335 S = src[k] + i;
1336 S2 = src[-k] + i;
1337 x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1338 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1339 }
1340
1341 __m128i s0i = _mm_cvtps_epi32(s0);
1342 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1343 }
1344 }
1345 else
1346 {
1347 for( ; i <= width - 16; i += 16 )
1348 {
1349 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1350 __m128 x0, x1;
1351 S = src[0] + i;
1352
1353 for( k = 1; k <= ksize2; k++ )
1354 {
1355 S = src[k] + i;
1356 S2 = src[-k] + i;
1357 f = _mm_load_ss(ky+k);
1358 f = _mm_shuffle_ps(f, f, 0);
1359 x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1360 x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1361 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1362 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1363 x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1364 x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1365 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1366 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1367 }
1368
1369 __m128i s0i = _mm_cvtps_epi32(s0);
1370 __m128i s1i = _mm_cvtps_epi32(s1);
1371 __m128i s2i = _mm_cvtps_epi32(s2);
1372 __m128i s3i = _mm_cvtps_epi32(s3);
1373
1374 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
1375 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
1376 }
1377
1378 for( ; i <= width - 4; i += 4 )
1379 {
1380 __m128 f, x0, s0 = d4;
1381
1382 for( k = 1; k <= ksize2; k++ )
1383 {
1384 f = _mm_load_ss(ky+k);
1385 f = _mm_shuffle_ps(f, f, 0);
1386 x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1387 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1388 }
1389
1390 __m128i s0i = _mm_cvtps_epi32(s0);
1391 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
1392 }
1393 }
1394
1395 return i;
1396 }
1397
1398 int symmetryType;
1399 float delta;
1400 Mat kernel;
1401 bool sse2_supported;
1402 };
1403
1404
1405 /////////////////////////////////////// 32f //////////////////////////////////
1406
1407 struct RowVec_32f
1408 {
RowVec_32fcv::RowVec_32f1409 RowVec_32f()
1410 {
1411 haveSSE = checkHardwareSupport(CV_CPU_SSE);
1412 }
1413
RowVec_32fcv::RowVec_32f1414 RowVec_32f( const Mat& _kernel )
1415 {
1416 kernel = _kernel;
1417 haveSSE = checkHardwareSupport(CV_CPU_SSE);
1418 #if defined USE_IPP_SEP_FILTERS && 0
1419 bufsz = -1;
1420 #endif
1421 }
1422
operator ()cv::RowVec_32f1423 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1424 {
1425 #if defined USE_IPP_SEP_FILTERS && 0
1426 CV_IPP_CHECK()
1427 {
1428 int ret = ippiOperator(_src, _dst, width, cn);
1429 if (ret > 0)
1430 return ret;
1431 }
1432 #endif
1433 int _ksize = kernel.rows + kernel.cols - 1;
1434 const float* src0 = (const float*)_src;
1435 float* dst = (float*)_dst;
1436 const float* _kx = kernel.ptr<float>();
1437
1438 if( !haveSSE )
1439 return 0;
1440
1441 int i = 0, k;
1442 width *= cn;
1443
1444 for( ; i <= width - 8; i += 8 )
1445 {
1446 const float* src = src0 + i;
1447 __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
1448 for( k = 0; k < _ksize; k++, src += cn )
1449 {
1450 f = _mm_load_ss(_kx+k);
1451 f = _mm_shuffle_ps(f, f, 0);
1452
1453 x0 = _mm_loadu_ps(src);
1454 x1 = _mm_loadu_ps(src + 4);
1455 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1456 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1457 }
1458 _mm_store_ps(dst + i, s0);
1459 _mm_store_ps(dst + i + 4, s1);
1460 }
1461 return i;
1462 }
1463
1464 Mat kernel;
1465 bool haveSSE;
1466 #if defined USE_IPP_SEP_FILTERS && 0
1467 private:
1468 mutable int bufsz;
ippiOperatorcv::RowVec_32f1469 int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const
1470 {
1471 int _ksize = kernel.rows + kernel.cols - 1;
1472 if ((1 != cn && 3 != cn) || width < _ksize*8)
1473 return 0;
1474
1475 const float* src = (const float*)_src;
1476 float* dst = (float*)_dst;
1477 const float* _kx = (const float*)kernel.data;
1478
1479 IppiSize roisz = { width, 1 };
1480 if( bufsz < 0 )
1481 {
1482 if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) ||
1483 (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0))
1484 return 0;
1485 }
1486 AutoBuffer<uchar> buf(bufsz + 64);
1487 uchar* bufptr = alignPtr((uchar*)buf, 32);
1488 int step = (int)(width*sizeof(dst[0])*cn);
1489 float borderValue[] = {0.f, 0.f, 0.f};
1490 // here is the trick. IPP needs border type and extrapolates the row. We did it already.
1491 // So we pass anchor=0 and ignore the right tail of results since they are incorrect there.
1492 if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src, step, &dst, roisz, _kx, _ksize, 0,
1493 ippBorderRepl, borderValue[0], bufptr) < 0) ||
1494 (cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src, step, &dst, roisz, _kx, _ksize, 0,
1495 ippBorderRepl, borderValue, bufptr) < 0))
1496 {
1497 setIppErrorStatus();
1498 return 0;
1499 }
1500 CV_IMPL_ADD(CV_IMPL_IPP);
1501 return width - _ksize + 1;
1502 }
1503 #endif
1504 };
1505
1506
1507 struct SymmRowSmallVec_32f
1508 {
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f1509 SymmRowSmallVec_32f() {}
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f1510 SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
1511 {
1512 kernel = _kernel;
1513 symmetryType = _symmetryType;
1514 }
1515
operator ()cv::SymmRowSmallVec_32f1516 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
1517 {
1518 if( !checkHardwareSupport(CV_CPU_SSE) )
1519 return 0;
1520
1521 int i = 0, _ksize = kernel.rows + kernel.cols - 1;
1522 float* dst = (float*)_dst;
1523 const float* src = (const float*)_src + (_ksize/2)*cn;
1524 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1525 const float* kx = kernel.ptr<float>() + _ksize/2;
1526 width *= cn;
1527
1528 if( symmetrical )
1529 {
1530 if( _ksize == 1 )
1531 return 0;
1532 if( _ksize == 3 )
1533 {
1534 if( kx[0] == 2 && kx[1] == 1 )
1535 for( ; i <= width - 8; i += 8, src += 8 )
1536 {
1537 __m128 x0, x1, x2, y0, y1, y2;
1538 x0 = _mm_loadu_ps(src - cn);
1539 x1 = _mm_loadu_ps(src);
1540 x2 = _mm_loadu_ps(src + cn);
1541 y0 = _mm_loadu_ps(src - cn + 4);
1542 y1 = _mm_loadu_ps(src + 4);
1543 y2 = _mm_loadu_ps(src + cn + 4);
1544 x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
1545 y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
1546 _mm_store_ps(dst + i, x0);
1547 _mm_store_ps(dst + i + 4, y0);
1548 }
1549 else if( kx[0] == -2 && kx[1] == 1 )
1550 for( ; i <= width - 8; i += 8, src += 8 )
1551 {
1552 __m128 x0, x1, x2, y0, y1, y2;
1553 x0 = _mm_loadu_ps(src - cn);
1554 x1 = _mm_loadu_ps(src);
1555 x2 = _mm_loadu_ps(src + cn);
1556 y0 = _mm_loadu_ps(src - cn + 4);
1557 y1 = _mm_loadu_ps(src + 4);
1558 y2 = _mm_loadu_ps(src + cn + 4);
1559 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1560 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1561 _mm_store_ps(dst + i, x0);
1562 _mm_store_ps(dst + i + 4, y0);
1563 }
1564 else
1565 {
1566 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
1567 for( ; i <= width - 8; i += 8, src += 8 )
1568 {
1569 __m128 x0, x1, x2, y0, y1, y2;
1570 x0 = _mm_loadu_ps(src - cn);
1571 x1 = _mm_loadu_ps(src);
1572 x2 = _mm_loadu_ps(src + cn);
1573 y0 = _mm_loadu_ps(src - cn + 4);
1574 y1 = _mm_loadu_ps(src + 4);
1575 y2 = _mm_loadu_ps(src + cn + 4);
1576
1577 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1578 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1579 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1580 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1581 _mm_store_ps(dst + i, x0);
1582 _mm_store_ps(dst + i + 4, y0);
1583 }
1584 }
1585 }
1586 else if( _ksize == 5 )
1587 {
1588 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
1589 for( ; i <= width - 8; i += 8, src += 8 )
1590 {
1591 __m128 x0, x1, x2, y0, y1, y2;
1592 x0 = _mm_loadu_ps(src - cn*2);
1593 x1 = _mm_loadu_ps(src);
1594 x2 = _mm_loadu_ps(src + cn*2);
1595 y0 = _mm_loadu_ps(src - cn*2 + 4);
1596 y1 = _mm_loadu_ps(src + 4);
1597 y2 = _mm_loadu_ps(src + cn*2 + 4);
1598 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
1599 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
1600 _mm_store_ps(dst + i, x0);
1601 _mm_store_ps(dst + i + 4, y0);
1602 }
1603 else
1604 {
1605 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1606 for( ; i <= width - 8; i += 8, src += 8 )
1607 {
1608 __m128 x0, x1, x2, y0, y1, y2;
1609 x0 = _mm_loadu_ps(src - cn);
1610 x1 = _mm_loadu_ps(src);
1611 x2 = _mm_loadu_ps(src + cn);
1612 y0 = _mm_loadu_ps(src - cn + 4);
1613 y1 = _mm_loadu_ps(src + 4);
1614 y2 = _mm_loadu_ps(src + cn + 4);
1615
1616 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
1617 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
1618 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
1619 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
1620
1621 x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1622 y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1623 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1624 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1625
1626 _mm_store_ps(dst + i, x0);
1627 _mm_store_ps(dst + i + 4, y0);
1628 }
1629 }
1630 }
1631 }
1632 else
1633 {
1634 if( _ksize == 3 )
1635 {
1636 if( kx[0] == 0 && kx[1] == 1 )
1637 for( ; i <= width - 8; i += 8, src += 8 )
1638 {
1639 __m128 x0, x2, y0, y2;
1640 x0 = _mm_loadu_ps(src + cn);
1641 x2 = _mm_loadu_ps(src - cn);
1642 y0 = _mm_loadu_ps(src + cn + 4);
1643 y2 = _mm_loadu_ps(src - cn + 4);
1644 x0 = _mm_sub_ps(x0, x2);
1645 y0 = _mm_sub_ps(y0, y2);
1646 _mm_store_ps(dst + i, x0);
1647 _mm_store_ps(dst + i + 4, y0);
1648 }
1649 else
1650 {
1651 __m128 k1 = _mm_set1_ps(kx[1]);
1652 for( ; i <= width - 8; i += 8, src += 8 )
1653 {
1654 __m128 x0, x2, y0, y2;
1655 x0 = _mm_loadu_ps(src + cn);
1656 x2 = _mm_loadu_ps(src - cn);
1657 y0 = _mm_loadu_ps(src + cn + 4);
1658 y2 = _mm_loadu_ps(src - cn + 4);
1659
1660 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1661 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1662 _mm_store_ps(dst + i, x0);
1663 _mm_store_ps(dst + i + 4, y0);
1664 }
1665 }
1666 }
1667 else if( _ksize == 5 )
1668 {
1669 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
1670 for( ; i <= width - 8; i += 8, src += 8 )
1671 {
1672 __m128 x0, x2, y0, y2;
1673 x0 = _mm_loadu_ps(src + cn);
1674 x2 = _mm_loadu_ps(src - cn);
1675 y0 = _mm_loadu_ps(src + cn + 4);
1676 y2 = _mm_loadu_ps(src - cn + 4);
1677
1678 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
1679 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
1680
1681 x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
1682 y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
1683 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
1684 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
1685
1686 _mm_store_ps(dst + i, x0);
1687 _mm_store_ps(dst + i + 4, y0);
1688 }
1689 }
1690 }
1691
1692 return i;
1693 }
1694
1695 Mat kernel;
1696 int symmetryType;
1697 };
1698
1699
1700 struct SymmColumnVec_32f
1701 {
SymmColumnVec_32fcv::SymmColumnVec_32f1702 SymmColumnVec_32f() { symmetryType=0; }
SymmColumnVec_32fcv::SymmColumnVec_32f1703 SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1704 {
1705 symmetryType = _symmetryType;
1706 kernel = _kernel;
1707 delta = (float)_delta;
1708 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1709 }
1710
operator ()cv::SymmColumnVec_32f1711 int operator()(const uchar** _src, uchar* _dst, int width) const
1712 {
1713 if( !checkHardwareSupport(CV_CPU_SSE) )
1714 return 0;
1715
1716 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1717 const float* ky = kernel.ptr<float>() + ksize2;
1718 int i = 0, k;
1719 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1720 const float** src = (const float**)_src;
1721 const float *S, *S2;
1722 float* dst = (float*)_dst;
1723 __m128 d4 = _mm_set1_ps(delta);
1724
1725 if( symmetrical )
1726 {
1727 for( ; i <= width - 16; i += 16 )
1728 {
1729 __m128 f = _mm_load_ss(ky);
1730 f = _mm_shuffle_ps(f, f, 0);
1731 __m128 s0, s1, s2, s3;
1732 __m128 x0, x1;
1733 S = src[0] + i;
1734 s0 = _mm_load_ps(S);
1735 s1 = _mm_load_ps(S+4);
1736 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1737 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
1738 s2 = _mm_load_ps(S+8);
1739 s3 = _mm_load_ps(S+12);
1740 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
1741 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
1742
1743 for( k = 1; k <= ksize2; k++ )
1744 {
1745 S = src[k] + i;
1746 S2 = src[-k] + i;
1747 f = _mm_load_ss(ky+k);
1748 f = _mm_shuffle_ps(f, f, 0);
1749 x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
1750 x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1751 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1752 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1753 x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1754 x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1755 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1756 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1757 }
1758
1759 _mm_storeu_ps(dst + i, s0);
1760 _mm_storeu_ps(dst + i + 4, s1);
1761 _mm_storeu_ps(dst + i + 8, s2);
1762 _mm_storeu_ps(dst + i + 12, s3);
1763 }
1764
1765 for( ; i <= width - 4; i += 4 )
1766 {
1767 __m128 f = _mm_load_ss(ky);
1768 f = _mm_shuffle_ps(f, f, 0);
1769 __m128 x0, s0 = _mm_load_ps(src[0] + i);
1770 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
1771
1772 for( k = 1; k <= ksize2; k++ )
1773 {
1774 f = _mm_load_ss(ky+k);
1775 f = _mm_shuffle_ps(f, f, 0);
1776 S = src[k] + i;
1777 S2 = src[-k] + i;
1778 x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1779 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1780 }
1781
1782 _mm_storeu_ps(dst + i, s0);
1783 }
1784 }
1785 else
1786 {
1787 for( ; i <= width - 16; i += 16 )
1788 {
1789 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1790 __m128 x0, x1;
1791 S = src[0] + i;
1792
1793 for( k = 1; k <= ksize2; k++ )
1794 {
1795 S = src[k] + i;
1796 S2 = src[-k] + i;
1797 f = _mm_load_ss(ky+k);
1798 f = _mm_shuffle_ps(f, f, 0);
1799 x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
1800 x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
1801 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1802 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
1803 x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
1804 x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
1805 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
1806 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
1807 }
1808
1809 _mm_storeu_ps(dst + i, s0);
1810 _mm_storeu_ps(dst + i + 4, s1);
1811 _mm_storeu_ps(dst + i + 8, s2);
1812 _mm_storeu_ps(dst + i + 12, s3);
1813 }
1814
1815 for( ; i <= width - 4; i += 4 )
1816 {
1817 __m128 f, x0, s0 = d4;
1818
1819 for( k = 1; k <= ksize2; k++ )
1820 {
1821 f = _mm_load_ss(ky+k);
1822 f = _mm_shuffle_ps(f, f, 0);
1823 x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
1824 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
1825 }
1826
1827 _mm_storeu_ps(dst + i, s0);
1828 }
1829 }
1830
1831 return i;
1832 }
1833
1834 int symmetryType;
1835 float delta;
1836 Mat kernel;
1837 };
1838
1839
1840 struct SymmColumnSmallVec_32f
1841 {
SymmColumnSmallVec_32fcv::SymmColumnSmallVec_32f1842 SymmColumnSmallVec_32f() { symmetryType=0; }
SymmColumnSmallVec_32fcv::SymmColumnSmallVec_32f1843 SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
1844 {
1845 symmetryType = _symmetryType;
1846 kernel = _kernel;
1847 delta = (float)_delta;
1848 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
1849 }
1850
operator ()cv::SymmColumnSmallVec_32f1851 int operator()(const uchar** _src, uchar* _dst, int width) const
1852 {
1853 if( !checkHardwareSupport(CV_CPU_SSE) )
1854 return 0;
1855
1856 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
1857 const float* ky = kernel.ptr<float>() + ksize2;
1858 int i = 0;
1859 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
1860 const float** src = (const float**)_src;
1861 const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
1862 float* dst = (float*)_dst;
1863 __m128 d4 = _mm_set1_ps(delta);
1864
1865 if( symmetrical )
1866 {
1867 if( ky[0] == 2 && ky[1] == 1 )
1868 {
1869 for( ; i <= width - 8; i += 8 )
1870 {
1871 __m128 s0, s1, s2, s3, s4, s5;
1872 s0 = _mm_load_ps(S0 + i);
1873 s1 = _mm_load_ps(S0 + i + 4);
1874 s2 = _mm_load_ps(S1 + i);
1875 s3 = _mm_load_ps(S1 + i + 4);
1876 s4 = _mm_load_ps(S2 + i);
1877 s5 = _mm_load_ps(S2 + i + 4);
1878 s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
1879 s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
1880 s0 = _mm_add_ps(s0, d4);
1881 s1 = _mm_add_ps(s1, d4);
1882 _mm_storeu_ps(dst + i, s0);
1883 _mm_storeu_ps(dst + i + 4, s1);
1884 }
1885 }
1886 else if( ky[0] == -2 && ky[1] == 1 )
1887 {
1888 for( ; i <= width - 8; i += 8 )
1889 {
1890 __m128 s0, s1, s2, s3, s4, s5;
1891 s0 = _mm_load_ps(S0 + i);
1892 s1 = _mm_load_ps(S0 + i + 4);
1893 s2 = _mm_load_ps(S1 + i);
1894 s3 = _mm_load_ps(S1 + i + 4);
1895 s4 = _mm_load_ps(S2 + i);
1896 s5 = _mm_load_ps(S2 + i + 4);
1897 s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
1898 s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
1899 s0 = _mm_add_ps(s0, d4);
1900 s1 = _mm_add_ps(s1, d4);
1901 _mm_storeu_ps(dst + i, s0);
1902 _mm_storeu_ps(dst + i + 4, s1);
1903 }
1904 }
1905 else
1906 {
1907 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
1908 for( ; i <= width - 8; i += 8 )
1909 {
1910 __m128 s0, s1, x0, x1;
1911 s0 = _mm_load_ps(S1 + i);
1912 s1 = _mm_load_ps(S1 + i + 4);
1913 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
1914 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
1915 x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
1916 x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
1917 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1918 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1919 _mm_storeu_ps(dst + i, s0);
1920 _mm_storeu_ps(dst + i + 4, s1);
1921 }
1922 }
1923 }
1924 else
1925 {
1926 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
1927 {
1928 if( ky[1] < 0 )
1929 std::swap(S0, S2);
1930 for( ; i <= width - 8; i += 8 )
1931 {
1932 __m128 s0, s1, s2, s3;
1933 s0 = _mm_load_ps(S2 + i);
1934 s1 = _mm_load_ps(S2 + i + 4);
1935 s2 = _mm_load_ps(S0 + i);
1936 s3 = _mm_load_ps(S0 + i + 4);
1937 s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
1938 s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
1939 _mm_storeu_ps(dst + i, s0);
1940 _mm_storeu_ps(dst + i + 4, s1);
1941 }
1942 }
1943 else
1944 {
1945 __m128 k1 = _mm_set1_ps(ky[1]);
1946 for( ; i <= width - 8; i += 8 )
1947 {
1948 __m128 s0 = d4, s1 = d4, x0, x1;
1949 x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
1950 x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
1951 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
1952 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
1953 _mm_storeu_ps(dst + i, s0);
1954 _mm_storeu_ps(dst + i + 4, s1);
1955 }
1956 }
1957 }
1958
1959 return i;
1960 }
1961
1962 int symmetryType;
1963 float delta;
1964 Mat kernel;
1965 };
1966
1967
1968 /////////////////////////////// non-separable filters ///////////////////////////////
1969
1970 ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
1971
1972 struct FilterVec_8u
1973 {
FilterVec_8ucv::FilterVec_8u1974 FilterVec_8u() {}
FilterVec_8ucv::FilterVec_8u1975 FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
1976 {
1977 Mat kernel;
1978 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
1979 delta = (float)(_delta/(1 << _bits));
1980 std::vector<Point> coords;
1981 preprocess2DKernel(kernel, coords, coeffs);
1982 _nz = (int)coords.size();
1983 }
1984
operator ()cv::FilterVec_8u1985 int operator()(const uchar** src, uchar* dst, int width) const
1986 {
1987 if( !checkHardwareSupport(CV_CPU_SSE2) )
1988 return 0;
1989
1990 const float* kf = (const float*)&coeffs[0];
1991 int i = 0, k, nz = _nz;
1992 __m128 d4 = _mm_set1_ps(delta);
1993
1994 for( ; i <= width - 16; i += 16 )
1995 {
1996 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
1997 __m128i x0, x1, z = _mm_setzero_si128();
1998
1999 for( k = 0; k < nz; k++ )
2000 {
2001 __m128 f = _mm_load_ss(kf+k), t0, t1;
2002 f = _mm_shuffle_ps(f, f, 0);
2003
2004 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2005 x1 = _mm_unpackhi_epi8(x0, z);
2006 x0 = _mm_unpacklo_epi8(x0, z);
2007
2008 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2009 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2010 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2011 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2012
2013 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2014 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2015 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2016 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2017 }
2018
2019 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2020 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2021 x0 = _mm_packus_epi16(x0, x1);
2022 _mm_storeu_si128((__m128i*)(dst + i), x0);
2023 }
2024
2025 for( ; i <= width - 4; i += 4 )
2026 {
2027 __m128 s0 = d4;
2028 __m128i x0, z = _mm_setzero_si128();
2029
2030 for( k = 0; k < nz; k++ )
2031 {
2032 __m128 f = _mm_load_ss(kf+k), t0;
2033 f = _mm_shuffle_ps(f, f, 0);
2034
2035 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2036 x0 = _mm_unpacklo_epi8(x0, z);
2037 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2038 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2039 }
2040
2041 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2042 x0 = _mm_packus_epi16(x0, x0);
2043 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
2044 }
2045
2046 return i;
2047 }
2048
2049 int _nz;
2050 std::vector<uchar> coeffs;
2051 float delta;
2052 };
2053
2054
2055 struct FilterVec_8u16s
2056 {
FilterVec_8u16scv::FilterVec_8u16s2057 FilterVec_8u16s() {}
FilterVec_8u16scv::FilterVec_8u16s2058 FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
2059 {
2060 Mat kernel;
2061 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2062 delta = (float)(_delta/(1 << _bits));
2063 std::vector<Point> coords;
2064 preprocess2DKernel(kernel, coords, coeffs);
2065 _nz = (int)coords.size();
2066 }
2067
operator ()cv::FilterVec_8u16s2068 int operator()(const uchar** src, uchar* _dst, int width) const
2069 {
2070 if( !checkHardwareSupport(CV_CPU_SSE2) )
2071 return 0;
2072
2073 const float* kf = (const float*)&coeffs[0];
2074 short* dst = (short*)_dst;
2075 int i = 0, k, nz = _nz;
2076 __m128 d4 = _mm_set1_ps(delta);
2077
2078 for( ; i <= width - 16; i += 16 )
2079 {
2080 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2081 __m128i x0, x1, z = _mm_setzero_si128();
2082
2083 for( k = 0; k < nz; k++ )
2084 {
2085 __m128 f = _mm_load_ss(kf+k), t0, t1;
2086 f = _mm_shuffle_ps(f, f, 0);
2087
2088 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
2089 x1 = _mm_unpackhi_epi8(x0, z);
2090 x0 = _mm_unpacklo_epi8(x0, z);
2091
2092 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2093 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
2094 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2095 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2096
2097 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
2098 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
2099 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2100 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2101 }
2102
2103 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
2104 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
2105 _mm_storeu_si128((__m128i*)(dst + i), x0);
2106 _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
2107 }
2108
2109 for( ; i <= width - 4; i += 4 )
2110 {
2111 __m128 s0 = d4;
2112 __m128i x0, z = _mm_setzero_si128();
2113
2114 for( k = 0; k < nz; k++ )
2115 {
2116 __m128 f = _mm_load_ss(kf+k), t0;
2117 f = _mm_shuffle_ps(f, f, 0);
2118
2119 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
2120 x0 = _mm_unpacklo_epi8(x0, z);
2121 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
2122 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2123 }
2124
2125 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
2126 _mm_storel_epi64((__m128i*)(dst + i), x0);
2127 }
2128
2129 return i;
2130 }
2131
2132 int _nz;
2133 std::vector<uchar> coeffs;
2134 float delta;
2135 };
2136
2137
2138 struct FilterVec_32f
2139 {
FilterVec_32fcv::FilterVec_32f2140 FilterVec_32f() {}
FilterVec_32fcv::FilterVec_32f2141 FilterVec_32f(const Mat& _kernel, int, double _delta)
2142 {
2143 delta = (float)_delta;
2144 std::vector<Point> coords;
2145 preprocess2DKernel(_kernel, coords, coeffs);
2146 _nz = (int)coords.size();
2147 }
2148
operator ()cv::FilterVec_32f2149 int operator()(const uchar** _src, uchar* _dst, int width) const
2150 {
2151 if( !checkHardwareSupport(CV_CPU_SSE) )
2152 return 0;
2153
2154 const float* kf = (const float*)&coeffs[0];
2155 const float** src = (const float**)_src;
2156 float* dst = (float*)_dst;
2157 int i = 0, k, nz = _nz;
2158 __m128 d4 = _mm_set1_ps(delta);
2159
2160 for( ; i <= width - 16; i += 16 )
2161 {
2162 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
2163
2164 for( k = 0; k < nz; k++ )
2165 {
2166 __m128 f = _mm_load_ss(kf+k), t0, t1;
2167 f = _mm_shuffle_ps(f, f, 0);
2168 const float* S = src[k] + i;
2169
2170 t0 = _mm_loadu_ps(S);
2171 t1 = _mm_loadu_ps(S + 4);
2172 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2173 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
2174
2175 t0 = _mm_loadu_ps(S + 8);
2176 t1 = _mm_loadu_ps(S + 12);
2177 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
2178 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
2179 }
2180
2181 _mm_storeu_ps(dst + i, s0);
2182 _mm_storeu_ps(dst + i + 4, s1);
2183 _mm_storeu_ps(dst + i + 8, s2);
2184 _mm_storeu_ps(dst + i + 12, s3);
2185 }
2186
2187 for( ; i <= width - 4; i += 4 )
2188 {
2189 __m128 s0 = d4;
2190
2191 for( k = 0; k < nz; k++ )
2192 {
2193 __m128 f = _mm_load_ss(kf+k), t0;
2194 f = _mm_shuffle_ps(f, f, 0);
2195 t0 = _mm_loadu_ps(src[k] + i);
2196 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
2197 }
2198 _mm_storeu_ps(dst + i, s0);
2199 }
2200
2201 return i;
2202 }
2203
2204 int _nz;
2205 std::vector<uchar> coeffs;
2206 float delta;
2207 };
2208
2209
2210 #elif CV_NEON
2211
2212 struct SymmRowSmallVec_8u32s
2213 {
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s2214 SymmRowSmallVec_8u32s() { smallValues = false; }
SymmRowSmallVec_8u32scv::SymmRowSmallVec_8u32s2215 SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
2216 {
2217 kernel = _kernel;
2218 symmetryType = _symmetryType;
2219 smallValues = true;
2220 int k, ksize = kernel.rows + kernel.cols - 1;
2221 for( k = 0; k < ksize; k++ )
2222 {
2223 int v = kernel.ptr<int>()[k];
2224 if( v < SHRT_MIN || v > SHRT_MAX )
2225 {
2226 smallValues = false;
2227 break;
2228 }
2229 }
2230 }
2231
operator ()cv::SymmRowSmallVec_8u32s2232 int operator()(const uchar* src, uchar* _dst, int width, int cn) const
2233 {
2234 if( !checkHardwareSupport(CV_CPU_NEON) )
2235 return 0;
2236
2237 int i = 0, _ksize = kernel.rows + kernel.cols - 1;
2238 int* dst = (int*)_dst;
2239 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2240 const int* kx = kernel.ptr<int>() + _ksize/2;
2241 if( !smallValues )
2242 return 0;
2243
2244 src += (_ksize/2)*cn;
2245 width *= cn;
2246
2247 if( symmetrical )
2248 {
2249 if( _ksize == 1 )
2250 return 0;
2251 if( _ksize == 3 )
2252 {
2253 if( kx[0] == 2 && kx[1] == 1 )
2254 {
2255 uint16x8_t zq = vdupq_n_u16(0);
2256
2257 for( ; i <= width - 8; i += 8, src += 8 )
2258 {
2259 uint8x8_t x0, x1, x2;
2260 x0 = vld1_u8( (uint8_t *) (src - cn) );
2261 x1 = vld1_u8( (uint8_t *) (src) );
2262 x2 = vld1_u8( (uint8_t *) (src + cn) );
2263
2264 uint16x8_t y0, y1, y2;
2265 y0 = vaddl_u8(x0, x2);
2266 y1 = vshll_n_u8(x1, 1);
2267 y2 = vaddq_u16(y0, y1);
2268
2269 uint16x8x2_t str;
2270 str.val[0] = y2; str.val[1] = zq;
2271 vst2q_u16( (uint16_t *) (dst + i), str );
2272 }
2273 }
2274 else if( kx[0] == -2 && kx[1] == 1 )
2275 return 0;
2276 else
2277 {
2278 int32x4_t k32 = vdupq_n_s32(0);
2279 k32 = vld1q_lane_s32(kx, k32, 0);
2280 k32 = vld1q_lane_s32(kx + 1, k32, 1);
2281
2282 int16x4_t k = vqmovn_s32(k32);
2283
2284 uint8x8_t z = vdup_n_u8(0);
2285
2286 for( ; i <= width - 8; i += 8, src += 8 )
2287 {
2288 uint8x8_t x0, x1, x2;
2289 x0 = vld1_u8( (uint8_t *) (src - cn) );
2290 x1 = vld1_u8( (uint8_t *) (src) );
2291 x2 = vld1_u8( (uint8_t *) (src + cn) );
2292
2293 int16x8_t y0, y1;
2294 int32x4_t y2, y3;
2295 y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
2296 y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
2297 y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
2298 y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
2299 y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
2300 y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
2301
2302 vst1q_s32((int32_t *)(dst + i), y2);
2303 vst1q_s32((int32_t *)(dst + i + 4), y3);
2304 }
2305 }
2306 }
2307 else if( _ksize == 5 )
2308 {
2309 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
2310 return 0;
2311 else
2312 {
2313 int32x4_t k32 = vdupq_n_s32(0);
2314 k32 = vld1q_lane_s32(kx, k32, 0);
2315 k32 = vld1q_lane_s32(kx + 1, k32, 1);
2316 k32 = vld1q_lane_s32(kx + 2, k32, 2);
2317
2318 int16x4_t k = vqmovn_s32(k32);
2319
2320 uint8x8_t z = vdup_n_u8(0);
2321
2322 for( ; i <= width - 8; i += 8, src += 8 )
2323 {
2324 uint8x8_t x0, x1, x2, x3, x4;
2325 x0 = vld1_u8( (uint8_t *) (src - cn) );
2326 x1 = vld1_u8( (uint8_t *) (src) );
2327 x2 = vld1_u8( (uint8_t *) (src + cn) );
2328
2329 int16x8_t y0, y1;
2330 int32x4_t accl, acch;
2331 y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
2332 y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
2333 accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
2334 accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
2335 acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
2336 acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
2337
2338 int16x8_t y2;
2339 x3 = vld1_u8( (uint8_t *) (src - cn*2) );
2340 x4 = vld1_u8( (uint8_t *) (src + cn*2) );
2341 y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
2342 accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
2343 acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
2344
2345 vst1q_s32((int32_t *)(dst + i), accl);
2346 vst1q_s32((int32_t *)(dst + i + 4), acch);
2347 }
2348 }
2349 }
2350 }
2351 else
2352 {
2353 if( _ksize == 3 )
2354 {
2355 if( kx[0] == 0 && kx[1] == 1 )
2356 {
2357 uint8x8_t z = vdup_n_u8(0);
2358
2359 for( ; i <= width - 8; i += 8, src += 8 )
2360 {
2361 uint8x8_t x0, x1;
2362 x0 = vld1_u8( (uint8_t *) (src - cn) );
2363 x1 = vld1_u8( (uint8_t *) (src + cn) );
2364
2365 int16x8_t y0;
2366 y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2367 vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2368
2369 vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
2370 vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
2371 }
2372 }
2373 else
2374 {
2375 int32x4_t k32 = vdupq_n_s32(0);
2376 k32 = vld1q_lane_s32(kx + 1, k32, 1);
2377
2378 int16x4_t k = vqmovn_s32(k32);
2379
2380 uint8x8_t z = vdup_n_u8(0);
2381
2382 for( ; i <= width - 8; i += 8, src += 8 )
2383 {
2384 uint8x8_t x0, x1;
2385 x0 = vld1_u8( (uint8_t *) (src - cn) );
2386 x1 = vld1_u8( (uint8_t *) (src + cn) );
2387
2388 int16x8_t y0;
2389 int32x4_t y1, y2;
2390 y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2391 vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2392 y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
2393 y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
2394
2395 vst1q_s32((int32_t *)(dst + i), y1);
2396 vst1q_s32((int32_t *)(dst + i + 4), y2);
2397 }
2398 }
2399 }
2400 else if( _ksize == 5 )
2401 {
2402 int32x4_t k32 = vdupq_n_s32(0);
2403 k32 = vld1q_lane_s32(kx + 1, k32, 1);
2404 k32 = vld1q_lane_s32(kx + 2, k32, 2);
2405
2406 int16x4_t k = vqmovn_s32(k32);
2407
2408 uint8x8_t z = vdup_n_u8(0);
2409
2410 for( ; i <= width - 8; i += 8, src += 8 )
2411 {
2412 uint8x8_t x0, x1;
2413 x0 = vld1_u8( (uint8_t *) (src - cn) );
2414 x1 = vld1_u8( (uint8_t *) (src + cn) );
2415
2416 int32x4_t accl, acch;
2417 int16x8_t y0;
2418 y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
2419 vreinterpretq_s16_u16(vaddl_u8(x0, z)));
2420 accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
2421 acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
2422
2423 uint8x8_t x2, x3;
2424 x2 = vld1_u8( (uint8_t *) (src - cn*2) );
2425 x3 = vld1_u8( (uint8_t *) (src + cn*2) );
2426
2427 int16x8_t y1;
2428 y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
2429 vreinterpretq_s16_u16(vaddl_u8(x2, z)));
2430 accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
2431 acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
2432
2433 vst1q_s32((int32_t *)(dst + i), accl);
2434 vst1q_s32((int32_t *)(dst + i + 4), acch);
2435 }
2436 }
2437 }
2438
2439 return i;
2440 }
2441
2442 Mat kernel;
2443 int symmetryType;
2444 bool smallValues;
2445 };
2446
2447
2448 struct SymmColumnVec_32s8u
2449 {
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u2450 SymmColumnVec_32s8u() { symmetryType=0; }
SymmColumnVec_32s8ucv::SymmColumnVec_32s8u2451 SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
2452 {
2453 symmetryType = _symmetryType;
2454 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2455 delta = (float)(_delta/(1 << _bits));
2456 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2457 }
2458
operator ()cv::SymmColumnVec_32s8u2459 int operator()(const uchar** _src, uchar* dst, int width) const
2460 {
2461 if( !checkHardwareSupport(CV_CPU_NEON) )
2462 return 0;
2463
2464 int _ksize = kernel.rows + kernel.cols - 1;
2465 int ksize2 = _ksize / 2;
2466 const float* ky = kernel.ptr<float>() + ksize2;
2467 int i = 0, k;
2468 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2469 const int** src = (const int**)_src;
2470 const int *S, *S2;
2471
2472 float32x4_t d4 = vdupq_n_f32(delta);
2473
2474 if( symmetrical )
2475 {
2476 if( _ksize == 1 )
2477 return 0;
2478
2479
2480 float32x2_t k32;
2481 k32 = vdup_n_f32(0);
2482 k32 = vld1_lane_f32(ky, k32, 0);
2483 k32 = vld1_lane_f32(ky + 1, k32, 1);
2484
2485 for( ; i <= width - 8; i += 8 )
2486 {
2487 float32x4_t accl, acch;
2488 float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
2489
2490 S = src[0] + i;
2491
2492 f0l = vcvtq_f32_s32( vld1q_s32(S) );
2493 f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2494
2495 S = src[1] + i;
2496 S2 = src[-1] + i;
2497
2498 f1l = vcvtq_f32_s32( vld1q_s32(S) );
2499 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2500 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
2501 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2502
2503 accl = acch = d4;
2504 accl = vmlaq_lane_f32(accl, f0l, k32, 0);
2505 acch = vmlaq_lane_f32(acch, f0h, k32, 0);
2506 accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
2507 acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
2508
2509 for( k = 2; k <= ksize2; k++ )
2510 {
2511 S = src[k] + i;
2512 S2 = src[-k] + i;
2513
2514 float32x4_t f3l, f3h, f4l, f4h;
2515 f3l = vcvtq_f32_s32( vld1q_s32(S) );
2516 f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2517 f4l = vcvtq_f32_s32( vld1q_s32(S2) );
2518 f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2519
2520 accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
2521 acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
2522 }
2523
2524 int32x4_t s32l, s32h;
2525 s32l = vcvtq_s32_f32(accl);
2526 s32h = vcvtq_s32_f32(acch);
2527
2528 int16x4_t s16l, s16h;
2529 s16l = vqmovn_s32(s32l);
2530 s16h = vqmovn_s32(s32h);
2531
2532 uint8x8_t u8;
2533 u8 = vqmovun_s16(vcombine_s16(s16l, s16h));
2534
2535 vst1_u8((uint8_t *)(dst + i), u8);
2536 }
2537 }
2538 else
2539 {
2540 float32x2_t k32;
2541 k32 = vdup_n_f32(0);
2542 k32 = vld1_lane_f32(ky + 1, k32, 1);
2543
2544 for( ; i <= width - 8; i += 8 )
2545 {
2546 float32x4_t accl, acch;
2547 float32x4_t f1l, f1h, f2l, f2h;
2548
2549 S = src[1] + i;
2550 S2 = src[-1] + i;
2551
2552 f1l = vcvtq_f32_s32( vld1q_s32(S) );
2553 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2554 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
2555 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2556
2557 accl = acch = d4;
2558 accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
2559 acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
2560
2561 for( k = 2; k <= ksize2; k++ )
2562 {
2563 S = src[k] + i;
2564 S2 = src[-k] + i;
2565
2566 float32x4_t f3l, f3h, f4l, f4h;
2567 f3l = vcvtq_f32_s32( vld1q_s32(S) );
2568 f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
2569 f4l = vcvtq_f32_s32( vld1q_s32(S2) );
2570 f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
2571
2572 accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
2573 acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
2574 }
2575
2576 int32x4_t s32l, s32h;
2577 s32l = vcvtq_s32_f32(accl);
2578 s32h = vcvtq_s32_f32(acch);
2579
2580 int16x4_t s16l, s16h;
2581 s16l = vqmovn_s32(s32l);
2582 s16h = vqmovn_s32(s32h);
2583
2584 uint8x8_t u8;
2585 u8 = vqmovun_s16(vcombine_s16(s16l, s16h));
2586
2587 vst1_u8((uint8_t *)(dst + i), u8);
2588 }
2589 }
2590
2591 return i;
2592 }
2593
2594 int symmetryType;
2595 float delta;
2596 Mat kernel;
2597 };
2598
2599
2600 struct SymmColumnSmallVec_32s16s
2601 {
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s2602 SymmColumnSmallVec_32s16s() { symmetryType=0; }
SymmColumnSmallVec_32s16scv::SymmColumnSmallVec_32s16s2603 SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
2604 {
2605 symmetryType = _symmetryType;
2606 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
2607 delta = (float)(_delta/(1 << _bits));
2608 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2609 }
2610
operator ()cv::SymmColumnSmallVec_32s16s2611 int operator()(const uchar** _src, uchar* _dst, int width) const
2612 {
2613 if( !checkHardwareSupport(CV_CPU_NEON) )
2614 return 0;
2615
2616 int ksize2 = (kernel.rows + kernel.cols - 1)/2;
2617 const float* ky = kernel.ptr<float>() + ksize2;
2618 int i = 0;
2619 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2620 const int** src = (const int**)_src;
2621 const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
2622 short* dst = (short*)_dst;
2623 float32x4_t df4 = vdupq_n_f32(delta);
2624 int32x4_t d4 = vcvtq_s32_f32(df4);
2625
2626 if( symmetrical )
2627 {
2628 if( ky[0] == 2 && ky[1] == 1 )
2629 {
2630 for( ; i <= width - 4; i += 4 )
2631 {
2632 int32x4_t x0, x1, x2;
2633 x0 = vld1q_s32((int32_t const *)(S0 + i));
2634 x1 = vld1q_s32((int32_t const *)(S1 + i));
2635 x2 = vld1q_s32((int32_t const *)(S2 + i));
2636
2637 int32x4_t y0, y1, y2, y3;
2638 y0 = vaddq_s32(x0, x2);
2639 y1 = vqshlq_n_s32(x1, 1);
2640 y2 = vaddq_s32(y0, y1);
2641 y3 = vaddq_s32(y2, d4);
2642
2643 int16x4_t t;
2644 t = vqmovn_s32(y3);
2645
2646 vst1_s16((int16_t *)(dst + i), t);
2647 }
2648 }
2649 else if( ky[0] == -2 && ky[1] == 1 )
2650 {
2651 for( ; i <= width - 4; i += 4 )
2652 {
2653 int32x4_t x0, x1, x2;
2654 x0 = vld1q_s32((int32_t const *)(S0 + i));
2655 x1 = vld1q_s32((int32_t const *)(S1 + i));
2656 x2 = vld1q_s32((int32_t const *)(S2 + i));
2657
2658 int32x4_t y0, y1, y2, y3;
2659 y0 = vaddq_s32(x0, x2);
2660 y1 = vqshlq_n_s32(x1, 1);
2661 y2 = vsubq_s32(y0, y1);
2662 y3 = vaddq_s32(y2, d4);
2663
2664 int16x4_t t;
2665 t = vqmovn_s32(y3);
2666
2667 vst1_s16((int16_t *)(dst + i), t);
2668 }
2669 }
2670 else if( ky[0] == 10 && ky[1] == 3 )
2671 {
2672 for( ; i <= width - 4; i += 4 )
2673 {
2674 int32x4_t x0, x1, x2, x3;
2675 x0 = vld1q_s32((int32_t const *)(S0 + i));
2676 x1 = vld1q_s32((int32_t const *)(S1 + i));
2677 x2 = vld1q_s32((int32_t const *)(S2 + i));
2678
2679 x3 = vaddq_s32(x0, x2);
2680
2681 int32x4_t y0;
2682 y0 = vmlaq_n_s32(d4, x1, 10);
2683 y0 = vmlaq_n_s32(y0, x3, 3);
2684
2685 int16x4_t t;
2686 t = vqmovn_s32(y0);
2687
2688 vst1_s16((int16_t *)(dst + i), t);
2689 }
2690 }
2691 else
2692 {
2693 float32x2_t k32 = vdup_n_f32(0);
2694 k32 = vld1_lane_f32(ky, k32, 0);
2695 k32 = vld1_lane_f32(ky + 1, k32, 1);
2696
2697 for( ; i <= width - 4; i += 4 )
2698 {
2699 int32x4_t x0, x1, x2, x3, x4;
2700 x0 = vld1q_s32((int32_t const *)(S0 + i));
2701 x1 = vld1q_s32((int32_t const *)(S1 + i));
2702 x2 = vld1q_s32((int32_t const *)(S2 + i));
2703
2704 x3 = vaddq_s32(x0, x2);
2705
2706 float32x4_t s0, s1, s2;
2707 s0 = vcvtq_f32_s32(x1);
2708 s1 = vcvtq_f32_s32(x3);
2709 s2 = vmlaq_lane_f32(df4, s0, k32, 0);
2710 s2 = vmlaq_lane_f32(s2, s1, k32, 1);
2711
2712 x4 = vcvtq_s32_f32(s2);
2713
2714 int16x4_t x5;
2715 x5 = vqmovn_s32(x4);
2716
2717 vst1_s16((int16_t *)(dst + i), x5);
2718 }
2719 }
2720 }
2721 else
2722 {
2723 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
2724 {
2725 if( ky[1] < 0 )
2726 std::swap(S0, S2);
2727 for( ; i <= width - 4; i += 4 )
2728 {
2729 int32x4_t x0, x1;
2730 x0 = vld1q_s32((int32_t const *)(S0 + i));
2731 x1 = vld1q_s32((int32_t const *)(S2 + i));
2732
2733 int32x4_t y0, y1;
2734 y0 = vsubq_s32(x1, x0);
2735 y1 = vqaddq_s32(y0, d4);
2736
2737 int16x4_t t;
2738 t = vqmovn_s32(y1);
2739
2740 vst1_s16((int16_t *)(dst + i), t);
2741 }
2742 }
2743 else
2744 {
2745 float32x2_t k32 = vdup_n_f32(0);
2746 k32 = vld1_lane_f32(ky + 1, k32, 1);
2747
2748 for( ; i <= width - 4; i += 4 )
2749 {
2750 int32x4_t x0, x1, x2, x3;
2751 x0 = vld1q_s32((int32_t const *)(S0 + i));
2752 x1 = vld1q_s32((int32_t const *)(S2 + i));
2753
2754 x2 = vsubq_s32(x1, x0);
2755
2756 float32x4_t s0, s1;
2757 s0 = vcvtq_f32_s32(x2);
2758 s1 = vmlaq_lane_f32(df4, s0, k32, 1);
2759
2760 x3 = vcvtq_s32_f32(s1);
2761
2762 int16x4_t x4;
2763 x4 = vqmovn_s32(x3);
2764
2765 vst1_s16((int16_t *)(dst + i), x4);
2766 }
2767 }
2768 }
2769
2770 return i;
2771 }
2772
2773 int symmetryType;
2774 float delta;
2775 Mat kernel;
2776 };
2777
2778
2779 struct SymmColumnVec_32f16s
2780 {
SymmColumnVec_32f16scv::SymmColumnVec_32f16s2781 SymmColumnVec_32f16s() { symmetryType=0; }
SymmColumnVec_32f16scv::SymmColumnVec_32f16s2782 SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
2783 {
2784 symmetryType = _symmetryType;
2785 kernel = _kernel;
2786 delta = (float)_delta;
2787 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
2788 neon_supported = checkHardwareSupport(CV_CPU_NEON);
2789 }
2790
operator ()cv::SymmColumnVec_32f16s2791 int operator()(const uchar** _src, uchar* _dst, int width) const
2792 {
2793 if( !neon_supported )
2794 return 0;
2795
2796 int _ksize = kernel.rows + kernel.cols - 1;
2797 int ksize2 = _ksize / 2;
2798 const float* ky = kernel.ptr<float>() + ksize2;
2799 int i = 0, k;
2800 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2801 const float** src = (const float**)_src;
2802 const float *S, *S2;
2803 short* dst = (short*)_dst;
2804
2805 float32x4_t d4 = vdupq_n_f32(delta);
2806
2807 if( symmetrical )
2808 {
2809 if( _ksize == 1 )
2810 return 0;
2811
2812
2813 float32x2_t k32;
2814 k32 = vdup_n_f32(0);
2815 k32 = vld1_lane_f32(ky, k32, 0);
2816 k32 = vld1_lane_f32(ky + 1, k32, 1);
2817
2818 for( ; i <= width - 8; i += 8 )
2819 {
2820 float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
2821 float32x4_t accl, acch;
2822
2823 S = src[0] + i;
2824
2825 x0l = vld1q_f32(S);
2826 x0h = vld1q_f32(S + 4);
2827
2828 S = src[1] + i;
2829 S2 = src[-1] + i;
2830
2831 x1l = vld1q_f32(S);
2832 x1h = vld1q_f32(S + 4);
2833 x2l = vld1q_f32(S2);
2834 x2h = vld1q_f32(S2 + 4);
2835
2836 accl = acch = d4;
2837 accl = vmlaq_lane_f32(accl, x0l, k32, 0);
2838 acch = vmlaq_lane_f32(acch, x0h, k32, 0);
2839 accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
2840 acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
2841
2842 for( k = 2; k <= ksize2; k++ )
2843 {
2844 S = src[k] + i;
2845 S2 = src[-k] + i;
2846
2847 float32x4_t x3l, x3h, x4l, x4h;
2848 x3l = vld1q_f32(S);
2849 x3h = vld1q_f32(S + 4);
2850 x4l = vld1q_f32(S2);
2851 x4h = vld1q_f32(S2 + 4);
2852
2853 accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
2854 acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
2855 }
2856
2857 int32x4_t s32l, s32h;
2858 s32l = vcvtq_s32_f32(accl);
2859 s32h = vcvtq_s32_f32(acch);
2860
2861 int16x4_t s16l, s16h;
2862 s16l = vqmovn_s32(s32l);
2863 s16h = vqmovn_s32(s32h);
2864
2865 vst1_s16((int16_t *)(dst + i), s16l);
2866 vst1_s16((int16_t *)(dst + i + 4), s16h);
2867 }
2868 }
2869 else
2870 {
2871 float32x2_t k32;
2872 k32 = vdup_n_f32(0);
2873 k32 = vld1_lane_f32(ky + 1, k32, 1);
2874
2875 for( ; i <= width - 8; i += 8 )
2876 {
2877 float32x4_t x1l, x1h, x2l, x2h;
2878 float32x4_t accl, acch;
2879
2880 S = src[1] + i;
2881 S2 = src[-1] + i;
2882
2883 x1l = vld1q_f32(S);
2884 x1h = vld1q_f32(S + 4);
2885 x2l = vld1q_f32(S2);
2886 x2h = vld1q_f32(S2 + 4);
2887
2888 accl = acch = d4;
2889 accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
2890 acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
2891
2892 for( k = 2; k <= ksize2; k++ )
2893 {
2894 S = src[k] + i;
2895 S2 = src[-k] + i;
2896
2897 float32x4_t x3l, x3h, x4l, x4h;
2898 x3l = vld1q_f32(S);
2899 x3h = vld1q_f32(S + 4);
2900 x4l = vld1q_f32(S2);
2901 x4h = vld1q_f32(S2 + 4);
2902
2903 accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
2904 acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
2905 }
2906
2907 int32x4_t s32l, s32h;
2908 s32l = vcvtq_s32_f32(accl);
2909 s32h = vcvtq_s32_f32(acch);
2910
2911 int16x4_t s16l, s16h;
2912 s16l = vqmovn_s32(s32l);
2913 s16h = vqmovn_s32(s32h);
2914
2915 vst1_s16((int16_t *)(dst + i), s16l);
2916 vst1_s16((int16_t *)(dst + i + 4), s16h);
2917 }
2918 }
2919
2920 return i;
2921 }
2922
2923 int symmetryType;
2924 float delta;
2925 Mat kernel;
2926 bool neon_supported;
2927 };
2928
2929
2930 struct SymmRowSmallVec_32f
2931 {
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f2932 SymmRowSmallVec_32f() {}
SymmRowSmallVec_32fcv::SymmRowSmallVec_32f2933 SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
2934 {
2935 kernel = _kernel;
2936 symmetryType = _symmetryType;
2937 }
2938
operator ()cv::SymmRowSmallVec_32f2939 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
2940 {
2941 if( !checkHardwareSupport(CV_CPU_NEON) )
2942 return 0;
2943
2944 int i = 0, _ksize = kernel.rows + kernel.cols - 1;
2945 float* dst = (float*)_dst;
2946 const float* src = (const float*)_src + (_ksize/2)*cn;
2947 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
2948 const float* kx = kernel.ptr<float>() + _ksize/2;
2949 width *= cn;
2950
2951 if( symmetrical )
2952 {
2953 if( _ksize == 1 )
2954 return 0;
2955 if( _ksize == 3 )
2956 {
2957 if( kx[0] == 2 && kx[1] == 1 )
2958 return 0;
2959 else if( kx[0] == -2 && kx[1] == 1 )
2960 return 0;
2961 else
2962 {
2963 return 0;
2964 }
2965 }
2966 else if( _ksize == 5 )
2967 {
2968 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
2969 return 0;
2970 else
2971 {
2972 float32x2_t k0, k1;
2973 k0 = k1 = vdup_n_f32(0);
2974 k0 = vld1_lane_f32(kx + 0, k0, 0);
2975 k0 = vld1_lane_f32(kx + 1, k0, 1);
2976 k1 = vld1_lane_f32(kx + 2, k1, 0);
2977
2978 for( ; i <= width - 4; i += 4, src += 4 )
2979 {
2980 float32x4_t x0, x1, x2, x3, x4;
2981 x0 = vld1q_f32(src);
2982 x1 = vld1q_f32(src - cn);
2983 x2 = vld1q_f32(src + cn);
2984 x3 = vld1q_f32(src - cn*2);
2985 x4 = vld1q_f32(src + cn*2);
2986
2987 float32x4_t y0;
2988 y0 = vmulq_lane_f32(x0, k0, 0);
2989 y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
2990 y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
2991
2992 vst1q_f32(dst + i, y0);
2993 }
2994 }
2995 }
2996 }
2997 else
2998 {
2999 if( _ksize == 3 )
3000 {
3001 if( kx[0] == 0 && kx[1] == 1 )
3002 return 0;
3003 else
3004 {
3005 return 0;
3006 }
3007 }
3008 else if( _ksize == 5 )
3009 {
3010 float32x2_t k;
3011 k = vdup_n_f32(0);
3012 k = vld1_lane_f32(kx + 1, k, 0);
3013 k = vld1_lane_f32(kx + 2, k, 1);
3014
3015 for( ; i <= width - 4; i += 4, src += 4 )
3016 {
3017 float32x4_t x0, x1, x2, x3;
3018 x0 = vld1q_f32(src - cn);
3019 x1 = vld1q_f32(src + cn);
3020 x2 = vld1q_f32(src - cn*2);
3021 x3 = vld1q_f32(src + cn*2);
3022
3023 float32x4_t y0;
3024 y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
3025 y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
3026
3027 vst1q_f32(dst + i, y0);
3028 }
3029 }
3030 }
3031
3032 return i;
3033 }
3034
3035 Mat kernel;
3036 int symmetryType;
3037 };
3038
3039
3040 typedef RowNoVec RowVec_8u32s;
3041 typedef RowNoVec RowVec_16s32f;
3042 typedef RowNoVec RowVec_32f;
3043 typedef ColumnNoVec SymmColumnVec_32f;
3044 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
3045 typedef FilterNoVec FilterVec_8u;
3046 typedef FilterNoVec FilterVec_8u16s;
3047 typedef FilterNoVec FilterVec_32f;
3048
3049
3050 #else
3051
3052 typedef RowNoVec RowVec_8u32s;
3053 typedef RowNoVec RowVec_16s32f;
3054 typedef RowNoVec RowVec_32f;
3055 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s;
3056 typedef SymmRowSmallNoVec SymmRowSmallVec_32f;
3057 typedef ColumnNoVec SymmColumnVec_32s8u;
3058 typedef ColumnNoVec SymmColumnVec_32f16s;
3059 typedef ColumnNoVec SymmColumnVec_32f;
3060 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s;
3061 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
3062 typedef FilterNoVec FilterVec_8u;
3063 typedef FilterNoVec FilterVec_8u16s;
3064 typedef FilterNoVec FilterVec_32f;
3065
3066 #endif
3067
3068
3069 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter
3070 {
RowFiltercv::RowFilter3071 RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() )
3072 {
3073 if( _kernel.isContinuous() )
3074 kernel = _kernel;
3075 else
3076 _kernel.copyTo(kernel);
3077 anchor = _anchor;
3078 ksize = kernel.rows + kernel.cols - 1;
3079 CV_Assert( kernel.type() == DataType<DT>::type &&
3080 (kernel.rows == 1 || kernel.cols == 1));
3081 vecOp = _vecOp;
3082 }
3083
operator ()cv::RowFilter3084 void operator()(const uchar* src, uchar* dst, int width, int cn)
3085 {
3086 int _ksize = ksize;
3087 const DT* kx = kernel.ptr<DT>();
3088 const ST* S;
3089 DT* D = (DT*)dst;
3090 int i, k;
3091
3092 i = vecOp(src, dst, width, cn);
3093 width *= cn;
3094 #if CV_ENABLE_UNROLLED
3095 for( ; i <= width - 4; i += 4 )
3096 {
3097 S = (const ST*)src + i;
3098 DT f = kx[0];
3099 DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3];
3100
3101 for( k = 1; k < _ksize; k++ )
3102 {
3103 S += cn;
3104 f = kx[k];
3105 s0 += f*S[0]; s1 += f*S[1];
3106 s2 += f*S[2]; s3 += f*S[3];
3107 }
3108
3109 D[i] = s0; D[i+1] = s1;
3110 D[i+2] = s2; D[i+3] = s3;
3111 }
3112 #endif
3113 for( ; i < width; i++ )
3114 {
3115 S = (const ST*)src + i;
3116 DT s0 = kx[0]*S[0];
3117 for( k = 1; k < _ksize; k++ )
3118 {
3119 S += cn;
3120 s0 += kx[k]*S[0];
3121 }
3122 D[i] = s0;
3123 }
3124 }
3125
3126 Mat kernel;
3127 VecOp vecOp;
3128 };
3129
3130
3131 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter :
3132 public RowFilter<ST, DT, VecOp>
3133 {
SymmRowSmallFiltercv::SymmRowSmallFilter3134 SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType,
3135 const VecOp& _vecOp = VecOp())
3136 : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp )
3137 {
3138 symmetryType = _symmetryType;
3139 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 );
3140 }
3141
operator ()cv::SymmRowSmallFilter3142 void operator()(const uchar* src, uchar* dst, int width, int cn)
3143 {
3144 int ksize2 = this->ksize/2, ksize2n = ksize2*cn;
3145 const DT* kx = this->kernel.template ptr<DT>() + ksize2;
3146 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
3147 DT* D = (DT*)dst;
3148 int i = this->vecOp(src, dst, width, cn), j, k;
3149 const ST* S = (const ST*)src + i + ksize2n;
3150 width *= cn;
3151
3152 if( symmetrical )
3153 {
3154 if( this->ksize == 1 && kx[0] == 1 )
3155 {
3156 for( ; i <= width - 2; i += 2 )
3157 {
3158 DT s0 = S[i], s1 = S[i+1];
3159 D[i] = s0; D[i+1] = s1;
3160 }
3161 S += i;
3162 }
3163 else if( this->ksize == 3 )
3164 {
3165 if( kx[0] == 2 && kx[1] == 1 )
3166 for( ; i <= width - 2; i += 2, S += 2 )
3167 {
3168 DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn];
3169 D[i] = s0; D[i+1] = s1;
3170 }
3171 else if( kx[0] == -2 && kx[1] == 1 )
3172 for( ; i <= width - 2; i += 2, S += 2 )
3173 {
3174 DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn];
3175 D[i] = s0; D[i+1] = s1;
3176 }
3177 else
3178 {
3179 DT k0 = kx[0], k1 = kx[1];
3180 for( ; i <= width - 2; i += 2, S += 2 )
3181 {
3182 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1;
3183 D[i] = s0; D[i+1] = s1;
3184 }
3185 }
3186 }
3187 else if( this->ksize == 5 )
3188 {
3189 DT k0 = kx[0], k1 = kx[1], k2 = kx[2];
3190 if( k0 == -2 && k1 == 0 && k2 == 1 )
3191 for( ; i <= width - 2; i += 2, S += 2 )
3192 {
3193 DT s0 = -2*S[0] + S[-cn*2] + S[cn*2];
3194 DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2];
3195 D[i] = s0; D[i+1] = s1;
3196 }
3197 else
3198 for( ; i <= width - 2; i += 2, S += 2 )
3199 {
3200 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2;
3201 DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2;
3202 D[i] = s0; D[i+1] = s1;
3203 }
3204 }
3205
3206 for( ; i < width; i++, S++ )
3207 {
3208 DT s0 = kx[0]*S[0];
3209 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
3210 s0 += kx[k]*(S[j] + S[-j]);
3211 D[i] = s0;
3212 }
3213 }
3214 else
3215 {
3216 if( this->ksize == 3 )
3217 {
3218 if( kx[0] == 0 && kx[1] == 1 )
3219 for( ; i <= width - 2; i += 2, S += 2 )
3220 {
3221 DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn];
3222 D[i] = s0; D[i+1] = s1;
3223 }
3224 else
3225 {
3226 DT k1 = kx[1];
3227 for( ; i <= width - 2; i += 2, S += 2 )
3228 {
3229 DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1;
3230 D[i] = s0; D[i+1] = s1;
3231 }
3232 }
3233 }
3234 else if( this->ksize == 5 )
3235 {
3236 DT k1 = kx[1], k2 = kx[2];
3237 for( ; i <= width - 2; i += 2, S += 2 )
3238 {
3239 DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2;
3240 DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2;
3241 D[i] = s0; D[i+1] = s1;
3242 }
3243 }
3244
3245 for( ; i < width; i++, S++ )
3246 {
3247 DT s0 = kx[0]*S[0];
3248 for( k = 1, j = cn; k <= ksize2; k++, j += cn )
3249 s0 += kx[k]*(S[j] - S[-j]);
3250 D[i] = s0;
3251 }
3252 }
3253 }
3254
3255 int symmetryType;
3256 };
3257
3258
3259 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter
3260 {
3261 typedef typename CastOp::type1 ST;
3262 typedef typename CastOp::rtype DT;
3263
ColumnFiltercv::ColumnFilter3264 ColumnFilter( const Mat& _kernel, int _anchor,
3265 double _delta, const CastOp& _castOp=CastOp(),
3266 const VecOp& _vecOp=VecOp() )
3267 {
3268 if( _kernel.isContinuous() )
3269 kernel = _kernel;
3270 else
3271 _kernel.copyTo(kernel);
3272 anchor = _anchor;
3273 ksize = kernel.rows + kernel.cols - 1;
3274 delta = saturate_cast<ST>(_delta);
3275 castOp0 = _castOp;
3276 vecOp = _vecOp;
3277 CV_Assert( kernel.type() == DataType<ST>::type &&
3278 (kernel.rows == 1 || kernel.cols == 1));
3279 }
3280
operator ()cv::ColumnFilter3281 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3282 {
3283 const ST* ky = kernel.template ptr<ST>();
3284 ST _delta = delta;
3285 int _ksize = ksize;
3286 int i, k;
3287 CastOp castOp = castOp0;
3288
3289 for( ; count--; dst += dststep, src++ )
3290 {
3291 DT* D = (DT*)dst;
3292 i = vecOp(src, dst, width);
3293 #if CV_ENABLE_UNROLLED
3294 for( ; i <= width - 4; i += 4 )
3295 {
3296 ST f = ky[0];
3297 const ST* S = (const ST*)src[0] + i;
3298 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
3299 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
3300
3301 for( k = 1; k < _ksize; k++ )
3302 {
3303 S = (const ST*)src[k] + i; f = ky[k];
3304 s0 += f*S[0]; s1 += f*S[1];
3305 s2 += f*S[2]; s3 += f*S[3];
3306 }
3307
3308 D[i] = castOp(s0); D[i+1] = castOp(s1);
3309 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3310 }
3311 #endif
3312 for( ; i < width; i++ )
3313 {
3314 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
3315 for( k = 1; k < _ksize; k++ )
3316 s0 += ky[k]*((const ST*)src[k])[i];
3317 D[i] = castOp(s0);
3318 }
3319 }
3320 }
3321
3322 Mat kernel;
3323 CastOp castOp0;
3324 VecOp vecOp;
3325 ST delta;
3326 };
3327
3328
3329 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp>
3330 {
3331 typedef typename CastOp::type1 ST;
3332 typedef typename CastOp::rtype DT;
3333
SymmColumnFiltercv::SymmColumnFilter3334 SymmColumnFilter( const Mat& _kernel, int _anchor,
3335 double _delta, int _symmetryType,
3336 const CastOp& _castOp=CastOp(),
3337 const VecOp& _vecOp=VecOp())
3338 : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp )
3339 {
3340 symmetryType = _symmetryType;
3341 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
3342 }
3343
operator ()cv::SymmColumnFilter3344 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3345 {
3346 int ksize2 = this->ksize/2;
3347 const ST* ky = this->kernel.template ptr<ST>() + ksize2;
3348 int i, k;
3349 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
3350 ST _delta = this->delta;
3351 CastOp castOp = this->castOp0;
3352 src += ksize2;
3353
3354 if( symmetrical )
3355 {
3356 for( ; count--; dst += dststep, src++ )
3357 {
3358 DT* D = (DT*)dst;
3359 i = (this->vecOp)(src, dst, width);
3360 #if CV_ENABLE_UNROLLED
3361 for( ; i <= width - 4; i += 4 )
3362 {
3363 ST f = ky[0];
3364 const ST* S = (const ST*)src[0] + i, *S2;
3365 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta,
3366 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta;
3367
3368 for( k = 1; k <= ksize2; k++ )
3369 {
3370 S = (const ST*)src[k] + i;
3371 S2 = (const ST*)src[-k] + i;
3372 f = ky[k];
3373 s0 += f*(S[0] + S2[0]);
3374 s1 += f*(S[1] + S2[1]);
3375 s2 += f*(S[2] + S2[2]);
3376 s3 += f*(S[3] + S2[3]);
3377 }
3378
3379 D[i] = castOp(s0); D[i+1] = castOp(s1);
3380 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3381 }
3382 #endif
3383 for( ; i < width; i++ )
3384 {
3385 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta;
3386 for( k = 1; k <= ksize2; k++ )
3387 s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]);
3388 D[i] = castOp(s0);
3389 }
3390 }
3391 }
3392 else
3393 {
3394 for( ; count--; dst += dststep, src++ )
3395 {
3396 DT* D = (DT*)dst;
3397 i = this->vecOp(src, dst, width);
3398 #if CV_ENABLE_UNROLLED
3399 for( ; i <= width - 4; i += 4 )
3400 {
3401 ST f = ky[0];
3402 const ST *S, *S2;
3403 ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3404
3405 for( k = 1; k <= ksize2; k++ )
3406 {
3407 S = (const ST*)src[k] + i;
3408 S2 = (const ST*)src[-k] + i;
3409 f = ky[k];
3410 s0 += f*(S[0] - S2[0]);
3411 s1 += f*(S[1] - S2[1]);
3412 s2 += f*(S[2] - S2[2]);
3413 s3 += f*(S[3] - S2[3]);
3414 }
3415
3416 D[i] = castOp(s0); D[i+1] = castOp(s1);
3417 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3418 }
3419 #endif
3420 for( ; i < width; i++ )
3421 {
3422 ST s0 = _delta;
3423 for( k = 1; k <= ksize2; k++ )
3424 s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]);
3425 D[i] = castOp(s0);
3426 }
3427 }
3428 }
3429 }
3430
3431 int symmetryType;
3432 };
3433
3434
3435 template<class CastOp, class VecOp>
3436 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp>
3437 {
3438 typedef typename CastOp::type1 ST;
3439 typedef typename CastOp::rtype DT;
3440
SymmColumnSmallFiltercv::SymmColumnSmallFilter3441 SymmColumnSmallFilter( const Mat& _kernel, int _anchor,
3442 double _delta, int _symmetryType,
3443 const CastOp& _castOp=CastOp(),
3444 const VecOp& _vecOp=VecOp())
3445 : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp )
3446 {
3447 CV_Assert( this->ksize == 3 );
3448 }
3449
operator ()cv::SymmColumnSmallFilter3450 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width)
3451 {
3452 int ksize2 = this->ksize/2;
3453 const ST* ky = this->kernel.template ptr<ST>() + ksize2;
3454 int i;
3455 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0;
3456 bool is_1_2_1 = ky[0] == 2 && ky[1] == 1;
3457 bool is_1_m2_1 = ky[0] == -2 && ky[1] == 1;
3458 bool is_m1_0_1 = ky[0] == 0 && (ky[1] == 1 || ky[1] == -1);
3459 ST f0 = ky[0], f1 = ky[1];
3460 ST _delta = this->delta;
3461 CastOp castOp = this->castOp0;
3462 src += ksize2;
3463
3464 for( ; count--; dst += dststep, src++ )
3465 {
3466 DT* D = (DT*)dst;
3467 i = (this->vecOp)(src, dst, width);
3468 const ST* S0 = (const ST*)src[-1];
3469 const ST* S1 = (const ST*)src[0];
3470 const ST* S2 = (const ST*)src[1];
3471
3472 if( symmetrical )
3473 {
3474 if( is_1_2_1 )
3475 {
3476 #if CV_ENABLE_UNROLLED
3477 for( ; i <= width - 4; i += 4 )
3478 {
3479 ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
3480 ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta;
3481 D[i] = castOp(s0);
3482 D[i+1] = castOp(s1);
3483
3484 s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta;
3485 s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta;
3486 D[i+2] = castOp(s0);
3487 D[i+3] = castOp(s1);
3488 }
3489 #endif
3490 for( ; i < width; i ++ )
3491 {
3492 ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta;
3493 D[i] = castOp(s0);
3494 }
3495 }
3496 else if( is_1_m2_1 )
3497 {
3498 #if CV_ENABLE_UNROLLED
3499 for( ; i <= width - 4; i += 4 )
3500 {
3501 ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
3502 ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta;
3503 D[i] = castOp(s0);
3504 D[i+1] = castOp(s1);
3505
3506 s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta;
3507 s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta;
3508 D[i+2] = castOp(s0);
3509 D[i+3] = castOp(s1);
3510 }
3511 #endif
3512 for( ; i < width; i ++ )
3513 {
3514 ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta;
3515 D[i] = castOp(s0);
3516 }
3517 }
3518 else
3519 {
3520 #if CV_ENABLE_UNROLLED
3521 for( ; i <= width - 4; i += 4 )
3522 {
3523 ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
3524 ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta;
3525 D[i] = castOp(s0);
3526 D[i+1] = castOp(s1);
3527
3528 s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta;
3529 s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta;
3530 D[i+2] = castOp(s0);
3531 D[i+3] = castOp(s1);
3532 }
3533 #endif
3534 for( ; i < width; i ++ )
3535 {
3536 ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta;
3537 D[i] = castOp(s0);
3538 }
3539 }
3540 }
3541 else
3542 {
3543 if( is_m1_0_1 )
3544 {
3545 if( f1 < 0 )
3546 std::swap(S0, S2);
3547 #if CV_ENABLE_UNROLLED
3548 for( ; i <= width - 4; i += 4 )
3549 {
3550 ST s0 = S2[i] - S0[i] + _delta;
3551 ST s1 = S2[i+1] - S0[i+1] + _delta;
3552 D[i] = castOp(s0);
3553 D[i+1] = castOp(s1);
3554
3555 s0 = S2[i+2] - S0[i+2] + _delta;
3556 s1 = S2[i+3] - S0[i+3] + _delta;
3557 D[i+2] = castOp(s0);
3558 D[i+3] = castOp(s1);
3559 }
3560 #endif
3561 for( ; i < width; i ++ )
3562 {
3563 ST s0 = S2[i] - S0[i] + _delta;
3564 D[i] = castOp(s0);
3565 }
3566 if( f1 < 0 )
3567 std::swap(S0, S2);
3568 }
3569 else
3570 {
3571 #if CV_ENABLE_UNROLLED
3572 for( ; i <= width - 4; i += 4 )
3573 {
3574 ST s0 = (S2[i] - S0[i])*f1 + _delta;
3575 ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta;
3576 D[i] = castOp(s0);
3577 D[i+1] = castOp(s1);
3578
3579 s0 = (S2[i+2] - S0[i+2])*f1 + _delta;
3580 s1 = (S2[i+3] - S0[i+3])*f1 + _delta;
3581 D[i+2] = castOp(s0);
3582 D[i+3] = castOp(s1);
3583 }
3584 #endif
3585 for( ; i < width; i++ )
3586 D[i] = castOp((S2[i] - S0[i])*f1 + _delta);
3587 }
3588 }
3589 }
3590 }
3591 };
3592
3593 template<typename ST, typename DT> struct Cast
3594 {
3595 typedef ST type1;
3596 typedef DT rtype;
3597
operator ()cv::Cast3598 DT operator()(ST val) const { return saturate_cast<DT>(val); }
3599 };
3600
3601 template<typename ST, typename DT, int bits> struct FixedPtCast
3602 {
3603 typedef ST type1;
3604 typedef DT rtype;
3605 enum { SHIFT = bits, DELTA = 1 << (bits-1) };
3606
operator ()cv::FixedPtCast3607 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
3608 };
3609
3610 template<typename ST, typename DT> struct FixedPtCastEx
3611 {
3612 typedef ST type1;
3613 typedef DT rtype;
3614
FixedPtCastExcv::FixedPtCastEx3615 FixedPtCastEx() : SHIFT(0), DELTA(0) {}
FixedPtCastExcv::FixedPtCastEx3616 FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {}
operator ()cv::FixedPtCastEx3617 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
3618 int SHIFT, DELTA;
3619 };
3620
3621 }
3622
getLinearRowFilter(int srcType,int bufType,InputArray _kernel,int anchor,int symmetryType)3623 cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType,
3624 InputArray _kernel, int anchor,
3625 int symmetryType )
3626 {
3627 Mat kernel = _kernel.getMat();
3628 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType);
3629 int cn = CV_MAT_CN(srcType);
3630 CV_Assert( cn == CV_MAT_CN(bufType) &&
3631 ddepth >= std::max(sdepth, CV_32S) &&
3632 kernel.type() == ddepth );
3633 int ksize = kernel.rows + kernel.cols - 1;
3634
3635 if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 )
3636 {
3637 if( sdepth == CV_8U && ddepth == CV_32S )
3638 return makePtr<SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s> >
3639 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType));
3640 if( sdepth == CV_32F && ddepth == CV_32F )
3641 return makePtr<SymmRowSmallFilter<float, float, SymmRowSmallVec_32f> >
3642 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType));
3643 }
3644
3645 if( sdepth == CV_8U && ddepth == CV_32S )
3646 return makePtr<RowFilter<uchar, int, RowVec_8u32s> >
3647 (kernel, anchor, RowVec_8u32s(kernel));
3648 if( sdepth == CV_8U && ddepth == CV_32F )
3649 return makePtr<RowFilter<uchar, float, RowNoVec> >(kernel, anchor);
3650 if( sdepth == CV_8U && ddepth == CV_64F )
3651 return makePtr<RowFilter<uchar, double, RowNoVec> >(kernel, anchor);
3652 if( sdepth == CV_16U && ddepth == CV_32F )
3653 return makePtr<RowFilter<ushort, float, RowNoVec> >(kernel, anchor);
3654 if( sdepth == CV_16U && ddepth == CV_64F )
3655 return makePtr<RowFilter<ushort, double, RowNoVec> >(kernel, anchor);
3656 if( sdepth == CV_16S && ddepth == CV_32F )
3657 return makePtr<RowFilter<short, float, RowVec_16s32f> >
3658 (kernel, anchor, RowVec_16s32f(kernel));
3659 if( sdepth == CV_16S && ddepth == CV_64F )
3660 return makePtr<RowFilter<short, double, RowNoVec> >(kernel, anchor);
3661 if( sdepth == CV_32F && ddepth == CV_32F )
3662 return makePtr<RowFilter<float, float, RowVec_32f> >
3663 (kernel, anchor, RowVec_32f(kernel));
3664 if( sdepth == CV_32F && ddepth == CV_64F )
3665 return makePtr<RowFilter<float, double, RowNoVec> >(kernel, anchor);
3666 if( sdepth == CV_64F && ddepth == CV_64F )
3667 return makePtr<RowFilter<double, double, RowNoVec> >(kernel, anchor);
3668
3669 CV_Error_( CV_StsNotImplemented,
3670 ("Unsupported combination of source format (=%d), and buffer format (=%d)",
3671 srcType, bufType));
3672
3673 return Ptr<BaseRowFilter>();
3674 }
3675
3676
getLinearColumnFilter(int bufType,int dstType,InputArray _kernel,int anchor,int symmetryType,double delta,int bits)3677 cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType,
3678 InputArray _kernel, int anchor,
3679 int symmetryType, double delta,
3680 int bits )
3681 {
3682 Mat kernel = _kernel.getMat();
3683 int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType);
3684 int cn = CV_MAT_CN(dstType);
3685 CV_Assert( cn == CV_MAT_CN(bufType) &&
3686 sdepth >= std::max(ddepth, CV_32S) &&
3687 kernel.type() == sdepth );
3688
3689 if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) )
3690 {
3691 if( ddepth == CV_8U && sdepth == CV_32S )
3692 return makePtr<ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec> >
3693 (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits));
3694 if( ddepth == CV_8U && sdepth == CV_32F )
3695 return makePtr<ColumnFilter<Cast<float, uchar>, ColumnNoVec> >(kernel, anchor, delta);
3696 if( ddepth == CV_8U && sdepth == CV_64F )
3697 return makePtr<ColumnFilter<Cast<double, uchar>, ColumnNoVec> >(kernel, anchor, delta);
3698 if( ddepth == CV_16U && sdepth == CV_32F )
3699 return makePtr<ColumnFilter<Cast<float, ushort>, ColumnNoVec> >(kernel, anchor, delta);
3700 if( ddepth == CV_16U && sdepth == CV_64F )
3701 return makePtr<ColumnFilter<Cast<double, ushort>, ColumnNoVec> >(kernel, anchor, delta);
3702 if( ddepth == CV_16S && sdepth == CV_32F )
3703 return makePtr<ColumnFilter<Cast<float, short>, ColumnNoVec> >(kernel, anchor, delta);
3704 if( ddepth == CV_16S && sdepth == CV_64F )
3705 return makePtr<ColumnFilter<Cast<double, short>, ColumnNoVec> >(kernel, anchor, delta);
3706 if( ddepth == CV_32F && sdepth == CV_32F )
3707 return makePtr<ColumnFilter<Cast<float, float>, ColumnNoVec> >(kernel, anchor, delta);
3708 if( ddepth == CV_64F && sdepth == CV_64F )
3709 return makePtr<ColumnFilter<Cast<double, double>, ColumnNoVec> >(kernel, anchor, delta);
3710 }
3711 else
3712 {
3713 int ksize = kernel.rows + kernel.cols - 1;
3714 if( ksize == 3 )
3715 {
3716 if( ddepth == CV_8U && sdepth == CV_32S )
3717 return makePtr<SymmColumnSmallFilter<
3718 FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
3719 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
3720 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
3721 if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 )
3722 return makePtr<SymmColumnSmallFilter<Cast<int, short>,
3723 SymmColumnSmallVec_32s16s> >(kernel, anchor, delta, symmetryType,
3724 Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta));
3725 if( ddepth == CV_32F && sdepth == CV_32F )
3726 return makePtr<SymmColumnSmallFilter<
3727 Cast<float, float>,SymmColumnSmallVec_32f> >
3728 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
3729 SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta));
3730 }
3731 if( ddepth == CV_8U && sdepth == CV_32S )
3732 return makePtr<SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> >
3733 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits),
3734 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta));
3735 if( ddepth == CV_8U && sdepth == CV_32F )
3736 return makePtr<SymmColumnFilter<Cast<float, uchar>, ColumnNoVec> >
3737 (kernel, anchor, delta, symmetryType);
3738 if( ddepth == CV_8U && sdepth == CV_64F )
3739 return makePtr<SymmColumnFilter<Cast<double, uchar>, ColumnNoVec> >
3740 (kernel, anchor, delta, symmetryType);
3741 if( ddepth == CV_16U && sdepth == CV_32F )
3742 return makePtr<SymmColumnFilter<Cast<float, ushort>, ColumnNoVec> >
3743 (kernel, anchor, delta, symmetryType);
3744 if( ddepth == CV_16U && sdepth == CV_64F )
3745 return makePtr<SymmColumnFilter<Cast<double, ushort>, ColumnNoVec> >
3746 (kernel, anchor, delta, symmetryType);
3747 if( ddepth == CV_16S && sdepth == CV_32S )
3748 return makePtr<SymmColumnFilter<Cast<int, short>, ColumnNoVec> >
3749 (kernel, anchor, delta, symmetryType);
3750 if( ddepth == CV_16S && sdepth == CV_32F )
3751 return makePtr<SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s> >
3752 (kernel, anchor, delta, symmetryType, Cast<float, short>(),
3753 SymmColumnVec_32f16s(kernel, symmetryType, 0, delta));
3754 if( ddepth == CV_16S && sdepth == CV_64F )
3755 return makePtr<SymmColumnFilter<Cast<double, short>, ColumnNoVec> >
3756 (kernel, anchor, delta, symmetryType);
3757 if( ddepth == CV_32F && sdepth == CV_32F )
3758 return makePtr<SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f> >
3759 (kernel, anchor, delta, symmetryType, Cast<float, float>(),
3760 SymmColumnVec_32f(kernel, symmetryType, 0, delta));
3761 if( ddepth == CV_64F && sdepth == CV_64F )
3762 return makePtr<SymmColumnFilter<Cast<double, double>, ColumnNoVec> >
3763 (kernel, anchor, delta, symmetryType);
3764 }
3765
3766 CV_Error_( CV_StsNotImplemented,
3767 ("Unsupported combination of buffer format (=%d), and destination format (=%d)",
3768 bufType, dstType));
3769
3770 return Ptr<BaseColumnFilter>();
3771 }
3772
3773
createSeparableLinearFilter(int _srcType,int _dstType,InputArray __rowKernel,InputArray __columnKernel,Point _anchor,double _delta,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)3774 cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter(
3775 int _srcType, int _dstType,
3776 InputArray __rowKernel, InputArray __columnKernel,
3777 Point _anchor, double _delta,
3778 int _rowBorderType, int _columnBorderType,
3779 const Scalar& _borderValue )
3780 {
3781 Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat();
3782 _srcType = CV_MAT_TYPE(_srcType);
3783 _dstType = CV_MAT_TYPE(_dstType);
3784 int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
3785 int cn = CV_MAT_CN(_srcType);
3786 CV_Assert( cn == CV_MAT_CN(_dstType) );
3787 int rsize = _rowKernel.rows + _rowKernel.cols - 1;
3788 int csize = _columnKernel.rows + _columnKernel.cols - 1;
3789 if( _anchor.x < 0 )
3790 _anchor.x = rsize/2;
3791 if( _anchor.y < 0 )
3792 _anchor.y = csize/2;
3793 int rtype = getKernelType(_rowKernel,
3794 _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x));
3795 int ctype = getKernelType(_columnKernel,
3796 _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y));
3797 Mat rowKernel, columnKernel;
3798
3799 int bdepth = std::max(CV_32F,std::max(sdepth, ddepth));
3800 int bits = 0;
3801
3802 if( sdepth == CV_8U &&
3803 ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
3804 ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
3805 ddepth == CV_8U) ||
3806 ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
3807 (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) &&
3808 (rtype & ctype & KERNEL_INTEGER) &&
3809 ddepth == CV_16S)) )
3810 {
3811 bdepth = CV_32S;
3812 bits = ddepth == CV_8U ? 8 : 0;
3813 _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits );
3814 _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits );
3815 bits *= 2;
3816 _delta *= (1 << bits);
3817 }
3818 else
3819 {
3820 if( _rowKernel.type() != bdepth )
3821 _rowKernel.convertTo( rowKernel, bdepth );
3822 else
3823 rowKernel = _rowKernel;
3824 if( _columnKernel.type() != bdepth )
3825 _columnKernel.convertTo( columnKernel, bdepth );
3826 else
3827 columnKernel = _columnKernel;
3828 }
3829
3830 int _bufType = CV_MAKETYPE(bdepth, cn);
3831 Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter(
3832 _srcType, _bufType, rowKernel, _anchor.x, rtype);
3833 Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter(
3834 _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits );
3835
3836 return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(), _rowFilter, _columnFilter,
3837 _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue ));
3838 }
3839
3840
3841 /****************************************************************************************\
3842 * Non-separable linear filter *
3843 \****************************************************************************************/
3844
3845 namespace cv
3846 {
3847
preprocess2DKernel(const Mat & kernel,std::vector<Point> & coords,std::vector<uchar> & coeffs)3848 void preprocess2DKernel( const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs )
3849 {
3850 int i, j, k, nz = countNonZero(kernel), ktype = kernel.type();
3851 if(nz == 0)
3852 nz = 1;
3853 CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F );
3854 coords.resize(nz);
3855 coeffs.resize(nz*getElemSize(ktype));
3856 uchar* _coeffs = &coeffs[0];
3857
3858 for( i = k = 0; i < kernel.rows; i++ )
3859 {
3860 const uchar* krow = kernel.ptr(i);
3861 for( j = 0; j < kernel.cols; j++ )
3862 {
3863 if( ktype == CV_8U )
3864 {
3865 uchar val = krow[j];
3866 if( val == 0 )
3867 continue;
3868 coords[k] = Point(j,i);
3869 _coeffs[k++] = val;
3870 }
3871 else if( ktype == CV_32S )
3872 {
3873 int val = ((const int*)krow)[j];
3874 if( val == 0 )
3875 continue;
3876 coords[k] = Point(j,i);
3877 ((int*)_coeffs)[k++] = val;
3878 }
3879 else if( ktype == CV_32F )
3880 {
3881 float val = ((const float*)krow)[j];
3882 if( val == 0 )
3883 continue;
3884 coords[k] = Point(j,i);
3885 ((float*)_coeffs)[k++] = val;
3886 }
3887 else
3888 {
3889 double val = ((const double*)krow)[j];
3890 if( val == 0 )
3891 continue;
3892 coords[k] = Point(j,i);
3893 ((double*)_coeffs)[k++] = val;
3894 }
3895 }
3896 }
3897 }
3898
3899
3900 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter
3901 {
3902 typedef typename CastOp::type1 KT;
3903 typedef typename CastOp::rtype DT;
3904
Filter2Dcv::Filter2D3905 Filter2D( const Mat& _kernel, Point _anchor,
3906 double _delta, const CastOp& _castOp=CastOp(),
3907 const VecOp& _vecOp=VecOp() )
3908 {
3909 anchor = _anchor;
3910 ksize = _kernel.size();
3911 delta = saturate_cast<KT>(_delta);
3912 castOp0 = _castOp;
3913 vecOp = _vecOp;
3914 CV_Assert( _kernel.type() == DataType<KT>::type );
3915 preprocess2DKernel( _kernel, coords, coeffs );
3916 ptrs.resize( coords.size() );
3917 }
3918
operator ()cv::Filter2D3919 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn)
3920 {
3921 KT _delta = delta;
3922 const Point* pt = &coords[0];
3923 const KT* kf = (const KT*)&coeffs[0];
3924 const ST** kp = (const ST**)&ptrs[0];
3925 int i, k, nz = (int)coords.size();
3926 CastOp castOp = castOp0;
3927
3928 width *= cn;
3929 for( ; count > 0; count--, dst += dststep, src++ )
3930 {
3931 DT* D = (DT*)dst;
3932
3933 for( k = 0; k < nz; k++ )
3934 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn;
3935
3936 i = vecOp((const uchar**)kp, dst, width);
3937 #if CV_ENABLE_UNROLLED
3938 for( ; i <= width - 4; i += 4 )
3939 {
3940 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta;
3941
3942 for( k = 0; k < nz; k++ )
3943 {
3944 const ST* sptr = kp[k] + i;
3945 KT f = kf[k];
3946 s0 += f*sptr[0];
3947 s1 += f*sptr[1];
3948 s2 += f*sptr[2];
3949 s3 += f*sptr[3];
3950 }
3951
3952 D[i] = castOp(s0); D[i+1] = castOp(s1);
3953 D[i+2] = castOp(s2); D[i+3] = castOp(s3);
3954 }
3955 #endif
3956 for( ; i < width; i++ )
3957 {
3958 KT s0 = _delta;
3959 for( k = 0; k < nz; k++ )
3960 s0 += kf[k]*kp[k][i];
3961 D[i] = castOp(s0);
3962 }
3963 }
3964 }
3965
3966 std::vector<Point> coords;
3967 std::vector<uchar> coeffs;
3968 std::vector<uchar*> ptrs;
3969 KT delta;
3970 CastOp castOp0;
3971 VecOp vecOp;
3972 };
3973
3974 #ifdef HAVE_OPENCL
3975
3976 #define DIVUP(total, grain) (((total) + (grain) - 1) / (grain))
3977 #define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n)))
3978
3979 // prepare kernel: transpose and make double rows (+align). Returns size of aligned row
3980 // Samples:
3981 // a b c
3982 // Input: d e f
3983 // g h i
3984 // Output, last two zeros is the alignment:
3985 // a d g a d g 0 0
3986 // b e h b e h 0 0
3987 // c f i c f i 0 0
3988 template <typename T>
_prepareKernelFilter2D(std::vector<T> & data,const Mat & kernel)3989 static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel)
3990 {
3991 Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value);
3992 int size_y_aligned = ROUNDUP(kernel.rows * 2, 4);
3993 data.clear(); data.resize(size_y_aligned * kernel.cols, 0);
3994 for (int x = 0; x < kernel.cols; x++)
3995 {
3996 for (int y = 0; y < kernel.rows; y++)
3997 {
3998 data[x * size_y_aligned + y] = _kernel.at<T>(y, x);
3999 data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x);
4000 }
4001 }
4002 return size_y_aligned;
4003 }
4004
ocl_filter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernel,Point anchor,double delta,int borderType)4005 static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
4006 InputArray _kernel, Point anchor,
4007 double delta, int borderType )
4008 {
4009 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4010 ddepth = ddepth < 0 ? sdepth : ddepth;
4011 int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F),
4012 wtype = CV_MAKE_TYPE(wdepth, cn);
4013 if (cn > 4)
4014 return false;
4015
4016 Size ksize = _kernel.size();
4017 if (anchor.x < 0)
4018 anchor.x = ksize.width / 2;
4019 if (anchor.y < 0)
4020 anchor.y = ksize.height / 2;
4021
4022 bool isolated = (borderType & BORDER_ISOLATED) != 0;
4023 borderType &= ~BORDER_ISOLATED;
4024 const cv::ocl::Device &device = cv::ocl::Device::getDefault();
4025 bool doubleSupport = device.doubleFPConfig() > 0;
4026 if (wdepth == CV_64F && !doubleSupport)
4027 return false;
4028
4029 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT",
4030 "BORDER_WRAP", "BORDER_REFLECT_101" };
4031
4032 cv::Mat kernelMat = _kernel.getMat();
4033 cv::Size sz = _src.size(), wholeSize;
4034 size_t globalsize[2] = { sz.width, sz.height };
4035 size_t localsize_general[2] = {0, 1};
4036 size_t* localsize = NULL;
4037
4038 ocl::Kernel k;
4039 UMat src = _src.getUMat();
4040 if (!isolated)
4041 {
4042 Point ofs;
4043 src.locateROI(wholeSize, ofs);
4044 }
4045
4046 size_t tryWorkItems = device.maxWorkGroupSize();
4047 if (device.isIntel() && 128 < tryWorkItems)
4048 tryWorkItems = 128;
4049 char cvt[2][40];
4050
4051 // For smaller filter kernels, there is a special kernel that is more
4052 // efficient than the general one.
4053 UMat kernalDataUMat;
4054 if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) &&
4055 ((ksize.width < 5 && ksize.height < 5) ||
4056 (ksize.width == 5 && ksize.height == 5 && cn == 1)))
4057 {
4058 kernelMat = kernelMat.reshape(0, 1);
4059 String kerStr = ocl::kernelToStr(kernelMat, CV_32F);
4060 int h = isolated ? sz.height : wholeSize.height;
4061 int w = isolated ? sz.width : wholeSize.width;
4062
4063 if (w < ksize.width || h < ksize.height)
4064 return false;
4065
4066 // Figure out what vector size to use for loading the pixels.
4067 int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4;
4068 int pxLoadVecSize = cn * pxLoadNumPixels;
4069
4070 // Figure out how many pixels per work item to compute in X and Y
4071 // directions. Too many and we run out of registers.
4072 int pxPerWorkItemX = 1;
4073 int pxPerWorkItemY = 1;
4074 if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4)
4075 {
4076 pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8;
4077 pxPerWorkItemY = sz.height % 2 ? 1 : 2;
4078 }
4079 else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4))
4080 {
4081 pxPerWorkItemX = sz.width % 2 ? 1 : 2;
4082 pxPerWorkItemY = sz.height % 2 ? 1 : 2;
4083 }
4084 globalsize[0] = sz.width / pxPerWorkItemX;
4085 globalsize[1] = sz.height / pxPerWorkItemY;
4086
4087 // Need some padding in the private array for pixels
4088 int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels);
4089
4090 // Make the global size a nice round number so the runtime can pick
4091 // from reasonable choices for the workgroup size
4092 const int wgRound = 256;
4093 globalsize[0] = ROUNDUP(globalsize[0], wgRound);
4094
4095 char build_options[1024];
4096 sprintf(build_options, "-D cn=%d "
4097 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
4098 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
4099 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
4100 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
4101 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
4102 "-D convertToWT=%s -D convertToDstT=%s %s",
4103 cn, anchor.x, anchor.y, ksize.width, ksize.height,
4104 pxLoadVecSize, pxLoadNumPixels,
4105 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType],
4106 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4107 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1,
4108 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
4109 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
4110 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4111 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str());
4112
4113 if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options))
4114 return false;
4115 }
4116 else
4117 {
4118 localsize = localsize_general;
4119 std::vector<float> kernelMatDataFloat;
4120 int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat);
4121 String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F);
4122
4123 for ( ; ; )
4124 {
4125 size_t BLOCK_SIZE = tryWorkItems;
4126 while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2)
4127 BLOCK_SIZE /= 2;
4128
4129 if ((size_t)ksize.width > BLOCK_SIZE)
4130 return false;
4131
4132 int requiredTop = anchor.y;
4133 int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x;
4134 int requiredBottom = ksize.height - 1 - anchor.y;
4135 int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x;
4136 int h = isolated ? sz.height : wholeSize.height;
4137 int w = isolated ? sz.width : wholeSize.width;
4138 bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight;
4139
4140 if ((w < ksize.width) || (h < ksize.height))
4141 return false;
4142
4143 String opts = format("-D LOCAL_SIZE=%d -D cn=%d "
4144 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
4145 "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
4146 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
4147 "-D convertToWT=%s -D convertToDstT=%s",
4148 (int)BLOCK_SIZE, cn, anchor.x, anchor.y,
4149 ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType],
4150 extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
4151 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4152 doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(),
4153 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype),
4154 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth),
4155 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4156 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]));
4157
4158 localsize[0] = BLOCK_SIZE;
4159 globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE;
4160 globalsize[1] = sz.height;
4161
4162 if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts))
4163 return false;
4164
4165 size_t kernelWorkGroupSize = k.workGroupSize();
4166 if (localsize[0] <= kernelWorkGroupSize)
4167 break;
4168 if (BLOCK_SIZE < kernelWorkGroupSize)
4169 return false;
4170 tryWorkItems = kernelWorkGroupSize;
4171 }
4172 }
4173
4174 _dst.create(sz, dtype);
4175 UMat dst = _dst.getUMat();
4176
4177 int srcOffsetX = (int)((src.offset % src.step) / src.elemSize());
4178 int srcOffsetY = (int)(src.offset / src.step);
4179 int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width);
4180 int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height);
4181
4182 k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY,
4183 srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta);
4184
4185 return k.run(2, globalsize, localsize, false);
4186 }
4187
4188 const int shift_bits = 8;
4189
ocl_sepRowFilter2D(const UMat & src,UMat & buf,const Mat & kernelX,int anchor,int borderType,int ddepth,bool fast8uc1,bool int_arithm)4190 static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor,
4191 int borderType, int ddepth, bool fast8uc1, bool int_arithm)
4192 {
4193 int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type);
4194 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4195 Size bufSize = buf.size();
4196 int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
4197
4198 if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
4199 return false;
4200
4201 #ifdef ANDROID
4202 size_t localsize[2] = {16, 10};
4203 #else
4204 size_t localsize[2] = {16, 16};
4205 #endif
4206
4207 size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]};
4208 if (fast8uc1)
4209 globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0];
4210
4211 int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1;
4212
4213 bool isolated = (borderType & BORDER_ISOLATED) != 0;
4214 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" },
4215 * const btype = borderMap[borderType & ~BORDER_ISOLATED];
4216
4217 bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1;
4218 extra_extrapolation |= src.rows < radiusY;
4219 extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1;
4220 extra_extrapolation |= src.cols < radiusX;
4221
4222 char cvt[40];
4223 cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s"
4224 " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s",
4225 radiusX, (int)localsize[0], (int)localsize[1], cn, btype,
4226 extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION",
4227 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED",
4228 ocl::typeToStr(type), ocl::typeToStr(buf_type),
4229 ocl::convertTypeStr(sdepth, bdepth, cn, cvt),
4230 ocl::typeToStr(sdepth), ocl::typeToStr(bdepth),
4231 doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4232 int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4233 build_options += ocl::kernelToStr(kernelX, bdepth);
4234
4235 Size srcWholeSize; Point srcOffset;
4236 src.locateROI(srcWholeSize, srcOffset);
4237
4238 String kernelName("row_filter");
4239 if (fast8uc1)
4240 kernelName += "_C1_D0";
4241
4242 ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc,
4243 build_options);
4244 if (k.empty())
4245 return false;
4246
4247 if (fast8uc1)
4248 k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x,
4249 srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
4250 ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()),
4251 buf.cols, buf.rows, radiusY);
4252 else
4253 k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x,
4254 srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height,
4255 ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY);
4256
4257 return k.run(2, globalsize, localsize, false);
4258 }
4259
ocl_sepColFilter2D(const UMat & buf,UMat & dst,const Mat & kernelY,double delta,int anchor,bool int_arithm)4260 static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm)
4261 {
4262 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4263 if (dst.depth() == CV_64F && !doubleSupport)
4264 return false;
4265
4266 #ifdef ANDROID
4267 size_t localsize[2] = { 16, 10 };
4268 #else
4269 size_t localsize[2] = { 16, 16 };
4270 #endif
4271 size_t globalsize[2] = { 0, 0 };
4272
4273 int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype);
4274 Size sz = dst.size();
4275 int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type);
4276
4277 globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1];
4278 globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0];
4279
4280 char cvt[40];
4281 cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d"
4282 " -D srcT=%s -D dstT=%s -D convertToDstT=%s"
4283 " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s",
4284 anchor, (int)localsize[0], (int)localsize[1], cn,
4285 ocl::typeToStr(buf_type), ocl::typeToStr(dtype),
4286 ocl::convertTypeStr(bdepth, ddepth, cn, cvt),
4287 ocl::typeToStr(bdepth), ocl::typeToStr(ddepth),
4288 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "",
4289 int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4290 build_options += ocl::kernelToStr(kernelY, bdepth);
4291
4292 ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc,
4293 build_options);
4294 if (k.empty())
4295 return false;
4296
4297 k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst),
4298 static_cast<float>(delta));
4299
4300 return k.run(2, globalsize, localsize, false);
4301 }
4302
4303 const int optimizedSepFilterLocalWidth = 16;
4304 const int optimizedSepFilterLocalHeight = 8;
4305
ocl_sepFilter2D_SinglePass(InputArray _src,OutputArray _dst,Mat row_kernel,Mat col_kernel,double delta,int borderType,int ddepth,int bdepth,bool int_arithm)4306 static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst,
4307 Mat row_kernel, Mat col_kernel,
4308 double delta, int borderType, int ddepth, int bdepth, bool int_arithm)
4309 {
4310 Size size = _src.size(), wholeSize;
4311 Point origin;
4312 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
4313 esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth),
4314 dtype = CV_MAKE_TYPE(ddepth, cn);
4315 size_t src_step = _src.step(), src_offset = _src.offset();
4316 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
4317
4318 if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) ||
4319 !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE ||
4320 borderType == BORDER_REFLECT || borderType == BORDER_WRAP ||
4321 borderType == BORDER_REFLECT_101))
4322 return false;
4323
4324 size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight };
4325 size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]};
4326
4327 char cvt[2][40];
4328 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
4329 "BORDER_REFLECT_101" };
4330
4331 String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s"
4332 " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s"
4333 " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s",
4334 (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2,
4335 ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(),
4336 ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(),
4337 ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]),
4338 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype),
4339 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType],
4340 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth),
4341 cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : "");
4342
4343 ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts);
4344 if (k.empty())
4345 return false;
4346
4347 UMat src = _src.getUMat();
4348 _dst.create(size, dtype);
4349 UMat dst = _dst.getUMat();
4350
4351 int src_offset_x = static_cast<int>((src_offset % src_step) / esz);
4352 int src_offset_y = static_cast<int>(src_offset / src_step);
4353
4354 src.locateROI(wholeSize, origin);
4355
4356 k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y,
4357 wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst),
4358 static_cast<float>(delta));
4359
4360 return k.run(2, gt2, lt2, false);
4361 }
4362
ocl_sepFilter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernelX,InputArray _kernelY,Point anchor,double delta,int borderType)4363 static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
4364 InputArray _kernelX, InputArray _kernelY, Point anchor,
4365 double delta, int borderType )
4366 {
4367 const ocl::Device & d = ocl::Device::getDefault();
4368 Size imgSize = _src.size();
4369
4370 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
4371 if (cn > 4)
4372 return false;
4373
4374 Mat kernelX = _kernelX.getMat().reshape(1, 1);
4375 if (kernelX.cols % 2 != 1)
4376 return false;
4377 Mat kernelY = _kernelY.getMat().reshape(1, 1);
4378 if (kernelY.cols % 2 != 1)
4379 return false;
4380
4381 if (ddepth < 0)
4382 ddepth = sdepth;
4383
4384 if (anchor.x < 0)
4385 anchor.x = kernelX.cols >> 1;
4386 if (anchor.y < 0)
4387 anchor.y = kernelY.cols >> 1;
4388
4389 int rtype = getKernelType(kernelX,
4390 kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x));
4391 int ctype = getKernelType(kernelY,
4392 kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y));
4393
4394 int bdepth = CV_32F;
4395 bool int_arithm = false;
4396 if( sdepth == CV_8U && ddepth == CV_8U &&
4397 rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL &&
4398 ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL)
4399 {
4400 if (ocl::Device::getDefault().isIntel())
4401 {
4402 for (int i=0; i<kernelX.cols; i++)
4403 kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits));
4404 if (kernelX.data != kernelY.data)
4405 for (int i=0; i<kernelX.cols; i++)
4406 kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits));
4407 } else
4408 {
4409 bdepth = CV_32S;
4410 kernelX.convertTo( kernelX, bdepth, 1 << shift_bits );
4411 kernelY.convertTo( kernelY, bdepth, 1 << shift_bits );
4412 }
4413 int_arithm = true;
4414 }
4415
4416 CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 &&
4417 imgSize.width > optimizedSepFilterLocalWidth + anchor.x &&
4418 imgSize.height > optimizedSepFilterLocalHeight + anchor.y &&
4419 (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) &&
4420 anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) &&
4421 (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())),
4422 ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta,
4423 borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true)
4424
4425 UMat src = _src.getUMat();
4426 Size srcWholeSize; Point srcOffset;
4427 src.locateROI(srcWholeSize, srcOffset);
4428
4429 bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 &&
4430 src.cols % 4 == 0 && src.step % 4 == 0;
4431
4432 Size srcSize = src.size();
4433 Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1);
4434 UMat buf(bufSize, CV_MAKETYPE(bdepth, cn));
4435 if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm))
4436 return false;
4437
4438 _dst.create(srcSize, CV_MAKETYPE(ddepth, cn));
4439 UMat dst = _dst.getUMat();
4440
4441 return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm);
4442 }
4443
4444 #endif
4445
4446 }
4447
getLinearFilter(int srcType,int dstType,InputArray filter_kernel,Point anchor,double delta,int bits)4448 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType,
4449 InputArray filter_kernel, Point anchor,
4450 double delta, int bits)
4451 {
4452 Mat _kernel = filter_kernel.getMat();
4453 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
4454 int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth();
4455 CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth );
4456
4457 anchor = normalizeAnchor(anchor, _kernel.size());
4458
4459 /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S )
4460 return makePtr<Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u> >
4461 (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits),
4462 FilterVec_8u(_kernel, bits, delta));
4463 if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S )
4464 return makePtr<Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s> >
4465 (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits),
4466 FilterVec_8u16s(_kernel, bits, delta));*/
4467
4468 kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F;
4469 Mat kernel;
4470 if( _kernel.type() == kdepth )
4471 kernel = _kernel;
4472 else
4473 _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.);
4474
4475 if( sdepth == CV_8U && ddepth == CV_8U )
4476 return makePtr<Filter2D<uchar, Cast<float, uchar>, FilterVec_8u> >
4477 (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta));
4478 if( sdepth == CV_8U && ddepth == CV_16U )
4479 return makePtr<Filter2D<uchar,
4480 Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
4481 if( sdepth == CV_8U && ddepth == CV_16S )
4482 return makePtr<Filter2D<uchar, Cast<float, short>, FilterVec_8u16s> >
4483 (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta));
4484 if( sdepth == CV_8U && ddepth == CV_32F )
4485 return makePtr<Filter2D<uchar,
4486 Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4487 if( sdepth == CV_8U && ddepth == CV_64F )
4488 return makePtr<Filter2D<uchar,
4489 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4490
4491 if( sdepth == CV_16U && ddepth == CV_16U )
4492 return makePtr<Filter2D<ushort,
4493 Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta);
4494 if( sdepth == CV_16U && ddepth == CV_32F )
4495 return makePtr<Filter2D<ushort,
4496 Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4497 if( sdepth == CV_16U && ddepth == CV_64F )
4498 return makePtr<Filter2D<ushort,
4499 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4500
4501 if( sdepth == CV_16S && ddepth == CV_16S )
4502 return makePtr<Filter2D<short,
4503 Cast<float, short>, FilterNoVec> >(kernel, anchor, delta);
4504 if( sdepth == CV_16S && ddepth == CV_32F )
4505 return makePtr<Filter2D<short,
4506 Cast<float, float>, FilterNoVec> >(kernel, anchor, delta);
4507 if( sdepth == CV_16S && ddepth == CV_64F )
4508 return makePtr<Filter2D<short,
4509 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4510
4511 if( sdepth == CV_32F && ddepth == CV_32F )
4512 return makePtr<Filter2D<float, Cast<float, float>, FilterVec_32f> >
4513 (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta));
4514 if( sdepth == CV_64F && ddepth == CV_64F )
4515 return makePtr<Filter2D<double,
4516 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta);
4517
4518 CV_Error_( CV_StsNotImplemented,
4519 ("Unsupported combination of source format (=%d), and destination format (=%d)",
4520 srcType, dstType));
4521
4522 return Ptr<BaseFilter>();
4523 }
4524
4525
createLinearFilter(int _srcType,int _dstType,InputArray filter_kernel,Point _anchor,double _delta,int _rowBorderType,int _columnBorderType,const Scalar & _borderValue)4526 cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType,
4527 InputArray filter_kernel,
4528 Point _anchor, double _delta,
4529 int _rowBorderType, int _columnBorderType,
4530 const Scalar& _borderValue )
4531 {
4532 Mat _kernel = filter_kernel.getMat();
4533 _srcType = CV_MAT_TYPE(_srcType);
4534 _dstType = CV_MAT_TYPE(_dstType);
4535 int cn = CV_MAT_CN(_srcType);
4536 CV_Assert( cn == CV_MAT_CN(_dstType) );
4537
4538 Mat kernel = _kernel;
4539 int bits = 0;
4540
4541 /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType);
4542 int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor);
4543 if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) &&
4544 _kernel.rows*_kernel.cols <= (1 << 10) )
4545 {
4546 bits = (ktype & KERNEL_INTEGER) ? 0 : 11;
4547 _kernel.convertTo(kernel, CV_32S, 1 << bits);
4548 }*/
4549
4550 Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType,
4551 kernel, _anchor, _delta, bits);
4552
4553 return makePtr<FilterEngine>(_filter2D, Ptr<BaseRowFilter>(),
4554 Ptr<BaseColumnFilter>(), _srcType, _dstType, _srcType,
4555 _rowBorderType, _columnBorderType, _borderValue );
4556 }
4557
4558
filter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernel,Point anchor0,double delta,int borderType)4559 void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
4560 InputArray _kernel, Point anchor0,
4561 double delta, int borderType )
4562 {
4563 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
4564 ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType))
4565
4566 Mat src = _src.getMat(), kernel = _kernel.getMat();
4567
4568 if( ddepth < 0 )
4569 ddepth = src.depth();
4570
4571 #if CV_SSE2
4572 int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) ||
4573 (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50;
4574 #else
4575 int dft_filter_size = 50;
4576 #endif
4577
4578 _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
4579 Mat dst = _dst.getMat();
4580 Point anchor = normalizeAnchor(anchor0, kernel.size());
4581
4582 #if IPP_VERSION_X100 > 0 && !defined HAVE_IPP_ICV_ONLY
4583 CV_IPP_CHECK()
4584 {
4585 typedef IppStatus (CV_STDCALL * ippiFilterBorder)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize dstRoiSize,
4586 IppiBorderType border, const void * borderValue,
4587 const IppiFilterBorderSpec* pSpec, Ipp8u* pBuffer);
4588
4589 int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype),
4590 ktype = kernel.type(), kdepth = CV_MAT_DEPTH(ktype);
4591 bool isolated = (borderType & BORDER_ISOLATED) != 0;
4592 Point ippAnchor(kernel.cols >> 1, kernel.rows >> 1);
4593 int borderTypeNI = borderType & ~BORDER_ISOLATED;
4594 IppiBorderType ippBorderType = ippiGetBorderType(borderTypeNI);
4595
4596 if (borderTypeNI == BORDER_CONSTANT || borderTypeNI == BORDER_REPLICATE)
4597 {
4598 ippiFilterBorder ippFunc =
4599 stype == CV_8UC1 ? (ippiFilterBorder)ippiFilterBorder_8u_C1R :
4600 stype == CV_8UC3 ? (ippiFilterBorder)ippiFilterBorder_8u_C3R :
4601 stype == CV_8UC4 ? (ippiFilterBorder)ippiFilterBorder_8u_C4R :
4602 stype == CV_16UC1 ? (ippiFilterBorder)ippiFilterBorder_16u_C1R :
4603 stype == CV_16UC3 ? (ippiFilterBorder)ippiFilterBorder_16u_C3R :
4604 stype == CV_16UC4 ? (ippiFilterBorder)ippiFilterBorder_16u_C4R :
4605 stype == CV_16SC1 ? (ippiFilterBorder)ippiFilterBorder_16s_C1R :
4606 stype == CV_16SC3 ? (ippiFilterBorder)ippiFilterBorder_16s_C3R :
4607 stype == CV_16SC4 ? (ippiFilterBorder)ippiFilterBorder_16s_C4R :
4608 stype == CV_32FC1 ? (ippiFilterBorder)ippiFilterBorder_32f_C1R :
4609 stype == CV_32FC3 ? (ippiFilterBorder)ippiFilterBorder_32f_C3R :
4610 stype == CV_32FC4 ? (ippiFilterBorder)ippiFilterBorder_32f_C4R : 0;
4611
4612 if (sdepth == ddepth && (ktype == CV_16SC1 || ktype == CV_32FC1) &&
4613 ippFunc && (int)ippBorderType >= 0 && (!src.isSubmatrix() || isolated) &&
4614 std::fabs(delta - 0) < DBL_EPSILON && ippAnchor == anchor && dst.data != src.data)
4615 {
4616 IppiSize kernelSize = { kernel.cols, kernel.rows }, dstRoiSize = { dst.cols, dst.rows };
4617 IppDataType dataType = ippiGetDataType(ddepth), kernelType = ippiGetDataType(kdepth);
4618 Ipp32s specSize = 0, bufsize = 0;
4619 IppStatus status = (IppStatus)-1;
4620
4621 if ((status = ippiFilterBorderGetSize(kernelSize, dstRoiSize, dataType, kernelType, cn, &specSize, &bufsize)) >= 0)
4622 {
4623 IppiFilterBorderSpec * spec = (IppiFilterBorderSpec *)ippMalloc(specSize);
4624 Ipp8u * buffer = ippsMalloc_8u(bufsize);
4625 Ipp32f borderValue[4] = { 0, 0, 0, 0 };
4626
4627 Mat reversedKernel;
4628 flip(kernel, reversedKernel, -1);
4629
4630 if ((kdepth == CV_32F && (status = ippiFilterBorderInit_32f((const Ipp32f *)reversedKernel.data, kernelSize,
4631 dataType, cn, ippRndFinancial, spec)) >= 0 ) ||
4632 (kdepth == CV_16S && (status = ippiFilterBorderInit_16s((const Ipp16s *)reversedKernel.data,
4633 kernelSize, 0, dataType, cn, ippRndFinancial, spec)) >= 0))
4634 {
4635 status = ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, dstRoiSize,
4636 ippBorderType, borderValue, spec, buffer);
4637 }
4638
4639 ippsFree(buffer);
4640 ippsFree(spec);
4641 }
4642
4643 if (status >= 0)
4644 {
4645 CV_IMPL_ADD(CV_IMPL_IPP);
4646 return;
4647 }
4648 setIppErrorStatus();
4649 }
4650 }
4651 }
4652 #endif
4653
4654 #ifdef HAVE_TEGRA_OPTIMIZATION
4655 if( tegra::useTegra() && tegra::filter2D(src, dst, kernel, anchor, delta, borderType) )
4656 return;
4657 #endif
4658
4659 if( kernel.cols*kernel.rows >= dft_filter_size )
4660 {
4661 Mat temp;
4662 // crossCorr doesn't accept non-zero delta with multiple channels
4663 if( src.channels() != 1 && delta != 0 )
4664 {
4665 // The semantics of filter2D require that the delta be applied
4666 // as floating-point math. So wee need an intermediate Mat
4667 // with a float datatype. If the dest is already floats,
4668 // we just use that.
4669 int corrDepth = dst.depth();
4670 if( (dst.depth() == CV_32F || dst.depth() == CV_64F) &&
4671 src.data != dst.data )
4672 {
4673 temp = dst;
4674 }
4675 else
4676 {
4677 corrDepth = dst.depth() == CV_64F ? CV_64F : CV_32F;
4678 temp.create( dst.size(), CV_MAKETYPE(corrDepth, dst.channels()) );
4679 }
4680 crossCorr( src, kernel, temp, src.size(),
4681 CV_MAKETYPE(corrDepth, src.channels()),
4682 anchor, 0, borderType );
4683 add( temp, delta, temp );
4684 if ( temp.data != dst.data )
4685 {
4686 temp.convertTo( dst, dst.type() );
4687 }
4688 }
4689 else
4690 {
4691 if( src.data != dst.data )
4692 temp = dst;
4693 else
4694 temp.create(dst.size(), dst.type());
4695 crossCorr( src, kernel, temp, src.size(),
4696 CV_MAKETYPE(ddepth, src.channels()),
4697 anchor, delta, borderType );
4698 if( temp.data != dst.data )
4699 temp.copyTo(dst);
4700 }
4701 return;
4702 }
4703
4704 Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel,
4705 anchor, delta, borderType & ~BORDER_ISOLATED );
4706 f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
4707 }
4708
4709
sepFilter2D(InputArray _src,OutputArray _dst,int ddepth,InputArray _kernelX,InputArray _kernelY,Point anchor,double delta,int borderType)4710 void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth,
4711 InputArray _kernelX, InputArray _kernelY, Point anchor,
4712 double delta, int borderType )
4713 {
4714 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2,
4715 ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType))
4716
4717 Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat();
4718
4719 if( ddepth < 0 )
4720 ddepth = src.depth();
4721
4722 _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) );
4723 Mat dst = _dst.getMat();
4724
4725 Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(),
4726 dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED );
4727 f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 );
4728 }
4729
4730
4731 CV_IMPL void
cvFilter2D(const CvArr * srcarr,CvArr * dstarr,const CvMat * _kernel,CvPoint anchor)4732 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor )
4733 {
4734 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
4735 cv::Mat kernel = cv::cvarrToMat(_kernel);
4736
4737 CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() );
4738
4739 cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE );
4740 }
4741
4742 /* End of file. */
4743