1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                        Intel License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
14 // Third party copyrights are property of their respective owners.
15 //
16 // Redistribution and use in source and binary forms, with or without modification,
17 // are permitted provided that the following conditions are met:
18 //
19 //   * Redistribution's of source code must retain the above copyright notice,
20 //     this list of conditions and the following disclaimer.
21 //
22 //   * Redistribution's in binary form must reproduce the above copyright notice,
23 //     this list of conditions and the following disclaimer in the documentation
24 //     and/or other materials provided with the distribution.
25 //
26 //   * The name of Intel Corporation may not be used to endorse or promote products
27 //     derived from this software without specific prior written permission.
28 //
29 // This software is provided by the copyright holders and contributors "as is" and
30 // any express or implied warranties, including, but not limited to, the implied
31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
32 // In no event shall the Intel Corporation or contributors be liable for any direct,
33 // indirect, incidental, special, exemplary, or consequential damages
34 // (including, but not limited to, procurement of substitute goods or services;
35 // loss of use, data, or profits; or business interruption) however caused
36 // and on any theory of liability, whether in contract, strict liability,
37 // or tort (including negligence or otherwise) arising in any way out of
38 // the use of this software, even if advised of the possibility of such damage.
39 //
40 //M*/
41 
42 #include "_cxcore.h"
43 
44 /****************************************************************************************\
45 *                                         cvGEMM                                         *
46 \****************************************************************************************/
47 
48 icvBLAS_GEMM_32f_t icvBLAS_GEMM_32f_p = 0;
49 icvBLAS_GEMM_64f_t icvBLAS_GEMM_64f_p = 0;
50 icvBLAS_GEMM_32fc_t icvBLAS_GEMM_32fc_p = 0;
51 icvBLAS_GEMM_64fc_t icvBLAS_GEMM_64fc_p = 0;
52 
53 static void
icvGEMM_CopyBlock(const uchar * src,int src_step,uchar * dst,int dst_step,CvSize size,int pix_size)54 icvGEMM_CopyBlock( const uchar* src, int src_step,
55                    uchar* dst, int dst_step,
56                    CvSize size, int pix_size )
57 {
58     int j;
59     size.width = size.width * (pix_size / sizeof(int));
60 
61     for( ; size.height--; src += src_step, dst += dst_step )
62     {
63         for( j = 0; j <= size.width - 4; j += 4 )
64         {
65             int t0 = ((const int*)src)[j];
66             int t1 = ((const int*)src)[j+1];
67             ((int*)dst)[j] = t0;
68             ((int*)dst)[j+1] = t1;
69             t0 = ((const int*)src)[j+2];
70             t1 = ((const int*)src)[j+3];
71             ((int*)dst)[j+2] = t0;
72             ((int*)dst)[j+3] = t1;
73         }
74 
75         for( ; j < size.width; j++ )
76             ((int*)dst)[j] = ((const int*)src)[j];
77     }
78 }
79 
80 
81 static void
icvGEMM_TransposeBlock(const uchar * src,int src_step,uchar * dst,int dst_step,CvSize size,int pix_size)82 icvGEMM_TransposeBlock( const uchar* src, int src_step,
83                         uchar* dst, int dst_step,
84                         CvSize size, int pix_size )
85 {
86     int i, j;
87     for( i = 0; i < size.width; i++, dst += dst_step, src += pix_size )
88     {
89         const uchar* _src = src;
90         switch( pix_size )
91         {
92         case sizeof(int):
93             for( j = 0; j < size.height; j++, _src += src_step )
94                 ((int*)dst)[j] = ((int*)_src)[0];
95             break;
96         case sizeof(int)*2:
97             for( j = 0; j < size.height*2; j += 2, _src += src_step )
98             {
99                 int t0 = ((int*)_src)[0];
100                 int t1 = ((int*)_src)[1];
101                 ((int*)dst)[j] = t0;
102                 ((int*)dst)[j+1] = t1;
103             }
104             break;
105         case sizeof(int)*4:
106             for( j = 0; j < size.height*4; j += 4, _src += src_step )
107             {
108                 int t0 = ((int*)_src)[0];
109                 int t1 = ((int*)_src)[1];
110                 ((int*)dst)[j] = t0;
111                 ((int*)dst)[j+1] = t1;
112                 t0 = ((int*)_src)[2];
113                 t1 = ((int*)_src)[3];
114                 ((int*)dst)[j+2] = t0;
115                 ((int*)dst)[j+3] = t1;
116             }
117             break;
118         default:
119             assert(0);
120             return;
121         }
122     }
123 }
124 
125 #define ICV_DEF_GEMM_SINGLE_MUL( flavor, arrtype, worktype )                \
126 static CvStatus CV_STDCALL                                                  \
127 icvGEMMSingleMul_##flavor( const arrtype* a_data, size_t a_step,            \
128                          const arrtype* b_data, size_t b_step,              \
129                          const arrtype* c_data, size_t c_step,              \
130                          arrtype* d_data, size_t d_step,                    \
131                          CvSize a_size, CvSize d_size,                      \
132                          double alpha, double beta, int flags )             \
133 {                                                                           \
134     int i, j, k, n = a_size.width, m = d_size.width, drows = d_size.height; \
135     const arrtype *_a_data = a_data, *_b_data = b_data, *_c_data = c_data;  \
136     arrtype* a_buf = 0;                                                     \
137     size_t a_step0, a_step1, c_step0, c_step1, t_step;                      \
138                                                                             \
139     a_step /= sizeof(a_data[0]);                                            \
140     b_step /= sizeof(b_data[0]);                                            \
141     c_step /= sizeof(c_data[0]);                                            \
142     d_step /= sizeof(d_data[0]);                                            \
143     a_step0 = a_step;                                                       \
144     a_step1 = 1;                                                            \
145                                                                             \
146     if( !c_data )                                                           \
147         c_step0 = c_step1 = 0;                                              \
148     else if( !(flags & CV_GEMM_C_T) )                                       \
149         c_step0 = c_step, c_step1 = 1;                                      \
150     else                                                                    \
151         c_step0 = 1, c_step1 = c_step;                                      \
152                                                                             \
153     if( flags & CV_GEMM_A_T )                                               \
154     {                                                                       \
155         CV_SWAP( a_step0, a_step1, t_step );                                \
156         n = a_size.height;                                                  \
157         if( a_step > 1 && n > 1 )                                           \
158             a_buf = (arrtype*)cvStackAlloc(n*sizeof(a_data[0]));            \
159     }                                                                       \
160                                                                             \
161     if( n == 1 ) /* external product */                                     \
162     {                                                                       \
163         arrtype* b_buf = 0;                                                 \
164                                                                             \
165         if( a_step > 1 )                                                    \
166         {                                                                   \
167             a_buf = (arrtype*)cvStackAlloc(drows*sizeof(a_data[0]));        \
168             for( k = 0; k < drows; k++ )                                    \
169                 a_buf[k] = a_data[a_step*k];                                \
170             a_data = a_buf;                                                 \
171         }                                                                   \
172                                                                             \
173         if( b_step > 1 )                                                    \
174         {                                                                   \
175             b_buf = (arrtype*)cvStackAlloc(d_size.width*sizeof(b_buf[0]) ); \
176             for( j = 0; j < d_size.width; j++ )                             \
177                 b_buf[j] = b_data[j*b_step];                                \
178             b_data = b_buf;                                                 \
179         }                                                                   \
180                                                                             \
181         for( i = 0; i < drows; i++, _c_data += c_step0,                     \
182                                     d_data += d_step )                      \
183         {                                                                   \
184             worktype al = worktype(a_data[i])*alpha;                        \
185             c_data = _c_data;                                               \
186             for( j = 0; j <= d_size.width - 2; j += 2, c_data += 2*c_step1 )\
187             {                                                               \
188                 worktype s0 = al*b_data[j];                                 \
189                 worktype s1 = al*b_data[j+1];                               \
190                 if( !c_data )                                               \
191                 {                                                           \
192                     d_data[j] = arrtype(s0);                                \
193                     d_data[j+1] = arrtype(s1);                              \
194                 }                                                           \
195                 else                                                        \
196                 {                                                           \
197                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
198                     d_data[j+1] = arrtype(s1 + c_data[c_step1]*beta);       \
199                 }                                                           \
200             }                                                               \
201                                                                             \
202             for( ; j < d_size.width; j++, c_data += c_step1 )               \
203             {                                                               \
204                 worktype s0 = al*b_data[j];                                 \
205                 if( !c_data )                                               \
206                     d_data[j] = arrtype(s0);                                \
207                 else                                                        \
208                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
209             }                                                               \
210         }                                                                   \
211     }                                                                       \
212     else if( flags & CV_GEMM_B_T ) /* A * Bt */                             \
213     {                                                                       \
214         for( i = 0; i < drows; i++, _a_data += a_step0,                     \
215                                     _c_data += c_step0,                     \
216                                     d_data += d_step )                      \
217         {                                                                   \
218             a_data = _a_data;                                               \
219             b_data = _b_data;                                               \
220             c_data = _c_data;                                               \
221                                                                             \
222             if( a_buf )                                                     \
223             {                                                               \
224                 for( k = 0; k < n; k++ )                                    \
225                     a_buf[k] = a_data[a_step1*k];                           \
226                 a_data = a_buf;                                             \
227             }                                                               \
228                                                                             \
229             for( j = 0; j < d_size.width; j++, b_data += b_step,            \
230                                                c_data += c_step1 )          \
231             {                                                               \
232                 worktype s0(0), s1(0), s2(0), s3(0);                        \
233                                                                             \
234                 for( k = 0; k <= n - 4; k += 4 )                            \
235                 {                                                           \
236                     s0 += worktype(a_data[k])*b_data[k];                    \
237                     s1 += worktype(a_data[k+1])*b_data[k+1];                \
238                     s2 += worktype(a_data[k+2])*b_data[k+2];                \
239                     s3 += worktype(a_data[k+3])*b_data[k+3];                \
240                 }                                                           \
241                                                                             \
242                 for( ; k < n; k++ )                                         \
243                     s0 += worktype(a_data[k])*b_data[k];                    \
244                 s0 = (s0+s1+s2+s3)*alpha;                                   \
245                                                                             \
246                 if( !c_data )                                               \
247                     d_data[j] = arrtype(s0);                                \
248                 else                                                        \
249                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
250             }                                                               \
251         }                                                                   \
252     }                                                                       \
253     else if( d_size.width*sizeof(d_data[0]) <= 1600 )                       \
254     {                                                                       \
255         for( i = 0; i < drows; i++, _a_data += a_step0,                     \
256                                     _c_data += c_step0,                     \
257                                     d_data += d_step )                      \
258         {                                                                   \
259             a_data = _a_data, c_data = _c_data;                             \
260                                                                             \
261             if( a_buf )                                                     \
262             {                                                               \
263                 for( k = 0; k < n; k++ )                                    \
264                     a_buf[k] = a_data[a_step1*k];                           \
265                 a_data = a_buf;                                             \
266             }                                                               \
267                                                                             \
268             for( j = 0; j <= m - 4; j += 4, c_data += 4*c_step1 )           \
269             {                                                               \
270                 const arrtype* b = _b_data + j;                             \
271                 worktype s0(0), s1(0), s2(0), s3(0);                        \
272                                                                             \
273                 for( k = 0; k < n; k++, b += b_step )                       \
274                 {                                                           \
275                     worktype a(a_data[k]);                                  \
276                     s0 += a * b[0]; s1 += a * b[1];                         \
277                     s2 += a * b[2]; s3 += a * b[3];                         \
278                 }                                                           \
279                                                                             \
280                 if( !c_data )                                               \
281                 {                                                           \
282                     d_data[j] = arrtype(s0*alpha);                          \
283                     d_data[j+1] = arrtype(s1*alpha);                        \
284                     d_data[j+2] = arrtype(s2*alpha);                        \
285                     d_data[j+3] = arrtype(s3*alpha);                        \
286                 }                                                           \
287                 else                                                        \
288                 {                                                           \
289                     s0 = s0*alpha; s1 = s1*alpha;                           \
290                     s2 = s2*alpha; s3 = s3*alpha;                           \
291                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
292                     d_data[j+1] = arrtype(s1 + c_data[c_step1]*beta);       \
293                     d_data[j+2] = arrtype(s2 + c_data[c_step1*2]*beta);     \
294                     d_data[j+3] = arrtype(s3 + c_data[c_step1*3]*beta);     \
295                 }                                                           \
296             }                                                               \
297                                                                             \
298             for( ; j < m; j++, c_data += c_step1 )                          \
299             {                                                               \
300                 const arrtype* b = _b_data + j;                             \
301                 worktype s0(0);                                             \
302                                                                             \
303                 for( k = 0; k < n; k++, b += b_step )                       \
304                     s0 += worktype(a_data[k]) * b[0];                       \
305                                                                             \
306                 s0 = s0*alpha;                                              \
307                 if( !c_data )                                               \
308                     d_data[j] = arrtype(s0);                                \
309                 else                                                        \
310                     d_data[j] = arrtype(s0 + c_data[0]*beta);               \
311             }                                                               \
312         }                                                                   \
313     }                                                                       \
314     else                                                                    \
315     {                                                                       \
316         worktype* d_buf = (worktype*)cvStackAlloc(m*sizeof(d_buf[0]));      \
317                                                                             \
318         for( i = 0; i < drows; i++, _a_data += a_step0,                     \
319                                             _c_data += c_step0,             \
320                                             d_data += d_step )              \
321         {                                                                   \
322             a_data = _a_data;                                               \
323             b_data = _b_data;                                               \
324             c_data = _c_data;                                               \
325                                                                             \
326             if( a_buf )                                                     \
327             {                                                               \
328                 for( k = 0; k < n; k++ )                                    \
329                     a_buf[k] = _a_data[a_step1*k];                          \
330                 a_data = a_buf;                                             \
331             }                                                               \
332                                                                             \
333             for( j = 0; j < m; j++ )                                        \
334                 d_buf[j] = worktype(0);                                     \
335                                                                             \
336             for( k = 0; k < n; k++, b_data += b_step )                      \
337             {                                                               \
338                 worktype al(a_data[k]);                                     \
339                                                                             \
340                 for( j = 0; j <= m - 4; j += 4 )                            \
341                 {                                                           \
342                     worktype t0 = d_buf[j] + b_data[j]*al;                  \
343                     worktype t1 = d_buf[j+1] + b_data[j+1]*al;              \
344                     d_buf[j] = t0;                                          \
345                     d_buf[j+1] = t1;                                        \
346                     t0 = d_buf[j+2] + b_data[j+2]*al;                       \
347                     t1 = d_buf[j+3] + b_data[j+3]*al;                       \
348                     d_buf[j+2] = t0;                                        \
349                     d_buf[j+3] = t1;                                        \
350                 }                                                           \
351                                                                             \
352                 for( ; j < m; j++ )                                         \
353                     d_buf[j] += b_data[j]*al;                               \
354             }                                                               \
355                                                                             \
356             if( !c_data )                                                   \
357                 for( j = 0; j < m; j++ )                                    \
358                     d_data[j] = arrtype(d_buf[j]*alpha);                    \
359             else                                                            \
360                 for( j = 0; j < m; j++, c_data += c_step1 )                 \
361                 {                                                           \
362                     worktype t = d_buf[j]*alpha;                            \
363                     d_data[j] = arrtype(t + c_data[0]*beta);                \
364                 }                                                           \
365         }                                                                   \
366     }                                                                       \
367     return CV_OK;                                                           \
368 }
369 
370 
371 #define ICV_DEF_GEMM_BLOCK_MUL( flavor, arrtype, worktype )         \
372 static CvStatus CV_STDCALL                                          \
373 icvGEMMBlockMul_##flavor( const arrtype* a_data, size_t a_step,     \
374                         const arrtype* b_data, size_t b_step,       \
375                         worktype* d_data, size_t d_step,            \
376                         CvSize a_size, CvSize d_size, int flags )   \
377 {                                                                   \
378     int i, j, k, n = a_size.width, m = d_size.width;                \
379     const arrtype *_a_data = a_data, *_b_data = b_data;             \
380     arrtype* a_buf = 0;                                             \
381     size_t a_step0, a_step1, t_step;                                \
382     int do_acc = flags & 16;                                        \
383                                                                     \
384     a_step /= sizeof(a_data[0]);                                    \
385     b_step /= sizeof(b_data[0]);                                    \
386     d_step /= sizeof(d_data[0]);                                    \
387                                                                     \
388     a_step0 = a_step;                                               \
389     a_step1 = 1;                                                    \
390                                                                     \
391     if( flags & CV_GEMM_A_T )                                       \
392     {                                                               \
393         CV_SWAP( a_step0, a_step1, t_step );                        \
394         n = a_size.height;                                          \
395         a_buf = (arrtype*)cvStackAlloc(n*sizeof(a_data[0]));        \
396     }                                                               \
397                                                                     \
398     if( flags & CV_GEMM_B_T )                                       \
399     {                                                               \
400         /* second operand is transposed */                          \
401         for( i = 0; i < d_size.height; i++, _a_data += a_step0,     \
402                                             d_data += d_step )      \
403         {                                                           \
404             a_data = _a_data; b_data = _b_data;                     \
405                                                                     \
406             if( a_buf )                                             \
407             {                                                       \
408                 for( k = 0; k < n; k++ )                            \
409                     a_buf[k] = a_data[a_step1*k];                   \
410                 a_data = a_buf;                                     \
411             }                                                       \
412                                                                     \
413             for( j = 0; j < d_size.width; j++, b_data += b_step )   \
414             {                                                       \
415                 worktype s0 = do_acc ? d_data[j]:worktype(0), s1(0);\
416                 for( k = 0; k <= n - 2; k += 2 )                    \
417                 {                                                   \
418                     s0 += worktype(a_data[k])*b_data[k];            \
419                     s1 += worktype(a_data[k+1])*b_data[k+1];        \
420                 }                                                   \
421                                                                     \
422                 for( ; k < n; k++ )                                 \
423                     s0 += worktype(a_data[k])*b_data[k];            \
424                                                                     \
425                 d_data[j] = s0 + s1;                                \
426             }                                                       \
427         }                                                           \
428     }                                                               \
429     else                                                            \
430     {                                                               \
431         for( i = 0; i < d_size.height; i++, _a_data += a_step0,     \
432                                             d_data += d_step )      \
433         {                                                           \
434             a_data = _a_data, b_data = _b_data;                     \
435                                                                     \
436             if( a_buf )                                             \
437             {                                                       \
438                 for( k = 0; k < n; k++ )                            \
439                     a_buf[k] = a_data[a_step1*k];                   \
440                 a_data = a_buf;                                     \
441             }                                                       \
442                                                                     \
443             for( j = 0; j <= m - 4; j += 4 )                        \
444             {                                                       \
445                 worktype s0, s1, s2, s3;                            \
446                 const arrtype* b = b_data + j;                      \
447                                                                     \
448                 if( do_acc )                                        \
449                 {                                                   \
450                     s0 = d_data[j]; s1 = d_data[j+1];               \
451                     s2 = d_data[j+2]; s3 = d_data[j+3];             \
452                 }                                                   \
453                 else                                                \
454                     s0 = s1 = s2 = s3 = worktype(0);                \
455                                                                     \
456                 for( k = 0; k < n; k++, b += b_step )               \
457                 {                                                   \
458                     worktype a(a_data[k]);                          \
459                     s0 += a * b[0]; s1 += a * b[1];                 \
460                     s2 += a * b[2]; s3 += a * b[3];                 \
461                 }                                                   \
462                                                                     \
463                 d_data[j] = s0; d_data[j+1] = s1;                   \
464                 d_data[j+2] = s2; d_data[j+3] = s3;                 \
465             }                                                       \
466                                                                     \
467             for( ; j < m; j++ )                                     \
468             {                                                       \
469                 const arrtype* b = b_data + j;                      \
470                 worktype s0 = do_acc ? d_data[j] : worktype(0);     \
471                                                                     \
472                 for( k = 0; k < n; k++, b += b_step )               \
473                     s0 += worktype(a_data[k]) * b[0];               \
474                                                                     \
475                 d_data[j] = s0;                                     \
476             }                                                       \
477         }                                                           \
478     }                                                               \
479                                                                     \
480     return CV_OK;                                                   \
481 }
482 
483 
484 #define ICV_DEF_GEMM_STORE( flavor, arrtype, worktype )             \
485 static CvStatus CV_STDCALL                                          \
486 icvGEMMStore_##flavor( const arrtype* c_data, size_t c_step,        \
487                        const worktype* d_buf, size_t d_buf_step,    \
488                        arrtype* d_data, size_t d_step, CvSize d_size,\
489                        double alpha, double beta, int flags )       \
490 {                                                                   \
491     const arrtype* _c_data = c_data;                                \
492     int j;                                                          \
493     size_t c_step0, c_step1;                                        \
494                                                                     \
495     c_step /= sizeof(c_data[0]);                                    \
496     d_buf_step /= sizeof(d_buf[0]);                                 \
497     d_step /= sizeof(d_data[0]);                                    \
498                                                                     \
499     if( !c_data )                                                   \
500         c_step0 = c_step1 = 0;                                      \
501     else if( !(flags & CV_GEMM_C_T) )                               \
502         c_step0 = c_step, c_step1 = 1;                              \
503     else                                                            \
504         c_step0 = 1, c_step1 = c_step;                              \
505                                                                     \
506     for( ; d_size.height--; _c_data += c_step0,                     \
507                             d_buf += d_buf_step,                    \
508                             d_data += d_step )                      \
509     {                                                               \
510         if( _c_data )                                               \
511         {                                                           \
512             c_data = _c_data;                                       \
513             for( j = 0; j <= d_size.width - 4; j += 4, c_data += 4*c_step1 )\
514             {                                                       \
515                 worktype t0 = alpha*d_buf[j];                       \
516                 worktype t1 = alpha*d_buf[j+1];                     \
517                 t0 += beta*worktype(c_data[0]);                     \
518                 t1 += beta*worktype(c_data[c_step1]);               \
519                 d_data[j] = arrtype(t0);                            \
520                 d_data[j+1] = arrtype(t1);                          \
521                 t0 = alpha*d_buf[j+2];                              \
522                 t1 = alpha*d_buf[j+3];                              \
523                 t0 += beta*worktype(c_data[c_step1*2]);             \
524                 t1 += beta*worktype(c_data[c_step1*3]);             \
525                 d_data[j+2] = arrtype(t0);                          \
526                 d_data[j+3] = arrtype(t1);                          \
527             }                                                       \
528             for( ; j < d_size.width; j++, c_data += c_step1 )       \
529             {                                                       \
530                 worktype t0 = alpha*d_buf[j];                       \
531                 d_data[j] = arrtype(t0 + beta*c_data[0]);           \
532             }                                                       \
533         }                                                           \
534         else                                                        \
535         {                                                           \
536             for( j = 0; j <= d_size.width - 4; j += 4 )             \
537             {                                                       \
538                 worktype t0 = alpha*d_buf[j];                       \
539                 worktype t1 = alpha*d_buf[j+1];                     \
540                 d_data[j] = arrtype(t0);                            \
541                 d_data[j+1] = arrtype(t1);                          \
542                 t0 = alpha*d_buf[j+2];                              \
543                 t1 = alpha*d_buf[j+3];                              \
544                 d_data[j+2] = arrtype(t0);                          \
545                 d_data[j+3] = arrtype(t1);                          \
546             }                                                       \
547             for( ; j < d_size.width; j++ )                          \
548                 d_data[j] = arrtype(alpha*d_buf[j]);                \
549         }                                                           \
550     }                                                               \
551     return CV_OK;                                                   \
552 }
553 
554 
555 ICV_DEF_GEMM_SINGLE_MUL( 32f_C1R, float, double)
556 ICV_DEF_GEMM_BLOCK_MUL( 32f_C1R, float, double)
557 ICV_DEF_GEMM_STORE( 32f_C1R, float, double)
558 
559 ICV_DEF_GEMM_SINGLE_MUL( 64f_C1R, double, double)
560 ICV_DEF_GEMM_BLOCK_MUL( 64f_C1R, double, double)
561 ICV_DEF_GEMM_STORE( 64f_C1R, double, double)
562 
563 ICV_DEF_GEMM_SINGLE_MUL( 32f_C2R, CvComplex32f, CvComplex64f)
564 ICV_DEF_GEMM_BLOCK_MUL( 32f_C2R, CvComplex32f, CvComplex64f)
565 ICV_DEF_GEMM_STORE( 32f_C2R, CvComplex32f, CvComplex64f)
566 
567 ICV_DEF_GEMM_SINGLE_MUL( 64f_C2R, CvComplex64f, CvComplex64f)
568 ICV_DEF_GEMM_BLOCK_MUL( 64f_C2R, CvComplex64f, CvComplex64f)
569 ICV_DEF_GEMM_STORE( 64f_C2R, CvComplex64f, CvComplex64f)
570 
571 typedef CvStatus (CV_STDCALL *CvGEMMSingleMulFunc)( const void* src1, size_t step1,
572                    const void* src2, size_t step2, const void* src3, size_t step3,
573                    void* dst, size_t dststep, CvSize srcsize, CvSize dstsize,
574                    double alpha, double beta, int flags );
575 
576 typedef CvStatus (CV_STDCALL *CvGEMMBlockMulFunc)( const void* src1, size_t step1,
577                    const void* src2, size_t step2, void* dst, size_t dststep,
578                    CvSize srcsize, CvSize dstsize, int flags );
579 
580 typedef CvStatus (CV_STDCALL *CvGEMMStoreFunc)( const void* src1, size_t step1,
581                    const void* src2, size_t step2, void* dst, size_t dststep,
582                    CvSize dstsize, double alpha, double beta, int flags );
583 
584 
icvInitGEMMTable(CvBigFuncTable * single_mul_tab,CvBigFuncTable * block_mul_tab,CvBigFuncTable * store_tab)585 static void icvInitGEMMTable( CvBigFuncTable* single_mul_tab,
586                               CvBigFuncTable* block_mul_tab,
587                               CvBigFuncTable* store_tab )
588 {
589     single_mul_tab->fn_2d[CV_32FC1] = (void*)icvGEMMSingleMul_32f_C1R;
590     single_mul_tab->fn_2d[CV_64FC1] = (void*)icvGEMMSingleMul_64f_C1R;
591     single_mul_tab->fn_2d[CV_32FC2] = (void*)icvGEMMSingleMul_32f_C2R;
592     single_mul_tab->fn_2d[CV_64FC2] = (void*)icvGEMMSingleMul_64f_C2R;
593     block_mul_tab->fn_2d[CV_32FC1] = (void*)icvGEMMBlockMul_32f_C1R;
594     block_mul_tab->fn_2d[CV_64FC1] = (void*)icvGEMMBlockMul_64f_C1R;
595     block_mul_tab->fn_2d[CV_32FC2] = (void*)icvGEMMBlockMul_32f_C2R;
596     block_mul_tab->fn_2d[CV_64FC2] = (void*)icvGEMMBlockMul_64f_C2R;
597     store_tab->fn_2d[CV_32FC1] = (void*)icvGEMMStore_32f_C1R;
598     store_tab->fn_2d[CV_64FC1] = (void*)icvGEMMStore_64f_C1R;
599     store_tab->fn_2d[CV_32FC2] = (void*)icvGEMMStore_32f_C2R;
600     store_tab->fn_2d[CV_64FC2] = (void*)icvGEMMStore_64f_C2R;
601 }
602 
603 
604 CV_IMPL void
cvGEMM(const CvArr * Aarr,const CvArr * Barr,double alpha,const CvArr * Carr,double beta,CvArr * Darr,int flags)605 cvGEMM( const CvArr* Aarr, const CvArr* Barr, double alpha,
606         const CvArr* Carr, double beta, CvArr* Darr, int flags )
607 {
608     const int block_lin_size = 128;
609     const int block_size = block_lin_size * block_lin_size;
610 
611     static CvBigFuncTable single_mul_tab, block_mul_tab, store_tab;
612     static int inittab = 0;
613     static double zero[] = {0,0,0,0};
614     static float zerof[] = {0,0,0,0};
615 
616     uchar* buffer = 0;
617     int local_alloc = 0;
618     uchar* block_buffer = 0;
619 
620     CV_FUNCNAME( "cvGEMM" );
621 
622     __BEGIN__;
623 
624     CvMat *A = (CvMat*)Aarr;
625     CvMat *B = (CvMat*)Barr;
626     CvMat *C = (CvMat*)Carr;
627     CvMat *D = (CvMat*)Darr;
628     int len = 0;
629 
630     CvMat stub, stub1, stub2, stub3;
631     CvSize a_size, d_size;
632     int type;
633 
634     if( !CV_IS_MAT( A ))
635     {
636         int coi = 0;
637         CV_CALL( A = cvGetMat( A, &stub1, &coi ));
638 
639         if( coi != 0 )
640             CV_ERROR( CV_BadCOI, "" );
641     }
642 
643     if( !CV_IS_MAT( B ))
644     {
645         int coi = 0;
646         CV_CALL( B = cvGetMat( B, &stub2, &coi ));
647 
648         if( coi != 0 )
649             CV_ERROR( CV_BadCOI, "" );
650     }
651 
652     if( !CV_IS_MAT( D ))
653     {
654         int coi = 0;
655         CV_CALL( D = cvGetMat( D, &stub, &coi ));
656 
657         if( coi != 0 )
658             CV_ERROR( CV_BadCOI, "" );
659     }
660 
661     if( beta == 0 )
662         C = 0;
663 
664     if( C )
665     {
666         if( !CV_IS_MAT( C ))
667         {
668             int coi = 0;
669             CV_CALL( C = cvGetMat( C, &stub3, &coi ));
670 
671             if( coi != 0 )
672                 CV_ERROR( CV_BadCOI, "" );
673         }
674 
675         if( !CV_ARE_TYPES_EQ( C, D ))
676             CV_ERROR( CV_StsUnmatchedFormats, "" );
677 
678         if( ((flags&CV_GEMM_C_T) == 0 && (C->cols != D->cols || C->rows != D->rows)) ||
679             ((flags&CV_GEMM_C_T) != 0 && (C->rows != D->cols || C->cols != D->rows)))
680             CV_ERROR( CV_StsUnmatchedSizes, "" );
681 
682         if( (flags & CV_GEMM_C_T) != 0 && C->data.ptr == D->data.ptr )
683         {
684             cvTranspose( C, D );
685             C = D;
686             flags &= ~CV_GEMM_C_T;
687         }
688     }
689     else
690     {
691         C = &stub3;
692         C->data.ptr = 0;
693         C->step = 0;
694         C->type = CV_MAT_CONT_FLAG;
695     }
696 
697     type = CV_MAT_TYPE(A->type);
698     if( !CV_ARE_TYPES_EQ( A, B ) || !CV_ARE_TYPES_EQ( A, D ) )
699         CV_ERROR( CV_StsUnmatchedFormats, "" );
700 
701     a_size.width = A->cols;
702     a_size.height = A->rows;
703     d_size.width = D->cols;
704     d_size.height = D->rows;
705 
706     switch( flags & (CV_GEMM_A_T|CV_GEMM_B_T) )
707     {
708     case 0:
709         len = B->rows;
710         if( a_size.width != len ||
711             B->cols != d_size.width ||
712             a_size.height != d_size.height )
713             CV_ERROR( CV_StsUnmatchedSizes, "" );
714         break;
715     case 1:
716         len = B->rows;
717         if( a_size.height != len ||
718             B->cols != d_size.width ||
719             a_size.width != d_size.height )
720             CV_ERROR( CV_StsUnmatchedSizes, "" );
721         break;
722     case 2:
723         len = B->cols;
724         if( a_size.width != len ||
725             B->rows != d_size.width ||
726             a_size.height != d_size.height )
727             CV_ERROR( CV_StsUnmatchedSizes, "" );
728         break;
729     case 3:
730         len = B->cols;
731         if( a_size.height != len ||
732             B->rows != d_size.width ||
733             a_size.width != d_size.height )
734             CV_ERROR( CV_StsUnmatchedSizes, "" );
735         break;
736     }
737 
738     if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) )
739     {
740         int i;
741         if( type == CV_64F )
742         {
743             double* d = D->data.db;
744             const double *a = A->data.db, *b = B->data.db, *c = C->data.db;
745             size_t d_step = D->step/sizeof(d[0]),
746                    a_step = A->step/sizeof(a[0]),
747                    b_step = B->step/sizeof(b[0]),
748                    c_step = C->step/sizeof(c[0]);
749 
750             if( !c )
751                 c = zero;
752 
753             switch( len )
754             {
755             case 2:
756                 if( len == d_size.width && b != d )
757                 {
758                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
759                     {
760                         double t0 = a[0]*b[0] + a[1]*b[b_step];
761                         double t1 = a[0]*b[1] + a[1]*b[b_step+1];
762                         d[0] = t0*alpha + c[0]*beta;
763                         d[1] = t1*alpha + c[1]*beta;
764                     }
765                 }
766                 else if( a != d )
767                 {
768                     int c_step0 = 1;
769                     if( c == zero )
770                     {
771                         c_step0 = 0;
772                         c_step = 1;
773                     }
774 
775                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
776                     {
777                         double t0 = a[0]*b[0] + a[1]*b[b_step];
778                         double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
779                         d[0] = t0*alpha + c[0]*beta;
780                         d[d_step] = t1*alpha + c[c_step]*beta;
781                     }
782                 }
783                 else
784                     break;
785                 EXIT;
786             case 3:
787                 if( len == d_size.width && b != d )
788                 {
789                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
790                     {
791                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
792                         double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
793                         double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
794                         d[0] = t0*alpha + c[0]*beta;
795                         d[1] = t1*alpha + c[1]*beta;
796                         d[2] = t2*alpha + c[2]*beta;
797                     }
798                 }
799                 else if( a != d )
800                 {
801                     int c_step0 = 1;
802                     if( c == zero )
803                     {
804                         c_step0 = 0;
805                         c_step = 1;
806                     }
807 
808                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
809                     {
810                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
811                         double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
812                         double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
813 
814                         d[0] = t0*alpha + c[0]*beta;
815                         d[d_step] = t1*alpha + c[c_step]*beta;
816                         d[d_step*2] = t2*alpha + c[c_step*2]*beta;
817                     }
818                 }
819                 else
820                     break;
821                 EXIT;
822             case 4:
823                 if( len == d_size.width && b != d )
824                 {
825                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
826                     {
827                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
828                         double t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
829                         double t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
830                         double t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
831                         d[0] = t0*alpha + c[0]*beta;
832                         d[1] = t1*alpha + c[1]*beta;
833                         d[2] = t2*alpha + c[2]*beta;
834                         d[3] = t3*alpha + c[3]*beta;
835                     }
836                 }
837                 else if( d_size.width <= 16 && a != d )
838                 {
839                     int c_step0 = 1;
840                     if( c == zero )
841                     {
842                         c_step0 = 0;
843                         c_step = 1;
844                     }
845 
846                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
847                     {
848                         double t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
849                         double t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
850                                     a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
851                         double t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
852                                     a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
853                         double t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
854                                     a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
855                         d[0] = t0*alpha + c[0]*beta;
856                         d[d_step] = t1*alpha + c[c_step]*beta;
857                         d[d_step*2] = t2*alpha + c[c_step*2]*beta;
858                         d[d_step*3] = t3*alpha + c[c_step*3]*beta;
859                     }
860                 }
861                 else
862                     break;
863                 EXIT;
864             }
865         }
866 
867         if( type == CV_32F )
868         {
869             float* d = D->data.fl;
870             const float *a = A->data.fl, *b = B->data.fl, *c = C->data.fl;
871             size_t d_step = D->step/sizeof(d[0]),
872                    a_step = A->step/sizeof(a[0]),
873                    b_step = B->step/sizeof(b[0]),
874                    c_step = C->step/sizeof(c[0]);
875 
876             if( !c )
877                 c = zerof;
878 
879             switch( len )
880             {
881             case 2:
882                 if( len == d_size.width && b != d )
883                 {
884                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
885                     {
886                         float t0 = a[0]*b[0] + a[1]*b[b_step];
887                         float t1 = a[0]*b[1] + a[1]*b[b_step+1];
888                         d[0] = (float)(t0*alpha + c[0]*beta);
889                         d[1] = (float)(t1*alpha + c[1]*beta);
890                     }
891                 }
892                 else if( a != d )
893                 {
894                     int c_step0 = 1;
895                     if( c == zerof )
896                     {
897                         c_step0 = 0;
898                         c_step = 1;
899                     }
900 
901                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
902                     {
903                         float t0 = a[0]*b[0] + a[1]*b[b_step];
904                         float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step];
905                         d[0] = (float)(t0*alpha + c[0]*beta);
906                         d[d_step] = (float)(t1*alpha + c[c_step]*beta);
907                     }
908                 }
909                 else
910                     break;
911                 EXIT;
912             case 3:
913                 if( len == d_size.width && b != d )
914                 {
915                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
916                     {
917                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
918                         float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1];
919                         float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2];
920                         d[0] = (float)(t0*alpha + c[0]*beta);
921                         d[1] = (float)(t1*alpha + c[1]*beta);
922                         d[2] = (float)(t2*alpha + c[2]*beta);
923                     }
924                 }
925                 else if( a != d )
926                 {
927                     int c_step0 = 1;
928                     if( c == zerof )
929                     {
930                         c_step0 = 0;
931                         c_step = 1;
932                     }
933 
934                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
935                     {
936                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2];
937                         float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] + a[a_step+2]*b[b_step*2];
938                         float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] + a[a_step*2+2]*b[b_step*2];
939 
940                         d[0] = (float)(t0*alpha + c[0]*beta);
941                         d[d_step] = (float)(t1*alpha + c[c_step]*beta);
942                         d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
943                     }
944                 }
945                 else
946                     break;
947                 EXIT;
948             case 4:
949                 if( len == d_size.width && b != d )
950                 {
951                     for( i = 0; i < d_size.height; i++, d += d_step, a += a_step, c += c_step )
952                     {
953                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
954                         float t1 = a[0]*b[1] + a[1]*b[b_step+1] + a[2]*b[b_step*2+1] + a[3]*b[b_step*3+1];
955                         float t2 = a[0]*b[2] + a[1]*b[b_step+2] + a[2]*b[b_step*2+2] + a[3]*b[b_step*3+2];
956                         float t3 = a[0]*b[3] + a[1]*b[b_step+3] + a[2]*b[b_step*2+3] + a[3]*b[b_step*3+3];
957                         d[0] = (float)(t0*alpha + c[0]*beta);
958                         d[1] = (float)(t1*alpha + c[1]*beta);
959                         d[2] = (float)(t2*alpha + c[2]*beta);
960                         d[3] = (float)(t3*alpha + c[3]*beta);
961                     }
962                 }
963                 else if( len <= 16 && a != d )
964                 {
965                     int c_step0 = 1;
966                     if( c == zerof )
967                     {
968                         c_step0 = 0;
969                         c_step = 1;
970                     }
971 
972                     for( i = 0; i < d_size.width; i++, d++, b++, c += c_step0 )
973                     {
974                         float t0 = a[0]*b[0] + a[1]*b[b_step] + a[2]*b[b_step*2] + a[3]*b[b_step*3];
975                         float t1 = a[a_step]*b[0] + a[a_step+1]*b[b_step] +
976                                    a[a_step+2]*b[b_step*2] + a[a_step+3]*b[b_step*3];
977                         float t2 = a[a_step*2]*b[0] + a[a_step*2+1]*b[b_step] +
978                                    a[a_step*2+2]*b[b_step*2] + a[a_step*2+3]*b[b_step*3];
979                         float t3 = a[a_step*3]*b[0] + a[a_step*3+1]*b[b_step] +
980                                    a[a_step*3+2]*b[b_step*2] + a[a_step*3+3]*b[b_step*3];
981                         d[0] = (float)(t0*alpha + c[0]*beta);
982                         d[d_step] = (float)(t1*alpha + c[c_step]*beta);
983                         d[d_step*2] = (float)(t2*alpha + c[c_step*2]*beta);
984                         d[d_step*3] = (float)(t3*alpha + c[c_step*3]*beta);
985                     }
986                 }
987                 else
988                     break;
989                 EXIT;
990             }
991         }
992     }
993 
994     {
995         int b_step = B->step;
996         CvGEMMSingleMulFunc single_mul_func;
997         CvMat tmat, *D0 = D;
998         icvBLAS_GEMM_32f_t blas_func = 0;
999 
1000         if( !inittab )
1001         {
1002             icvInitGEMMTable( &single_mul_tab, &block_mul_tab, &store_tab );
1003             inittab = 1;
1004         }
1005 
1006         single_mul_func = (CvGEMMSingleMulFunc)single_mul_tab.fn_2d[type];
1007         if( !single_mul_func )
1008             CV_ERROR( CV_StsUnsupportedFormat, "" );
1009 
1010         if( D->data.ptr == A->data.ptr || D->data.ptr == B->data.ptr )
1011         {
1012             int buf_size = d_size.width*d_size.height*CV_ELEM_SIZE(type);
1013             if( d_size.width <= CV_MAX_LOCAL_MAT_SIZE )
1014             {
1015                 buffer = (uchar*)cvStackAlloc( buf_size );
1016                 local_alloc = 1;
1017             }
1018             else
1019                 CV_CALL( buffer = (uchar*)cvAlloc( buf_size ));
1020 
1021             tmat = cvMat( d_size.height, d_size.width, type, buffer );
1022             D = &tmat;
1023         }
1024 
1025         if( (d_size.width == 1 || len == 1) && !(flags & CV_GEMM_B_T) && CV_IS_MAT_CONT(B->type) )
1026         {
1027             b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
1028             flags |= CV_GEMM_B_T;
1029         }
1030 
1031         if( (d_size.width | d_size.height | len) >= 16 && icvBLAS_GEMM_32f_p != 0 )
1032         {
1033             blas_func = type == CV_32FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32f_p :
1034                         type == CV_64FC1 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64f_p :
1035                         type == CV_32FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_32fc_p :
1036                         type == CV_64FC2 ? (icvBLAS_GEMM_32f_t)icvBLAS_GEMM_64fc_p : 0;
1037         }
1038 
1039         if( blas_func )
1040         {
1041             const char* transa = flags & CV_GEMM_A_T ? "t" : "n";
1042             const char* transb = flags & CV_GEMM_B_T ? "t" : "n";
1043             int lda, ldb, ldd;
1044 
1045             if( C->data.ptr )
1046             {
1047                 if( C->data.ptr != D->data.ptr )
1048                 {
1049                     if( !(flags & CV_GEMM_C_T) )
1050                         cvCopy( C, D );
1051                     else
1052                         cvTranspose( C, D );
1053                 }
1054             }
1055 
1056             if( CV_MAT_DEPTH(type) == CV_32F )
1057             {
1058                 CvComplex32f _alpha, _beta;
1059 
1060                 lda = A->step/sizeof(float);
1061                 ldb = b_step/sizeof(float);
1062                 ldd = D->step/sizeof(float);
1063                 _alpha.re = (float)alpha;
1064                 _alpha.im = 0;
1065                 _beta.re = C->data.ptr ? (float)beta : 0;
1066                 _beta.im = 0;
1067                 if( CV_MAT_CN(type) == 2 )
1068                     lda /= 2, ldb /= 2, ldd /= 2;
1069 
1070                 blas_func( transb, transa, &d_size.width, &d_size.height, &len,
1071                        &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
1072                        &_beta, D->data.ptr, &ldd );
1073             }
1074             else
1075             {
1076                 CvComplex64f _alpha, _beta;
1077 
1078                 lda = A->step/sizeof(double);
1079                 ldb = b_step/sizeof(double);
1080                 ldd = D->step/sizeof(double);
1081                 _alpha.re = alpha;
1082                 _alpha.im = 0;
1083                 _beta.re = C->data.ptr ? beta : 0;
1084                 _beta.im = 0;
1085                 if( CV_MAT_CN(type) == 2 )
1086                     lda /= 2, ldb /= 2, ldd /= 2;
1087 
1088                 blas_func( transb, transa, &d_size.width, &d_size.height, &len,
1089                        &_alpha, B->data.ptr, &ldb, A->data.ptr, &lda,
1090                        &_beta, D->data.ptr, &ldd );
1091             }
1092         }
1093         else if( ((d_size.height <= block_lin_size/2 || d_size.width <= block_lin_size/2) &&
1094             len <= 10000) || len <= 10 ||
1095             (d_size.width <= block_lin_size && d_size.height <= block_lin_size && len <= block_lin_size) )
1096         {
1097             single_mul_func( A->data.ptr, A->step, B->data.ptr, b_step,
1098                              C->data.ptr, C->step, D->data.ptr, D->step,
1099                              a_size, d_size, alpha, beta, flags );
1100         }
1101         else
1102         {
1103             int is_a_t = flags & CV_GEMM_A_T;
1104             int is_b_t = flags & CV_GEMM_B_T;
1105             int elem_size = CV_ELEM_SIZE(type);
1106             int dk0_1, dk0_2;
1107             int a_buf_size = 0, b_buf_size, d_buf_size;
1108             uchar* a_buf = 0;
1109             uchar* b_buf = 0;
1110             uchar* d_buf = 0;
1111             int i, j, k, di = 0, dj = 0, dk = 0;
1112             int dm0, dn0, dk0;
1113             int a_step0, a_step1, b_step0, b_step1, c_step0, c_step1;
1114             int work_elem_size = elem_size << (CV_MAT_DEPTH(type) == CV_32F ? 1 : 0);
1115             CvGEMMBlockMulFunc block_mul_func = (CvGEMMBlockMulFunc)block_mul_tab.fn_2d[type];
1116             CvGEMMStoreFunc store_func = (CvGEMMStoreFunc)store_tab.fn_2d[type];
1117 
1118             assert( block_mul_func && store_func );
1119 
1120             if( !is_a_t )
1121                 a_step0 = A->step, a_step1 = elem_size;
1122             else
1123                 a_step0 = elem_size, a_step1 = A->step;
1124 
1125             if( !is_b_t )
1126                 b_step0 = b_step, b_step1 = elem_size;
1127             else
1128                 b_step0 = elem_size, b_step1 = b_step;
1129 
1130             if( !C->data.ptr )
1131             {
1132                 c_step0 = c_step1 = 0;
1133                 flags &= ~CV_GEMM_C_T;
1134             }
1135             else if( !(flags & CV_GEMM_C_T) )
1136                 c_step0 = C->step, c_step1 = elem_size;
1137             else
1138                 c_step0 = elem_size, c_step1 = C->step;
1139 
1140             dm0 = MIN( block_lin_size, d_size.height );
1141             dn0 = MIN( block_lin_size, d_size.width );
1142             dk0_1 = block_size / dm0;
1143             dk0_2 = block_size / dn0;
1144             dk0 = MAX( dk0_1, dk0_2 );
1145             dk0 = MIN( dk0, len );
1146             if( dk0*dm0 > block_size )
1147                 dm0 = block_size / dk0;
1148             if( dk0*dn0 > block_size )
1149                 dn0 = block_size / dk0;
1150 
1151             dk0_1 = (dn0+dn0/8+2) & -2;
1152             b_buf_size = (dk0+dk0/8+1)*dk0_1*elem_size;
1153             d_buf_size = (dk0+dk0/8+1)*dk0_1*work_elem_size;
1154 
1155             if( is_a_t )
1156             {
1157                 a_buf_size = (dm0+dm0/8+1)*((dk0+dk0/8+2)&-2)*elem_size;
1158                 flags &= ~CV_GEMM_A_T;
1159             }
1160 
1161             CV_CALL( block_buffer = (uchar*)cvAlloc(a_buf_size + b_buf_size + d_buf_size));
1162             d_buf = block_buffer;
1163             b_buf = d_buf + d_buf_size;
1164 
1165             if( is_a_t )
1166                 a_buf = b_buf + b_buf_size;
1167 
1168             for( i = 0; i < d_size.height; i += di )
1169             {
1170                 di = dm0;
1171                 if( i + di >= d_size.height || 8*(i + di) + di > 8*d_size.height )
1172                     di = d_size.height - i;
1173 
1174                 for( j = 0; j < d_size.width; j += dj )
1175                 {
1176                     uchar* _d = D->data.ptr + i*D->step + j*elem_size;
1177                     const uchar* _c = C->data.ptr + i*c_step0 + j*c_step1;
1178                     int _d_step = D->step;
1179                     dj = dn0;
1180 
1181                     if( j + dj >= d_size.width || 8*(j + dj) + dj > 8*d_size.width )
1182                         dj = d_size.width - j;
1183 
1184                     flags &= 15;
1185                     if( dk0 < len )
1186                     {
1187                         _d = d_buf;
1188                         _d_step = dj*work_elem_size;
1189                     }
1190 
1191                     for( k = 0; k < len; k += dk )
1192                     {
1193                         const uchar* _a = A->data.ptr + i*a_step0 + k*a_step1;
1194                         int _a_step = A->step;
1195                         const uchar* _b = B->data.ptr + k*b_step0 + j*b_step1;
1196                         int _b_step = b_step;
1197                         CvSize a_bl_size;
1198 
1199                         dk = dk0;
1200                         if( k + dk >= len || 8*(k + dk) + dk > 8*len )
1201                             dk = len - k;
1202 
1203                         if( !is_a_t )
1204                             a_bl_size.width = dk, a_bl_size.height = di;
1205                         else
1206                             a_bl_size.width = di, a_bl_size.height = dk;
1207 
1208                         if( a_buf && is_a_t )
1209                         {
1210                             int t;
1211                             _a_step = dk*elem_size;
1212                             icvGEMM_TransposeBlock( _a, A->step, a_buf, _a_step, a_bl_size, elem_size );
1213                             CV_SWAP( a_bl_size.width, a_bl_size.height, t );
1214                             _a = a_buf;
1215                         }
1216 
1217                         if( dj < d_size.width )
1218                         {
1219                             CvSize b_size;
1220                             if( !is_b_t )
1221                                 b_size.width = dj, b_size.height = dk;
1222                             else
1223                                 b_size.width = dk, b_size.height = dj;
1224 
1225                             _b_step = b_size.width*elem_size;
1226                             icvGEMM_CopyBlock( _b, b_step, b_buf, _b_step, b_size, elem_size );
1227                             _b = b_buf;
1228                         }
1229 
1230                         if( dk0 < len )
1231                             block_mul_func( _a, _a_step, _b, _b_step, _d, _d_step,
1232                                             a_bl_size, cvSize(dj,di), flags );
1233                         else
1234                             single_mul_func( _a, _a_step, _b, _b_step, _c, C->step, _d, _d_step,
1235                                              a_bl_size, cvSize(dj,di), alpha, beta, flags );
1236                         flags |= 16;
1237                     }
1238 
1239                     if( dk0 < len )
1240                         store_func( _c, C->step, _d, _d_step, D->data.ptr + i*D->step + j*elem_size,
1241                                     D->step, cvSize(dj,di), alpha, beta, flags );
1242                 }
1243             }
1244         }
1245 
1246         if( D0 != D )
1247             CV_CALL( cvCopy( D, D0 ));
1248     }
1249 
1250     __END__;
1251 
1252     if( buffer && !local_alloc )
1253         cvFree( &buffer );
1254     if( block_buffer )
1255         cvFree( &block_buffer );
1256 }
1257 
1258 
1259 /****************************************************************************************\
1260 *                                        cvTransform                                     *
1261 \****************************************************************************************/
1262 
1263 #define  ICV_DEF_TRANSFORM_CASE_C1( arrtype, temptype, _ld_,        \
1264                                    _cast_macro1_, _cast_macro2_ )   \
1265 {                                                                   \
1266     for( i = 0; i < size.width; i++, dst += dst_cn )                \
1267     {                                                               \
1268         const double* _mat = mat;                                   \
1269         double v0 = _ld_(src[i]);                                   \
1270         for( k = 0; k < dst_cn; k++, _mat += 2 )                    \
1271         {                                                           \
1272             temptype t0 = _cast_macro1_(_mat[0]*v0 + _mat[1]);      \
1273             dst[k] = _cast_macro2_(t0);                             \
1274         }                                                           \
1275     }                                                               \
1276     src += size.width;                                              \
1277 }
1278 
1279 
1280 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C1( arrtype, temptype, _ld_,   \
1281                                   _cast_macro1_, _cast_macro2_ )    \
1282     for( i = 0; i < size.width; i++ )                               \
1283     {                                                               \
1284         double ft0;                                                 \
1285         temptype t0;                                                \
1286         ft0 = mat[0]*_ld_(src[i]) + mat[1];                         \
1287         t0 = _cast_macro1_(ft0);                                    \
1288         dst[i] = _cast_macro2_(t0);                                 \
1289     }
1290 
1291 
1292 #define  ICV_DEF_TRANSFORM_CASE_C2( arrtype, temptype, _ld_,        \
1293                                   _cast_macro1_, _cast_macro2_ )    \
1294 if( dst_cn == 2 )                                                   \
1295 {                                                                   \
1296     for( i = 0; i < size.width*2; i += 2 )                          \
1297     {                                                               \
1298         double ft0, ft1;                                            \
1299         temptype t0, t1;                                            \
1300         ft0 = mat[0]*_ld_(src[i]) + mat[1]*_ld_(src[i+1]) + mat[2]; \
1301         ft1 = mat[3]*_ld_(src[i]) + mat[4]*_ld_(src[i+1]) + mat[5]; \
1302         t0 = _cast_macro1_(ft0);                                    \
1303         t1 = _cast_macro1_(ft1);                                    \
1304         dst[i] = _cast_macro2_(t0);                                 \
1305         dst[i+1] = _cast_macro2_(t1);                               \
1306     }                                                               \
1307     src += size.width*2; dst += size.width*2;                       \
1308 }                                                                   \
1309 else                                                                \
1310     for( i = 0; i < size.width; i++, src += 2, dst += dst_cn )      \
1311     {                                                               \
1312         const double* _mat = mat;                                   \
1313         double v0 = _ld_(src[0]), v1 = src[1];                      \
1314         for( k = 0; k < dst_cn; k++, _mat += 3 )                    \
1315         {                                                           \
1316             temptype t0 =                                           \
1317                 _cast_macro1_(_mat[0]*v0 + _mat[1]*v1 + _mat[2]);   \
1318             dst[k] = _cast_macro2_(t0);                             \
1319         }                                                           \
1320     }
1321 
1322 
1323 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C2( arrtype, temptype, _ld_,   \
1324                                   _cast_macro1_, _cast_macro2_ )    \
1325     for( i = 0; i < size.width*2; i += 2 )                          \
1326     {                                                               \
1327         double ft0, ft1;                                            \
1328         temptype t0, t1;                                            \
1329         ft0 = mat[0]*_ld_(src[i]) + mat[2];                         \
1330         ft1 = mat[4]*_ld_(src[i+1]) + mat[5];                       \
1331         t0 = _cast_macro1_(ft0);                                    \
1332         t1 = _cast_macro1_(ft1);                                    \
1333         dst[i] = _cast_macro2_(t0);                                 \
1334         dst[i+1] = _cast_macro2_(t1);                               \
1335     }
1336 
1337 
1338 #define  ICV_DEF_TRANSFORM_CASE_C3( arrtype, temptype, _ld_,        \
1339                                   _cast_macro1_, _cast_macro2_ )    \
1340 if( dst_cn == 3 )                                                   \
1341 {                                                                   \
1342     for( i = 0; i < size.width*3; i += 3 )                          \
1343     {                                                               \
1344         double ft0, ft1, ft2;                                       \
1345         temptype t0, t1, t2;                                        \
1346         ft0 = mat[0]*_ld_(src[i]) + mat[1]*_ld_(src[i+1]) +         \
1347               mat[2]*_ld_(src[i+2]) + mat[3];                       \
1348         ft1 = mat[4]*_ld_(src[i]) + mat[5]*_ld_(src[i+1]) +         \
1349               mat[6]*_ld_(src[i+2]) + mat[7];                       \
1350         ft2 = mat[8]*_ld_(src[i]) + mat[9]*_ld_(src[i+1]) +         \
1351               mat[10]*_ld_(src[i+2]) + mat[11];                     \
1352         t0 = _cast_macro1_(ft0);                                    \
1353         t1 = _cast_macro1_(ft1);                                    \
1354         t2 = _cast_macro1_(ft2);                                    \
1355         dst[i] = _cast_macro2_(t0);                                 \
1356         dst[i+1] = _cast_macro2_(t1);                               \
1357         dst[i+2] = _cast_macro2_(t2);                               \
1358     }                                                               \
1359     src += size.width*3; dst += size.width*3;                       \
1360 }                                                                   \
1361 else if( dst_cn == 1 )                                              \
1362 {                                                                   \
1363     for( i = 0; i < size.width; i++, src += 3 )                     \
1364     {                                                               \
1365         temptype t0 = _cast_macro1_(mat[0]*_ld_(src[0]) +           \
1366             mat[1]*_ld_(src[1]) + mat[2]*_ld_(src[2]) + mat[3]);    \
1367         dst[i] = _cast_macro2_(t0);                                 \
1368     }                                                               \
1369     dst += size.width;                                              \
1370 }                                                                   \
1371 else                                                                \
1372     for( i = 0; i < size.width; i++, src += 3, dst += dst_cn )      \
1373     {                                                               \
1374         const double* _mat = mat;                                   \
1375         double v0=_ld_(src[0]), v1=_ld_(src[1]), v2=_ld_(src[2]);   \
1376         for( k = 0; k < dst_cn; k++, _mat += 4 )                    \
1377         {                                                           \
1378             temptype t0 = _cast_macro1_(_mat[0]*v0 +                \
1379                     _mat[1]*v1 + _mat[2]*v2 + _mat[3]);             \
1380             dst[k] = _cast_macro2_(t0);                             \
1381         }                                                           \
1382     }
1383 
1384 
1385 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C3( arrtype, temptype, _ld_,   \
1386                                   _cast_macro1_, _cast_macro2_ )    \
1387     for( i = 0; i < size.width*3; i += 3 )                          \
1388     {                                                               \
1389         double ft0, ft1, ft2;                                       \
1390         temptype t0, t1, t2;                                        \
1391         ft0 = mat[0]*_ld_(src[i]) + mat[3];                         \
1392         ft1 = mat[5]*_ld_(src[i+1]) + mat[7];                       \
1393         ft2 = mat[10]*_ld_(src[i+2]) + mat[11];                     \
1394         t0 = _cast_macro1_(ft0);                                    \
1395         t1 = _cast_macro1_(ft1);                                    \
1396         t2 = _cast_macro1_(ft2);                                    \
1397         dst[i] = _cast_macro2_(t0);                                 \
1398         dst[i+1] = _cast_macro2_(t1);                               \
1399         dst[i+2] = _cast_macro2_(t2);                               \
1400     }
1401 
1402 
1403 #define  ICV_DEF_TRANSFORM_CASE_C4( arrtype, temptype, _ld_,        \
1404                                   _cast_macro1_, _cast_macro2_ )    \
1405 for( i = 0; i < size.width; i++, src += 4, dst += dst_cn )          \
1406 {                                                                   \
1407     const double* _mat = mat;                                       \
1408     double v0 = _ld_(src[0]), v1 = _ld_(src[1]),                    \
1409            v2 = _ld_(src[2]), v3 = _ld_(src[3]);                    \
1410     for( k = 0; k < dst_cn; k++, _mat += 5 )                        \
1411     {                                                               \
1412         temptype t0 =_cast_macro1_(_mat[0]*v0+_mat[1]*v1+           \
1413                                    _mat[2]*v2+_mat[3]*v3+_mat[4]);  \
1414         dst[k] = _cast_macro2_(t0);                                 \
1415     }                                                               \
1416 }
1417 
1418 
1419 #define  ICV_DEF_DIAG_TRANSFORM_CASE_C4( arrtype, temptype, _ld_,   \
1420                                   _cast_macro1_, _cast_macro2_ )    \
1421     for( i = 0; i < size.width*4; i += 4 )                          \
1422     {                                                               \
1423         double ft0, ft1;                                            \
1424         temptype t0, t1;                                            \
1425         ft0 = mat[0]*_ld_(src[i]) + mat[4];                         \
1426         ft1 = mat[6]*_ld_(src[i+1]) + mat[9];                       \
1427         t0 = _cast_macro1_(ft0);                                    \
1428         t1 = _cast_macro1_(ft1);                                    \
1429         dst[i] = _cast_macro2_(t0);                                 \
1430         dst[i+1] = _cast_macro2_(t1);                               \
1431         ft0 = mat[12]*_ld_(src[i+2]) + mat[14];                     \
1432         ft1 = mat[18]*_ld_(src[i+3]) + mat[19];                     \
1433         t0 = _cast_macro1_(ft0);                                    \
1434         t1 = _cast_macro1_(ft1);                                    \
1435         dst[i+2] = _cast_macro2_(t0);                               \
1436         dst[i+3] = _cast_macro2_(t1);                               \
1437     }
1438 
1439 
1440 
1441 #define  ICV_DEF_TRANSFORM_FUNC( flavor, arrtype, temptype, _ld_,   \
1442                                  _cast_macro1_, _cast_macro2_, cn  )\
1443 static CvStatus CV_STDCALL                                          \
1444 icvTransform_##flavor( const arrtype* src, int srcstep,             \
1445                        arrtype* dst, int dststep, CvSize size,      \
1446                        const double* mat, int dst_cn )              \
1447 {                                                                   \
1448     srcstep = srcstep/sizeof(src[0]) - size.width*cn;               \
1449     dststep = dststep/sizeof(dst[0]) - size.width*dst_cn;           \
1450     for( ; size.height--; src += srcstep, dst += dststep )          \
1451     {                                                               \
1452         int i, k;                                                   \
1453         ICV_DEF_TRANSFORM_CASE_C##cn( arrtype, temptype, _ld_,      \
1454                                      _cast_macro1_, _cast_macro2_ ) \
1455     }                                                               \
1456                                                                     \
1457     return CV_OK;                                                   \
1458 }
1459 
1460 
1461 #define  ICV_DEF_DIAG_TRANSFORM_FUNC( flavor, arrtype, temptype, _ld_, \
1462                                  _cast_macro1_, _cast_macro2_, cn  )\
1463 static CvStatus CV_STDCALL                                          \
1464 icvDiagTransform_##flavor( const arrtype* src, int srcstep,         \
1465                        arrtype* dst, int dststep, CvSize size,      \
1466                        const double* mat )                          \
1467 {                                                                   \
1468     srcstep /= sizeof(src[0]);                                      \
1469     dststep /= sizeof(dst[0]);                                      \
1470     for( ; size.height--; src += srcstep, dst += dststep )          \
1471     {                                                               \
1472         int i;                                                      \
1473         ICV_DEF_DIAG_TRANSFORM_CASE_C##cn( arrtype, temptype, _ld_, \
1474                                      _cast_macro1_, _cast_macro2_ ) \
1475     }                                                               \
1476                                                                     \
1477     return CV_OK;                                                   \
1478 }
1479 
1480 
1481 ICV_DEF_TRANSFORM_FUNC( 8u_C1R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 1 )
1482 ICV_DEF_TRANSFORM_FUNC( 8u_C2R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 2 )
1483 ICV_DEF_TRANSFORM_FUNC( 8u_C3R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 3 )
1484 ICV_DEF_TRANSFORM_FUNC( 8u_C4R, uchar, int, CV_8TO32F, cvRound, CV_CAST_8U, 4 )
1485 
1486 ICV_DEF_TRANSFORM_FUNC( 16u_C1R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 1 )
1487 ICV_DEF_TRANSFORM_FUNC( 16u_C2R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 2 )
1488 ICV_DEF_TRANSFORM_FUNC( 16u_C3R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 3 )
1489 ICV_DEF_TRANSFORM_FUNC( 16u_C4R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 4 )
1490 
1491 ICV_DEF_TRANSFORM_FUNC( 16s_C1R, short, int, CV_NOP, cvRound, CV_CAST_16S, 1 )
1492 ICV_DEF_TRANSFORM_FUNC( 16s_C2R, short, int, CV_NOP, cvRound, CV_CAST_16S, 2 )
1493 ICV_DEF_TRANSFORM_FUNC( 16s_C3R, short, int, CV_NOP, cvRound, CV_CAST_16S, 3 )
1494 ICV_DEF_TRANSFORM_FUNC( 16s_C4R, short, int, CV_NOP, cvRound, CV_CAST_16S, 4 )
1495 
1496 ICV_DEF_TRANSFORM_FUNC( 32s_C1R, int, int, CV_NOP, cvRound, CV_NOP, 1 )
1497 ICV_DEF_TRANSFORM_FUNC( 32s_C2R, int, int, CV_NOP, cvRound, CV_NOP, 2 )
1498 ICV_DEF_TRANSFORM_FUNC( 32s_C3R, int, int, CV_NOP, cvRound, CV_NOP, 3 )
1499 ICV_DEF_TRANSFORM_FUNC( 32s_C4R, int, int, CV_NOP, cvRound, CV_NOP, 4 )
1500 
1501 ICV_DEF_TRANSFORM_FUNC( 32f_C1R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 1 )
1502 ICV_DEF_TRANSFORM_FUNC( 32f_C2R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 2 )
1503 ICV_DEF_TRANSFORM_FUNC( 32f_C3R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 3 )
1504 ICV_DEF_TRANSFORM_FUNC( 32f_C4R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 4 )
1505 
1506 ICV_DEF_TRANSFORM_FUNC( 64f_C1R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 1 )
1507 ICV_DEF_TRANSFORM_FUNC( 64f_C2R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 2 )
1508 ICV_DEF_TRANSFORM_FUNC( 64f_C3R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 3 )
1509 ICV_DEF_TRANSFORM_FUNC( 64f_C4R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 4 )
1510 
1511 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C1R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 1 )
1512 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C2R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 2 )
1513 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C3R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 3 )
1514 ICV_DEF_DIAG_TRANSFORM_FUNC( 16u_C4R, ushort, int, CV_NOP, cvRound, CV_CAST_16U, 4 )
1515 
1516 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C1R, short, int, CV_NOP, cvRound, CV_CAST_16S, 1 )
1517 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C2R, short, int, CV_NOP, cvRound, CV_CAST_16S, 2 )
1518 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C3R, short, int, CV_NOP, cvRound, CV_CAST_16S, 3 )
1519 ICV_DEF_DIAG_TRANSFORM_FUNC( 16s_C4R, short, int, CV_NOP, cvRound, CV_CAST_16S, 4 )
1520 
1521 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C1R, int, int, CV_NOP, cvRound, CV_NOP, 1 )
1522 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C2R, int, int, CV_NOP, cvRound, CV_NOP, 2 )
1523 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C3R, int, int, CV_NOP, cvRound, CV_NOP, 3 )
1524 ICV_DEF_DIAG_TRANSFORM_FUNC( 32s_C4R, int, int, CV_NOP, cvRound, CV_NOP, 4 )
1525 
1526 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C1R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 1 )
1527 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C2R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 2 )
1528 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C3R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 3 )
1529 ICV_DEF_DIAG_TRANSFORM_FUNC( 32f_C4R, float, double, CV_NOP, CV_NOP, CV_CAST_32F, 4 )
1530 
1531 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C1R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 1 )
1532 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C2R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 2 )
1533 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C3R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 3 )
1534 ICV_DEF_DIAG_TRANSFORM_FUNC( 64f_C4R, double, double, CV_NOP, CV_NOP, CV_CAST_64F, 4 )
1535 
1536 #define icvTransform_8s_C1R 0
1537 #define icvTransform_8s_C2R 0
1538 #define icvTransform_8s_C3R 0
1539 #define icvTransform_8s_C4R 0
1540 
1541 #define icvDiagTransform_8s_C1R 0
1542 #define icvDiagTransform_8s_C2R 0
1543 #define icvDiagTransform_8s_C3R 0
1544 #define icvDiagTransform_8s_C4R 0
1545 
1546 #define icvDiagTransform_8u_C1R 0
1547 #define icvDiagTransform_8u_C2R 0
1548 #define icvDiagTransform_8u_C3R 0
1549 #define icvDiagTransform_8u_C4R 0
1550 
1551 CV_DEF_INIT_BIG_FUNC_TAB_2D( Transform, R )
1552 CV_DEF_INIT_BIG_FUNC_TAB_2D( DiagTransform, R )
1553 
1554 typedef CvStatus (CV_STDCALL * CvTransformFunc)(
1555                        const void* src, int srcstep,
1556                        void* dst, int dststep, CvSize size,
1557                        const void* mat, int dst_cn );
1558 
1559 typedef CvStatus (CV_STDCALL * CvDiagTransformFunc)(
1560                        const void* src, int srcstep,
1561                        void* dst, int dststep, CvSize size,
1562                        const void* mat );
1563 
1564 typedef CvStatus (CV_STDCALL * CvDiagTransformFunc)(
1565                        const void* src, int srcstep,
1566                        void* dst, int dststep, CvSize size,
1567                        const void* mat );
1568 
1569 ///////////////////// IPP transform functions //////////////////
1570 
1571 icvColorTwist_8u_C3R_t icvColorTwist_8u_C3R_p = 0;
1572 icvColorTwist_16u_C3R_t icvColorTwist_16u_C3R_p = 0;
1573 icvColorTwist_16s_C3R_t icvColorTwist_16s_C3R_p = 0;
1574 icvColorTwist_32f_C3R_t icvColorTwist_32f_C3R_p = 0;
1575 icvColorTwist_32f_C4R_t icvColorTwist_32f_C4R_p = 0;
1576 
1577 icvColorToGray_8u_C3C1R_t icvColorToGray_8u_C3C1R_p = 0;
1578 icvColorToGray_16u_C3C1R_t icvColorToGray_16u_C3C1R_p = 0;
1579 icvColorToGray_16s_C3C1R_t icvColorToGray_16s_C3C1R_p = 0;
1580 icvColorToGray_32f_C3C1R_t icvColorToGray_32f_C3C1R_p = 0;
1581 
1582 icvColorToGray_8u_AC4C1R_t icvColorToGray_8u_AC4C1R_p = 0;
1583 icvColorToGray_16u_AC4C1R_t icvColorToGray_16u_AC4C1R_p = 0;
1584 icvColorToGray_16s_AC4C1R_t icvColorToGray_16s_AC4C1R_p = 0;
1585 icvColorToGray_32f_AC4C1R_t icvColorToGray_32f_AC4C1R_p = 0;
1586 
1587 typedef CvStatus (CV_STDCALL * CvColorTwistIPPFunc)( const void* src, int srcstep,
1588                         void* dst, int dststep, CvSize size, const float* coeffs );
1589 
1590 ////////////////////////////////////////////////////////////////
1591 
1592 CV_IMPL void
cvTransform(const CvArr * srcarr,CvArr * dstarr,const CvMat * transmat,const CvMat * shiftvec)1593 cvTransform( const CvArr* srcarr, CvArr* dstarr,
1594              const CvMat* transmat, const CvMat* shiftvec )
1595 {
1596     static CvBigFuncTable transform_tab, diag_transform_tab;
1597     static int inittab = 0;
1598     CvMat* lut = 0;
1599 
1600     CV_FUNCNAME( "cvTransform" );
1601 
1602     __BEGIN__;
1603 
1604     CvMat srcstub, *src = (CvMat*)srcarr;
1605     CvMat dststub, *dst = (CvMat*)dstarr;
1606     CvMat rotstub, *rot = (CvMat*)transmat;
1607     CvMat shiftstub, *shift = (CvMat*)shiftvec;
1608     CvSeq *src_seq = 0, *dst_seq = 0;
1609     CvSeq hdr; // need only one copy of stub header & seqblock (either for src or dst)
1610     CvSeqBlock block_hdr;
1611     int i, j, type, cn, dst_cn;
1612     int coi = 0, coi2 = 0;
1613     double* buffer = (double*)cvStackAlloc( CV_CN_MAX*(CV_CN_MAX+1)*sizeof(buffer[0]) );
1614 
1615     if( !inittab )
1616     {
1617         icvInitTransformRTable( &transform_tab );
1618         icvInitDiagTransformRTable( &diag_transform_tab );
1619         inittab = 1;
1620     }
1621 
1622     if( CV_IS_SEQ( src ))
1623     {
1624         src_seq = (CvSeq*)src;
1625         if( CV_ELEM_SIZE(src_seq->flags) != src_seq->elem_size )
1626             CV_ERROR( CV_StsUnsupportedFormat, "Unsupported type of sequence elements" );
1627     }
1628     else
1629         CV_CALL( src = cvGetMat( src, &srcstub, &coi ));
1630 
1631     if( CV_IS_SEQ( dst ))
1632     {
1633         dst_seq = (CvSeq*)dst;
1634         if( CV_ELEM_SIZE(dst_seq->flags) != dst_seq->elem_size )
1635             CV_ERROR( CV_StsUnsupportedFormat, "Unsupported type of sequence elements" );
1636     }
1637     else
1638         CV_CALL( dst = cvGetMat( dst, &dststub, &coi2 ));
1639 
1640     if( coi != 0 || coi2 != 0 )
1641         CV_ERROR( CV_BadCOI, "" );
1642 
1643     if( !CV_ARE_DEPTHS_EQ(src, dst) )
1644         CV_ERROR( CV_StsUnmatchedFormats, "" );
1645 
1646     if( src_seq || dst_seq )
1647     {
1648         if( !src_seq )
1649         {
1650             if( CV_IS_MAT_CONT(src->type) || (src->rows != 1 && src->cols != 1) )
1651                 CV_ERROR( CV_StsBadSize, "if eigher the source or destination is a sequence, "
1652                 "the other array must be also a sequence of continous 1d vector" );
1653             src_seq = cvMakeSeqHeaderForArray( CV_MAT_TYPE(src->type), sizeof(hdr),
1654                                        CV_ELEM_SIZE(src->type), src->data.ptr,
1655                                        src->rows + src->cols + 1, &hdr, &block_hdr );
1656         }
1657 
1658         if( !dst_seq )
1659         {
1660             if( CV_IS_MAT_CONT(dst->type) || (dst->rows != 1 && dst->cols != 1) )
1661                 CV_ERROR( CV_StsBadSize, "if eigher the source or destination is a sequence, "
1662                 "the other array must be also a sequence of continous 1d vector" );
1663             if( dst->rows + dst->cols - 1 != src_seq->total )
1664                 CV_ERROR( CV_StsUnmatchedFormats,
1665                 "source sequence and destination vector have different sizes" );
1666             dst_seq = cvMakeSeqHeaderForArray( CV_MAT_TYPE(dst->type), sizeof(hdr),
1667                                            CV_ELEM_SIZE(dst->type), dst->data.ptr,
1668                                            dst->rows + dst->cols + 1, &hdr, &block_hdr );
1669         }
1670         else if( dst_seq->total != src_seq->total )
1671         {
1672             if( dst_seq->total > src_seq->total )
1673                 cvSeqPopMulti( dst_seq, 0, dst_seq->total - src_seq->total );
1674             else
1675                 cvSeqPushMulti( dst_seq, 0, src_seq->total - dst_seq->total );
1676         }
1677     }
1678     else if( !CV_ARE_SIZES_EQ( src, dst ))
1679         CV_ERROR( CV_StsUnmatchedSizes, "" );
1680 
1681     type = CV_MAT_TYPE( src->type );
1682     cn = CV_MAT_CN( type );
1683     dst_cn = CV_MAT_CN( dst->type );
1684 
1685     if( cn > 4 || dst_cn > 4 )
1686         CV_ERROR( CV_StsOutOfRange, "Both input and output array must have at most 4 channels" );
1687 
1688     if( !CV_IS_MAT( rot ))
1689         CV_CALL( rot = cvGetMat( rot, &rotstub, &coi ));
1690 
1691     if( rot->rows != dst_cn )
1692         CV_ERROR( CV_StsBadSize,
1693         "The height of transmat matrix must be equal to number of channels" );
1694 
1695     if( rot->cols == cn + 1 || rot->cols == cn )
1696     {
1697         if( CV_MAT_TYPE( rot->type ) == CV_64FC1 )
1698         {
1699             for( i = 0; i < dst_cn; i++ )
1700             {
1701                 buffer[i*(cn+1) + cn] = 0;
1702                 for( j = 0; j < rot->cols; j++ )
1703                     buffer[i*(cn+1) + j] = ((double*)(rot->data.ptr + rot->step*i))[j];
1704             }
1705         }
1706         else if( CV_MAT_TYPE( rot->type ) == CV_32FC1 )
1707         {
1708             for( i = 0; i < dst_cn; i++ )
1709             {
1710                 buffer[i*(cn+1) + cn] = 0;
1711                 for( j = 0; j < rot->cols; j++ )
1712                     buffer[i*(cn+1) + j] = ((float*)(rot->data.ptr + rot->step*i))[j];
1713             }
1714         }
1715         else
1716             CV_ERROR( CV_StsUnsupportedFormat, "Rotation matrix must be 32fC1 or 64fC1" );
1717     }
1718     else
1719         CV_ERROR( CV_StsUnmatchedSizes, "If the source array has <cn> channels, "
1720            "the transformation matrix must have <cn> x <cn>+1 or <cn> x <cn> size" );
1721 
1722     if( shift )
1723     {
1724         if( !CV_IS_MAT( shift ))
1725             CV_CALL( shift = cvGetMat( shift, &shiftstub, &coi ));
1726 
1727         if( CV_MAT_CN( shift->type ) * shift->cols * shift->rows == dst_cn &&
1728             (shift->rows == 1 || shift->cols == 1) )
1729         {
1730             if( CV_MAT_DEPTH( shift->type ) == CV_64F )
1731             {
1732                 int step = shift->step ? shift->step/sizeof(double) : 1;
1733                 for( i = 0; i < dst_cn; i++ )
1734                     buffer[i*(cn+1) + cn] += shift->data.db[i*step];
1735             }
1736             else if( CV_MAT_DEPTH( shift->type ) == CV_32F )
1737             {
1738                 int step = shift->step ? shift->step/sizeof(float) : 1;
1739                 for( i = 0; i < dst_cn; i++ )
1740                     buffer[i*(cn+1) + cn] += shift->data.fl[i*step];
1741             }
1742             else
1743                 CV_ERROR( CV_StsUnsupportedFormat, "Shift vector must be 32f or 64f" );
1744         }
1745         else
1746         {
1747             CV_ERROR( CV_StsUnmatchedSizes,
1748                 "Shift (if present) must be 1 dimensional vector with the number "
1749                 "of elements equal to number of channels in the processed array" );
1750         }
1751     }
1752 
1753     if( coi != 0 || coi2 != 0 )
1754         CV_ERROR( CV_BadCOI, "" );
1755 
1756     {
1757         CvTransformFunc func = (CvTransformFunc)(transform_tab.fn_2d[type]);
1758         CvDiagTransformFunc diag_func = 0;
1759         CvLUT_TransformFunc lut_func = 0;
1760         int diag_transform = 0;
1761         CvColorTwistIPPFunc ipp_func = 0;
1762         CvSize size;
1763         float* ipp_coeffs = (float*)cvStackAlloc( 16*sizeof(ipp_coeffs[0]) );
1764 
1765         if( !func )
1766             CV_ERROR( CV_StsUnsupportedFormat, "" );
1767 
1768         if( cn == dst_cn )
1769             ipp_func = type == CV_8UC3 ? icvColorTwist_8u_C3R_p :
1770                        type == CV_16UC3 ? icvColorTwist_16u_C3R_p :
1771                        type == CV_16SC3 ? icvColorTwist_16s_C3R_p :
1772                        type == CV_32FC3 ? icvColorTwist_32f_C3R_p :
1773                        type == CV_32FC4 && fabs(buffer[4]) < DBL_EPSILON &&
1774                        fabs(buffer[9]) < DBL_EPSILON && fabs(buffer[14]) < DBL_EPSILON &&
1775                        fabs(buffer[19]) < DBL_EPSILON ? icvColorTwist_32f_C4R_p : 0;
1776         else if( dst_cn == 1 && (cn == 3 || cn == 4) &&
1777                  buffer[0] >= 0 && buffer[1] >= 0 && buffer[2] >= 0 &&
1778                  buffer[0] + buffer[1] + buffer[2] <= 1.01 &&
1779                  fabs(buffer[3]) < DBL_EPSILON && (cn == 3 || fabs(buffer[4]) < DBL_EPSILON) )
1780         {
1781             if( cn == 3 )
1782                 ipp_func = type == CV_8UC3 ? icvColorToGray_8u_C3C1R_p :
1783                            type == CV_16UC3 ? icvColorToGray_16u_C3C1R_p :
1784                            type == CV_16SC3 ? icvColorToGray_16s_C3C1R_p :
1785                            type == CV_32FC3 ? icvColorToGray_32f_C3C1R_p : 0;
1786             else
1787                 ipp_func = type == CV_8UC4 ? icvColorToGray_8u_AC4C1R_p :
1788                            type == CV_16UC4 ? icvColorToGray_16u_AC4C1R_p :
1789                            type == CV_16SC4 ? icvColorToGray_16s_AC4C1R_p :
1790                            type == CV_32FC4 ? icvColorToGray_32f_AC4C1R_p : 0;
1791         }
1792 
1793         if( dst_cn == cn )
1794         {
1795             diag_transform = 1;
1796             for( i = 0; i < dst_cn; i++ )
1797                 for( j = 0; j < cn; j++ )
1798                 {
1799                     if( i != j && fabs(buffer[i*(cn+1) + j]) > DBL_EPSILON )
1800                     {
1801                         diag_transform = 0;
1802                         break;
1803                     }
1804                 }
1805 
1806             if( diag_transform )
1807             {
1808                 if( CV_MAT_DEPTH(type) == CV_8U )
1809                 {
1810                     CV_CALL( lut = cvCreateMat( 1, 256, type ));
1811                     for( i = 0; i < cn; i++ )
1812                     {
1813                         double a = buffer[i*(cn+1) + i], b = buffer[i*(cn+1) + cn];
1814                         uchar* ltab = lut->data.ptr;
1815                         for( j = 0; j < 256; j++ )
1816                         {
1817                             int t = cvRound(a*j + b);
1818                             ltab[j*cn + i] = CV_CAST_8U(t);
1819                         }
1820                     }
1821                     lut_func = cn == 1 ? (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C1R :
1822                                cn == 2 ? (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C2R :
1823                                cn == 3 ? (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C3R :
1824                                (CvLUT_TransformFunc)icvLUT_Transform8u_8u_C4R;
1825                 }
1826                 else
1827                     diag_func = (CvDiagTransformFunc)(diag_transform_tab.fn_2d[type]);
1828             }
1829         }
1830 
1831         if( ipp_func )
1832         {
1833             const double* ptr = buffer;
1834 
1835             // fill cn x 4 ipp_coeffs array
1836             for( i = 0; i < cn*4; i += 4, ptr += cn+1 )
1837             {
1838                 float t0 = (float)ptr[0];
1839                 float t1 = (float)ptr[1];
1840                 ipp_coeffs[i] = t0;
1841                 ipp_coeffs[i+1] = t1;
1842                 t0 = (float)ptr[2];
1843                 t1 = (float)ptr[3];
1844                 ipp_coeffs[i+2] = t0;
1845                 ipp_coeffs[i+3] = t1;
1846             }
1847         }
1848 
1849         if( !src_seq )
1850         {
1851             int srcstep = src->step;
1852             int dststep = dst->step;
1853             size = cvGetMatSize( src );
1854 
1855             if( CV_IS_MAT_CONT( src->type & dst->type ))
1856             {
1857                 size.width *= size.height;
1858                 size.height = 1;
1859                 srcstep = dststep = CV_STUB_STEP;
1860             }
1861 
1862             if( lut_func )
1863                 lut_func( src->data.ptr, src->step, dst->data.ptr,
1864                           dst->step, size, lut->data.ptr );
1865             else if( ipp_func )
1866             {
1867                 IPPI_CALL( ipp_func( src->data.ptr, srcstep, dst->data.ptr,
1868                                      dststep, size, ipp_coeffs ));
1869             }
1870             else if( diag_transform )
1871                 diag_func( src->data.ptr, src->step, dst->data.ptr,
1872                            dst->step, size, buffer );
1873             else
1874                 func( src->data.ptr, src->step, dst->data.ptr,
1875                       dst->step, size, buffer, dst_cn );
1876         }
1877         else
1878         {
1879             CvSeqBlock* src_block = src_seq->first;
1880             CvSeqBlock* dst_block = dst_seq->first;
1881             int src_idx = 0, dst_idx = 0;
1882             int src_elem_size = CV_ELEM_SIZE(src_seq->flags);
1883             int dst_elem_size = CV_ELEM_SIZE(dst_seq->flags);
1884 
1885             for( i = src_seq->total; i > 0; )
1886             {
1887                 int src_len = src_block->count - src_idx;
1888                 int dst_len = dst_block->count - dst_idx;
1889                 const void* srcptr = src_block->data + src_idx*src_elem_size;
1890                 void* dstptr = dst_block->data + dst_idx*dst_elem_size;
1891                 src_len = MIN(src_len, dst_len);
1892 
1893                 if( lut_func )
1894                     lut_func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
1895                               cvSize( src_len, 1 ), lut->data.ptr );
1896                 else if( ipp_func )
1897                 {
1898                     IPPI_CALL( ipp_func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
1899                                          cvSize( src_len, 1 ), ipp_coeffs ));
1900                 }
1901                 else if( diag_transform )
1902                     diag_func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
1903                                cvSize( src_len, 1 ), buffer );
1904                 else
1905                     func( srcptr, CV_STUB_STEP, dstptr, CV_STUB_STEP,
1906                           cvSize( src_len, 1 ), buffer, dst_cn );
1907 
1908                 if( (src_idx += src_len) == src_block->count )
1909                     src_block = src_block->next, src_idx = 0;
1910                 if( (dst_idx += src_len) == dst_block->count )
1911                     dst_block = dst_block->next, dst_idx = 0;
1912                 i -= src_len;
1913             }
1914         }
1915     }
1916 
1917     __END__;
1918 
1919     cvReleaseMat( &lut );
1920 }
1921 
1922 
1923 /****************************************************************************************\
1924 *                                        cvPerspectiveTransform                          *
1925 \****************************************************************************************/
1926 
1927 #define ICV_PERSPECTIVE_TRANSFORM_FUNC_2( flavor, arrtype )                             \
1928 static CvStatus CV_STDCALL                                                              \
1929 icvPerspectiveTransform_##flavor##_C2R( const arrtype* src, int srcstep,                \
1930                                         arrtype* dst, int dststep,                      \
1931                                         CvSize size, const double* mat )                \
1932 {                                                                                       \
1933     int i;                                                                              \
1934     size.width *= 2;                                                                    \
1935     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                               \
1936                                                                                         \
1937     for( ; size.height--; src += srcstep, dst += dststep )                              \
1938     {                                                                                   \
1939         for( i = 0; i < size.width; i += 2 )                                            \
1940         {                                                                               \
1941             arrtype x = src[i], y = src[i + 1];                                         \
1942             double w = x*mat[6] + y*mat[7] + mat[8];                                    \
1943                                                                                         \
1944             if( fabs(w) > FLT_EPSILON )                                                 \
1945             {                                                                           \
1946                 w = 1./w;                                                               \
1947                 dst[i] = (arrtype)((x*mat[0] + y*mat[1] + mat[2]) * w);                 \
1948                 dst[i+1] = (arrtype)((x*mat[3] + y*mat[4] + mat[5]) * w);               \
1949             }                                                                           \
1950             else                                                                        \
1951             {                                                                           \
1952                 dst[i] = (arrtype)0;                                                    \
1953                 dst[i+1] = (arrtype)0;                                                  \
1954             }                                                                           \
1955         }                                                                               \
1956     }                                                                                   \
1957                                                                                         \
1958     return CV_OK;                                                                       \
1959 }
1960 
1961 
1962 #define ICV_PERSPECTIVE_TRANSFORM_FUNC_3( flavor, arrtype )                             \
1963 static CvStatus CV_STDCALL                                                              \
1964 icvPerspectiveTransform_##flavor##_C3R( const arrtype* src, int srcstep,                \
1965                                              arrtype* dst, int dststep,                 \
1966                                              CvSize size, const double* mat )           \
1967 {                                                                                       \
1968     int i;                                                                              \
1969     size.width *= 3;                                                                    \
1970     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                               \
1971                                                                                         \
1972     for( ; size.height--; src += srcstep, dst += dststep )                              \
1973     {                                                                                   \
1974         for( i = 0; i < size.width; i += 3 )                                            \
1975         {                                                                               \
1976             arrtype x = src[i], y = src[i + 1], z = src[i + 2];                         \
1977             double w = x*mat[12] + y*mat[13] + z*mat[14] + mat[15];                     \
1978                                                                                         \
1979             if( fabs(w) > FLT_EPSILON )                                                 \
1980             {                                                                           \
1981                 w = 1./w;                                                               \
1982                 dst[i] = (arrtype)((x*mat[0] + y*mat[1] + z*mat[2] + mat[3]) * w);      \
1983                 dst[i+1] = (arrtype)((x*mat[4] + y*mat[5] + z*mat[6] + mat[7]) * w);    \
1984                 dst[i+2] = (arrtype)((x*mat[8] + y*mat[9] + z*mat[10] + mat[11]) * w);  \
1985             }                                                                           \
1986             else                                                                        \
1987             {                                                                           \
1988                 dst[i] = (arrtype)0;                                                    \
1989                 dst[i+1] = (arrtype)0;                                                  \
1990                 dst[i+2] = (arrtype)0;                                                  \
1991             }                                                                           \
1992         }                                                                               \
1993     }                                                                                   \
1994                                                                                         \
1995     return CV_OK;                                                                       \
1996 }
1997 
1998 ICV_PERSPECTIVE_TRANSFORM_FUNC_2( 32f, float )
1999 ICV_PERSPECTIVE_TRANSFORM_FUNC_2( 64f, double )
2000 ICV_PERSPECTIVE_TRANSFORM_FUNC_3( 32f, float )
2001 ICV_PERSPECTIVE_TRANSFORM_FUNC_3( 64f, double )
2002 
icvInitPerspectiveTransformTable(CvFuncTable * tab2,CvFuncTable * tab3)2003 static void icvInitPerspectiveTransformTable( CvFuncTable* tab2, CvFuncTable* tab3 )\
2004 {                                                                                   \
2005     tab2->fn_2d[CV_32F] = (void*)icvPerspectiveTransform_32f_C2R;                   \
2006     tab2->fn_2d[CV_64F] = (void*)icvPerspectiveTransform_64f_C2R;                   \
2007     tab3->fn_2d[CV_32F] = (void*)icvPerspectiveTransform_32f_C3R;                   \
2008     tab3->fn_2d[CV_64F] = (void*)icvPerspectiveTransform_64f_C3R;                   \
2009 }
2010 
2011 
2012 CV_IMPL void
cvPerspectiveTransform(const CvArr * srcarr,CvArr * dstarr,const CvMat * mat)2013 cvPerspectiveTransform( const CvArr* srcarr, CvArr* dstarr, const CvMat* mat )
2014 {
2015     static CvFuncTable tab[2];
2016     static int inittab = 0;
2017     double buffer[16];
2018 
2019     CV_FUNCNAME( "cvPerspectiveProject" );
2020 
2021     __BEGIN__;
2022 
2023     CvMat sstub, *src = (CvMat*)srcarr;
2024     CvMat dstub, *dst = (CvMat*)dstarr;
2025     int i, j, type, cn;
2026     CvFunc2D_2A1P func = 0;
2027     CvSize size;
2028 
2029     if( !inittab )
2030     {
2031         icvInitPerspectiveTransformTable( &tab[0], &tab[1] );
2032         inittab = 1;
2033     }
2034 
2035     if( !CV_IS_MAT( src ))
2036     {
2037         int coi = 0;
2038         CV_CALL( src = cvGetMat( src, &sstub, &coi ));
2039 
2040         if( coi != 0 )
2041             CV_ERROR( CV_BadCOI, "" );
2042     }
2043 
2044     if( !CV_IS_MAT( dst ))
2045     {
2046         int coi = 0;
2047         CV_CALL( dst = cvGetMat( dst, &dstub, &coi ));
2048 
2049         if( coi != 0 )
2050             CV_ERROR( CV_BadCOI, "" );
2051     }
2052 
2053     if( !CV_ARE_TYPES_EQ( src, dst ))
2054         CV_ERROR( CV_StsUnmatchedFormats, "" );
2055 
2056     if( !CV_ARE_SIZES_EQ( src, dst ))
2057         CV_ERROR( CV_StsUnmatchedSizes, "" );
2058 
2059     type = CV_MAT_TYPE( src->type );
2060     cn = CV_MAT_CN( type );
2061 
2062     if( cn != 2 && cn != 3 )
2063         CV_ERROR( CV_BadNumChannels, cvUnsupportedFormat );
2064 
2065     if( !CV_IS_MAT( mat ))
2066         CV_ERROR( CV_StsBadArg, "Invalid transformation matrix" );
2067 
2068     if( mat->rows != cn + 1 && mat->cols != mat->rows )
2069         CV_ERROR( CV_StsBadSize,
2070         "The size of transform matrix must be equal to number of channels" );
2071 
2072     if( CV_MAT_TYPE( mat->type ) == CV_64FC1 )
2073     {
2074         for( i = 0; i <= cn; i++ )
2075         {
2076             for( j = 0; j <= cn; j++ )
2077                 buffer[i*(cn+1) + j] = ((double*)(mat->data.ptr + mat->step*i))[j];
2078         }
2079     }
2080     else if( CV_MAT_TYPE( mat->type ) == CV_32FC1 )
2081     {
2082         for( i = 0; i <= cn; i++ )
2083         {
2084             for( j = 0; j <= cn; j++ )
2085                 buffer[i*(cn+1) + j] = ((float*)(mat->data.ptr + mat->step*i))[j];
2086         }
2087     }
2088     else
2089     {
2090         CV_ERROR( CV_StsUnsupportedFormat, "Rotation matrix must be 32fC1 or 64fC1" );
2091     }
2092 
2093     func = (CvFunc2D_2A1P)tab[cn == 3].fn_2d[CV_MAT_DEPTH(type)];
2094 
2095     if( !func )
2096         CV_ERROR( CV_StsUnsupportedFormat, "" );
2097 
2098     size = cvGetMatSize( src );
2099 
2100     if( CV_IS_MAT_CONT( src->type & dst->type ))
2101     {
2102         size.width *= size.height;
2103         size.height = 1;
2104     }
2105 
2106     IPPI_CALL( func( src->data.ptr, src->step, dst->data.ptr, dst->step, size, buffer));
2107 
2108     CV_CHECK_NANS( dst );
2109 
2110     __END__;
2111 }
2112 
2113 
2114 /****************************************************************************************\
2115 *                                       cvScaleAdd                                       *
2116 \****************************************************************************************/
2117 
2118 #define  ICV_DEF_MULADDC_CASE_C1( arrtype, temptype, src1, src2, dst, len )     \
2119 {                                                                               \
2120     int i;                                                                      \
2121                                                                                 \
2122     for( i = 0; i <= (len) - 4; i += 4 )                                        \
2123     {                                                                           \
2124         temptype t0 = (src1)[i]*s0 + (src2)[i];                                 \
2125         temptype t1 = (src1)[i+1]*s0 + (src2)[i+1];                             \
2126                                                                                 \
2127         (dst)[i] = (arrtype)t0;                                                 \
2128         (dst)[i+1] = (arrtype)t1;                                               \
2129                                                                                 \
2130         t0 = (src1)[i+2]*s0 + (src2)[i+2];                                      \
2131         t1 = (src1)[i+3]*s0 + (src2)[i+3];                                      \
2132                                                                                 \
2133         (dst)[i+2] = (arrtype)t0;                                               \
2134         (dst)[i+3] = (arrtype)t1;                                               \
2135     }                                                                           \
2136                                                                                 \
2137     for( ; i < (len); i++ )                                                     \
2138     {                                                                           \
2139         temptype t0 = (src1)[i]*s0 + (src2)[i];                                 \
2140         (dst)[i] = (arrtype)t0;                                                 \
2141     }                                                                           \
2142 }
2143 
2144 
2145 #define  ICV_DEF_MULADDC_CASE_C2( arrtype, temptype, src1, src2, dst, len )     \
2146 {                                                                               \
2147     int i;                                                                      \
2148                                                                                 \
2149     for( i = 0; i <= (len) - 4; i += 4 )                                        \
2150     {                                                                           \
2151         temptype t0 = (src1)[i]*s0 - (src1)[i+1]*s1 + (src2)[i];                \
2152         temptype t1 = (src1)[i]*s1 + (src1)[i+1]*s0 + (src2)[i+1];              \
2153                                                                                 \
2154         (dst)[i] = (arrtype)t0;                                                 \
2155         (dst)[i+1] = (arrtype)t1;                                               \
2156                                                                                 \
2157         t0 = (src1)[i+2]*s0 - (src1)[i+3]*s1 + (src2)[i+2];                     \
2158         t1 = (src1)[i+2]*s1 + (src1)[i+3]*s0 + (src2)[i+3];                     \
2159                                                                                 \
2160         (dst)[i+2] = (arrtype)t0;                                               \
2161         (dst)[i+3] = (arrtype)t1;                                               \
2162     }                                                                           \
2163                                                                                 \
2164     for( ; i < (len); i += 2 )                                                  \
2165     {                                                                           \
2166         temptype t0 = (src1)[i]*s0 - (src1)[i+1]*s1 + (src2)[i];                \
2167         temptype t1 = (src1)[i]*s1 + (src1)[i+1]*s0 + (src2)[i+1];              \
2168                                                                                 \
2169         (dst)[i] = (arrtype)t0;                                                 \
2170         (dst)[i+1] = (arrtype)t1;                                               \
2171     }                                                                           \
2172 }
2173 
2174 
2175 #define  ICV_DEF_MULADDS_FUNC( flavor, arrtype, scalartype, entry, cn )     \
2176 static CvStatus CV_STDCALL                                                  \
2177 icvMulAddC_##flavor( const arrtype* src1, int srcstep1,                     \
2178                       const arrtype* src2, int srcstep2,                    \
2179                       arrtype* dst, int dststep, CvSize size,               \
2180                       const scalartype* scalar )                            \
2181 {                                                                           \
2182     entry(scalartype);                                                      \
2183     size.width *= (cn);                                                     \
2184     srcstep1 /= sizeof(src1[0]); srcstep2 /= sizeof(src2[0]);               \
2185     dststep /= sizeof(dst[0]);                                              \
2186                                                                             \
2187     for( ; size.height--; src1+=srcstep1, src2+=srcstep2, dst+=dststep )    \
2188     {                                                                       \
2189         ICV_DEF_MULADDC_CASE_C##cn( arrtype, scalartype, src1, src2,        \
2190                                     dst, size.width )                       \
2191     }                                                                       \
2192                                                                             \
2193     return CV_OK;                                                           \
2194 }
2195 
2196 
2197 ICV_DEF_MULADDS_FUNC( 32f_C1R, float, double, CV_UN_ENTRY_C1, 1 )
2198 ICV_DEF_MULADDS_FUNC( 32f_C2R, float, double, CV_UN_ENTRY_C2, 2 )
2199 ICV_DEF_MULADDS_FUNC( 64f_C1R, double, double, CV_UN_ENTRY_C1, 1 )
2200 ICV_DEF_MULADDS_FUNC( 64f_C2R, double, double, CV_UN_ENTRY_C2, 2 )
2201 
2202 
2203 static void
icvInitMulAddCTable(CvBigFuncTable * tab)2204 icvInitMulAddCTable( CvBigFuncTable* tab )
2205 {
2206     tab->fn_2d[CV_32FC1] = (void*)icvMulAddC_32f_C1R;
2207     tab->fn_2d[CV_32FC2] = (void*)icvMulAddC_32f_C2R;
2208     tab->fn_2d[CV_64FC1] = (void*)icvMulAddC_64f_C1R;
2209     tab->fn_2d[CV_64FC2] = (void*)icvMulAddC_64f_C2R;
2210 }
2211 
2212 
2213 CV_IMPL void
cvScaleAdd(const CvArr * srcarr1,CvScalar scale,const CvArr * srcarr2,CvArr * dstarr)2214 cvScaleAdd( const CvArr* srcarr1, CvScalar scale,
2215             const CvArr* srcarr2, CvArr* dstarr )
2216 {
2217     static CvBigFuncTable muladds_tab;
2218     static int inittab = 0;
2219 
2220     CV_FUNCNAME( "cvScaleAdd" );
2221 
2222     __BEGIN__;
2223 
2224     CvMat stub1, *src1 = (CvMat*)srcarr1;
2225     CvMat stub2, *src2 = (CvMat*)srcarr2;
2226     CvMat stub, *dst = (CvMat*)dstarr;
2227     CvSize size;
2228     int type;
2229 
2230     if( !CV_IS_MAT( src1 ) || !CV_IS_MAT(src2) || !CV_IS_MAT(dst))
2231     {
2232         int coi1 = 0, coi2 = 0, coi3 = 0;
2233         CV_CALL( src1 = cvGetMat( src1, &stub1, &coi1 ));
2234         CV_CALL( src2 = cvGetMat( src2, &stub2, &coi2 ));
2235         CV_CALL( dst = cvGetMat( dst, &stub, &coi3 ));
2236 
2237         if( coi1 + coi2 + coi3 != 0 )
2238             CV_ERROR( CV_BadCOI, "" );
2239     }
2240 
2241     if( !CV_ARE_TYPES_EQ( src1, dst ) || !CV_ARE_TYPES_EQ( src2, dst ))
2242         CV_ERROR( CV_StsUnmatchedFormats, "" );
2243 
2244     if( !CV_ARE_SIZES_EQ( src1, dst ) || !CV_ARE_SIZES_EQ( src2, dst ))
2245         CV_ERROR( CV_StsUnmatchedSizes, "" );
2246 
2247     type = CV_MAT_TYPE( src1->type );
2248     size = cvGetMatSize( src1 );
2249 
2250     if( CV_IS_MAT_CONT( src1->type & src2->type & dst->type ))
2251     {
2252         size.width *= size.height;
2253 
2254         if( size.width <= CV_MAX_INLINE_MAT_OP_SIZE )
2255         {
2256             if( type == CV_32FC1 )
2257             {
2258                 float* mA = src1->data.fl;
2259                 float* mB = src2->data.fl;
2260                 float* mC = dst->data.fl;
2261 
2262                 do
2263                 {
2264                     mC[size.width - 1] = (float)(mA[size.width - 1]*scale.val[0] +
2265                                          mB[size.width - 1]);
2266                 }
2267                 while( --size.width );
2268 
2269                 EXIT;
2270             }
2271 
2272             if( type == CV_64FC1 )
2273             {
2274                 double* mA = src1->data.db;
2275                 double* mB = src2->data.db;
2276                 double* mC = dst->data.db;
2277 
2278                 do
2279                 {
2280                     mC[size.width - 1] = mA[size.width - 1]*scale.val[0] +
2281                                          mB[size.width - 1];
2282                 }
2283                 while( --size.width );
2284 
2285                 EXIT;
2286             }
2287         }
2288 
2289         size.height = 1;
2290     }
2291 
2292     if( !inittab )
2293     {
2294         icvInitMulAddCTable( &muladds_tab );
2295         inittab = 1;
2296     }
2297 
2298     if( CV_MAT_CN(type) > 2 )
2299         CV_ERROR( CV_StsOutOfRange, "The function only supports 1- and 2-channel arrays" );
2300 
2301     {
2302         CvFunc2D_3A1P func = (CvFunc2D_3A1P)(muladds_tab.fn_2d[type]);
2303 
2304         if( !func )
2305             CV_ERROR( CV_StsUnsupportedFormat, "" );
2306 
2307         IPPI_CALL( func( src1->data.ptr, src1->step, src2->data.ptr, src2->step,
2308                          dst->data.ptr, dst->step, size, scale.val ));
2309     }
2310 
2311     CV_CHECK_NANS( dst );
2312 
2313     __END__;
2314 }
2315 
2316 
2317 /****************************************************************************************\
2318 *                                    cvCalcCovarMatrix                                   *
2319 \****************************************************************************************/
2320 
2321 #define ICV_DOT_PRODUCT_CASE( flavor, srctype, avgtype, load_macro )                    \
2322 static CvStatus CV_STDCALL                                                              \
2323 icvDotProductShifted_##flavor##_C1R( const srctype* vec1, int vecstep1,                 \
2324                                      const srctype* vec2, int vecstep2,                 \
2325                                      const avgtype* avg, int avgstep,                   \
2326                                      CvSize size, double* _result )                     \
2327 {                                                                                       \
2328     double result = 0;                                                                  \
2329     vecstep1 /= sizeof(vec1[0]); vecstep2 /= sizeof(vec2[0]); avgstep /= sizeof(avg[0]);\
2330                                                                                         \
2331     for( ; size.height--; vec1 += vecstep1, vec2 += vecstep2, avg += avgstep )          \
2332     {                                                                                   \
2333         int x;                                                                          \
2334         for( x = 0; x <= size.width - 4; x += 4 )                                       \
2335             result += (load_macro(vec1[x]) - avg[x])*(load_macro(vec2[x]) - avg[x]) +   \
2336                 (load_macro(vec1[x+1]) - avg[x+1])*(load_macro(vec2[x+1]) - avg[x+1]) + \
2337                 (load_macro(vec1[x+2]) - avg[x+2])*(load_macro(vec2[x+2]) - avg[x+2]) + \
2338                 (load_macro(vec1[x+3]) - avg[x+3])*(load_macro(vec2[x+3]) - avg[x+3]);  \
2339         for( ; x < size.width; x++ )                                                    \
2340             result += (load_macro(vec1[x]) - avg[x])*(load_macro(vec2[x]) - avg[x]);    \
2341     }                                                                                   \
2342                                                                                         \
2343     *_result = result;                                                                  \
2344     return CV_OK;                                                                       \
2345 }
2346 
2347 
2348 ICV_DOT_PRODUCT_CASE( 8u32f, uchar, float, CV_8TO32F )
2349 ICV_DOT_PRODUCT_CASE( 8u64f, uchar, double, CV_8TO32F )
2350 ICV_DOT_PRODUCT_CASE( 16u32f, ushort, float, CV_NOP )
2351 ICV_DOT_PRODUCT_CASE( 16u64f, ushort, double, CV_NOP )
2352 ICV_DOT_PRODUCT_CASE( 16s32f, short, float, CV_NOP )
2353 ICV_DOT_PRODUCT_CASE( 16s64f, short, double, CV_NOP )
2354 ICV_DOT_PRODUCT_CASE( 32f, float, float, CV_NOP )
2355 ICV_DOT_PRODUCT_CASE( 32f64f, float, double, CV_NOP )
2356 ICV_DOT_PRODUCT_CASE( 64f, double, double, CV_NOP )
2357 
icvInitDotProductShiftedTable(CvFuncTable * tabfl,CvFuncTable * tabdb)2358 static void  icvInitDotProductShiftedTable( CvFuncTable* tabfl, CvFuncTable* tabdb )
2359 {
2360     tabfl->fn_2d[CV_8U] = (void*)icvDotProductShifted_8u32f_C1R;
2361     tabfl->fn_2d[CV_8S] = 0;
2362     tabfl->fn_2d[CV_16U] = (void*)icvDotProductShifted_16u32f_C1R;
2363     tabfl->fn_2d[CV_16S] = (void*)icvDotProductShifted_16s32f_C1R;
2364     tabfl->fn_2d[CV_32S] = 0;
2365     tabfl->fn_2d[CV_32F] = (void*)icvDotProductShifted_32f_C1R;
2366     tabfl->fn_2d[CV_64F] = 0;
2367 
2368     tabdb->fn_2d[CV_8U] = (void*)icvDotProductShifted_8u64f_C1R;
2369     tabdb->fn_2d[CV_8S] = 0;
2370     tabdb->fn_2d[CV_16U] = (void*)icvDotProductShifted_16u64f_C1R;
2371     tabdb->fn_2d[CV_16S] = (void*)icvDotProductShifted_16s64f_C1R;
2372     tabdb->fn_2d[CV_32S] = 0;
2373     tabdb->fn_2d[CV_32F] = (void*)icvDotProductShifted_32f64f_C1R;
2374     tabdb->fn_2d[CV_64F] = (void*)icvDotProductShifted_64f_C1R;
2375 }
2376 
2377 #define ICV_EXT_PRODUCT_CASE( flavor, srctype, avgtype, load_macro )                    \
2378 static CvStatus CV_STDCALL                                                              \
2379 icvExtProductShifted_##flavor##_C1R( const srctype* vec, int vecstep,                   \
2380                                      const avgtype* avg, int avgstep,                   \
2381                                      avgtype* dst, int dststep,                         \
2382                                      CvSize size, avgtype* tempbuf )                    \
2383 {                                                                                       \
2384     int x, y, dstsize = size.width * size.height;                                       \
2385                                                                                         \
2386     vecstep /= sizeof(vec[0]); avgstep /= sizeof(avg[0]);                               \
2387     for( y = 0; y < size.height; y++, vec += vecstep, avg += avgstep )                  \
2388         for( x = 0; x < size.width; x++ )                                               \
2389             *tempbuf++ = load_macro(vec[x]) - avg[x];                                   \
2390     tempbuf -= dstsize;                                                                 \
2391                                                                                         \
2392     dststep /= sizeof(dst[0]);                                                          \
2393     for( y = 0; y < dstsize; y++, dst += dststep )                                      \
2394     {                                                                                   \
2395         double ty = tempbuf[y];                                                         \
2396         for( x = 0; x <= y - 3; x += 4 )                                                \
2397         {                                                                               \
2398             double t0 = dst[x] + ty*tempbuf[x];                                         \
2399             double t1 = dst[x+1] + ty*tempbuf[x+1];                                     \
2400             dst[x] = (avgtype)t0;                                                       \
2401             dst[x+1] = (avgtype)t1;                                                     \
2402             t0 = dst[x+2] + ty*tempbuf[x+2];                                            \
2403             t1 = dst[x+3] + ty*tempbuf[x+3];                                            \
2404             dst[x+2] = (avgtype)t0;                                                     \
2405             dst[x+3] = (avgtype)t1;                                                     \
2406         }                                                                               \
2407         for( ; x <= y; x++ )                                                            \
2408             dst[x] = (avgtype)(dst[x] + ty*tempbuf[x]);                                 \
2409     }                                                                                   \
2410                                                                                         \
2411     return CV_OK;                                                                       \
2412 }
2413 
2414 ICV_EXT_PRODUCT_CASE( 8u32f, uchar, float, CV_8TO32F )
2415 ICV_EXT_PRODUCT_CASE( 8u64f, uchar, double, CV_8TO32F )
2416 ICV_EXT_PRODUCT_CASE( 16u32f, ushort, float, CV_NOP )
2417 ICV_EXT_PRODUCT_CASE( 16u64f, ushort, double, CV_NOP )
2418 ICV_EXT_PRODUCT_CASE( 16s32f, short, float, CV_NOP )
2419 ICV_EXT_PRODUCT_CASE( 16s64f, short, double, CV_NOP )
2420 ICV_EXT_PRODUCT_CASE( 32f, float, float, CV_NOP )
2421 ICV_EXT_PRODUCT_CASE( 32f64f, float, double, CV_NOP )
2422 ICV_EXT_PRODUCT_CASE( 64f, double, double, CV_NOP )
2423 
2424 
icvInitExtProductShiftedTable(CvFuncTable * tabfl,CvFuncTable * tabdb)2425 static void  icvInitExtProductShiftedTable( CvFuncTable* tabfl, CvFuncTable* tabdb )
2426 {
2427     tabfl->fn_2d[CV_8U] = (void*)icvExtProductShifted_8u32f_C1R;
2428     tabfl->fn_2d[CV_8S] = 0;
2429     tabfl->fn_2d[CV_16U] = (void*)icvExtProductShifted_16u32f_C1R;
2430     tabfl->fn_2d[CV_16S] = (void*)icvExtProductShifted_16s32f_C1R;
2431     tabfl->fn_2d[CV_32S] = 0;
2432     tabfl->fn_2d[CV_32F] = (void*)icvExtProductShifted_32f_C1R;
2433     tabfl->fn_2d[CV_64F] = 0;
2434 
2435     tabdb->fn_2d[CV_8U] = (void*)icvExtProductShifted_8u64f_C1R;
2436     tabdb->fn_2d[CV_8S] = 0;
2437     tabdb->fn_2d[CV_16U] = (void*)icvExtProductShifted_16u64f_C1R;
2438     tabdb->fn_2d[CV_16S] = (void*)icvExtProductShifted_16s64f_C1R;
2439     tabdb->fn_2d[CV_32S] = 0;
2440     tabdb->fn_2d[CV_32F] = (void*)icvExtProductShifted_32f64f_C1R;
2441     tabdb->fn_2d[CV_64F] = (void*)icvExtProductShifted_64f_C1R;
2442 }
2443 
2444 
2445 typedef struct vec_data
2446 {
2447     void* ptr;
2448     int step;
2449 }
2450 vec_data;
2451 
2452 CV_IMPL void
cvCalcCovarMatrix(const CvArr ** vecarr,int count,CvArr * covarr,CvArr * avgarr,int flags)2453 cvCalcCovarMatrix( const CvArr** vecarr, int count,
2454                    CvArr* covarr, CvArr* avgarr, int flags )
2455 {
2456     static CvFuncTable dot_tab[2];
2457     static CvFuncTable ext_tab[2];
2458     static int inittab = 0;
2459     vec_data* vecdata = 0;
2460     CvMat *tempvec = 0;
2461 
2462     CV_FUNCNAME( "cvCalcCovarMatrix" );
2463 
2464     __BEGIN__;
2465 
2466     CvMat covstub, *cov = (CvMat*)covarr;
2467     CvMat avgstub, *avg = (CvMat*)avgarr;
2468     CvMat vecstub0, *vecmat = 0;
2469     CvSize srcsize, contsize;
2470     int srctype = 0, dsttype = 0;
2471     int i, j;
2472     int cont_flag, vec_delta = 0, vec_step = 0;
2473     int is_covar_normal = (flags & CV_COVAR_NORMAL) != 0;
2474     double scale;
2475 
2476     if( !inittab )
2477     {
2478         icvInitDotProductShiftedTable( dot_tab + 0, dot_tab + 1 );
2479         icvInitExtProductShiftedTable( ext_tab + 0, ext_tab + 1 );
2480         inittab = 1;
2481     }
2482 
2483     if( !vecarr )
2484         CV_ERROR( CV_StsNullPtr, "NULL vec pointer" );
2485 
2486     CV_CALL( cov = cvGetMat( cov, &covstub ));
2487     CV_CALL( avg = cvGetMat( avg, &avgstub ));
2488 
2489     if( !CV_ARE_TYPES_EQ( cov, avg ))
2490         CV_ERROR( CV_StsUnmatchedFormats,
2491         "Covariation matrix and average vector should have the same types" );
2492 
2493     dsttype = CV_MAT_TYPE( cov->type );
2494     if( dsttype != CV_32FC1 && dsttype != CV_64FC1 )
2495         CV_ERROR( CV_StsUnsupportedFormat, "Covariation matrix must be 32fC1 or 64fC1" );
2496 
2497     if( cov->rows != cov->cols )
2498         CV_ERROR( CV_StsBadSize, "Covariation matrix must be square" );
2499 
2500     srcsize = cvGetMatSize( avg );
2501     contsize.width = srcsize.width * srcsize.height;
2502     contsize.height = 1;
2503     cont_flag = avg->type;
2504 
2505     if( flags & (CV_COVAR_ROWS|CV_COVAR_COLS) )
2506     {
2507         CV_CALL( vecmat = cvGetMat( vecarr[0], &vecstub0 ));
2508         srctype = CV_MAT_TYPE(vecmat->type);
2509         if( flags & CV_COVAR_COLS )
2510         {
2511             count = vecmat->cols;
2512             if( avg->cols != 1 || avg->rows != vecmat->rows )
2513                 CV_ERROR( CV_StsUnmatchedSizes,
2514                 "The number of input vectors does not match to avg vector size" );
2515             cont_flag = 0;
2516             vec_delta = CV_ELEM_SIZE(vecmat->type);
2517             vec_step = vecmat->step;
2518         }
2519         else
2520         {
2521             count = vecmat->rows;
2522             if( avg->rows != 1 || avg->cols != vecmat->cols )
2523                 CV_ERROR( CV_StsUnmatchedSizes,
2524                 "The number of input vectors does not match to avg vector size" );
2525             vec_delta = vecmat->step;
2526             vec_step = CV_STUB_STEP;
2527         }
2528 
2529         if( !(flags & CV_COVAR_USE_AVG) )
2530             CV_CALL( cvReduce( vecmat, avg, -1, CV_REDUCE_AVG ));
2531 
2532         scale = !(flags & CV_COVAR_SCALE) ? 1. : 1./count;
2533 
2534         cvMulTransposed( vecmat, cov, ((flags & CV_COVAR_ROWS)!=0) ^ ((flags & CV_COVAR_NORMAL)==0), avg, scale );
2535         EXIT;
2536     }
2537 
2538     scale = !(flags & CV_COVAR_SCALE) ? 1. : 1./count;
2539 
2540     if( is_covar_normal )
2541     {
2542         if( count <= 0 )
2543             CV_ERROR( CV_StsBadSize,
2544             "The number of vectors is zero or negative" );
2545         if( cov->rows != contsize.width )
2546             CV_ERROR( CV_StsUnmatchedSizes,
2547             "The size of input vectors does not match with the size of covariation matrix" );
2548 
2549         CV_CALL( tempvec = cvCreateMat( avg->rows, avg->cols, dsttype ));
2550     }
2551     else if( count != cov->rows )
2552         CV_ERROR( CV_StsUnmatchedSizes,
2553         "The vector count and covariance matrix size do not match" );
2554 
2555     if( !(flags & (CV_COVAR_ROWS|CV_COVAR_COLS)) )
2556     {
2557         if( !(flags & CV_COVAR_USE_AVG) )
2558             cvZero( avg );
2559 
2560         CV_CALL( vecdata = (vec_data*)cvAlloc( count*sizeof(vecdata[0])));
2561 
2562         for( i = 0; i < count; i++ )
2563         {
2564             CvMat vecstub, *vec = (CvMat*)vecarr[i];
2565             CvMat* temp;
2566 
2567             if( !CV_IS_MAT(vec) )
2568                 CV_CALL( vec = cvGetMat( vec, &vecstub ));
2569 
2570             if( !CV_ARE_SIZES_EQ( vec, avg ))
2571                 CV_ERROR( CV_StsUnmatchedSizes,
2572                 "All input vectors and average vector must have the same size" );
2573 
2574             vecdata[i].ptr = vec->data.ptr;
2575             vecdata[i].step = vec->step;
2576             cont_flag &= vec->type;
2577             temp = vec;
2578             if( i == 0 )
2579             {
2580                 srctype = CV_MAT_TYPE( vec->type );
2581                 if( CV_MAT_CN( srctype ) != 1 )
2582                     CV_ERROR( CV_BadNumChannels, "All vectors must have a single channel" );
2583                 if( srctype != dsttype && !tempvec && !(flags & CV_COVAR_USE_AVG))
2584                     CV_CALL( tempvec = cvCreateMat( vec->rows, vec->cols, dsttype ));
2585             }
2586             else if( CV_MAT_TYPE(vec->type) != srctype )
2587                 CV_ERROR( CV_StsUnmatchedFormats,
2588                 "All input vectors must have the same type" );
2589 
2590             if( !(flags & CV_COVAR_USE_AVG) )
2591             {
2592                 if( tempvec )
2593                 {
2594                     temp = tempvec;
2595                     cvConvert( vec, temp );
2596                 }
2597                 cvAdd( temp, avg, avg );
2598             }
2599         }
2600 
2601         if( !(flags & CV_COVAR_USE_AVG) )
2602             cvScale( avg, avg, 1./count );
2603     }
2604 
2605     cont_flag = CV_IS_MAT_CONT( cont_flag );
2606     if( cont_flag )
2607         srcsize = contsize;
2608 
2609     if( !is_covar_normal )
2610     {
2611         CvFunc2D_3A1P dot_func =
2612             (CvFunc2D_3A1P)dot_tab[dsttype == CV_64FC1].fn_2d[CV_MAT_DEPTH(srctype)];
2613 
2614         if( !dot_func )
2615             CV_ERROR( CV_StsUnsupportedFormat,
2616             "The format of input vectors is not supported" );
2617 
2618         for( i = 0; i < count; i++ )
2619         {
2620             int a, b, delta;
2621             if( !(i & 1) )
2622                 a = 0, b = i+1, delta = 1;
2623             else
2624                 a = i, b = -1, delta = -1;
2625 
2626             for( j = a; j != b; j += delta )
2627             {
2628                 double result = 0;
2629                 void *v_i, *v_j;
2630                 int step_i, step_j;
2631 
2632                 if( !vecmat )
2633                 {
2634                     v_i = vecdata[i].ptr;
2635                     v_j = vecdata[j].ptr;
2636                     step_i = vecdata[i].step;
2637                     step_j = vecdata[j].step;
2638                 }
2639                 else
2640                 {
2641                     v_i = vecmat->data.ptr + vec_delta*i;
2642                     v_j = vecmat->data.ptr + vec_delta*j;
2643                     step_i = step_j = vec_step;
2644                 }
2645 
2646                 dot_func( v_i, step_i, v_j, step_j, avg->data.ptr, avg->step, srcsize, &result );
2647 
2648                 if( dsttype == CV_64FC1 )
2649                 {
2650                     ((double*)(cov->data.ptr + i*cov->step))[j] =
2651                     ((double*)(cov->data.ptr + j*cov->step))[i] = result*scale;
2652                 }
2653                 else
2654                 {
2655                     ((float*)(cov->data.ptr + i*cov->step))[j] =
2656                     ((float*)(cov->data.ptr + j*cov->step))[i] = (float)(result*scale);
2657                 }
2658             }
2659         }
2660     }
2661     else
2662     {
2663         uchar* cov_ptr = cov->data.ptr;
2664         int cov_step = cov->step;
2665         int cov_size = cov->rows;
2666         CvFunc2D_3A1P ext_func =
2667             (CvFunc2D_3A1P)ext_tab[dsttype == CV_64FC1].fn_2d[CV_MAT_DEPTH(srctype)];
2668         if( !ext_func )
2669             CV_ERROR( CV_StsUnsupportedFormat,
2670             "The format of input vectors is not supported" );
2671 
2672         cvZero( cov );
2673 
2674         for( i = 0; i < count; i++ )
2675         {
2676             void* v;
2677             int vstep;
2678             if( !vecmat )
2679             {
2680                 v = vecdata[i].ptr;
2681                 vstep = vecdata[i].step;
2682             }
2683             else
2684             {
2685                 v = vecmat->data.ptr + vec_delta*i;
2686                 vstep = vec_step;
2687             }
2688 
2689             ext_func( v, vstep, avg->data.ptr, avg->step,
2690                       cov_ptr, cov_step, srcsize, tempvec->data.ptr );
2691         }
2692 
2693         if( dsttype == CV_64FC1 )
2694             for( i = 0; i < cov_size; i++ )
2695                 for( j = 0; j <= i; j++ )
2696                 {
2697                     double* cov1 = ((double*)(cov_ptr + i*cov_step)) + j;
2698                     double* cov2 = ((double*)(cov_ptr + j*cov_step)) + i;
2699 
2700                     if( flags & CV_COVAR_SCALE )
2701                         *cov1 = *cov2 = *cov1*scale;
2702                     else
2703                         *cov2 = *cov1;
2704                 }
2705         else
2706             for( i = 0; i < cov_size; i++ )
2707                 for( j = 0; j <= i; j++ )
2708                 {
2709                     float* cov1 = ((float*)(cov_ptr + i*cov_step)) + j;
2710                     float* cov2 = ((float*)(cov_ptr + j*cov_step)) + i;
2711 
2712                     if( flags & CV_COVAR_SCALE )
2713                         *cov1 = *cov2 = (float)(*cov1*scale);
2714                     else
2715                         *cov2 = *cov1;
2716                 }
2717     }
2718 
2719     __END__;
2720 
2721     cvFree( &vecdata );
2722     cvReleaseMat( &tempvec );
2723 }
2724 
2725 /****************************************************************************************\
2726 *                                        cvMahalanobis                                   *
2727 \****************************************************************************************/
2728 
2729 #define ICV_MAHALANOBIS( flavor, arrtype )                                              \
2730 static CvStatus CV_STDCALL                                                              \
2731 icvMahalanobis_##flavor##_C1R( const arrtype* mat, int matstep,                         \
2732                                const arrtype* vec, int len, double* _result )           \
2733 {                                                                                       \
2734     int i, j;                                                                           \
2735     double result = 0;                                                                  \
2736                                                                                         \
2737     matstep /= sizeof(mat[0]);                                                          \
2738     for( i = 0; i < len; i++, mat += matstep )                                          \
2739     {                                                                                   \
2740         double row_sum = 0;                                                             \
2741         for( j = 0; j <= len - 4; j += 4 )                                              \
2742             row_sum += vec[j]*mat[j] + vec[j+1]*mat[j+1] +                              \
2743                        vec[j+2]*mat[j+2] + vec[j+3]*mat[j+3];                           \
2744         for( ; j < len; j++ )                                                           \
2745             row_sum += vec[j]*mat[j];                                                   \
2746         result += row_sum * vec[i];                                                     \
2747     }                                                                                   \
2748     *_result = result;                                                                  \
2749                                                                                         \
2750     return CV_OK;                                                                       \
2751 }
2752 
2753 ICV_MAHALANOBIS( 32f, float )
2754 ICV_MAHALANOBIS( 64f, double )
2755 
icvInitMahalanobisTable(CvFuncTable * tab)2756 static void  icvInitMahalanobisTable( CvFuncTable* tab )
2757 {
2758     tab->fn_2d[CV_32F] = (void*)icvMahalanobis_32f_C1R;
2759     tab->fn_2d[CV_64F] = (void*)icvMahalanobis_64f_C1R;
2760 }
2761 
2762 typedef CvStatus (CV_STDCALL * CvMahalanobisFunc)( const void* mat, int matstep,
2763                                                    const void* vec, int len, double* _result );
2764 
2765 CV_IMPL double
cvMahalanobis(const CvArr * srcAarr,const CvArr * srcBarr,CvArr * matarr)2766 cvMahalanobis( const CvArr* srcAarr, const CvArr* srcBarr, CvArr* matarr )
2767 {
2768     static CvFuncTable mahal_tab;
2769     static int inittab = 0;
2770     uchar* buffer = 0;
2771     int local_alloc = 0;
2772     double dist = 0;
2773 
2774     CV_FUNCNAME( "cvMahalanobis" );
2775 
2776     __BEGIN__;
2777 
2778     int buf_size, elem_size, len;
2779     CvMat stubA, *srcA = (CvMat*)srcAarr;
2780     CvMat stubB, *srcB = (CvMat*)srcBarr;
2781     CvMat stub, *mat = (CvMat*)matarr;
2782     CvMat temp;
2783     CvMahalanobisFunc func;
2784 
2785     if( !inittab )
2786     {
2787         icvInitMahalanobisTable( &mahal_tab );
2788         inittab = 1;
2789     }
2790 
2791     if( !CV_IS_MAT(srcA) )
2792         CV_CALL( srcA = cvGetMat( srcA, &stubA ));
2793 
2794     if( !CV_IS_MAT(srcB) )
2795         CV_CALL( srcB = cvGetMat( srcB, &stubB ));
2796 
2797     if( !CV_IS_MAT(mat) )
2798         CV_CALL( mat = cvGetMat( mat, &stub ));
2799 
2800     if( srcA->rows != 1 && srcA->cols != 1 )
2801         CV_ERROR( CV_StsBadSize, "Input matrices must be 1-d vectors" );
2802 
2803     len = srcA->rows + srcA->cols - 1;
2804 
2805     if( !CV_ARE_SIZES_EQ(srcA,srcB) )
2806         CV_ERROR( CV_StsUnmatchedSizes, "Input vectors have different sizes" );
2807 
2808     if( mat->rows != len || mat->cols != len )
2809         CV_ERROR( CV_StsUnmatchedSizes, "Input vectors and covariation matrix have different sizes" );
2810 
2811     func = (CvMahalanobisFunc)mahal_tab.fn_2d[CV_MAT_DEPTH(srcA->type)];
2812 
2813     if( CV_MAT_CN(srcA->type) > 1 || !func )
2814         CV_ERROR( CV_StsUnsupportedFormat,
2815         "Only single-channel floating-point vectors are supported" );
2816 
2817     if( !CV_ARE_TYPES_EQ(srcA,srcB) || !CV_ARE_TYPES_EQ(srcA,mat) )
2818         CV_ERROR( CV_StsUnmatchedSizes, "Input vectors have different sizes" );
2819 
2820     elem_size = CV_ELEM_SIZE(srcA->type);
2821     buf_size = len*elem_size;
2822 
2823     if( buf_size <= CV_MAX_LOCAL_SIZE )
2824     {
2825         buffer = (uchar*)cvStackAlloc( buf_size );
2826         local_alloc = 1;
2827     }
2828     else
2829     {
2830         CV_CALL( buffer = (uchar*)cvAlloc( buf_size ));
2831     }
2832 
2833     temp = cvMat( srcA->rows, srcA->cols, srcA->type, buffer );
2834     CV_CALL( cvSub( srcA, srcB, &temp ));
2835 
2836     IPPI_CALL( func( mat->data.ptr, mat->step, temp.data.ptr, len, &dist ));
2837     dist = sqrt(dist);
2838 
2839     __END__;
2840 
2841     if( buffer && !local_alloc )
2842         cvFree( &buffer );
2843 
2844     return  dist;
2845 }
2846 
2847 
2848 /****************************************************************************************\
2849 *                                        cvMulTransposed                                 *
2850 \****************************************************************************************/
2851 
2852 #define ICV_DEF_MULTRANS_R_FUNC( flavor, srctype, dsttype, load_macro )         \
2853 static CvStatus CV_STDCALL                                                      \
2854 icvMulTransposedR_##flavor( const srctype* src, int srcstep,                    \
2855                        dsttype* dst, int dststep,                               \
2856                        const dsttype* delta, int deltastep,                     \
2857                        CvSize size, int delta_cols, double scale )              \
2858 {                                                                               \
2859     int i, j, k;                                                                \
2860     dsttype* tdst = dst;                                                        \
2861     dsttype* col_buf = 0;                                                       \
2862     dsttype* delta_buf = 0;                                                     \
2863     int local_alloc = 0;                                                        \
2864     int buf_size = size.height*sizeof(dsttype);                                 \
2865                                                                                 \
2866     if( delta && delta_cols < size.width )                                      \
2867     {                                                                           \
2868         assert( delta_cols == 1 );                                              \
2869         buf_size += 4*buf_size;                                                 \
2870     }                                                                           \
2871                                                                                 \
2872     if( buf_size <= CV_MAX_LOCAL_SIZE )                                         \
2873     {                                                                           \
2874         col_buf = (dsttype*)cvStackAlloc( buf_size );                           \
2875         local_alloc = 1;                                                        \
2876     }                                                                           \
2877     else                                                                        \
2878     {                                                                           \
2879         col_buf = (dsttype*)cvAlloc( buf_size );                                \
2880         if( !col_buf )                                                          \
2881             return CV_OUTOFMEM_ERR;                                             \
2882     }                                                                           \
2883                                                                                 \
2884     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                       \
2885     deltastep /= sizeof(delta[0]);                                              \
2886                                                                                 \
2887     if( delta && delta_cols < size.width )                                      \
2888     {                                                                           \
2889         delta_buf = col_buf + size.height;                                      \
2890         for( i = 0; i < size.height; i++ )                                      \
2891             delta_buf[i*4] = delta_buf[i*4+1] =                                 \
2892                 delta_buf[i*4+2] = delta_buf[i*4+3] = delta[i*deltastep];       \
2893         delta = delta_buf;                                                      \
2894         deltastep = deltastep ? 4 : 0;                                          \
2895     }                                                                           \
2896                                                                                 \
2897     if( !delta )                                                                \
2898         for( i = 0; i < size.width; i++, tdst += dststep )                      \
2899         {                                                                       \
2900             for( k = 0; k < size.height; k++ )                                  \
2901                 col_buf[k] = src[k*srcstep+i];                                  \
2902                                                                                 \
2903             for( j = i; j <= size.width - 4; j += 4 )                           \
2904             {                                                                   \
2905                 double s0 = 0, s1 = 0, s2 = 0, s3 = 0;                          \
2906                 const srctype *tsrc = src + j;                                  \
2907                                                                                 \
2908                 for( k = 0; k < size.height; k++, tsrc += srcstep )             \
2909                 {                                                               \
2910                     double a = col_buf[k];                                      \
2911                     s0 += a * load_macro(tsrc[0]);                              \
2912                     s1 += a * load_macro(tsrc[1]);                              \
2913                     s2 += a * load_macro(tsrc[2]);                              \
2914                     s3 += a * load_macro(tsrc[3]);                              \
2915                 }                                                               \
2916                                                                                 \
2917                 tdst[j] = (dsttype)(s0*scale);                                  \
2918                 tdst[j+1] = (dsttype)(s1*scale);                                \
2919                 tdst[j+2] = (dsttype)(s2*scale);                                \
2920                 tdst[j+3] = (dsttype)(s3*scale);                                \
2921             }                                                                   \
2922                                                                                 \
2923             for( ; j < size.width; j++ )                                        \
2924             {                                                                   \
2925                 double s0 = 0;                                                  \
2926                 const srctype *tsrc = src + j;                                  \
2927                                                                                 \
2928                 for( k = 0; k < size.height; k++, tsrc += srcstep )             \
2929                     s0 += col_buf[k] * tsrc[0];                                 \
2930                                                                                 \
2931                 tdst[j] = (dsttype)(s0*scale);                                  \
2932             }                                                                   \
2933         }                                                                       \
2934     else                                                                        \
2935         for( i = 0; i < size.width; i++, tdst += dststep )                      \
2936         {                                                                       \
2937             if( !delta_buf )                                                    \
2938                 for( k = 0; k < size.height; k++ )                              \
2939                     col_buf[k] = load_macro(src[k*srcstep+i]) - delta[k*deltastep+i]; \
2940             else                                                                \
2941                 for( k = 0; k < size.height; k++ )                              \
2942                     col_buf[k] = load_macro(src[k*srcstep+i]) - delta_buf[k*deltastep]; \
2943                                                                                 \
2944             for( j = i; j <= size.width - 4; j += 4 )                           \
2945             {                                                                   \
2946                 double s0 = 0, s1 = 0, s2 = 0, s3 = 0;                          \
2947                 const srctype *tsrc = src + j;                                  \
2948                 const dsttype *d = delta_buf ? delta_buf : delta + j;           \
2949                                                                                 \
2950                 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) \
2951                 {                                                               \
2952                     double a = col_buf[k];                                      \
2953                     s0 += a * (load_macro(tsrc[0]) - d[0]);                     \
2954                     s1 += a * (load_macro(tsrc[1]) - d[1]);                     \
2955                     s2 += a * (load_macro(tsrc[2]) - d[2]);                     \
2956                     s3 += a * (load_macro(tsrc[3]) - d[3]);                     \
2957                 }                                                               \
2958                                                                                 \
2959                 tdst[j] = (dsttype)(s0*scale);                                  \
2960                 tdst[j+1] = (dsttype)(s1*scale);                                \
2961                 tdst[j+2] = (dsttype)(s2*scale);                                \
2962                 tdst[j+3] = (dsttype)(s3*scale);                                \
2963             }                                                                   \
2964                                                                                 \
2965             for( ; j < size.width; j++ )                                        \
2966             {                                                                   \
2967                 double s0 = 0;                                                  \
2968                 const srctype *tsrc = src + j;                                  \
2969                 const dsttype *d = delta_buf ? delta_buf : delta + j;           \
2970                                                                                 \
2971                 for( k = 0; k < size.height; k++, tsrc+=srcstep, d+=deltastep ) \
2972                     s0 += col_buf[k] * (load_macro(tsrc[0]) - d[0]);            \
2973                                                                                 \
2974                 tdst[j] = (dsttype)(s0*scale);                                  \
2975             }                                                                   \
2976         }                                                                       \
2977                                                                                 \
2978     /* fill the lower part of the destination matrix */                         \
2979     for( i = 1; i < size.width; i++ )                                           \
2980         for( j = 0; j < i; j++ )                                                \
2981             dst[dststep*i + j] = dst[dststep*j + i];                            \
2982                                                                                 \
2983     if( col_buf && !local_alloc )                                               \
2984         cvFree( &col_buf );                                                     \
2985                                                                                 \
2986     return CV_NO_ERR;                                                           \
2987 }
2988 
2989 
2990 #define ICV_DEF_MULTRANS_L_FUNC( flavor, srctype, dsttype, load_macro )         \
2991 static CvStatus CV_STDCALL                                                      \
2992 icvMulTransposedL_##flavor( const srctype* src, int srcstep,                    \
2993                             dsttype* dst, int dststep,                          \
2994                             dsttype* delta, int deltastep,                      \
2995                             CvSize size, int delta_cols, double scale )         \
2996 {                                                                               \
2997     int i, j, k;                                                                \
2998     dsttype* tdst = dst;                                                        \
2999                                                                                 \
3000     srcstep /= sizeof(src[0]); dststep /= sizeof(dst[0]);                       \
3001     deltastep /= sizeof(delta[0]);                                              \
3002                                                                                 \
3003     if( !delta )                                                                \
3004         for( i = 0; i < size.height; i++, tdst += dststep )                     \
3005             for( j = i; j < size.height; j++ )                                  \
3006             {                                                                   \
3007                 double s = 0;                                                   \
3008                 const srctype *tsrc1 = src + i*srcstep;                         \
3009                 const srctype *tsrc2 = src + j*srcstep;                         \
3010                                                                                 \
3011                 for( k = 0; k <= size.width - 4; k += 4 )                       \
3012                     s += tsrc1[k]*tsrc2[k] + tsrc1[k+1]*tsrc2[k+1] +            \
3013                          tsrc1[k+2]*tsrc2[k+2] + tsrc1[k+3]*tsrc2[k+3];         \
3014                 for( ; k < size.width; k++ )                                    \
3015                     s += tsrc1[k] * tsrc2[k];                                   \
3016                 tdst[j] = (dsttype)(s*scale);                                   \
3017             }                                                                   \
3018     else                                                                        \
3019     {                                                                           \
3020         dsttype* row_buf = 0;                                                   \
3021         int local_alloc = 0;                                                    \
3022         int buf_size = size.width*sizeof(dsttype);                              \
3023         dsttype delta_buf[4];                                                   \
3024         int delta_shift = delta_cols == size.width ? 4 : 0;                     \
3025                                                                                 \
3026         if( buf_size <= CV_MAX_LOCAL_SIZE )                                     \
3027         {                                                                       \
3028             row_buf = (dsttype*)cvStackAlloc( buf_size );                       \
3029             local_alloc = 1;                                                    \
3030         }                                                                       \
3031         else                                                                    \
3032         {                                                                       \
3033             row_buf = (dsttype*)cvAlloc( buf_size );                            \
3034             if( !row_buf )                                                      \
3035                 return CV_OUTOFMEM_ERR;                                         \
3036         }                                                                       \
3037                                                                                 \
3038         for( i = 0; i < size.height; i++, tdst += dststep )                     \
3039         {                                                                       \
3040             const srctype *tsrc1 = src + i*srcstep;                             \
3041             const dsttype *tdelta1 = delta + i*deltastep;                       \
3042                                                                                 \
3043             if( delta_cols < size.width )                                       \
3044                 for( k = 0; k < size.width; k++ )                               \
3045                     row_buf[k] = tsrc1[k] - tdelta1[0];                         \
3046             else                                                                \
3047                 for( k = 0; k < size.width; k++ )                               \
3048                     row_buf[k] = tsrc1[k] - tdelta1[k];                         \
3049                                                                                 \
3050             for( j = i; j < size.height; j++ )                                  \
3051             {                                                                   \
3052                 double s = 0;                                                   \
3053                 const srctype *tsrc2 = src + j*srcstep;                         \
3054                 const dsttype *tdelta2 = delta + j*deltastep;                   \
3055                 if( delta_cols < size.width )                                   \
3056                 {                                                               \
3057                     delta_buf[0] = delta_buf[1] =                               \
3058                         delta_buf[2] = delta_buf[3] = tdelta2[0];               \
3059                     tdelta2 = delta_buf;                                        \
3060                 }                                                               \
3061                 for( k = 0; k <= size.width-4; k += 4, tdelta2 += delta_shift ) \
3062                     s += row_buf[k]*(load_macro(tsrc2[k]) - tdelta2[0]) +       \
3063                          row_buf[k+1]*(load_macro(tsrc2[k+1]) - tdelta2[1]) +   \
3064                          row_buf[k+2]*(load_macro(tsrc2[k+2]) - tdelta2[2]) +   \
3065                          row_buf[k+3]*(load_macro(tsrc2[k+3]) - tdelta2[3]);    \
3066                 for( ; k < size.width; k++, tdelta2++ )                         \
3067                     s += row_buf[k]*(load_macro(tsrc2[k]) - tdelta2[0]);        \
3068                 tdst[j] = (dsttype)(s*scale);                                   \
3069             }                                                                   \
3070         }                                                                       \
3071                                                                                 \
3072         if( row_buf && !local_alloc )                                           \
3073             cvFree( &row_buf );                                                 \
3074     }                                                                           \
3075                                                                                 \
3076     /* fill the lower part of the destination matrix */                         \
3077     for( j = 0; j < size.height - 1; j++ )                                      \
3078         for( i = j; i < size.height; i++ )                                      \
3079             dst[dststep*i + j] = dst[dststep*j + i];                            \
3080                                                                                 \
3081     return CV_NO_ERR;                                                           \
3082 }
3083 
3084 
3085 ICV_DEF_MULTRANS_R_FUNC( 8u32f, uchar, float, CV_8TO32F )
3086 ICV_DEF_MULTRANS_R_FUNC( 8u64f, uchar, double, CV_8TO32F )
3087 ICV_DEF_MULTRANS_R_FUNC( 32f, float, float, CV_NOP )
3088 ICV_DEF_MULTRANS_R_FUNC( 32f64f, float, double, CV_NOP )
3089 ICV_DEF_MULTRANS_R_FUNC( 64f, double, double, CV_NOP )
3090 ICV_DEF_MULTRANS_R_FUNC( 16u32f, ushort, float, CV_NOP )
3091 ICV_DEF_MULTRANS_R_FUNC( 16u64f, ushort, double, CV_NOP )
3092 ICV_DEF_MULTRANS_R_FUNC( 16s32f, short, float, CV_NOP )
3093 ICV_DEF_MULTRANS_R_FUNC( 16s64f, short, double, CV_NOP )
3094 
3095 ICV_DEF_MULTRANS_L_FUNC( 8u32f, uchar, float, CV_8TO32F )
3096 ICV_DEF_MULTRANS_L_FUNC( 8u64f, uchar, double, CV_8TO32F )
3097 ICV_DEF_MULTRANS_L_FUNC( 32f, float, float, CV_NOP )
3098 ICV_DEF_MULTRANS_L_FUNC( 32f64f, float, double, CV_NOP )
3099 ICV_DEF_MULTRANS_L_FUNC( 64f, double, double, CV_NOP )
3100 ICV_DEF_MULTRANS_L_FUNC( 16u32f, ushort, float, CV_NOP )
3101 ICV_DEF_MULTRANS_L_FUNC( 16u64f, ushort, double, CV_NOP )
3102 ICV_DEF_MULTRANS_L_FUNC( 16s32f, short, float, CV_NOP )
3103 ICV_DEF_MULTRANS_L_FUNC( 16s64f, short, double, CV_NOP )
3104 
3105 
3106 typedef CvStatus (CV_STDCALL * CvMulTransposedFunc)
3107     ( const void* src, int srcstep,
3108       void* dst, int dststep, const void* delta,
3109       int deltastep, CvSize size, int delta_cols, double scale );
3110 
3111 CV_IMPL void
cvMulTransposed(const CvArr * srcarr,CvArr * dstarr,int order,const CvArr * deltaarr,double scale)3112 cvMulTransposed( const CvArr* srcarr, CvArr* dstarr,
3113                  int order, const CvArr* deltaarr, double scale )
3114 {
3115     const int gemm_level = 100; // boundary above which GEMM is faster.
3116     CvMat* src2 = 0;
3117 
3118     CV_FUNCNAME( "cvMulTransposed" );
3119 
3120     __BEGIN__;
3121 
3122     CvMat sstub, *src = (CvMat*)srcarr;
3123     CvMat dstub, *dst = (CvMat*)dstarr;
3124     CvMat deltastub, *delta = (CvMat*)deltaarr;
3125     int stype, dtype;
3126 
3127     if( !CV_IS_MAT( src ))
3128         CV_CALL( src = cvGetMat( src, &sstub ));
3129 
3130     if( !CV_IS_MAT( dst ))
3131         CV_CALL( dst = cvGetMat( dst, &dstub ));
3132 
3133     if( delta )
3134     {
3135         if( !CV_IS_MAT( delta ))
3136             CV_CALL( delta = cvGetMat( delta, &deltastub ));
3137 
3138         if( !CV_ARE_TYPES_EQ( dst, delta ))
3139             CV_ERROR( CV_StsUnmatchedFormats, "" );
3140 
3141         if( (delta->rows != src->rows && delta->rows != 1) ||
3142             (delta->cols != src->cols && delta->cols != 1) )
3143             CV_ERROR( CV_StsUnmatchedSizes, "" );
3144     }
3145     else
3146     {
3147         delta = &deltastub;
3148         delta->data.ptr = 0;
3149         delta->step = 0;
3150         delta->rows = delta->cols = 0;
3151     }
3152 
3153     stype = CV_MAT_TYPE( src->type );
3154     dtype = CV_MAT_TYPE( dst->type );
3155 
3156     if( dst->rows != dst->cols )
3157         CV_ERROR( CV_StsBadSize, "The destination matrix must be square" );
3158 
3159     if( (order != 0 && src->cols != dst->cols) ||
3160         (order == 0 && src->rows != dst->rows))
3161         CV_ERROR( CV_StsUnmatchedSizes, "" );
3162 
3163     if( src->data.ptr == dst->data.ptr || (stype == dtype &&
3164         (dst->cols >= gemm_level && dst->rows >= gemm_level &&
3165          src->cols >= gemm_level && src->rows >= gemm_level)))
3166     {
3167         if( deltaarr )
3168         {
3169             CV_CALL( src2 = cvCreateMat( src->rows, src->cols, src->type ));
3170             cvRepeat( delta, src2 );
3171             cvSub( src, src2, src2 );
3172             src = src2;
3173         }
3174         cvGEMM( src, src, scale, 0, 0, dst, order == 0 ? CV_GEMM_B_T : CV_GEMM_A_T );
3175     }
3176     else
3177     {
3178         CvMulTransposedFunc func =
3179             stype == CV_8U && dtype == CV_32F ?
3180             (order ? (CvMulTransposedFunc)icvMulTransposedR_8u32f :
3181                     (CvMulTransposedFunc)icvMulTransposedL_8u32f) :
3182             stype == CV_8U && dtype == CV_64F ?
3183             (order ? (CvMulTransposedFunc)icvMulTransposedR_8u64f :
3184                     (CvMulTransposedFunc)icvMulTransposedL_8u64f) :
3185             stype == CV_16U && dtype == CV_32F ?
3186             (order ? (CvMulTransposedFunc)icvMulTransposedR_16u32f :
3187                     (CvMulTransposedFunc)icvMulTransposedL_16u32f) :
3188             stype == CV_16U && dtype == CV_64F ?
3189             (order ? (CvMulTransposedFunc)icvMulTransposedR_16u64f :
3190                     (CvMulTransposedFunc)icvMulTransposedL_16u64f) :
3191             stype == CV_16S && dtype == CV_32F ?
3192             (order ? (CvMulTransposedFunc)icvMulTransposedR_16s32f :
3193                     (CvMulTransposedFunc)icvMulTransposedL_16s32f) :
3194             stype == CV_16S && dtype == CV_64F ?
3195             (order ? (CvMulTransposedFunc)icvMulTransposedR_16s64f :
3196                     (CvMulTransposedFunc)icvMulTransposedL_16s64f) :
3197             stype == CV_32F && dtype == CV_32F ?
3198             (order ? (CvMulTransposedFunc)icvMulTransposedR_32f :
3199                     (CvMulTransposedFunc)icvMulTransposedL_32f) :
3200             stype == CV_32F && dtype == CV_64F ?
3201             (order ? (CvMulTransposedFunc)icvMulTransposedR_32f64f :
3202                     (CvMulTransposedFunc)icvMulTransposedL_32f64f) :
3203             stype == CV_64F && dtype == CV_64F ?
3204             (order ? (CvMulTransposedFunc)icvMulTransposedR_64f :
3205                     (CvMulTransposedFunc)icvMulTransposedL_64f) : 0;
3206 
3207         if( !func )
3208             CV_ERROR( CV_StsUnsupportedFormat, "" );
3209 
3210         IPPI_CALL( func( src->data.ptr, src->step, dst->data.ptr, dst->step,
3211                          delta->data.ptr, delta->step, cvGetMatSize( src ),
3212                          delta->cols, scale ));
3213     }
3214 
3215     __END__;
3216 
3217     if( src2 )
3218         cvReleaseMat( &src2 );
3219 }
3220 
3221 
3222 /****************************************************************************************\
3223 *                                        cvDotProduct                                    *
3224 \****************************************************************************************/
3225 
3226 #define ICV_DEF_DOT_PROD_FUNC_2D( flavor, arrtype, temptype, sumtype )  \
3227 static CvStatus CV_STDCALL                                              \
3228 icvDotProduct_##flavor##_C1R( const arrtype* src1, int step1,           \
3229                               const arrtype* src2, int step2,           \
3230                               CvSize size, sumtype* _sum )              \
3231 {                                                                       \
3232     sumtype sum = 0;                                                    \
3233     step1 /= sizeof(src1[0]); step2 /= sizeof(src2[0]);                 \
3234                                                                         \
3235     for( ; size.height--; src1 += step1, src2 += step2 )                \
3236     {                                                                   \
3237         int i;                                                          \
3238                                                                         \
3239         for( i = 0; i <= size.width - 4; i += 4 )                       \
3240         {                                                               \
3241             temptype t0 = (temptype)src1[i]*src2[i];                    \
3242             temptype t1 = (temptype)src1[i+1]*src2[i+1];                \
3243             t0 += (temptype)src1[i+2]*src2[i+2];                        \
3244             t1 += (temptype)src1[i+3]*src2[i+3];                        \
3245             sum += t0 + t1;                                             \
3246         }                                                               \
3247                                                                         \
3248         for( ; i < size.width; i++ )                                    \
3249         {                                                               \
3250             sum += (temptype)src1[i]*src2[i];                           \
3251         }                                                               \
3252     }                                                                   \
3253                                                                         \
3254     *_sum = sum;                                                        \
3255     return CV_OK;                                                       \
3256 }
3257 
3258 
3259 ICV_DEF_DOT_PROD_FUNC_2D( 8u, uchar, int, int64 )
3260 ICV_DEF_DOT_PROD_FUNC_2D( 16u, ushort, int64, int64 )
3261 ICV_DEF_DOT_PROD_FUNC_2D( 16s, short, int64, int64 )
3262 ICV_DEF_DOT_PROD_FUNC_2D( 32s, int, double, double )
3263 ICV_DEF_DOT_PROD_FUNC_2D( 32f, float, double, double )
3264 ICV_DEF_DOT_PROD_FUNC_2D( 64f, double, double, double )
3265 
3266 #define icvDotProduct_8s_C1R 0
3267 
CV_DEF_INIT_FUNC_TAB_2D(DotProduct,C1R)3268 CV_DEF_INIT_FUNC_TAB_2D( DotProduct, C1R )
3269 
3270 CV_IMPL double
3271 cvDotProduct( const CvArr* srcAarr, const CvArr* srcBarr )
3272 {
3273     static CvFuncTable tab_2d;
3274     static int inittab = 0;
3275 
3276     Cv64suf result;
3277     result.f = 0;
3278 
3279     CV_FUNCNAME( "cvDotProduct" );
3280 
3281     __BEGIN__;
3282 
3283     CvMat stubA, *srcA = (CvMat*)srcAarr;
3284     CvMat stubB, *srcB = (CvMat*)srcBarr;
3285     CvSize size;
3286     int type, depth;
3287     CvFunc2D_2A1P func;
3288 
3289     if( !inittab )
3290     {
3291         icvInitDotProductC1RTable( &tab_2d );
3292         inittab = 1;
3293     }
3294 
3295     if( !CV_IS_MAT( srcA ))
3296     {
3297         int coi = 0;
3298         CV_CALL( srcA = cvGetMat( srcA, &stubA, &coi ));
3299         if( coi != 0 )
3300             CV_ERROR( CV_BadCOI, "coi is not supported" );
3301     }
3302 
3303     if( srcBarr == srcAarr )
3304         srcB = srcA;
3305     else
3306     {
3307         if( !CV_IS_MAT( srcB ))
3308         {
3309             int coi = 0;
3310             CV_CALL( srcB = cvGetMat( srcB, &stubB, &coi ));
3311 
3312             if( coi != 0 )
3313                 CV_ERROR( CV_BadCOI, "coi is not supported" );
3314         }
3315 
3316         if( !CV_ARE_TYPES_EQ( srcA, srcB ))
3317             CV_ERROR( CV_StsUnmatchedFormats, "" );
3318 
3319         if( !CV_ARE_SIZES_EQ( srcA, srcB ))
3320             CV_ERROR( CV_StsUnmatchedSizes, "" );
3321     }
3322 
3323     type = CV_MAT_TYPE( srcA->type );
3324     size = cvGetMatSize( srcA );
3325 
3326     size.width *= CV_MAT_CN( type );
3327     depth = CV_MAT_DEPTH( type );
3328 
3329     if( CV_IS_MAT_CONT( srcA->type & srcB->type ))
3330     {
3331         size.width *= size.height;
3332 
3333         if( size.width <= CV_MAX_INLINE_MAT_OP_SIZE )
3334         {
3335             if( depth == CV_32F )
3336             {
3337                 float* mA = srcA->data.fl;
3338                 float* mB = srcB->data.fl;
3339                 double sum = 0;
3340                 do
3341                     sum += (double)mA[size.width - 1]*mB[size.width - 1];
3342                 while( --size.width );
3343                 result.f = sum;
3344                 EXIT;
3345             }
3346 
3347             if( depth == CV_64F )
3348             {
3349                 double* mA = srcA->data.db;
3350                 double* mB = srcB->data.db;
3351                 double sum = 0;
3352                 do
3353                     sum += mA[size.width - 1]*mB[size.width - 1];
3354                 while( --size.width );
3355                 result.f = sum;
3356                 EXIT;
3357             }
3358         }
3359         size.height = 1;
3360     }
3361 
3362     func = (CvFunc2D_2A1P)(tab_2d.fn_2d[depth]);
3363     if( !func )
3364         CV_ERROR( CV_StsUnsupportedFormat, "" );
3365 
3366     IPPI_CALL( func( srcA->data.ptr, srcA->step,
3367                      srcB->data.ptr, srcB->step,
3368                      size, &result ));
3369 
3370     if( depth < CV_32S )
3371         result.f = (double)result.i;
3372 
3373     __END__;
3374 
3375     return result.f;
3376 }
3377 
3378 /* End of file. */
3379