1 /*********************************************************************** 2 * Software License Agreement (BSD License) 3 * 4 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved. 5 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved. 6 * 7 * THE BSD LICENSE 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 *************************************************************************/ 30 31 #ifndef OPENCV_FLANN_KMEANS_INDEX_H_ 32 #define OPENCV_FLANN_KMEANS_INDEX_H_ 33 34 #include <algorithm> 35 #include <map> 36 #include <cassert> 37 #include <limits> 38 #include <cmath> 39 40 #include "general.h" 41 #include "nn_index.h" 42 #include "dist.h" 43 #include "matrix.h" 44 #include "result_set.h" 45 #include "heap.h" 46 #include "allocator.h" 47 #include "random.h" 48 #include "saving.h" 49 #include "logger.h" 50 51 52 namespace cvflann 53 { 54 55 struct KMeansIndexParams : public IndexParams 56 { 57 KMeansIndexParams(int branching = 32, int iterations = 11, 58 flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM, float cb_index = 0.2 ) 59 { 60 (*this)["algorithm"] = FLANN_INDEX_KMEANS; 61 // branching factor 62 (*this)["branching"] = branching; 63 // max iterations to perform in one kmeans clustering (kmeans tree) 64 (*this)["iterations"] = iterations; 65 // algorithm used for picking the initial cluster centers for kmeans tree 66 (*this)["centers_init"] = centers_init; 67 // cluster boundary index. Used when searching the kmeans tree 68 (*this)["cb_index"] = cb_index; 69 } 70 }; 71 72 73 /** 74 * Hierarchical kmeans index 75 * 76 * Contains a tree constructed through a hierarchical kmeans clustering 77 * and other information for indexing a set of points for nearest-neighbour matching. 78 */ 79 template <typename Distance> 80 class KMeansIndex : public NNIndex<Distance> 81 { 82 public: 83 typedef typename Distance::ElementType ElementType; 84 typedef typename Distance::ResultType DistanceType; 85 86 87 88 typedef void (KMeansIndex::* centersAlgFunction)(int, int*, int, int*, int&); 89 90 /** 91 * The function used for choosing the cluster centers. 92 */ 93 centersAlgFunction chooseCenters; 94 95 96 97 /** 98 * Chooses the initial centers in the k-means clustering in a random manner. 99 * 100 * Params: 101 * k = number of centers 102 * vecs = the dataset of points 103 * indices = indices in the dataset 104 * indices_length = length of indices vector 105 * 106 */ chooseCentersRandom(int k,int * indices,int indices_length,int * centers,int & centers_length)107 void chooseCentersRandom(int k, int* indices, int indices_length, int* centers, int& centers_length) 108 { 109 UniqueRandom r(indices_length); 110 111 int index; 112 for (index=0; index<k; ++index) { 113 bool duplicate = true; 114 int rnd; 115 while (duplicate) { 116 duplicate = false; 117 rnd = r.next(); 118 if (rnd<0) { 119 centers_length = index; 120 return; 121 } 122 123 centers[index] = indices[rnd]; 124 125 for (int j=0; j<index; ++j) { 126 DistanceType sq = distance_(dataset_[centers[index]], dataset_[centers[j]], dataset_.cols); 127 if (sq<1e-16) { 128 duplicate = true; 129 } 130 } 131 } 132 } 133 134 centers_length = index; 135 } 136 137 138 /** 139 * Chooses the initial centers in the k-means using Gonzales' algorithm 140 * so that the centers are spaced apart from each other. 141 * 142 * Params: 143 * k = number of centers 144 * vecs = the dataset of points 145 * indices = indices in the dataset 146 * Returns: 147 */ chooseCentersGonzales(int k,int * indices,int indices_length,int * centers,int & centers_length)148 void chooseCentersGonzales(int k, int* indices, int indices_length, int* centers, int& centers_length) 149 { 150 int n = indices_length; 151 152 int rnd = rand_int(n); 153 assert(rnd >=0 && rnd < n); 154 155 centers[0] = indices[rnd]; 156 157 int index; 158 for (index=1; index<k; ++index) { 159 160 int best_index = -1; 161 DistanceType best_val = 0; 162 for (int j=0; j<n; ++j) { 163 DistanceType dist = distance_(dataset_[centers[0]],dataset_[indices[j]],dataset_.cols); 164 for (int i=1; i<index; ++i) { 165 DistanceType tmp_dist = distance_(dataset_[centers[i]],dataset_[indices[j]],dataset_.cols); 166 if (tmp_dist<dist) { 167 dist = tmp_dist; 168 } 169 } 170 if (dist>best_val) { 171 best_val = dist; 172 best_index = j; 173 } 174 } 175 if (best_index!=-1) { 176 centers[index] = indices[best_index]; 177 } 178 else { 179 break; 180 } 181 } 182 centers_length = index; 183 } 184 185 186 /** 187 * Chooses the initial centers in the k-means using the algorithm 188 * proposed in the KMeans++ paper: 189 * Arthur, David; Vassilvitskii, Sergei - k-means++: The Advantages of Careful Seeding 190 * 191 * Implementation of this function was converted from the one provided in Arthur's code. 192 * 193 * Params: 194 * k = number of centers 195 * vecs = the dataset of points 196 * indices = indices in the dataset 197 * Returns: 198 */ chooseCentersKMeanspp(int k,int * indices,int indices_length,int * centers,int & centers_length)199 void chooseCentersKMeanspp(int k, int* indices, int indices_length, int* centers, int& centers_length) 200 { 201 int n = indices_length; 202 203 double currentPot = 0; 204 DistanceType* closestDistSq = new DistanceType[n]; 205 206 // Choose one random center and set the closestDistSq values 207 int index = rand_int(n); 208 assert(index >=0 && index < n); 209 centers[0] = indices[index]; 210 211 for (int i = 0; i < n; i++) { 212 closestDistSq[i] = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); 213 closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] ); 214 currentPot += closestDistSq[i]; 215 } 216 217 218 const int numLocalTries = 1; 219 220 // Choose each center 221 int centerCount; 222 for (centerCount = 1; centerCount < k; centerCount++) { 223 224 // Repeat several trials 225 double bestNewPot = -1; 226 int bestNewIndex = -1; 227 for (int localTrial = 0; localTrial < numLocalTries; localTrial++) { 228 229 // Choose our center - have to be slightly careful to return a valid answer even accounting 230 // for possible rounding errors 231 double randVal = rand_double(currentPot); 232 for (index = 0; index < n-1; index++) { 233 if (randVal <= closestDistSq[index]) break; 234 else randVal -= closestDistSq[index]; 235 } 236 237 // Compute the new potential 238 double newPot = 0; 239 for (int i = 0; i < n; i++) { 240 DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[index]], dataset_.cols); 241 newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] ); 242 } 243 244 // Store the best result 245 if ((bestNewPot < 0)||(newPot < bestNewPot)) { 246 bestNewPot = newPot; 247 bestNewIndex = index; 248 } 249 } 250 251 // Add the appropriate center 252 centers[centerCount] = indices[bestNewIndex]; 253 currentPot = bestNewPot; 254 for (int i = 0; i < n; i++) { 255 DistanceType dist = distance_(dataset_[indices[i]], dataset_[indices[bestNewIndex]], dataset_.cols); 256 closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] ); 257 } 258 } 259 260 centers_length = centerCount; 261 262 delete[] closestDistSq; 263 } 264 265 266 267 public: 268 getType()269 flann_algorithm_t getType() const 270 { 271 return FLANN_INDEX_KMEANS; 272 } 273 274 class KMeansDistanceComputer : public cv::ParallelLoopBody 275 { 276 public: KMeansDistanceComputer(Distance _distance,const Matrix<ElementType> & _dataset,const int _branching,const int * _indices,const Matrix<double> & _dcenters,const size_t _veclen,int * _count,int * _belongs_to,std::vector<DistanceType> & _radiuses,bool & _converged,cv::Mutex & _mtx)277 KMeansDistanceComputer(Distance _distance, const Matrix<ElementType>& _dataset, 278 const int _branching, const int* _indices, const Matrix<double>& _dcenters, const size_t _veclen, 279 int* _count, int* _belongs_to, std::vector<DistanceType>& _radiuses, bool& _converged, cv::Mutex& _mtx) 280 : distance(_distance) 281 , dataset(_dataset) 282 , branching(_branching) 283 , indices(_indices) 284 , dcenters(_dcenters) 285 , veclen(_veclen) 286 , count(_count) 287 , belongs_to(_belongs_to) 288 , radiuses(_radiuses) 289 , converged(_converged) 290 , mtx(_mtx) 291 { 292 } 293 operator()294 void operator()(const cv::Range& range) const 295 { 296 const int begin = range.start; 297 const int end = range.end; 298 299 for( int i = begin; i<end; ++i) 300 { 301 DistanceType sq_dist = distance(dataset[indices[i]], dcenters[0], veclen); 302 int new_centroid = 0; 303 for (int j=1; j<branching; ++j) { 304 DistanceType new_sq_dist = distance(dataset[indices[i]], dcenters[j], veclen); 305 if (sq_dist>new_sq_dist) { 306 new_centroid = j; 307 sq_dist = new_sq_dist; 308 } 309 } 310 if (sq_dist > radiuses[new_centroid]) { 311 radiuses[new_centroid] = sq_dist; 312 } 313 if (new_centroid != belongs_to[i]) { 314 count[belongs_to[i]]--; 315 count[new_centroid]++; 316 belongs_to[i] = new_centroid; 317 mtx.lock(); 318 converged = false; 319 mtx.unlock(); 320 } 321 } 322 } 323 324 private: 325 Distance distance; 326 const Matrix<ElementType>& dataset; 327 const int branching; 328 const int* indices; 329 const Matrix<double>& dcenters; 330 const size_t veclen; 331 int* count; 332 int* belongs_to; 333 std::vector<DistanceType>& radiuses; 334 bool& converged; 335 cv::Mutex& mtx; 336 KMeansDistanceComputer& operator=( const KMeansDistanceComputer & ) { return *this; } 337 }; 338 339 /** 340 * Index constructor 341 * 342 * Params: 343 * inputData = dataset with the input features 344 * params = parameters passed to the hierarchical k-means algorithm 345 */ 346 KMeansIndex(const Matrix<ElementType>& inputData, const IndexParams& params = KMeansIndexParams(), 347 Distance d = Distance()) dataset_(inputData)348 : dataset_(inputData), index_params_(params), root_(NULL), indices_(NULL), distance_(d) 349 { 350 memoryCounter_ = 0; 351 352 size_ = dataset_.rows; 353 veclen_ = dataset_.cols; 354 355 branching_ = get_param(params,"branching",32); 356 iterations_ = get_param(params,"iterations",11); 357 if (iterations_<0) { 358 iterations_ = (std::numeric_limits<int>::max)(); 359 } 360 centers_init_ = get_param(params,"centers_init",FLANN_CENTERS_RANDOM); 361 362 if (centers_init_==FLANN_CENTERS_RANDOM) { 363 chooseCenters = &KMeansIndex::chooseCentersRandom; 364 } 365 else if (centers_init_==FLANN_CENTERS_GONZALES) { 366 chooseCenters = &KMeansIndex::chooseCentersGonzales; 367 } 368 else if (centers_init_==FLANN_CENTERS_KMEANSPP) { 369 chooseCenters = &KMeansIndex::chooseCentersKMeanspp; 370 } 371 else { 372 throw FLANNException("Unknown algorithm for choosing initial centers."); 373 } 374 cb_index_ = 0.4f; 375 376 } 377 378 379 KMeansIndex(const KMeansIndex&); 380 KMeansIndex& operator=(const KMeansIndex&); 381 382 383 /** 384 * Index destructor. 385 * 386 * Release the memory used by the index. 387 */ ~KMeansIndex()388 virtual ~KMeansIndex() 389 { 390 if (root_ != NULL) { 391 free_centers(root_); 392 } 393 if (indices_!=NULL) { 394 delete[] indices_; 395 } 396 } 397 398 /** 399 * Returns size of index. 400 */ size()401 size_t size() const 402 { 403 return size_; 404 } 405 406 /** 407 * Returns the length of an index feature. 408 */ veclen()409 size_t veclen() const 410 { 411 return veclen_; 412 } 413 414 set_cb_index(float index)415 void set_cb_index( float index) 416 { 417 cb_index_ = index; 418 } 419 420 /** 421 * Computes the inde memory usage 422 * Returns: memory used by the index 423 */ usedMemory()424 int usedMemory() const 425 { 426 return pool_.usedMemory+pool_.wastedMemory+memoryCounter_; 427 } 428 429 /** 430 * Builds the index 431 */ buildIndex()432 void buildIndex() 433 { 434 if (branching_<2) { 435 throw FLANNException("Branching factor must be at least 2"); 436 } 437 438 indices_ = new int[size_]; 439 for (size_t i=0; i<size_; ++i) { 440 indices_[i] = int(i); 441 } 442 443 root_ = pool_.allocate<KMeansNode>(); 444 computeNodeStatistics(root_, indices_, (int)size_); 445 computeClustering(root_, indices_, (int)size_, branching_,0); 446 } 447 448 saveIndex(FILE * stream)449 void saveIndex(FILE* stream) 450 { 451 save_value(stream, branching_); 452 save_value(stream, iterations_); 453 save_value(stream, memoryCounter_); 454 save_value(stream, cb_index_); 455 save_value(stream, *indices_, (int)size_); 456 457 save_tree(stream, root_); 458 } 459 460 loadIndex(FILE * stream)461 void loadIndex(FILE* stream) 462 { 463 load_value(stream, branching_); 464 load_value(stream, iterations_); 465 load_value(stream, memoryCounter_); 466 load_value(stream, cb_index_); 467 if (indices_!=NULL) { 468 delete[] indices_; 469 } 470 indices_ = new int[size_]; 471 load_value(stream, *indices_, size_); 472 473 if (root_!=NULL) { 474 free_centers(root_); 475 } 476 load_tree(stream, root_); 477 478 index_params_["algorithm"] = getType(); 479 index_params_["branching"] = branching_; 480 index_params_["iterations"] = iterations_; 481 index_params_["centers_init"] = centers_init_; 482 index_params_["cb_index"] = cb_index_; 483 484 } 485 486 487 /** 488 * Find set of nearest neighbors to vec. Their indices are stored inside 489 * the result object. 490 * 491 * Params: 492 * result = the result object in which the indices of the nearest-neighbors are stored 493 * vec = the vector for which to search the nearest neighbors 494 * searchParams = parameters that influence the search algorithm (checks, cb_index) 495 */ findNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,const SearchParams & searchParams)496 void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) 497 { 498 499 int maxChecks = get_param(searchParams,"checks",32); 500 501 if (maxChecks==FLANN_CHECKS_UNLIMITED) { 502 findExactNN(root_, result, vec); 503 } 504 else { 505 // Priority queue storing intermediate branches in the best-bin-first search 506 Heap<BranchSt>* heap = new Heap<BranchSt>((int)size_); 507 508 int checks = 0; 509 findNN(root_, result, vec, checks, maxChecks, heap); 510 511 BranchSt branch; 512 while (heap->popMin(branch) && (checks<maxChecks || !result.full())) { 513 KMeansNodePtr node = branch.node; 514 findNN(node, result, vec, checks, maxChecks, heap); 515 } 516 assert(result.full()); 517 518 delete heap; 519 } 520 521 } 522 523 /** 524 * Clustering function that takes a cut in the hierarchical k-means 525 * tree and return the clusters centers of that clustering. 526 * Params: 527 * numClusters = number of clusters to have in the clustering computed 528 * Returns: number of cluster centers 529 */ getClusterCenters(Matrix<DistanceType> & centers)530 int getClusterCenters(Matrix<DistanceType>& centers) 531 { 532 int numClusters = centers.rows; 533 if (numClusters<1) { 534 throw FLANNException("Number of clusters must be at least 1"); 535 } 536 537 DistanceType variance; 538 KMeansNodePtr* clusters = new KMeansNodePtr[numClusters]; 539 540 int clusterCount = getMinVarianceClusters(root_, clusters, numClusters, variance); 541 542 Logger::info("Clusters requested: %d, returning %d\n",numClusters, clusterCount); 543 544 for (int i=0; i<clusterCount; ++i) { 545 DistanceType* center = clusters[i]->pivot; 546 for (size_t j=0; j<veclen_; ++j) { 547 centers[i][j] = center[j]; 548 } 549 } 550 delete[] clusters; 551 552 return clusterCount; 553 } 554 getParameters()555 IndexParams getParameters() const 556 { 557 return index_params_; 558 } 559 560 561 private: 562 /** 563 * Struture representing a node in the hierarchical k-means tree. 564 */ 565 struct KMeansNode 566 { 567 /** 568 * The cluster center. 569 */ 570 DistanceType* pivot; 571 /** 572 * The cluster radius. 573 */ 574 DistanceType radius; 575 /** 576 * The cluster mean radius. 577 */ 578 DistanceType mean_radius; 579 /** 580 * The cluster variance. 581 */ 582 DistanceType variance; 583 /** 584 * The cluster size (number of points in the cluster) 585 */ 586 int size; 587 /** 588 * Child nodes (only for non-terminal nodes) 589 */ 590 KMeansNode** childs; 591 /** 592 * Node points (only for terminal nodes) 593 */ 594 int* indices; 595 /** 596 * Level 597 */ 598 int level; 599 }; 600 typedef KMeansNode* KMeansNodePtr; 601 602 /** 603 * Alias definition for a nicer syntax. 604 */ 605 typedef BranchStruct<KMeansNodePtr, DistanceType> BranchSt; 606 607 608 609 save_tree(FILE * stream,KMeansNodePtr node)610 void save_tree(FILE* stream, KMeansNodePtr node) 611 { 612 save_value(stream, *node); 613 save_value(stream, *(node->pivot), (int)veclen_); 614 if (node->childs==NULL) { 615 int indices_offset = (int)(node->indices - indices_); 616 save_value(stream, indices_offset); 617 } 618 else { 619 for(int i=0; i<branching_; ++i) { 620 save_tree(stream, node->childs[i]); 621 } 622 } 623 } 624 625 load_tree(FILE * stream,KMeansNodePtr & node)626 void load_tree(FILE* stream, KMeansNodePtr& node) 627 { 628 node = pool_.allocate<KMeansNode>(); 629 load_value(stream, *node); 630 node->pivot = new DistanceType[veclen_]; 631 load_value(stream, *(node->pivot), (int)veclen_); 632 if (node->childs==NULL) { 633 int indices_offset; 634 load_value(stream, indices_offset); 635 node->indices = indices_ + indices_offset; 636 } 637 else { 638 node->childs = pool_.allocate<KMeansNodePtr>(branching_); 639 for(int i=0; i<branching_; ++i) { 640 load_tree(stream, node->childs[i]); 641 } 642 } 643 } 644 645 646 /** 647 * Helper function 648 */ free_centers(KMeansNodePtr node)649 void free_centers(KMeansNodePtr node) 650 { 651 delete[] node->pivot; 652 if (node->childs!=NULL) { 653 for (int k=0; k<branching_; ++k) { 654 free_centers(node->childs[k]); 655 } 656 } 657 } 658 659 /** 660 * Computes the statistics of a node (mean, radius, variance). 661 * 662 * Params: 663 * node = the node to use 664 * indices = the indices of the points belonging to the node 665 */ computeNodeStatistics(KMeansNodePtr node,int * indices,int indices_length)666 void computeNodeStatistics(KMeansNodePtr node, int* indices, int indices_length) 667 { 668 669 DistanceType radius = 0; 670 DistanceType variance = 0; 671 DistanceType* mean = new DistanceType[veclen_]; 672 memoryCounter_ += int(veclen_*sizeof(DistanceType)); 673 674 memset(mean,0,veclen_*sizeof(DistanceType)); 675 676 for (size_t i=0; i<size_; ++i) { 677 ElementType* vec = dataset_[indices[i]]; 678 for (size_t j=0; j<veclen_; ++j) { 679 mean[j] += vec[j]; 680 } 681 variance += distance_(vec, ZeroIterator<ElementType>(), veclen_); 682 } 683 for (size_t j=0; j<veclen_; ++j) { 684 mean[j] /= size_; 685 } 686 variance /= size_; 687 variance -= distance_(mean, ZeroIterator<ElementType>(), veclen_); 688 689 DistanceType tmp = 0; 690 for (int i=0; i<indices_length; ++i) { 691 tmp = distance_(mean, dataset_[indices[i]], veclen_); 692 if (tmp>radius) { 693 radius = tmp; 694 } 695 } 696 697 node->variance = variance; 698 node->radius = radius; 699 node->pivot = mean; 700 } 701 702 703 /** 704 * The method responsible with actually doing the recursive hierarchical 705 * clustering 706 * 707 * Params: 708 * node = the node to cluster 709 * indices = indices of the points belonging to the current node 710 * branching = the branching factor to use in the clustering 711 * 712 * TODO: for 1-sized clusters don't store a cluster center (it's the same as the single cluster point) 713 */ computeClustering(KMeansNodePtr node,int * indices,int indices_length,int branching,int level)714 void computeClustering(KMeansNodePtr node, int* indices, int indices_length, int branching, int level) 715 { 716 node->size = indices_length; 717 node->level = level; 718 719 if (indices_length < branching) { 720 node->indices = indices; 721 std::sort(node->indices,node->indices+indices_length); 722 node->childs = NULL; 723 return; 724 } 725 726 cv::AutoBuffer<int> centers_idx_buf(branching); 727 int* centers_idx = (int*)centers_idx_buf; 728 int centers_length; 729 (this->*chooseCenters)(branching, indices, indices_length, centers_idx, centers_length); 730 731 if (centers_length<branching) { 732 node->indices = indices; 733 std::sort(node->indices,node->indices+indices_length); 734 node->childs = NULL; 735 return; 736 } 737 738 739 cv::AutoBuffer<double> dcenters_buf(branching*veclen_); 740 Matrix<double> dcenters((double*)dcenters_buf,branching,veclen_); 741 for (int i=0; i<centers_length; ++i) { 742 ElementType* vec = dataset_[centers_idx[i]]; 743 for (size_t k=0; k<veclen_; ++k) { 744 dcenters[i][k] = double(vec[k]); 745 } 746 } 747 748 std::vector<DistanceType> radiuses(branching); 749 cv::AutoBuffer<int> count_buf(branching); 750 int* count = (int*)count_buf; 751 for (int i=0; i<branching; ++i) { 752 radiuses[i] = 0; 753 count[i] = 0; 754 } 755 756 // assign points to clusters 757 cv::AutoBuffer<int> belongs_to_buf(indices_length); 758 int* belongs_to = (int*)belongs_to_buf; 759 for (int i=0; i<indices_length; ++i) { 760 761 DistanceType sq_dist = distance_(dataset_[indices[i]], dcenters[0], veclen_); 762 belongs_to[i] = 0; 763 for (int j=1; j<branching; ++j) { 764 DistanceType new_sq_dist = distance_(dataset_[indices[i]], dcenters[j], veclen_); 765 if (sq_dist>new_sq_dist) { 766 belongs_to[i] = j; 767 sq_dist = new_sq_dist; 768 } 769 } 770 if (sq_dist>radiuses[belongs_to[i]]) { 771 radiuses[belongs_to[i]] = sq_dist; 772 } 773 count[belongs_to[i]]++; 774 } 775 776 bool converged = false; 777 int iteration = 0; 778 while (!converged && iteration<iterations_) { 779 converged = true; 780 iteration++; 781 782 // compute the new cluster centers 783 for (int i=0; i<branching; ++i) { 784 memset(dcenters[i],0,sizeof(double)*veclen_); 785 radiuses[i] = 0; 786 } 787 for (int i=0; i<indices_length; ++i) { 788 ElementType* vec = dataset_[indices[i]]; 789 double* center = dcenters[belongs_to[i]]; 790 for (size_t k=0; k<veclen_; ++k) { 791 center[k] += vec[k]; 792 } 793 } 794 for (int i=0; i<branching; ++i) { 795 int cnt = count[i]; 796 for (size_t k=0; k<veclen_; ++k) { 797 dcenters[i][k] /= cnt; 798 } 799 } 800 801 // reassign points to clusters 802 cv::Mutex mtx; 803 KMeansDistanceComputer invoker(distance_, dataset_, branching, indices, dcenters, veclen_, count, belongs_to, radiuses, converged, mtx); 804 parallel_for_(cv::Range(0, (int)indices_length), invoker); 805 806 for (int i=0; i<branching; ++i) { 807 // if one cluster converges to an empty cluster, 808 // move an element into that cluster 809 if (count[i]==0) { 810 int j = (i+1)%branching; 811 while (count[j]<=1) { 812 j = (j+1)%branching; 813 } 814 815 for (int k=0; k<indices_length; ++k) { 816 if (belongs_to[k]==j) { 817 // for cluster j, we move the furthest element from the center to the empty cluster i 818 if ( distance_(dataset_[indices[k]], dcenters[j], veclen_) == radiuses[j] ) { 819 belongs_to[k] = i; 820 count[j]--; 821 count[i]++; 822 break; 823 } 824 } 825 } 826 converged = false; 827 } 828 } 829 830 } 831 832 DistanceType** centers = new DistanceType*[branching]; 833 834 for (int i=0; i<branching; ++i) { 835 centers[i] = new DistanceType[veclen_]; 836 memoryCounter_ += (int)(veclen_*sizeof(DistanceType)); 837 for (size_t k=0; k<veclen_; ++k) { 838 centers[i][k] = (DistanceType)dcenters[i][k]; 839 } 840 } 841 842 843 // compute kmeans clustering for each of the resulting clusters 844 node->childs = pool_.allocate<KMeansNodePtr>(branching); 845 int start = 0; 846 int end = start; 847 for (int c=0; c<branching; ++c) { 848 int s = count[c]; 849 850 DistanceType variance = 0; 851 DistanceType mean_radius =0; 852 for (int i=0; i<indices_length; ++i) { 853 if (belongs_to[i]==c) { 854 DistanceType d = distance_(dataset_[indices[i]], ZeroIterator<ElementType>(), veclen_); 855 variance += d; 856 mean_radius += sqrt(d); 857 std::swap(indices[i],indices[end]); 858 std::swap(belongs_to[i],belongs_to[end]); 859 end++; 860 } 861 } 862 variance /= s; 863 mean_radius /= s; 864 variance -= distance_(centers[c], ZeroIterator<ElementType>(), veclen_); 865 866 node->childs[c] = pool_.allocate<KMeansNode>(); 867 node->childs[c]->radius = radiuses[c]; 868 node->childs[c]->pivot = centers[c]; 869 node->childs[c]->variance = variance; 870 node->childs[c]->mean_radius = mean_radius; 871 node->childs[c]->indices = NULL; 872 computeClustering(node->childs[c],indices+start, end-start, branching, level+1); 873 start=end; 874 } 875 } 876 877 878 879 /** 880 * Performs one descent in the hierarchical k-means tree. The branches not 881 * visited are stored in a priority queue. 882 * 883 * Params: 884 * node = node to explore 885 * result = container for the k-nearest neighbors found 886 * vec = query points 887 * checks = how many points in the dataset have been checked so far 888 * maxChecks = maximum dataset points to checks 889 */ 890 891 findNN(KMeansNodePtr node,ResultSet<DistanceType> & result,const ElementType * vec,int & checks,int maxChecks,Heap<BranchSt> * heap)892 void findNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks, 893 Heap<BranchSt>* heap) 894 { 895 // Ignore those clusters that are too far away 896 { 897 DistanceType bsq = distance_(vec, node->pivot, veclen_); 898 DistanceType rsq = node->radius; 899 DistanceType wsq = result.worstDist(); 900 901 DistanceType val = bsq-rsq-wsq; 902 DistanceType val2 = val*val-4*rsq*wsq; 903 904 //if (val>0) { 905 if ((val>0)&&(val2>0)) { 906 return; 907 } 908 } 909 910 if (node->childs==NULL) { 911 if (checks>=maxChecks) { 912 if (result.full()) return; 913 } 914 checks += node->size; 915 for (int i=0; i<node->size; ++i) { 916 int index = node->indices[i]; 917 DistanceType dist = distance_(dataset_[index], vec, veclen_); 918 result.addPoint(dist, index); 919 } 920 } 921 else { 922 DistanceType* domain_distances = new DistanceType[branching_]; 923 int closest_center = exploreNodeBranches(node, vec, domain_distances, heap); 924 delete[] domain_distances; 925 findNN(node->childs[closest_center],result,vec, checks, maxChecks, heap); 926 } 927 } 928 929 /** 930 * Helper function that computes the nearest childs of a node to a given query point. 931 * Params: 932 * node = the node 933 * q = the query point 934 * distances = array with the distances to each child node. 935 * Returns: 936 */ exploreNodeBranches(KMeansNodePtr node,const ElementType * q,DistanceType * domain_distances,Heap<BranchSt> * heap)937 int exploreNodeBranches(KMeansNodePtr node, const ElementType* q, DistanceType* domain_distances, Heap<BranchSt>* heap) 938 { 939 940 int best_index = 0; 941 domain_distances[best_index] = distance_(q, node->childs[best_index]->pivot, veclen_); 942 for (int i=1; i<branching_; ++i) { 943 domain_distances[i] = distance_(q, node->childs[i]->pivot, veclen_); 944 if (domain_distances[i]<domain_distances[best_index]) { 945 best_index = i; 946 } 947 } 948 949 // float* best_center = node->childs[best_index]->pivot; 950 for (int i=0; i<branching_; ++i) { 951 if (i != best_index) { 952 domain_distances[i] -= cb_index_*node->childs[i]->variance; 953 954 // float dist_to_border = getDistanceToBorder(node.childs[i].pivot,best_center,q); 955 // if (domain_distances[i]<dist_to_border) { 956 // domain_distances[i] = dist_to_border; 957 // } 958 heap->insert(BranchSt(node->childs[i],domain_distances[i])); 959 } 960 } 961 962 return best_index; 963 } 964 965 966 /** 967 * Function the performs exact nearest neighbor search by traversing the entire tree. 968 */ findExactNN(KMeansNodePtr node,ResultSet<DistanceType> & result,const ElementType * vec)969 void findExactNN(KMeansNodePtr node, ResultSet<DistanceType>& result, const ElementType* vec) 970 { 971 // Ignore those clusters that are too far away 972 { 973 DistanceType bsq = distance_(vec, node->pivot, veclen_); 974 DistanceType rsq = node->radius; 975 DistanceType wsq = result.worstDist(); 976 977 DistanceType val = bsq-rsq-wsq; 978 DistanceType val2 = val*val-4*rsq*wsq; 979 980 // if (val>0) { 981 if ((val>0)&&(val2>0)) { 982 return; 983 } 984 } 985 986 987 if (node->childs==NULL) { 988 for (int i=0; i<node->size; ++i) { 989 int index = node->indices[i]; 990 DistanceType dist = distance_(dataset_[index], vec, veclen_); 991 result.addPoint(dist, index); 992 } 993 } 994 else { 995 int* sort_indices = new int[branching_]; 996 997 getCenterOrdering(node, vec, sort_indices); 998 999 for (int i=0; i<branching_; ++i) { 1000 findExactNN(node->childs[sort_indices[i]],result,vec); 1001 } 1002 1003 delete[] sort_indices; 1004 } 1005 } 1006 1007 1008 /** 1009 * Helper function. 1010 * 1011 * I computes the order in which to traverse the child nodes of a particular node. 1012 */ getCenterOrdering(KMeansNodePtr node,const ElementType * q,int * sort_indices)1013 void getCenterOrdering(KMeansNodePtr node, const ElementType* q, int* sort_indices) 1014 { 1015 DistanceType* domain_distances = new DistanceType[branching_]; 1016 for (int i=0; i<branching_; ++i) { 1017 DistanceType dist = distance_(q, node->childs[i]->pivot, veclen_); 1018 1019 int j=0; 1020 while (domain_distances[j]<dist && j<i) j++; 1021 for (int k=i; k>j; --k) { 1022 domain_distances[k] = domain_distances[k-1]; 1023 sort_indices[k] = sort_indices[k-1]; 1024 } 1025 domain_distances[j] = dist; 1026 sort_indices[j] = i; 1027 } 1028 delete[] domain_distances; 1029 } 1030 1031 /** 1032 * Method that computes the squared distance from the query point q 1033 * from inside region with center c to the border between this 1034 * region and the region with center p 1035 */ getDistanceToBorder(DistanceType * p,DistanceType * c,DistanceType * q)1036 DistanceType getDistanceToBorder(DistanceType* p, DistanceType* c, DistanceType* q) 1037 { 1038 DistanceType sum = 0; 1039 DistanceType sum2 = 0; 1040 1041 for (int i=0; i<veclen_; ++i) { 1042 DistanceType t = c[i]-p[i]; 1043 sum += t*(q[i]-(c[i]+p[i])/2); 1044 sum2 += t*t; 1045 } 1046 1047 return sum*sum/sum2; 1048 } 1049 1050 1051 /** 1052 * Helper function the descends in the hierarchical k-means tree by spliting those clusters that minimize 1053 * the overall variance of the clustering. 1054 * Params: 1055 * root = root node 1056 * clusters = array with clusters centers (return value) 1057 * varianceValue = variance of the clustering (return value) 1058 * Returns: 1059 */ getMinVarianceClusters(KMeansNodePtr root,KMeansNodePtr * clusters,int clusters_length,DistanceType & varianceValue)1060 int getMinVarianceClusters(KMeansNodePtr root, KMeansNodePtr* clusters, int clusters_length, DistanceType& varianceValue) 1061 { 1062 int clusterCount = 1; 1063 clusters[0] = root; 1064 1065 DistanceType meanVariance = root->variance*root->size; 1066 1067 while (clusterCount<clusters_length) { 1068 DistanceType minVariance = (std::numeric_limits<DistanceType>::max)(); 1069 int splitIndex = -1; 1070 1071 for (int i=0; i<clusterCount; ++i) { 1072 if (clusters[i]->childs != NULL) { 1073 1074 DistanceType variance = meanVariance - clusters[i]->variance*clusters[i]->size; 1075 1076 for (int j=0; j<branching_; ++j) { 1077 variance += clusters[i]->childs[j]->variance*clusters[i]->childs[j]->size; 1078 } 1079 if (variance<minVariance) { 1080 minVariance = variance; 1081 splitIndex = i; 1082 } 1083 } 1084 } 1085 1086 if (splitIndex==-1) break; 1087 if ( (branching_+clusterCount-1) > clusters_length) break; 1088 1089 meanVariance = minVariance; 1090 1091 // split node 1092 KMeansNodePtr toSplit = clusters[splitIndex]; 1093 clusters[splitIndex] = toSplit->childs[0]; 1094 for (int i=1; i<branching_; ++i) { 1095 clusters[clusterCount++] = toSplit->childs[i]; 1096 } 1097 } 1098 1099 varianceValue = meanVariance/root->size; 1100 return clusterCount; 1101 } 1102 1103 private: 1104 /** The branching factor used in the hierarchical k-means clustering */ 1105 int branching_; 1106 1107 /** Maximum number of iterations to use when performing k-means clustering */ 1108 int iterations_; 1109 1110 /** Algorithm for choosing the cluster centers */ 1111 flann_centers_init_t centers_init_; 1112 1113 /** 1114 * Cluster border index. This is used in the tree search phase when determining 1115 * the closest cluster to explore next. A zero value takes into account only 1116 * the cluster centres, a value greater then zero also take into account the size 1117 * of the cluster. 1118 */ 1119 float cb_index_; 1120 1121 /** 1122 * The dataset used by this index 1123 */ 1124 const Matrix<ElementType> dataset_; 1125 1126 /** Index parameters */ 1127 IndexParams index_params_; 1128 1129 /** 1130 * Number of features in the dataset. 1131 */ 1132 size_t size_; 1133 1134 /** 1135 * Length of each feature. 1136 */ 1137 size_t veclen_; 1138 1139 /** 1140 * The root node in the tree. 1141 */ 1142 KMeansNodePtr root_; 1143 1144 /** 1145 * Array of indices to vectors in the dataset. 1146 */ 1147 int* indices_; 1148 1149 /** 1150 * The distance 1151 */ 1152 Distance distance_; 1153 1154 /** 1155 * Pooled memory allocator. 1156 */ 1157 PooledAllocator pool_; 1158 1159 /** 1160 * Memory occupied by the index. 1161 */ 1162 int memoryCounter_; 1163 }; 1164 1165 } 1166 1167 #endif //OPENCV_FLANN_KMEANS_INDEX_H_ 1168