1 /*********************************************************************** 2 * Software License Agreement (BSD License) 3 * 4 * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved. 5 * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved. 6 * 7 * THE BSD LICENSE 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 *************************************************************************/ 30 #ifndef OPENCV_FLANN_AUTOTUNED_INDEX_H_ 31 #define OPENCV_FLANN_AUTOTUNED_INDEX_H_ 32 33 #include "general.h" 34 #include "nn_index.h" 35 #include "ground_truth.h" 36 #include "index_testing.h" 37 #include "sampling.h" 38 #include "kdtree_index.h" 39 #include "kdtree_single_index.h" 40 #include "kmeans_index.h" 41 #include "composite_index.h" 42 #include "linear_index.h" 43 #include "logger.h" 44 45 namespace cvflann 46 { 47 48 template<typename Distance> 49 NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance); 50 51 52 struct AutotunedIndexParams : public IndexParams 53 { 54 AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1) 55 { 56 (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED; 57 // precision desired (used for autotuning, -1 otherwise) 58 (*this)["target_precision"] = target_precision; 59 // build tree time weighting factor 60 (*this)["build_weight"] = build_weight; 61 // index memory weighting factor 62 (*this)["memory_weight"] = memory_weight; 63 // what fraction of the dataset to use for autotuning 64 (*this)["sample_fraction"] = sample_fraction; 65 } 66 }; 67 68 69 template <typename Distance> 70 class AutotunedIndex : public NNIndex<Distance> 71 { 72 public: 73 typedef typename Distance::ElementType ElementType; 74 typedef typename Distance::ResultType DistanceType; 75 76 AutotunedIndex(const Matrix<ElementType>& inputData, const IndexParams& params = AutotunedIndexParams(), Distance d = Distance()) : dataset_(inputData)77 dataset_(inputData), distance_(d) 78 { 79 target_precision_ = get_param(params, "target_precision",0.8f); 80 build_weight_ = get_param(params,"build_weight", 0.01f); 81 memory_weight_ = get_param(params, "memory_weight", 0.0f); 82 sample_fraction_ = get_param(params,"sample_fraction", 0.1f); 83 bestIndex_ = NULL; 84 } 85 86 AutotunedIndex(const AutotunedIndex&); 87 AutotunedIndex& operator=(const AutotunedIndex&); 88 ~AutotunedIndex()89 virtual ~AutotunedIndex() 90 { 91 if (bestIndex_ != NULL) { 92 delete bestIndex_; 93 bestIndex_ = NULL; 94 } 95 } 96 97 /** 98 * Method responsible with building the index. 99 */ buildIndex()100 virtual void buildIndex() 101 { 102 std::ostringstream stream; 103 bestParams_ = estimateBuildParams(); 104 print_params(bestParams_, stream); 105 Logger::info("----------------------------------------------------\n"); 106 Logger::info("Autotuned parameters:\n"); 107 Logger::info("%s", stream.str().c_str()); 108 Logger::info("----------------------------------------------------\n"); 109 110 bestIndex_ = create_index_by_type(dataset_, bestParams_, distance_); 111 bestIndex_->buildIndex(); 112 speedup_ = estimateSearchParams(bestSearchParams_); 113 stream.str(std::string()); 114 print_params(bestSearchParams_, stream); 115 Logger::info("----------------------------------------------------\n"); 116 Logger::info("Search parameters:\n"); 117 Logger::info("%s", stream.str().c_str()); 118 Logger::info("----------------------------------------------------\n"); 119 } 120 121 /** 122 * Saves the index to a stream 123 */ saveIndex(FILE * stream)124 virtual void saveIndex(FILE* stream) 125 { 126 save_value(stream, (int)bestIndex_->getType()); 127 bestIndex_->saveIndex(stream); 128 save_value(stream, get_param<int>(bestSearchParams_, "checks")); 129 } 130 131 /** 132 * Loads the index from a stream 133 */ loadIndex(FILE * stream)134 virtual void loadIndex(FILE* stream) 135 { 136 int index_type; 137 138 load_value(stream, index_type); 139 IndexParams params; 140 params["algorithm"] = (flann_algorithm_t)index_type; 141 bestIndex_ = create_index_by_type<Distance>(dataset_, params, distance_); 142 bestIndex_->loadIndex(stream); 143 int checks; 144 load_value(stream, checks); 145 bestSearchParams_["checks"] = checks; 146 } 147 148 /** 149 * Method that searches for nearest-neighbors 150 */ findNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,const SearchParams & searchParams)151 virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) 152 { 153 int checks = get_param<int>(searchParams,"checks",FLANN_CHECKS_AUTOTUNED); 154 if (checks == FLANN_CHECKS_AUTOTUNED) { 155 bestIndex_->findNeighbors(result, vec, bestSearchParams_); 156 } 157 else { 158 bestIndex_->findNeighbors(result, vec, searchParams); 159 } 160 } 161 162 getParameters()163 IndexParams getParameters() const 164 { 165 return bestIndex_->getParameters(); 166 } 167 getSearchParameters()168 SearchParams getSearchParameters() const 169 { 170 return bestSearchParams_; 171 } 172 getSpeedup()173 float getSpeedup() const 174 { 175 return speedup_; 176 } 177 178 179 /** 180 * Number of features in this index. 181 */ size()182 virtual size_t size() const 183 { 184 return bestIndex_->size(); 185 } 186 187 /** 188 * The length of each vector in this index. 189 */ veclen()190 virtual size_t veclen() const 191 { 192 return bestIndex_->veclen(); 193 } 194 195 /** 196 * The amount of memory (in bytes) this index uses. 197 */ usedMemory()198 virtual int usedMemory() const 199 { 200 return bestIndex_->usedMemory(); 201 } 202 203 /** 204 * Algorithm name 205 */ getType()206 virtual flann_algorithm_t getType() const 207 { 208 return FLANN_INDEX_AUTOTUNED; 209 } 210 211 private: 212 213 struct CostData 214 { 215 float searchTimeCost; 216 float buildTimeCost; 217 float memoryCost; 218 float totalCost; 219 IndexParams params; 220 }; 221 evaluate_kmeans(CostData & cost)222 void evaluate_kmeans(CostData& cost) 223 { 224 StartStopTimer t; 225 int checks; 226 const int nn = 1; 227 228 Logger::info("KMeansTree using params: max_iterations=%d, branching=%d\n", 229 get_param<int>(cost.params,"iterations"), 230 get_param<int>(cost.params,"branching")); 231 KMeansIndex<Distance> kmeans(sampledDataset_, cost.params, distance_); 232 // measure index build time 233 t.start(); 234 kmeans.buildIndex(); 235 t.stop(); 236 float buildTime = (float)t.value; 237 238 // measure search time 239 float searchTime = test_index_precision(kmeans, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn); 240 241 float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float)); 242 cost.memoryCost = (kmeans.usedMemory() + datasetMemory) / datasetMemory; 243 cost.searchTimeCost = searchTime; 244 cost.buildTimeCost = buildTime; 245 Logger::info("KMeansTree buildTime=%g, searchTime=%g, build_weight=%g\n", buildTime, searchTime, build_weight_); 246 } 247 248 evaluate_kdtree(CostData & cost)249 void evaluate_kdtree(CostData& cost) 250 { 251 StartStopTimer t; 252 int checks; 253 const int nn = 1; 254 255 Logger::info("KDTree using params: trees=%d\n", get_param<int>(cost.params,"trees")); 256 KDTreeIndex<Distance> kdtree(sampledDataset_, cost.params, distance_); 257 258 t.start(); 259 kdtree.buildIndex(); 260 t.stop(); 261 float buildTime = (float)t.value; 262 263 //measure search time 264 float searchTime = test_index_precision(kdtree, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn); 265 266 float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float)); 267 cost.memoryCost = (kdtree.usedMemory() + datasetMemory) / datasetMemory; 268 cost.searchTimeCost = searchTime; 269 cost.buildTimeCost = buildTime; 270 Logger::info("KDTree buildTime=%g, searchTime=%g\n", buildTime, searchTime); 271 } 272 273 274 // struct KMeansSimpleDownhillFunctor { 275 // 276 // Autotune& autotuner; 277 // KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {} 278 // 279 // float operator()(int* params) { 280 // 281 // float maxFloat = numeric_limits<float>::max(); 282 // 283 // if (params[0]<2) return maxFloat; 284 // if (params[1]<0) return maxFloat; 285 // 286 // CostData c; 287 // c.params["algorithm"] = KMEANS; 288 // c.params["centers-init"] = CENTERS_RANDOM; 289 // c.params["branching"] = params[0]; 290 // c.params["max-iterations"] = params[1]; 291 // 292 // autotuner.evaluate_kmeans(c); 293 // 294 // return c.timeCost; 295 // 296 // } 297 // }; 298 // 299 // struct KDTreeSimpleDownhillFunctor { 300 // 301 // Autotune& autotuner; 302 // KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {} 303 // 304 // float operator()(int* params) { 305 // float maxFloat = numeric_limits<float>::max(); 306 // 307 // if (params[0]<1) return maxFloat; 308 // 309 // CostData c; 310 // c.params["algorithm"] = KDTREE; 311 // c.params["trees"] = params[0]; 312 // 313 // autotuner.evaluate_kdtree(c); 314 // 315 // return c.timeCost; 316 // 317 // } 318 // }; 319 320 321 optimizeKMeans(std::vector<CostData> & costs)322 void optimizeKMeans(std::vector<CostData>& costs) 323 { 324 Logger::info("KMEANS, Step 1: Exploring parameter space\n"); 325 326 // explore kmeans parameters space using combinations of the parameters below 327 int maxIterations[] = { 1, 5, 10, 15 }; 328 int branchingFactors[] = { 16, 32, 64, 128, 256 }; 329 330 int kmeansParamSpaceSize = FLANN_ARRAY_LEN(maxIterations) * FLANN_ARRAY_LEN(branchingFactors); 331 costs.reserve(costs.size() + kmeansParamSpaceSize); 332 333 // evaluate kmeans for all parameter combinations 334 for (size_t i = 0; i < FLANN_ARRAY_LEN(maxIterations); ++i) { 335 for (size_t j = 0; j < FLANN_ARRAY_LEN(branchingFactors); ++j) { 336 CostData cost; 337 cost.params["algorithm"] = FLANN_INDEX_KMEANS; 338 cost.params["centers_init"] = FLANN_CENTERS_RANDOM; 339 cost.params["iterations"] = maxIterations[i]; 340 cost.params["branching"] = branchingFactors[j]; 341 342 evaluate_kmeans(cost); 343 costs.push_back(cost); 344 } 345 } 346 347 // Logger::info("KMEANS, Step 2: simplex-downhill optimization\n"); 348 // 349 // const int n = 2; 350 // // choose initial simplex points as the best parameters so far 351 // int kmeansNMPoints[n*(n+1)]; 352 // float kmeansVals[n+1]; 353 // for (int i=0;i<n+1;++i) { 354 // kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"]; 355 // kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"]; 356 // kmeansVals[i] = kmeansCosts[i].timeCost; 357 // } 358 // KMeansSimpleDownhillFunctor kmeans_cost_func(*this); 359 // // run optimization 360 // optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals); 361 // // store results 362 // for (int i=0;i<n+1;++i) { 363 // kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2]; 364 // kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1]; 365 // kmeansCosts[i].timeCost = kmeansVals[i]; 366 // } 367 } 368 369 optimizeKDTree(std::vector<CostData> & costs)370 void optimizeKDTree(std::vector<CostData>& costs) 371 { 372 Logger::info("KD-TREE, Step 1: Exploring parameter space\n"); 373 374 // explore kd-tree parameters space using the parameters below 375 int testTrees[] = { 1, 4, 8, 16, 32 }; 376 377 // evaluate kdtree for all parameter combinations 378 for (size_t i = 0; i < FLANN_ARRAY_LEN(testTrees); ++i) { 379 CostData cost; 380 cost.params["trees"] = testTrees[i]; 381 382 evaluate_kdtree(cost); 383 costs.push_back(cost); 384 } 385 386 // Logger::info("KD-TREE, Step 2: simplex-downhill optimization\n"); 387 // 388 // const int n = 1; 389 // // choose initial simplex points as the best parameters so far 390 // int kdtreeNMPoints[n*(n+1)]; 391 // float kdtreeVals[n+1]; 392 // for (int i=0;i<n+1;++i) { 393 // kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"]; 394 // kdtreeVals[i] = kdtreeCosts[i].timeCost; 395 // } 396 // KDTreeSimpleDownhillFunctor kdtree_cost_func(*this); 397 // // run optimization 398 // optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals); 399 // // store results 400 // for (int i=0;i<n+1;++i) { 401 // kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i]; 402 // kdtreeCosts[i].timeCost = kdtreeVals[i]; 403 // } 404 } 405 406 /** 407 * Chooses the best nearest-neighbor algorithm and estimates the optimal 408 * parameters to use when building the index (for a given precision). 409 * Returns a dictionary with the optimal parameters. 410 */ estimateBuildParams()411 IndexParams estimateBuildParams() 412 { 413 std::vector<CostData> costs; 414 415 int sampleSize = int(sample_fraction_ * dataset_.rows); 416 int testSampleSize = std::min(sampleSize / 10, 1000); 417 418 Logger::info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d, target precision: %g\n", dataset_.rows, sampleSize, testSampleSize, target_precision_); 419 420 // For a very small dataset, it makes no sense to build any fancy index, just 421 // use linear search 422 if (testSampleSize < 10) { 423 Logger::info("Choosing linear, dataset too small\n"); 424 return LinearIndexParams(); 425 } 426 427 // We use a fraction of the original dataset to speedup the autotune algorithm 428 sampledDataset_ = random_sample(dataset_, sampleSize); 429 // We use a cross-validation approach, first we sample a testset from the dataset 430 testDataset_ = random_sample(sampledDataset_, testSampleSize, true); 431 432 // We compute the ground truth using linear search 433 Logger::info("Computing ground truth... \n"); 434 gt_matches_ = Matrix<int>(new int[testDataset_.rows], testDataset_.rows, 1); 435 StartStopTimer t; 436 t.start(); 437 compute_ground_truth<Distance>(sampledDataset_, testDataset_, gt_matches_, 0, distance_); 438 t.stop(); 439 440 CostData linear_cost; 441 linear_cost.searchTimeCost = (float)t.value; 442 linear_cost.buildTimeCost = 0; 443 linear_cost.memoryCost = 0; 444 linear_cost.params["algorithm"] = FLANN_INDEX_LINEAR; 445 446 costs.push_back(linear_cost); 447 448 // Start parameter autotune process 449 Logger::info("Autotuning parameters...\n"); 450 451 optimizeKMeans(costs); 452 optimizeKDTree(costs); 453 454 float bestTimeCost = costs[0].searchTimeCost; 455 for (size_t i = 0; i < costs.size(); ++i) { 456 float timeCost = costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost; 457 if (timeCost < bestTimeCost) { 458 bestTimeCost = timeCost; 459 } 460 } 461 462 float bestCost = costs[0].searchTimeCost / bestTimeCost; 463 IndexParams bestParams = costs[0].params; 464 if (bestTimeCost > 0) { 465 for (size_t i = 0; i < costs.size(); ++i) { 466 float crtCost = (costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost) / bestTimeCost + 467 memory_weight_ * costs[i].memoryCost; 468 if (crtCost < bestCost) { 469 bestCost = crtCost; 470 bestParams = costs[i].params; 471 } 472 } 473 } 474 475 delete[] gt_matches_.data; 476 delete[] testDataset_.data; 477 delete[] sampledDataset_.data; 478 479 return bestParams; 480 } 481 482 483 484 /** 485 * Estimates the search time parameters needed to get the desired precision. 486 * Precondition: the index is built 487 * Postcondition: the searchParams will have the optimum params set, also the speedup obtained over linear search. 488 */ estimateSearchParams(SearchParams & searchParams)489 float estimateSearchParams(SearchParams& searchParams) 490 { 491 const int nn = 1; 492 const size_t SAMPLE_COUNT = 1000; 493 494 assert(bestIndex_ != NULL); // must have a valid index 495 496 float speedup = 0; 497 498 int samples = (int)std::min(dataset_.rows / 10, SAMPLE_COUNT); 499 if (samples > 0) { 500 Matrix<ElementType> testDataset = random_sample(dataset_, samples); 501 502 Logger::info("Computing ground truth\n"); 503 504 // we need to compute the ground truth first 505 Matrix<int> gt_matches(new int[testDataset.rows], testDataset.rows, 1); 506 StartStopTimer t; 507 t.start(); 508 compute_ground_truth<Distance>(dataset_, testDataset, gt_matches, 1, distance_); 509 t.stop(); 510 float linear = (float)t.value; 511 512 int checks; 513 Logger::info("Estimating number of checks\n"); 514 515 float searchTime; 516 float cb_index; 517 if (bestIndex_->getType() == FLANN_INDEX_KMEANS) { 518 Logger::info("KMeans algorithm, estimating cluster border factor\n"); 519 KMeansIndex<Distance>* kmeans = (KMeansIndex<Distance>*)bestIndex_; 520 float bestSearchTime = -1; 521 float best_cb_index = -1; 522 int best_checks = -1; 523 for (cb_index = 0; cb_index < 1.1f; cb_index += 0.2f) { 524 kmeans->set_cb_index(cb_index); 525 searchTime = test_index_precision(*kmeans, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1); 526 if ((searchTime < bestSearchTime) || (bestSearchTime == -1)) { 527 bestSearchTime = searchTime; 528 best_cb_index = cb_index; 529 best_checks = checks; 530 } 531 } 532 searchTime = bestSearchTime; 533 cb_index = best_cb_index; 534 checks = best_checks; 535 536 kmeans->set_cb_index(best_cb_index); 537 Logger::info("Optimum cb_index: %g\n", cb_index); 538 bestParams_["cb_index"] = cb_index; 539 } 540 else { 541 searchTime = test_index_precision(*bestIndex_, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1); 542 } 543 544 Logger::info("Required number of checks: %d \n", checks); 545 searchParams["checks"] = checks; 546 547 speedup = linear / searchTime; 548 549 delete[] gt_matches.data; 550 delete[] testDataset.data; 551 } 552 553 return speedup; 554 } 555 556 private: 557 NNIndex<Distance>* bestIndex_; 558 559 IndexParams bestParams_; 560 SearchParams bestSearchParams_; 561 562 Matrix<ElementType> sampledDataset_; 563 Matrix<ElementType> testDataset_; 564 Matrix<int> gt_matches_; 565 566 float speedup_; 567 568 /** 569 * The dataset used by this index 570 */ 571 const Matrix<ElementType> dataset_; 572 573 /** 574 * Index parameters 575 */ 576 float target_precision_; 577 float build_weight_; 578 float memory_weight_; 579 float sample_fraction_; 580 581 Distance distance_; 582 583 584 }; 585 } 586 587 #endif /* OPENCV_FLANN_AUTOTUNED_INDEX_H_ */ 588