1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 #ifndef OPENCV_FLANN_AUTOTUNED_INDEX_H_
31 #define OPENCV_FLANN_AUTOTUNED_INDEX_H_
32 
33 #include "general.h"
34 #include "nn_index.h"
35 #include "ground_truth.h"
36 #include "index_testing.h"
37 #include "sampling.h"
38 #include "kdtree_index.h"
39 #include "kdtree_single_index.h"
40 #include "kmeans_index.h"
41 #include "composite_index.h"
42 #include "linear_index.h"
43 #include "logger.h"
44 
45 namespace cvflann
46 {
47 
48 template<typename Distance>
49 NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance);
50 
51 
52 struct AutotunedIndexParams : public IndexParams
53 {
54     AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1)
55     {
56         (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED;
57         // precision desired (used for autotuning, -1 otherwise)
58         (*this)["target_precision"] = target_precision;
59         // build tree time weighting factor
60         (*this)["build_weight"] = build_weight;
61         // index memory weighting factor
62         (*this)["memory_weight"] = memory_weight;
63         // what fraction of the dataset to use for autotuning
64         (*this)["sample_fraction"] = sample_fraction;
65     }
66 };
67 
68 
69 template <typename Distance>
70 class AutotunedIndex : public NNIndex<Distance>
71 {
72 public:
73     typedef typename Distance::ElementType ElementType;
74     typedef typename Distance::ResultType DistanceType;
75 
76     AutotunedIndex(const Matrix<ElementType>& inputData, const IndexParams& params = AutotunedIndexParams(), Distance d = Distance()) :
dataset_(inputData)77         dataset_(inputData), distance_(d)
78     {
79         target_precision_ = get_param(params, "target_precision",0.8f);
80         build_weight_ =  get_param(params,"build_weight", 0.01f);
81         memory_weight_ = get_param(params, "memory_weight", 0.0f);
82         sample_fraction_ = get_param(params,"sample_fraction", 0.1f);
83         bestIndex_ = NULL;
84     }
85 
86     AutotunedIndex(const AutotunedIndex&);
87     AutotunedIndex& operator=(const AutotunedIndex&);
88 
~AutotunedIndex()89     virtual ~AutotunedIndex()
90     {
91         if (bestIndex_ != NULL) {
92             delete bestIndex_;
93             bestIndex_ = NULL;
94         }
95     }
96 
97     /**
98      *          Method responsible with building the index.
99      */
buildIndex()100     virtual void buildIndex()
101     {
102         std::ostringstream stream;
103         bestParams_ = estimateBuildParams();
104         print_params(bestParams_, stream);
105         Logger::info("----------------------------------------------------\n");
106         Logger::info("Autotuned parameters:\n");
107         Logger::info("%s", stream.str().c_str());
108         Logger::info("----------------------------------------------------\n");
109 
110         bestIndex_ = create_index_by_type(dataset_, bestParams_, distance_);
111         bestIndex_->buildIndex();
112         speedup_ = estimateSearchParams(bestSearchParams_);
113         stream.str(std::string());
114         print_params(bestSearchParams_, stream);
115         Logger::info("----------------------------------------------------\n");
116         Logger::info("Search parameters:\n");
117         Logger::info("%s", stream.str().c_str());
118         Logger::info("----------------------------------------------------\n");
119     }
120 
121     /**
122      *  Saves the index to a stream
123      */
saveIndex(FILE * stream)124     virtual void saveIndex(FILE* stream)
125     {
126         save_value(stream, (int)bestIndex_->getType());
127         bestIndex_->saveIndex(stream);
128         save_value(stream, get_param<int>(bestSearchParams_, "checks"));
129     }
130 
131     /**
132      *  Loads the index from a stream
133      */
loadIndex(FILE * stream)134     virtual void loadIndex(FILE* stream)
135     {
136         int index_type;
137 
138         load_value(stream, index_type);
139         IndexParams params;
140         params["algorithm"] = (flann_algorithm_t)index_type;
141         bestIndex_ = create_index_by_type<Distance>(dataset_, params, distance_);
142         bestIndex_->loadIndex(stream);
143         int checks;
144         load_value(stream, checks);
145         bestSearchParams_["checks"] = checks;
146     }
147 
148     /**
149      *      Method that searches for nearest-neighbors
150      */
findNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,const SearchParams & searchParams)151     virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
152     {
153         int checks = get_param<int>(searchParams,"checks",FLANN_CHECKS_AUTOTUNED);
154         if (checks == FLANN_CHECKS_AUTOTUNED) {
155             bestIndex_->findNeighbors(result, vec, bestSearchParams_);
156         }
157         else {
158             bestIndex_->findNeighbors(result, vec, searchParams);
159         }
160     }
161 
162 
getParameters()163     IndexParams getParameters() const
164     {
165         return bestIndex_->getParameters();
166     }
167 
getSearchParameters()168     SearchParams getSearchParameters() const
169     {
170         return bestSearchParams_;
171     }
172 
getSpeedup()173     float getSpeedup() const
174     {
175         return speedup_;
176     }
177 
178 
179     /**
180      *      Number of features in this index.
181      */
size()182     virtual size_t size() const
183     {
184         return bestIndex_->size();
185     }
186 
187     /**
188      *  The length of each vector in this index.
189      */
veclen()190     virtual size_t veclen() const
191     {
192         return bestIndex_->veclen();
193     }
194 
195     /**
196      * The amount of memory (in bytes) this index uses.
197      */
usedMemory()198     virtual int usedMemory() const
199     {
200         return bestIndex_->usedMemory();
201     }
202 
203     /**
204      * Algorithm name
205      */
getType()206     virtual flann_algorithm_t getType() const
207     {
208         return FLANN_INDEX_AUTOTUNED;
209     }
210 
211 private:
212 
213     struct CostData
214     {
215         float searchTimeCost;
216         float buildTimeCost;
217         float memoryCost;
218         float totalCost;
219         IndexParams params;
220     };
221 
evaluate_kmeans(CostData & cost)222     void evaluate_kmeans(CostData& cost)
223     {
224         StartStopTimer t;
225         int checks;
226         const int nn = 1;
227 
228         Logger::info("KMeansTree using params: max_iterations=%d, branching=%d\n",
229                      get_param<int>(cost.params,"iterations"),
230                      get_param<int>(cost.params,"branching"));
231         KMeansIndex<Distance> kmeans(sampledDataset_, cost.params, distance_);
232         // measure index build time
233         t.start();
234         kmeans.buildIndex();
235         t.stop();
236         float buildTime = (float)t.value;
237 
238         // measure search time
239         float searchTime = test_index_precision(kmeans, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
240 
241         float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
242         cost.memoryCost = (kmeans.usedMemory() + datasetMemory) / datasetMemory;
243         cost.searchTimeCost = searchTime;
244         cost.buildTimeCost = buildTime;
245         Logger::info("KMeansTree buildTime=%g, searchTime=%g, build_weight=%g\n", buildTime, searchTime, build_weight_);
246     }
247 
248 
evaluate_kdtree(CostData & cost)249     void evaluate_kdtree(CostData& cost)
250     {
251         StartStopTimer t;
252         int checks;
253         const int nn = 1;
254 
255         Logger::info("KDTree using params: trees=%d\n", get_param<int>(cost.params,"trees"));
256         KDTreeIndex<Distance> kdtree(sampledDataset_, cost.params, distance_);
257 
258         t.start();
259         kdtree.buildIndex();
260         t.stop();
261         float buildTime = (float)t.value;
262 
263         //measure search time
264         float searchTime = test_index_precision(kdtree, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
265 
266         float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
267         cost.memoryCost = (kdtree.usedMemory() + datasetMemory) / datasetMemory;
268         cost.searchTimeCost = searchTime;
269         cost.buildTimeCost = buildTime;
270         Logger::info("KDTree buildTime=%g, searchTime=%g\n", buildTime, searchTime);
271     }
272 
273 
274     //    struct KMeansSimpleDownhillFunctor {
275     //
276     //        Autotune& autotuner;
277     //        KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
278     //
279     //        float operator()(int* params) {
280     //
281     //            float maxFloat = numeric_limits<float>::max();
282     //
283     //            if (params[0]<2) return maxFloat;
284     //            if (params[1]<0) return maxFloat;
285     //
286     //            CostData c;
287     //            c.params["algorithm"] = KMEANS;
288     //            c.params["centers-init"] = CENTERS_RANDOM;
289     //            c.params["branching"] = params[0];
290     //            c.params["max-iterations"] = params[1];
291     //
292     //            autotuner.evaluate_kmeans(c);
293     //
294     //            return c.timeCost;
295     //
296     //        }
297     //    };
298     //
299     //    struct KDTreeSimpleDownhillFunctor {
300     //
301     //        Autotune& autotuner;
302     //        KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
303     //
304     //        float operator()(int* params) {
305     //            float maxFloat = numeric_limits<float>::max();
306     //
307     //            if (params[0]<1) return maxFloat;
308     //
309     //            CostData c;
310     //            c.params["algorithm"] = KDTREE;
311     //            c.params["trees"] = params[0];
312     //
313     //            autotuner.evaluate_kdtree(c);
314     //
315     //            return c.timeCost;
316     //
317     //        }
318     //    };
319 
320 
321 
optimizeKMeans(std::vector<CostData> & costs)322     void optimizeKMeans(std::vector<CostData>& costs)
323     {
324         Logger::info("KMEANS, Step 1: Exploring parameter space\n");
325 
326         // explore kmeans parameters space using combinations of the parameters below
327         int maxIterations[] = { 1, 5, 10, 15 };
328         int branchingFactors[] = { 16, 32, 64, 128, 256 };
329 
330         int kmeansParamSpaceSize = FLANN_ARRAY_LEN(maxIterations) * FLANN_ARRAY_LEN(branchingFactors);
331         costs.reserve(costs.size() + kmeansParamSpaceSize);
332 
333         // evaluate kmeans for all parameter combinations
334         for (size_t i = 0; i < FLANN_ARRAY_LEN(maxIterations); ++i) {
335             for (size_t j = 0; j < FLANN_ARRAY_LEN(branchingFactors); ++j) {
336                 CostData cost;
337                 cost.params["algorithm"] = FLANN_INDEX_KMEANS;
338                 cost.params["centers_init"] = FLANN_CENTERS_RANDOM;
339                 cost.params["iterations"] = maxIterations[i];
340                 cost.params["branching"] = branchingFactors[j];
341 
342                 evaluate_kmeans(cost);
343                 costs.push_back(cost);
344             }
345         }
346 
347         //         Logger::info("KMEANS, Step 2: simplex-downhill optimization\n");
348         //
349         //         const int n = 2;
350         //         // choose initial simplex points as the best parameters so far
351         //         int kmeansNMPoints[n*(n+1)];
352         //         float kmeansVals[n+1];
353         //         for (int i=0;i<n+1;++i) {
354         //             kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
355         //             kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
356         //             kmeansVals[i] = kmeansCosts[i].timeCost;
357         //         }
358         //         KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
359         //         // run optimization
360         //         optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
361         //         // store results
362         //         for (int i=0;i<n+1;++i) {
363         //             kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
364         //             kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
365         //             kmeansCosts[i].timeCost = kmeansVals[i];
366         //         }
367     }
368 
369 
optimizeKDTree(std::vector<CostData> & costs)370     void optimizeKDTree(std::vector<CostData>& costs)
371     {
372         Logger::info("KD-TREE, Step 1: Exploring parameter space\n");
373 
374         // explore kd-tree parameters space using the parameters below
375         int testTrees[] = { 1, 4, 8, 16, 32 };
376 
377         // evaluate kdtree for all parameter combinations
378         for (size_t i = 0; i < FLANN_ARRAY_LEN(testTrees); ++i) {
379             CostData cost;
380             cost.params["trees"] = testTrees[i];
381 
382             evaluate_kdtree(cost);
383             costs.push_back(cost);
384         }
385 
386         //         Logger::info("KD-TREE, Step 2: simplex-downhill optimization\n");
387         //
388         //         const int n = 1;
389         //         // choose initial simplex points as the best parameters so far
390         //         int kdtreeNMPoints[n*(n+1)];
391         //         float kdtreeVals[n+1];
392         //         for (int i=0;i<n+1;++i) {
393         //             kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
394         //             kdtreeVals[i] = kdtreeCosts[i].timeCost;
395         //         }
396         //         KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
397         //         // run optimization
398         //         optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
399         //         // store results
400         //         for (int i=0;i<n+1;++i) {
401         //             kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
402         //             kdtreeCosts[i].timeCost = kdtreeVals[i];
403         //         }
404     }
405 
406     /**
407      *  Chooses the best nearest-neighbor algorithm and estimates the optimal
408      *  parameters to use when building the index (for a given precision).
409      *  Returns a dictionary with the optimal parameters.
410      */
estimateBuildParams()411     IndexParams estimateBuildParams()
412     {
413         std::vector<CostData> costs;
414 
415         int sampleSize = int(sample_fraction_ * dataset_.rows);
416         int testSampleSize = std::min(sampleSize / 10, 1000);
417 
418         Logger::info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d, target precision: %g\n", dataset_.rows, sampleSize, testSampleSize, target_precision_);
419 
420         // For a very small dataset, it makes no sense to build any fancy index, just
421         // use linear search
422         if (testSampleSize < 10) {
423             Logger::info("Choosing linear, dataset too small\n");
424             return LinearIndexParams();
425         }
426 
427         // We use a fraction of the original dataset to speedup the autotune algorithm
428         sampledDataset_ = random_sample(dataset_, sampleSize);
429         // We use a cross-validation approach, first we sample a testset from the dataset
430         testDataset_ = random_sample(sampledDataset_, testSampleSize, true);
431 
432         // We compute the ground truth using linear search
433         Logger::info("Computing ground truth... \n");
434         gt_matches_ = Matrix<int>(new int[testDataset_.rows], testDataset_.rows, 1);
435         StartStopTimer t;
436         t.start();
437         compute_ground_truth<Distance>(sampledDataset_, testDataset_, gt_matches_, 0, distance_);
438         t.stop();
439 
440         CostData linear_cost;
441         linear_cost.searchTimeCost = (float)t.value;
442         linear_cost.buildTimeCost = 0;
443         linear_cost.memoryCost = 0;
444         linear_cost.params["algorithm"] = FLANN_INDEX_LINEAR;
445 
446         costs.push_back(linear_cost);
447 
448         // Start parameter autotune process
449         Logger::info("Autotuning parameters...\n");
450 
451         optimizeKMeans(costs);
452         optimizeKDTree(costs);
453 
454         float bestTimeCost = costs[0].searchTimeCost;
455         for (size_t i = 0; i < costs.size(); ++i) {
456             float timeCost = costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost;
457             if (timeCost < bestTimeCost) {
458                 bestTimeCost = timeCost;
459             }
460         }
461 
462         float bestCost = costs[0].searchTimeCost / bestTimeCost;
463         IndexParams bestParams = costs[0].params;
464         if (bestTimeCost > 0) {
465             for (size_t i = 0; i < costs.size(); ++i) {
466                 float crtCost = (costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost) / bestTimeCost +
467                                 memory_weight_ * costs[i].memoryCost;
468                 if (crtCost < bestCost) {
469                     bestCost = crtCost;
470                     bestParams = costs[i].params;
471                 }
472             }
473         }
474 
475         delete[] gt_matches_.data;
476         delete[] testDataset_.data;
477         delete[] sampledDataset_.data;
478 
479         return bestParams;
480     }
481 
482 
483 
484     /**
485      *  Estimates the search time parameters needed to get the desired precision.
486      *  Precondition: the index is built
487      *  Postcondition: the searchParams will have the optimum params set, also the speedup obtained over linear search.
488      */
estimateSearchParams(SearchParams & searchParams)489     float estimateSearchParams(SearchParams& searchParams)
490     {
491         const int nn = 1;
492         const size_t SAMPLE_COUNT = 1000;
493 
494         assert(bestIndex_ != NULL); // must have a valid index
495 
496         float speedup = 0;
497 
498         int samples = (int)std::min(dataset_.rows / 10, SAMPLE_COUNT);
499         if (samples > 0) {
500             Matrix<ElementType> testDataset = random_sample(dataset_, samples);
501 
502             Logger::info("Computing ground truth\n");
503 
504             // we need to compute the ground truth first
505             Matrix<int> gt_matches(new int[testDataset.rows], testDataset.rows, 1);
506             StartStopTimer t;
507             t.start();
508             compute_ground_truth<Distance>(dataset_, testDataset, gt_matches, 1, distance_);
509             t.stop();
510             float linear = (float)t.value;
511 
512             int checks;
513             Logger::info("Estimating number of checks\n");
514 
515             float searchTime;
516             float cb_index;
517             if (bestIndex_->getType() == FLANN_INDEX_KMEANS) {
518                 Logger::info("KMeans algorithm, estimating cluster border factor\n");
519                 KMeansIndex<Distance>* kmeans = (KMeansIndex<Distance>*)bestIndex_;
520                 float bestSearchTime = -1;
521                 float best_cb_index = -1;
522                 int best_checks = -1;
523                 for (cb_index = 0; cb_index < 1.1f; cb_index += 0.2f) {
524                     kmeans->set_cb_index(cb_index);
525                     searchTime = test_index_precision(*kmeans, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
526                     if ((searchTime < bestSearchTime) || (bestSearchTime == -1)) {
527                         bestSearchTime = searchTime;
528                         best_cb_index = cb_index;
529                         best_checks = checks;
530                     }
531                 }
532                 searchTime = bestSearchTime;
533                 cb_index = best_cb_index;
534                 checks = best_checks;
535 
536                 kmeans->set_cb_index(best_cb_index);
537                 Logger::info("Optimum cb_index: %g\n", cb_index);
538                 bestParams_["cb_index"] = cb_index;
539             }
540             else {
541                 searchTime = test_index_precision(*bestIndex_, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
542             }
543 
544             Logger::info("Required number of checks: %d \n", checks);
545             searchParams["checks"] = checks;
546 
547             speedup = linear / searchTime;
548 
549             delete[] gt_matches.data;
550             delete[] testDataset.data;
551         }
552 
553         return speedup;
554     }
555 
556 private:
557     NNIndex<Distance>* bestIndex_;
558 
559     IndexParams bestParams_;
560     SearchParams bestSearchParams_;
561 
562     Matrix<ElementType> sampledDataset_;
563     Matrix<ElementType> testDataset_;
564     Matrix<int> gt_matches_;
565 
566     float speedup_;
567 
568     /**
569      * The dataset used by this index
570      */
571     const Matrix<ElementType> dataset_;
572 
573     /**
574      * Index parameters
575      */
576     float target_precision_;
577     float build_weight_;
578     float memory_weight_;
579     float sample_fraction_;
580 
581     Distance distance_;
582 
583 
584 };
585 }
586 
587 #endif /* OPENCV_FLANN_AUTOTUNED_INDEX_H_ */
588