1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 
31 /***********************************************************************
32  * Author: Vincent Rabaud
33  *************************************************************************/
34 
35 #ifndef OPENCV_FLANN_LSH_INDEX_H_
36 #define OPENCV_FLANN_LSH_INDEX_H_
37 
38 #include <algorithm>
39 #include <cassert>
40 #include <cstring>
41 #include <map>
42 #include <vector>
43 
44 #include "general.h"
45 #include "nn_index.h"
46 #include "matrix.h"
47 #include "result_set.h"
48 #include "heap.h"
49 #include "lsh_table.h"
50 #include "allocator.h"
51 #include "random.h"
52 #include "saving.h"
53 
54 namespace cvflann
55 {
56 
57 struct LshIndexParams : public IndexParams
58 {
59     LshIndexParams(unsigned int table_number = 12, unsigned int key_size = 20, unsigned int multi_probe_level = 2)
60     {
61         (* this)["algorithm"] = FLANN_INDEX_LSH;
62         // The number of hash tables to use
63         (*this)["table_number"] = table_number;
64         // The length of the key in the hash tables
65         (*this)["key_size"] = key_size;
66         // Number of levels to use in multi-probe (0 for standard LSH)
67         (*this)["multi_probe_level"] = multi_probe_level;
68     }
69 };
70 
71 /**
72  * Randomized kd-tree index
73  *
74  * Contains the k-d trees and other information for indexing a set of points
75  * for nearest-neighbor matching.
76  */
77 template<typename Distance>
78 class LshIndex : public NNIndex<Distance>
79 {
80 public:
81     typedef typename Distance::ElementType ElementType;
82     typedef typename Distance::ResultType DistanceType;
83 
84     /** Constructor
85      * @param input_data dataset with the input features
86      * @param params parameters passed to the LSH algorithm
87      * @param d the distance used
88      */
89     LshIndex(const Matrix<ElementType>& input_data, const IndexParams& params = LshIndexParams(),
90              Distance d = Distance()) :
dataset_(input_data)91         dataset_(input_data), index_params_(params), distance_(d)
92     {
93         // cv::flann::IndexParams sets integer params as 'int', so it is used with get_param
94         // in place of 'unsigned int'
95         table_number_ = (unsigned int)get_param<int>(index_params_,"table_number",12);
96         key_size_ = (unsigned int)get_param<int>(index_params_,"key_size",20);
97         multi_probe_level_ = (unsigned int)get_param<int>(index_params_,"multi_probe_level",2);
98 
99         feature_size_ = (unsigned)dataset_.cols;
100         fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_);
101     }
102 
103 
104     LshIndex(const LshIndex&);
105     LshIndex& operator=(const LshIndex&);
106 
107     /**
108      * Builds the index
109      */
buildIndex()110     void buildIndex()
111     {
112         tables_.resize(table_number_);
113         for (unsigned int i = 0; i < table_number_; ++i) {
114             lsh::LshTable<ElementType>& table = tables_[i];
115             table = lsh::LshTable<ElementType>(feature_size_, key_size_);
116 
117             // Add the features to the table
118             table.add(dataset_);
119         }
120     }
121 
getType()122     flann_algorithm_t getType() const
123     {
124         return FLANN_INDEX_LSH;
125     }
126 
127 
saveIndex(FILE * stream)128     void saveIndex(FILE* stream)
129     {
130         save_value(stream,table_number_);
131         save_value(stream,key_size_);
132         save_value(stream,multi_probe_level_);
133         save_value(stream, dataset_);
134     }
135 
loadIndex(FILE * stream)136     void loadIndex(FILE* stream)
137     {
138         load_value(stream, table_number_);
139         load_value(stream, key_size_);
140         load_value(stream, multi_probe_level_);
141         load_value(stream, dataset_);
142         // Building the index is so fast we can afford not storing it
143         buildIndex();
144 
145         index_params_["algorithm"] = getType();
146         index_params_["table_number"] = table_number_;
147         index_params_["key_size"] = key_size_;
148         index_params_["multi_probe_level"] = multi_probe_level_;
149     }
150 
151     /**
152      *  Returns size of index.
153      */
size()154     size_t size() const
155     {
156         return dataset_.rows;
157     }
158 
159     /**
160      * Returns the length of an index feature.
161      */
veclen()162     size_t veclen() const
163     {
164         return feature_size_;
165     }
166 
167     /**
168      * Computes the index memory usage
169      * Returns: memory used by the index
170      */
usedMemory()171     int usedMemory() const
172     {
173         return (int)(dataset_.rows * sizeof(int));
174     }
175 
176 
getParameters()177     IndexParams getParameters() const
178     {
179         return index_params_;
180     }
181 
182     /**
183      * \brief Perform k-nearest neighbor search
184      * \param[in] queries The query points for which to find the nearest neighbors
185      * \param[out] indices The indices of the nearest neighbors found
186      * \param[out] dists Distances to the nearest neighbors found
187      * \param[in] knn Number of nearest neighbors to return
188      * \param[in] params Search parameters
189      */
knnSearch(const Matrix<ElementType> & queries,Matrix<int> & indices,Matrix<DistanceType> & dists,int knn,const SearchParams & params)190     virtual void knnSearch(const Matrix<ElementType>& queries, Matrix<int>& indices, Matrix<DistanceType>& dists, int knn, const SearchParams& params)
191     {
192         assert(queries.cols == veclen());
193         assert(indices.rows >= queries.rows);
194         assert(dists.rows >= queries.rows);
195         assert(int(indices.cols) >= knn);
196         assert(int(dists.cols) >= knn);
197 
198 
199         KNNUniqueResultSet<DistanceType> resultSet(knn);
200         for (size_t i = 0; i < queries.rows; i++) {
201             resultSet.clear();
202             std::fill_n(indices[i], knn, -1);
203             std::fill_n(dists[i], knn, std::numeric_limits<DistanceType>::max());
204             findNeighbors(resultSet, queries[i], params);
205             if (get_param(params,"sorted",true)) resultSet.sortAndCopy(indices[i], dists[i], knn);
206             else resultSet.copy(indices[i], dists[i], knn);
207         }
208     }
209 
210 
211     /**
212      * Find set of nearest neighbors to vec. Their indices are stored inside
213      * the result object.
214      *
215      * Params:
216      *     result = the result object in which the indices of the nearest-neighbors are stored
217      *     vec = the vector for which to search the nearest neighbors
218      *     maxCheck = the maximum number of restarts (in a best-bin-first manner)
219      */
findNeighbors(ResultSet<DistanceType> & result,const ElementType * vec,const SearchParams &)220     void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& /*searchParams*/)
221     {
222         getNeighbors(vec, result);
223     }
224 
225 private:
226     /** Defines the comparator on score and index
227      */
228     typedef std::pair<float, unsigned int> ScoreIndexPair;
229     struct SortScoreIndexPairOnSecond
230     {
operatorSortScoreIndexPairOnSecond231         bool operator()(const ScoreIndexPair& left, const ScoreIndexPair& right) const
232         {
233             return left.second < right.second;
234         }
235     };
236 
237     /** Fills the different xor masks to use when getting the neighbors in multi-probe LSH
238      * @param key the key we build neighbors from
239      * @param lowest_index the lowest index of the bit set
240      * @param level the multi-probe level we are at
241      * @param xor_masks all the xor mask
242      */
fill_xor_mask(lsh::BucketKey key,int lowest_index,unsigned int level,std::vector<lsh::BucketKey> & xor_masks)243     void fill_xor_mask(lsh::BucketKey key, int lowest_index, unsigned int level,
244                        std::vector<lsh::BucketKey>& xor_masks)
245     {
246         xor_masks.push_back(key);
247         if (level == 0) return;
248         for (int index = lowest_index - 1; index >= 0; --index) {
249             // Create a new key
250             lsh::BucketKey new_key = key | (1 << index);
251             fill_xor_mask(new_key, index, level - 1, xor_masks);
252         }
253     }
254 
255     /** Performs the approximate nearest-neighbor search.
256      * @param vec the feature to analyze
257      * @param do_radius flag indicating if we check the radius too
258      * @param radius the radius if it is a radius search
259      * @param do_k flag indicating if we limit the number of nn
260      * @param k_nn the number of nearest neighbors
261      * @param checked_average used for debugging
262      */
getNeighbors(const ElementType * vec,bool,float radius,bool do_k,unsigned int k_nn,float &)263     void getNeighbors(const ElementType* vec, bool /*do_radius*/, float radius, bool do_k, unsigned int k_nn,
264                       float& /*checked_average*/)
265     {
266         static std::vector<ScoreIndexPair> score_index_heap;
267 
268         if (do_k) {
269             unsigned int worst_score = std::numeric_limits<unsigned int>::max();
270             typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
271             typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
272             for (; table != table_end; ++table) {
273                 size_t key = table->getKey(vec);
274                 std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
275                 std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
276                 for (; xor_mask != xor_mask_end; ++xor_mask) {
277                     size_t sub_key = key ^ (*xor_mask);
278                     const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
279                     if (bucket == 0) continue;
280 
281                     // Go over each descriptor index
282                     std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
283                     std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
284                     DistanceType hamming_distance;
285 
286                     // Process the rest of the candidates
287                     for (; training_index < last_training_index; ++training_index) {
288                         hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
289 
290                         if (hamming_distance < worst_score) {
291                             // Insert the new element
292                             score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
293                             std::push_heap(score_index_heap.begin(), score_index_heap.end());
294 
295                             if (score_index_heap.size() > (unsigned int)k_nn) {
296                                 // Remove the highest distance value as we have too many elements
297                                 std::pop_heap(score_index_heap.begin(), score_index_heap.end());
298                                 score_index_heap.pop_back();
299                                 // Keep track of the worst score
300                                 worst_score = score_index_heap.front().first;
301                             }
302                         }
303                     }
304                 }
305             }
306         }
307         else {
308             typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
309             typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
310             for (; table != table_end; ++table) {
311                 size_t key = table->getKey(vec);
312                 std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
313                 std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
314                 for (; xor_mask != xor_mask_end; ++xor_mask) {
315                     size_t sub_key = key ^ (*xor_mask);
316                     const lsh::Bucket* bucket = table->getBucketFromKey(sub_key);
317                     if (bucket == 0) continue;
318 
319                     // Go over each descriptor index
320                     std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
321                     std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
322                     DistanceType hamming_distance;
323 
324                     // Process the rest of the candidates
325                     for (; training_index < last_training_index; ++training_index) {
326                         // Compute the Hamming distance
327                         hamming_distance = distance_(vec, dataset_[*training_index], dataset_.cols);
328                         if (hamming_distance < radius) score_index_heap.push_back(ScoreIndexPair(hamming_distance, training_index));
329                     }
330                 }
331             }
332         }
333     }
334 
335     /** Performs the approximate nearest-neighbor search.
336      * This is a slower version than the above as it uses the ResultSet
337      * @param vec the feature to analyze
338      */
getNeighbors(const ElementType * vec,ResultSet<DistanceType> & result)339     void getNeighbors(const ElementType* vec, ResultSet<DistanceType>& result)
340     {
341         typename std::vector<lsh::LshTable<ElementType> >::const_iterator table = tables_.begin();
342         typename std::vector<lsh::LshTable<ElementType> >::const_iterator table_end = tables_.end();
343         for (; table != table_end; ++table) {
344             size_t key = table->getKey(vec);
345             std::vector<lsh::BucketKey>::const_iterator xor_mask = xor_masks_.begin();
346             std::vector<lsh::BucketKey>::const_iterator xor_mask_end = xor_masks_.end();
347             for (; xor_mask != xor_mask_end; ++xor_mask) {
348                 size_t sub_key = key ^ (*xor_mask);
349                 const lsh::Bucket* bucket = table->getBucketFromKey((lsh::BucketKey)sub_key);
350                 if (bucket == 0) continue;
351 
352                 // Go over each descriptor index
353                 std::vector<lsh::FeatureIndex>::const_iterator training_index = bucket->begin();
354                 std::vector<lsh::FeatureIndex>::const_iterator last_training_index = bucket->end();
355                 DistanceType hamming_distance;
356 
357                 // Process the rest of the candidates
358                 for (; training_index < last_training_index; ++training_index) {
359                     // Compute the Hamming distance
360                     hamming_distance = distance_(vec, dataset_[*training_index], (int)dataset_.cols);
361                     result.addPoint(hamming_distance, *training_index);
362                 }
363             }
364         }
365     }
366 
367     /** The different hash tables */
368     std::vector<lsh::LshTable<ElementType> > tables_;
369 
370     /** The data the LSH tables where built from */
371     Matrix<ElementType> dataset_;
372 
373     /** The size of the features (as ElementType[]) */
374     unsigned int feature_size_;
375 
376     IndexParams index_params_;
377 
378     /** table number */
379     unsigned int table_number_;
380     /** key size */
381     unsigned int key_size_;
382     /** How far should we look for neighbors in multi-probe LSH */
383     unsigned int multi_probe_level_;
384 
385     /** The XOR masks to apply to a key to get the neighboring buckets */
386     std::vector<lsh::BucketKey> xor_masks_;
387 
388     Distance distance_;
389 };
390 }
391 
392 #endif //OPENCV_FLANN_LSH_INDEX_H_
393