1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
17 #define TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
18 
19 #include <algorithm>
20 #include <vector>
21 #include "tensorflow/core/framework/types.h"
22 #include "tensorflow/core/platform/macros.h"
23 #include "tensorflow/core/platform/prefetch.h"
24 
25 namespace tensorflow {
26 
27 // Class for efficiently storing key->value mappings when the size is
28 // known in advance and the keys are pre-hashed into uint64s.
29 // Keys should have "good enough" randomness (be spread across the
30 // entire 64 bit space).
31 //
32 // Important:  Clients wishing to use deterministic keys must
33 // ensure that their keys fall in the range 0 .. (uint64max-1);
34 // the table uses 2^64-1 as the "not occupied" flag.
35 //
36 // Inserted keys must be unique, and there are no update
37 // or delete functions (until some subsequent use of this table
38 // requires them).
39 //
40 // Threads must synchronize their access to a PresizedCuckooMap.
41 //
42 // The cuckoo hash table is 4-way associative (each "bucket" has 4
43 // "slots" for key/value entries).  Uses breadth-first-search to find
44 // a good cuckoo path with less data movement (see
45 // http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf )
46 
47 namespace presized_cuckoo_map {
48 // Utility function to compute (x * y) >> 64, or "multiply high".
49 // On x86-64, this is a single instruction, but not all platforms
50 // support the __uint128_t type, so we provide a generic
51 // implementation as well.
multiply_high_u64(uint64 x,uint64 y)52 inline uint64 multiply_high_u64(uint64 x, uint64 y) {
53 #if defined(__SIZEOF_INT128__)
54   return (uint64)(((__uint128_t)x * (__uint128_t)y) >> 64);
55 #else
56   // For platforms without int128 support, do it the long way.
57   uint64 x_lo = x & 0xffffffff;
58   uint64 x_hi = x >> 32;
59   uint64 buckets_lo = y & 0xffffffff;
60   uint64 buckets_hi = y >> 32;
61   uint64 prod_hi = x_hi * buckets_hi;
62   uint64 prod_lo = x_lo * buckets_lo;
63   uint64 prod_mid1 = x_hi * buckets_lo;
64   uint64 prod_mid2 = x_lo * buckets_hi;
65   uint64 carry =
66       ((prod_mid1 & 0xffffffff) + (prod_mid2 & 0xffffffff) + (prod_lo >> 32)) >>
67       32;
68   return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry;
69 #endif
70 }
71 }  // namespace presized_cuckoo_map
72 
73 template <class value>
74 class PresizedCuckooMap {
75  public:
76   // The key type is fixed as a pre-hashed key for this specialized use.
77   typedef uint64 key_type;
78 
PresizedCuckooMap(uint64 num_entries)79   explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); }
80 
Clear(uint64 num_entries)81   void Clear(uint64 num_entries) {
82     cpq_.reset(new CuckooPathQueue());
83     double n(num_entries);
84     n /= kLoadFactor;
85     num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket);
86     // Very small cuckoo tables don't work, because the probability
87     // of having same-bucket hashes is large.  We compromise for those
88     // uses by having a larger static starting size.
89     num_buckets_ += 32;
90     Bucket empty_bucket;
91     for (int i = 0; i < kSlotsPerBucket; i++) {
92       empty_bucket.keys[i] = kUnusedSlot;
93     }
94     buckets_.clear();
95     buckets_.resize(num_buckets_, empty_bucket);
96   }
97 
98   // Returns false if k is already in table or if the table
99   // is full; true otherwise.
InsertUnique(const key_type k,const value & v)100   bool InsertUnique(const key_type k, const value& v) {
101     uint64 tk = key_transform(k);
102     uint64 b1 = fast_map_to_buckets(tk);
103     uint64 b2 = fast_map_to_buckets(h2(tk));
104 
105     // Merged find and duplicate checking.
106     uint64 target_bucket = 0;
107     int target_slot = kNoSpace;
108 
109     for (auto bucket : {b1, b2}) {
110       Bucket* bptr = &buckets_[bucket];
111       for (int slot = 0; slot < kSlotsPerBucket; slot++) {
112         if (bptr->keys[slot] == k) {  // Duplicates are not allowed.
113           return false;
114         } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) {
115           target_bucket = bucket;
116           target_slot = slot;
117         }
118       }
119     }
120 
121     if (target_slot != kNoSpace) {
122       InsertInternal(tk, v, target_bucket, target_slot);
123       return true;
124     }
125 
126     return CuckooInsert(tk, v, b1, b2);
127   }
128 
129   // Returns true if found.  Sets *out = value.
Find(const key_type k,value * out)130   bool Find(const key_type k, value* out) const {
131     uint64 tk = key_transform(k);
132     return FindInBucket(k, fast_map_to_buckets(tk), out) ||
133            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
134   }
135 
136   // Prefetch memory associated with the key k into cache levels specified by
137   // hint.
138   template <port::PrefetchHint hint = port::PREFETCH_HINT_T0>
PrefetchKey(const key_type k)139   void PrefetchKey(const key_type k) const {
140     const uint64 tk = key_transform(k);
141     port::prefetch<hint>(&buckets_[fast_map_to_buckets(tk)].keys);
142     port::prefetch<hint>(&buckets_[fast_map_to_buckets(h2(tk))].keys);
143   }
144 
MemoryUsed()145   int64 MemoryUsed() const {
146     return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
147   }
148 
149  private:
150   static constexpr int kSlotsPerBucket = 4;
151 
152   // The load factor is chosen slightly conservatively for speed and
153   // to avoid the need for a table rebuild on insertion failure.
154   // 0.94 is achievable, but 0.85 is faster and keeps the code simple
155   // at the cost of a small amount of memory.
156   // NOTE:  0 < kLoadFactor <= 1.0
157   static constexpr double kLoadFactor = 0.85;
158 
159   // Cuckoo insert:  The maximum number of entries to scan should be ~400
160   // (Source:  Personal communication with Michael Mitzenmacher;  empirical
161   // experiments validate.).  After trying 400 candidate locations, declare
162   // the table full - it's probably full of unresolvable cycles.  Less than
163   // 400 reduces max occupancy;  much more results in very poor performance
164   // around the full point.  For (2,4) a max BFS path len of 5 results in ~682
165   // nodes to visit, calculated below, and is a good value.
166 
167   static constexpr uint8 kMaxBFSPathLen = 5;
168 
169   // Constants for BFS cuckoo path search:
170   // The visited list must be maintained for all but the last level of search
171   // in order to trace back the path.  The BFS search has two roots
172   // and each can go to a total depth (including the root) of 5.
173   // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682.
174   // The visited queue, however, does not need to hold the deepest level,
175   // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170
176   static constexpr int kMaxQueueSize = 682;
177   static constexpr int kVisitedListSize = 170;
178 
179   static constexpr int kNoSpace = -1;  // SpaceAvailable return
180   static constexpr uint64 kUnusedSlot = ~(0ULL);
181 
182   // Buckets are organized with key_types clustered for access speed
183   // and for compactness while remaining aligned.
184   struct Bucket {
185     key_type keys[kSlotsPerBucket];
186     value values[kSlotsPerBucket];
187   };
188 
189   // Insert uses the BFS optimization (search before moving) to reduce
190   // the number of cache lines dirtied during search.
191 
192   struct CuckooPathEntry {
193     uint64 bucket;
194     int depth;
195     int parent;       // To index in the visited array.
196     int parent_slot;  // Which slot in our parent did we come from?  -1 == root.
197   };
198 
199   // CuckooPathQueue is a trivial circular queue for path entries.
200   // The caller is responsible for not inserting more than kMaxQueueSize
201   // entries.  Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue
202   // that it reuses across inserts.
203   class CuckooPathQueue {
204    public:
CuckooPathQueue()205     CuckooPathQueue() : head_(0), tail_(0) {}
206 
push_back(CuckooPathEntry e)207     void push_back(CuckooPathEntry e) {
208       queue_[tail_] = e;
209       tail_ = (tail_ + 1) % kMaxQueueSize;
210     }
211 
pop_front()212     CuckooPathEntry pop_front() {
213       CuckooPathEntry& e = queue_[head_];
214       head_ = (head_ + 1) % kMaxQueueSize;
215       return e;
216     }
217 
empty()218     bool empty() const { return head_ == tail_; }
219 
full()220     bool full() const { return ((tail_ + 1) % kMaxQueueSize) == head_; }
221 
reset()222     void reset() { head_ = tail_ = 0; }
223 
224    private:
225     CuckooPathEntry queue_[kMaxQueueSize];
226     int head_;
227     int tail_;
228   };
229 
230   typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath;
231 
232   // Callers are expected to have pre-hashed the keys into a uint64
233   // and are expected to be able to handle (a very low rate) of
234   // collisions, OR must ensure that their keys are always in
235   // the range 0 - (uint64max - 1).  This transforms 'not found flag'
236   // keys into something else.
key_transform(const key_type k)237   inline uint64 key_transform(const key_type k) const {
238     return k + (k == kUnusedSlot);
239   }
240 
241   // h2 performs a very quick mix of h to generate the second bucket hash.
242   // Assumes there is plenty of remaining entropy in the initial h.
h2(uint64 h)243   inline uint64 h2(uint64 h) const {
244     const uint64 m = 0xc6a4a7935bd1e995;
245     return m * ((h >> 32) | (h << 32));
246   }
247 
248   // alt_bucket identifies the "other" bucket for key k, where
249   // other is "the one that isn't bucket b"
alt_bucket(key_type k,uint64 b)250   inline uint64 alt_bucket(key_type k, uint64 b) const {
251     if (fast_map_to_buckets(k) != b) {
252       return fast_map_to_buckets(k);
253     }
254     return fast_map_to_buckets(h2(k));
255   }
256 
InsertInternal(key_type k,const value & v,uint64 b,int slot)257   inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) {
258     Bucket* bptr = &buckets_[b];
259     bptr->keys[slot] = k;
260     bptr->values[slot] = v;
261   }
262 
263   // For the associative cuckoo table, check all of the slots in
264   // the bucket to see if the key is present.
FindInBucket(key_type k,uint64 b,value * out)265   bool FindInBucket(key_type k, uint64 b, value* out) const {
266     const Bucket& bref = buckets_[b];
267     for (int i = 0; i < kSlotsPerBucket; i++) {
268       if (bref.keys[i] == k) {
269         *out = bref.values[i];
270         return true;
271       }
272     }
273     return false;
274   }
275 
276   //  returns either kNoSpace or the index of an
277   //  available slot (0 <= slot < kSlotsPerBucket)
SpaceAvailable(uint64 bucket)278   inline int SpaceAvailable(uint64 bucket) const {
279     const Bucket& bref = buckets_[bucket];
280     for (int i = 0; i < kSlotsPerBucket; i++) {
281       if (bref.keys[i] == kUnusedSlot) {
282         return i;
283       }
284     }
285     return kNoSpace;
286   }
287 
CopyItem(uint64 src_bucket,int src_slot,uint64 dst_bucket,int dst_slot)288   inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket,
289                        int dst_slot) {
290     Bucket& src_ref = buckets_[src_bucket];
291     Bucket& dst_ref = buckets_[dst_bucket];
292     dst_ref.keys[dst_slot] = src_ref.keys[src_slot];
293     dst_ref.values[dst_slot] = src_ref.values[src_slot];
294   }
295 
CuckooInsert(key_type k,const value & v,uint64 b1,uint64 b2)296   bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) {
297     int visited_end = 0;
298     cpq_->reset();
299 
300     cpq_->push_back({b1, 1, 0, 0});  // Note depth starts at 1.
301     cpq_->push_back({b2, 1, 0, 0});
302 
303     while (!cpq_->empty()) {
304       CuckooPathEntry e = cpq_->pop_front();
305       int free_slot;
306       free_slot = SpaceAvailable(e.bucket);
307       if (free_slot != kNoSpace) {
308         while (e.depth > 1) {
309           // "copy" instead of "swap" because one entry is always zero.
310           // After, write target key/value over top of last copied entry.
311           CuckooPathEntry parent = visited_[e.parent];
312           CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot);
313           free_slot = e.parent_slot;
314           e = parent;
315         }
316         InsertInternal(k, v, e.bucket, free_slot);
317         return true;
318       } else {
319         if (e.depth < (kMaxBFSPathLen)) {
320           auto parent_index = visited_end;
321           visited_[visited_end] = e;
322           visited_end++;
323           // Don't always start with the same slot, to even out the path depth.
324           int start_slot = (k + e.bucket) % kSlotsPerBucket;
325           const Bucket& bref = buckets_[e.bucket];
326           for (int i = 0; i < kSlotsPerBucket; i++) {
327             int slot = (start_slot + i) % kSlotsPerBucket;
328             uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket);
329             // Optimization:  Avoid single-step cycles (from e, don't
330             // add a child node that is actually e's parent).
331             uint64 e_parent_bucket = visited_[e.parent].bucket;
332             if (next_bucket != e_parent_bucket) {
333               cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot});
334             }
335           }
336         }
337       }
338     }
339 
340     LOG(WARNING) << "Cuckoo path finding failed: Table too small?";
341     return false;
342   }
343 
fast_map_to_buckets(uint64 x)344   inline uint64 fast_map_to_buckets(uint64 x) const {
345     // Map x (uniform in 2^64) to the range [0, num_buckets_ -1]
346     // using Lemire's alternative to modulo reduction:
347     // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
348     // Instead of x % N, use (x * N) >> 64.
349     return presized_cuckoo_map::multiply_high_u64(x, num_buckets_);
350   }
351 
352   // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket.
353   uint64 num_buckets_;
354   std::vector<Bucket> buckets_;
355 
356   std::unique_ptr<CuckooPathQueue> cpq_;
357   CuckooPathEntry visited_[kVisitedListSize];
358 
359   TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap);
360 };
361 
362 }  // namespace tensorflow
363 
364 #endif  // TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
365