/* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ART_LIBARTBASE_BASE_HASH_SET_H_ #define ART_LIBARTBASE_BASE_HASH_SET_H_ #include #include #include #include #include #include #include #include #include "base/data_hash.h" #include "bit_utils.h" #include "macros.h" namespace art { template class HashSetIterator : std::iterator { public: HashSetIterator(const HashSetIterator&) = default; HashSetIterator(HashSetIterator&&) = default; HashSetIterator(HashSetType* hash_set, size_t index) : index_(index), hash_set_(hash_set) {} // Conversion from iterator to const_iterator. template ::value && std::is_same::value>::type> HashSetIterator(const HashSetIterator& other) : index_(other.index_), hash_set_(other.hash_set_) {} HashSetIterator& operator=(const HashSetIterator&) = default; HashSetIterator& operator=(HashSetIterator&&) = default; bool operator==(const HashSetIterator& other) const { return hash_set_ == other.hash_set_ && this->index_ == other.index_; } bool operator!=(const HashSetIterator& other) const { return !(*this == other); } HashSetIterator operator++() { // Value after modification. this->index_ = hash_set_->NextNonEmptySlot(index_); return *this; } HashSetIterator operator++(int) { HashSetIterator temp = *this; ++*this; return temp; } Elem& operator*() const { DCHECK(!hash_set_->IsFreeSlot(this->index_)); return hash_set_->ElementForIndex(this->index_); } Elem* operator->() const { return &**this; } private: size_t index_; HashSetType* hash_set_; template friend bool operator==(const HashSetIterator& lhs, const HashSetIterator& rhs); template friend class HashSet; template friend class HashSetIterator; }; template bool operator==(const HashSetIterator& lhs, const HashSetIterator& rhs) { static_assert( std::is_convertible, HashSetIterator>::value || std::is_convertible, HashSetIterator>::value, "Bad iterator types."); DCHECK_EQ(lhs.hash_set_, rhs.hash_set_); return lhs.index_ == rhs.index_; } template bool operator!=(const HashSetIterator& lhs, const HashSetIterator& rhs) { return !(lhs == rhs); } // Returns true if an item is empty. template class DefaultEmptyFn { public: void MakeEmpty(T& item) const { item = T(); } bool IsEmpty(const T& item) const { return item == T(); } }; template class DefaultEmptyFn { public: void MakeEmpty(T*& item) const { item = nullptr; } bool IsEmpty(T* const& item) const { return item == nullptr; } }; template using DefaultHashFn = typename std::conditional::value, DataHash, std::hash>::type; struct DefaultStringEquals { // Allow comparison with anything that can be compared to std::string, // for example std::string_view. template bool operator()(const std::string& lhs, const T& rhs) const { return lhs == rhs; } }; template using DefaultPred = typename std::conditional::value, DefaultStringEquals, std::equal_to>::type; // Low memory version of a hash set, uses less memory than std::unordered_multiset since elements // aren't boxed. Uses linear probing to resolve collisions. // EmptyFn needs to implement two functions MakeEmpty(T& item) and IsEmpty(const T& item). // TODO: We could get rid of this requirement by using a bitmap, though maybe this would be slower // and more complicated. template , class HashFn = DefaultHashFn, class Pred = DefaultPred, class Alloc = std::allocator> class HashSet { public: using value_type = T; using allocator_type = Alloc; using reference = T&; using const_reference = const T&; using pointer = T*; using const_pointer = const T*; using iterator = HashSetIterator; using const_iterator = HashSetIterator; using size_type = size_t; using difference_type = ptrdiff_t; static constexpr double kDefaultMinLoadFactor = 0.4; static constexpr double kDefaultMaxLoadFactor = 0.7; static constexpr size_t kMinBuckets = 1000; // If we don't own the data, this will create a new array which owns the data. void clear() { DeallocateStorage(); num_elements_ = 0; elements_until_expand_ = 0; } HashSet() : HashSet(kDefaultMinLoadFactor, kDefaultMaxLoadFactor) {} HashSet(double min_load_factor, double max_load_factor) noexcept : num_elements_(0u), num_buckets_(0u), elements_until_expand_(0u), owns_data_(false), data_(nullptr), min_load_factor_(min_load_factor), max_load_factor_(max_load_factor) { DCHECK_GT(min_load_factor, 0.0); DCHECK_LT(max_load_factor, 1.0); } explicit HashSet(const allocator_type& alloc) noexcept : allocfn_(alloc), hashfn_(), emptyfn_(), pred_(), num_elements_(0u), num_buckets_(0u), elements_until_expand_(0u), owns_data_(false), data_(nullptr), min_load_factor_(kDefaultMinLoadFactor), max_load_factor_(kDefaultMaxLoadFactor) { } HashSet(const HashSet& other) noexcept : allocfn_(other.allocfn_), hashfn_(other.hashfn_), emptyfn_(other.emptyfn_), pred_(other.pred_), num_elements_(other.num_elements_), num_buckets_(0), elements_until_expand_(other.elements_until_expand_), owns_data_(false), data_(nullptr), min_load_factor_(other.min_load_factor_), max_load_factor_(other.max_load_factor_) { AllocateStorage(other.NumBuckets()); for (size_t i = 0; i < num_buckets_; ++i) { ElementForIndex(i) = other.data_[i]; } } // noexcept required so that the move constructor is used instead of copy constructor. // b/27860101 HashSet(HashSet&& other) noexcept : allocfn_(std::move(other.allocfn_)), hashfn_(std::move(other.hashfn_)), emptyfn_(std::move(other.emptyfn_)), pred_(std::move(other.pred_)), num_elements_(other.num_elements_), num_buckets_(other.num_buckets_), elements_until_expand_(other.elements_until_expand_), owns_data_(other.owns_data_), data_(other.data_), min_load_factor_(other.min_load_factor_), max_load_factor_(other.max_load_factor_) { other.num_elements_ = 0u; other.num_buckets_ = 0u; other.elements_until_expand_ = 0u; other.owns_data_ = false; other.data_ = nullptr; } // Construct from existing data. // Read from a block of memory, if make_copy_of_data is false, then data_ points to within the // passed in ptr_. HashSet(const uint8_t* ptr, bool make_copy_of_data, size_t* read_count) noexcept { uint64_t temp; size_t offset = 0; offset = ReadFromBytes(ptr, offset, &temp); num_elements_ = static_cast(temp); offset = ReadFromBytes(ptr, offset, &temp); num_buckets_ = static_cast(temp); CHECK_LE(num_elements_, num_buckets_); offset = ReadFromBytes(ptr, offset, &temp); elements_until_expand_ = static_cast(temp); offset = ReadFromBytes(ptr, offset, &min_load_factor_); offset = ReadFromBytes(ptr, offset, &max_load_factor_); if (!make_copy_of_data) { owns_data_ = false; data_ = const_cast(reinterpret_cast(ptr + offset)); offset += sizeof(*data_) * num_buckets_; } else { AllocateStorage(num_buckets_); // Write elements, not that this may not be safe for cross compilation if the elements are // pointer sized. for (size_t i = 0; i < num_buckets_; ++i) { offset = ReadFromBytes(ptr, offset, &data_[i]); } } // Caller responsible for aligning. *read_count = offset; } // Returns how large the table is after being written. If target is null, then no writing happens // but the size is still returned. Target must be 8 byte aligned. size_t WriteToMemory(uint8_t* ptr) const { size_t offset = 0; offset = WriteToBytes(ptr, offset, static_cast(num_elements_)); offset = WriteToBytes(ptr, offset, static_cast(num_buckets_)); offset = WriteToBytes(ptr, offset, static_cast(elements_until_expand_)); offset = WriteToBytes(ptr, offset, min_load_factor_); offset = WriteToBytes(ptr, offset, max_load_factor_); // Write elements, not that this may not be safe for cross compilation if the elements are // pointer sized. for (size_t i = 0; i < num_buckets_; ++i) { offset = WriteToBytes(ptr, offset, data_[i]); } // Caller responsible for aligning. return offset; } ~HashSet() { DeallocateStorage(); } HashSet& operator=(HashSet&& other) noexcept { HashSet(std::move(other)).swap(*this); // NOLINT [runtime/explicit] [5] return *this; } HashSet& operator=(const HashSet& other) noexcept { HashSet(other).swap(*this); // NOLINT(runtime/explicit) - a case of lint gone mad. return *this; } // Lower case for c++11 for each. iterator begin() { iterator ret(this, 0); if (num_buckets_ != 0 && IsFreeSlot(ret.index_)) { ++ret; // Skip all the empty slots. } return ret; } // Lower case for c++11 for each. const version. const_iterator begin() const { const_iterator ret(this, 0); if (num_buckets_ != 0 && IsFreeSlot(ret.index_)) { ++ret; // Skip all the empty slots. } return ret; } // Lower case for c++11 for each. iterator end() { return iterator(this, NumBuckets()); } // Lower case for c++11 for each. const version. const_iterator end() const { return const_iterator(this, NumBuckets()); } size_t size() const { return num_elements_; } bool empty() const { return size() == 0; } // Erase algorithm: // Make an empty slot where the iterator is pointing. // Scan forwards until we hit another empty slot. // If an element in between doesn't rehash to the range from the current empty slot to the // iterator. It must be before the empty slot, in that case we can move it to the empty slot // and set the empty slot to be the location we just moved from. // Relies on maintaining the invariant that there's no empty slots from the 'ideal' index of an // element to its actual location/index. // Note that since erase shuffles back elements, it may result in the same element being visited // twice during HashSet iteration. This happens when an element already visited during iteration // gets shuffled to the end of the bucket array. iterator erase(iterator it) { // empty_index is the index that will become empty. size_t empty_index = it.index_; DCHECK(!IsFreeSlot(empty_index)); size_t next_index = empty_index; bool filled = false; // True if we filled the empty index. while (true) { next_index = NextIndex(next_index); T& next_element = ElementForIndex(next_index); // If the next element is empty, we are done. Make sure to clear the current empty index. if (emptyfn_.IsEmpty(next_element)) { emptyfn_.MakeEmpty(ElementForIndex(empty_index)); break; } // Otherwise try to see if the next element can fill the current empty index. const size_t next_hash = hashfn_(next_element); // Calculate the ideal index, if it is within empty_index + 1 to next_index then there is // nothing we can do. size_t next_ideal_index = IndexForHash(next_hash); // Loop around if needed for our check. size_t unwrapped_next_index = next_index; if (unwrapped_next_index < empty_index) { unwrapped_next_index += NumBuckets(); } // Loop around if needed for our check. size_t unwrapped_next_ideal_index = next_ideal_index; if (unwrapped_next_ideal_index < empty_index) { unwrapped_next_ideal_index += NumBuckets(); } if (unwrapped_next_ideal_index <= empty_index || unwrapped_next_ideal_index > unwrapped_next_index) { // If the target index isn't within our current range it must have been probed from before // the empty index. ElementForIndex(empty_index) = std::move(next_element); filled = true; // TODO: Optimize empty_index = next_index; } } --num_elements_; // If we didn't fill the slot then we need go to the next non free slot. if (!filled) { ++it; } return it; } // Find an element, returns end() if not found. // Allows custom key (K) types, example of when this is useful: // Set of Class* sorted by name, want to find a class with a name but can't allocate a dummy // object in the heap for performance solution. template iterator find(const K& key) { return FindWithHash(key, hashfn_(key)); } template const_iterator find(const K& key) const { return FindWithHash(key, hashfn_(key)); } template iterator FindWithHash(const K& key, size_t hash) { return iterator(this, FindIndex(key, hash)); } template const_iterator FindWithHash(const K& key, size_t hash) const { return const_iterator(this, FindIndex(key, hash)); } // Insert an element with hint, allows duplicates. // Note: The hint is not very useful for a HashSet<> unless there are many hash conflicts // and in that case the use of HashSet<> itself should be reconsidered. iterator insert(const_iterator hint ATTRIBUTE_UNUSED, const T& element) { return insert(element); } iterator insert(const_iterator hint ATTRIBUTE_UNUSED, T&& element) { return insert(std::move(element)); } // Insert an element, allows duplicates. iterator insert(const T& element) { return InsertWithHash(element, hashfn_(element)); } iterator insert(T&& element) { return InsertWithHash(std::move(element), hashfn_(element)); } template ::value>::type> iterator InsertWithHash(U&& element, size_t hash) { DCHECK_EQ(hash, hashfn_(element)); if (num_elements_ >= elements_until_expand_) { Expand(); DCHECK_LT(num_elements_, elements_until_expand_); } const size_t index = FirstAvailableSlot(IndexForHash(hash)); data_[index] = std::forward(element); ++num_elements_; return iterator(this, index); } void swap(HashSet& other) { // Use argument-dependent lookup with fall-back to std::swap() for function objects. using std::swap; swap(allocfn_, other.allocfn_); swap(hashfn_, other.hashfn_); swap(emptyfn_, other.emptyfn_); swap(pred_, other.pred_); std::swap(data_, other.data_); std::swap(num_buckets_, other.num_buckets_); std::swap(num_elements_, other.num_elements_); std::swap(elements_until_expand_, other.elements_until_expand_); std::swap(min_load_factor_, other.min_load_factor_); std::swap(max_load_factor_, other.max_load_factor_); std::swap(owns_data_, other.owns_data_); } allocator_type get_allocator() const { return allocfn_; } void ShrinkToMaximumLoad() { Resize(size() / max_load_factor_); } // Reserve enough room to insert until Size() == num_elements without requiring to grow the hash // set. No-op if the hash set is already large enough to do this. void reserve(size_t num_elements) { size_t num_buckets = num_elements / max_load_factor_; // Deal with rounding errors. Add one for rounding. while (static_cast(num_buckets * max_load_factor_) <= num_elements + 1u) { ++num_buckets; } if (num_buckets > NumBuckets()) { Resize(num_buckets); } } // To distance that inserted elements were probed. Used for measuring how good hash functions // are. size_t TotalProbeDistance() const { size_t total = 0; for (size_t i = 0; i < NumBuckets(); ++i) { const T& element = ElementForIndex(i); if (!emptyfn_.IsEmpty(element)) { size_t ideal_location = IndexForHash(hashfn_(element)); if (ideal_location > i) { total += i + NumBuckets() - ideal_location; } else { total += i - ideal_location; } } } return total; } // Calculate the current load factor and return it. double CalculateLoadFactor() const { return static_cast(size()) / static_cast(NumBuckets()); } // Make sure that everything reinserts in the right spot. Returns the number of errors. size_t Verify() NO_THREAD_SAFETY_ANALYSIS { size_t errors = 0; for (size_t i = 0; i < num_buckets_; ++i) { T& element = data_[i]; if (!emptyfn_.IsEmpty(element)) { T temp; emptyfn_.MakeEmpty(temp); std::swap(temp, element); size_t first_slot = FirstAvailableSlot(IndexForHash(hashfn_(temp))); if (i != first_slot) { LOG(ERROR) << "Element " << i << " should be in slot " << first_slot; ++errors; } std::swap(temp, element); } } return errors; } double GetMinLoadFactor() const { return min_load_factor_; } double GetMaxLoadFactor() const { return max_load_factor_; } // Change the load factor of the hash set. If the current load factor is greater than the max // specified, then we resize the hash table storage. void SetLoadFactor(double min_load_factor, double max_load_factor) { DCHECK_LT(min_load_factor, max_load_factor); DCHECK_GT(min_load_factor, 0.0); DCHECK_LT(max_load_factor, 1.0); min_load_factor_ = min_load_factor; max_load_factor_ = max_load_factor; elements_until_expand_ = NumBuckets() * max_load_factor_; // If the current load factor isn't in the range, then resize to the mean of the minimum and // maximum load factor. const double load_factor = CalculateLoadFactor(); if (load_factor > max_load_factor_) { Resize(size() / ((min_load_factor_ + max_load_factor_) * 0.5)); } } // The hash set expands when Size() reaches ElementsUntilExpand(). size_t ElementsUntilExpand() const { return elements_until_expand_; } size_t NumBuckets() const { return num_buckets_; } private: T& ElementForIndex(size_t index) { DCHECK_LT(index, NumBuckets()); DCHECK(data_ != nullptr); return data_[index]; } const T& ElementForIndex(size_t index) const { DCHECK_LT(index, NumBuckets()); DCHECK(data_ != nullptr); return data_[index]; } size_t IndexForHash(size_t hash) const { // Protect against undefined behavior (division by zero). if (UNLIKELY(num_buckets_ == 0)) { return 0; } return hash % num_buckets_; } size_t NextIndex(size_t index) const { if (UNLIKELY(++index >= num_buckets_)) { DCHECK_EQ(index, NumBuckets()); return 0; } return index; } // Find the hash table slot for an element, or return NumBuckets() if not found. // This value for not found is important so that iterator(this, FindIndex(...)) == end(). template size_t FindIndex(const K& element, size_t hash) const { // Guard against failing to get an element for a non-existing index. if (UNLIKELY(NumBuckets() == 0)) { return 0; } DCHECK_EQ(hashfn_(element), hash); size_t index = IndexForHash(hash); while (true) { const T& slot = ElementForIndex(index); if (emptyfn_.IsEmpty(slot)) { return NumBuckets(); } if (pred_(slot, element)) { return index; } index = NextIndex(index); } } bool IsFreeSlot(size_t index) const { return emptyfn_.IsEmpty(ElementForIndex(index)); } // Allocate a number of buckets. void AllocateStorage(size_t num_buckets) { num_buckets_ = num_buckets; data_ = allocfn_.allocate(num_buckets_); owns_data_ = true; for (size_t i = 0; i < num_buckets_; ++i) { allocfn_.construct(allocfn_.address(data_[i])); emptyfn_.MakeEmpty(data_[i]); } } void DeallocateStorage() { if (owns_data_) { for (size_t i = 0; i < NumBuckets(); ++i) { allocfn_.destroy(allocfn_.address(data_[i])); } if (data_ != nullptr) { allocfn_.deallocate(data_, NumBuckets()); } owns_data_ = false; } data_ = nullptr; num_buckets_ = 0; } // Expand the set based on the load factors. void Expand() { size_t min_index = static_cast(size() / min_load_factor_); // Resize based on the minimum load factor. Resize(min_index); } // Expand / shrink the table to the new specified size. void Resize(size_t new_size) { if (new_size < kMinBuckets) { new_size = kMinBuckets; } DCHECK_GE(new_size, size()); T* const old_data = data_; size_t old_num_buckets = num_buckets_; // Reinsert all of the old elements. const bool owned_data = owns_data_; AllocateStorage(new_size); for (size_t i = 0; i < old_num_buckets; ++i) { T& element = old_data[i]; if (!emptyfn_.IsEmpty(element)) { data_[FirstAvailableSlot(IndexForHash(hashfn_(element)))] = std::move(element); } if (owned_data) { allocfn_.destroy(allocfn_.address(element)); } } if (owned_data) { allocfn_.deallocate(old_data, old_num_buckets); } // When we hit elements_until_expand_, we are at the max load factor and must expand again. elements_until_expand_ = NumBuckets() * max_load_factor_; } ALWAYS_INLINE size_t FirstAvailableSlot(size_t index) const { DCHECK_LT(index, NumBuckets()); // Don't try to get a slot out of range. size_t non_empty_count = 0; while (!emptyfn_.IsEmpty(data_[index])) { index = NextIndex(index); non_empty_count++; DCHECK_LE(non_empty_count, NumBuckets()); // Don't loop forever. } return index; } size_t NextNonEmptySlot(size_t index) const { const size_t num_buckets = NumBuckets(); DCHECK_LT(index, num_buckets); do { ++index; } while (index < num_buckets && IsFreeSlot(index)); return index; } // Return new offset. template static size_t WriteToBytes(uint8_t* ptr, size_t offset, Elem n) { DCHECK_ALIGNED(ptr + offset, sizeof(n)); if (ptr != nullptr) { *reinterpret_cast(ptr + offset) = n; } return offset + sizeof(n); } template static size_t ReadFromBytes(const uint8_t* ptr, size_t offset, Elem* out) { DCHECK(ptr != nullptr); DCHECK_ALIGNED(ptr + offset, sizeof(*out)); *out = *reinterpret_cast(ptr + offset); return offset + sizeof(*out); } Alloc allocfn_; // Allocator function. HashFn hashfn_; // Hashing function. EmptyFn emptyfn_; // IsEmpty/SetEmpty function. Pred pred_; // Equals function. size_t num_elements_; // Number of inserted elements. size_t num_buckets_; // Number of hash table buckets. size_t elements_until_expand_; // Maximum number of elements until we expand the table. bool owns_data_; // If we own data_ and are responsible for freeing it. T* data_; // Backing storage. double min_load_factor_; double max_load_factor_; template friend class HashSetIterator; ART_FRIEND_TEST(InternTableTest, CrossHash); }; template void swap(HashSet& lhs, HashSet& rhs) { lhs.swap(rhs); } } // namespace art #endif // ART_LIBARTBASE_BASE_HASH_SET_H_