1 // Copyright 2017 The Chromium OS Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "bsdiff/suffix_array_index.h" 6 7 #include <limits> 8 #include <vector> 9 10 #include <divsufsort.h> 11 #include <divsufsort64.h> 12 13 #include "bsdiff/logging.h" 14 15 namespace { 16 17 // libdivsufsort C++ overloaded functions used to allow calling the right 18 // implementation based on the pointer size. 19 int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) { 20 return divsufsort(text, sa, n); 21 } 22 int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) { 23 return divsufsort64(text, sa, n); 24 } 25 26 saidx_t CallSaSearch(const uint8_t* text, 27 size_t text_size, 28 const uint8_t* pattern, 29 size_t pattern_size, 30 const saidx_t* sa, 31 size_t sa_size, 32 saidx_t* left) { 33 return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left); 34 } 35 saidx64_t CallSaSearch(const uint8_t* text, 36 size_t text_size, 37 const uint8_t* pattern, 38 size_t pattern_size, 39 const saidx64_t* sa, 40 size_t sa_size, 41 saidx64_t* left) { 42 return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left); 43 } 44 45 } // namespace 46 47 namespace bsdiff { 48 49 // The SAIDX template type must be either saidx_t or saidx64_t, which will 50 // depend on the maximum size of the input text needed. 51 template <typename SAIDX> 52 class SuffixArrayIndex : public SuffixArrayIndexInterface { 53 public: 54 SuffixArrayIndex() = default; 55 56 // Initialize and construct the suffix array of the |text| string of length 57 // |n|. The memory pointed by |text| must be kept alive. Returns whether the 58 // construction succeeded. 59 bool Init(const uint8_t* text, size_t n); 60 61 // SuffixArrayIndexInterface overrides. 62 void SearchPrefix(const uint8_t* target, 63 size_t length, 64 size_t* out_length, 65 uint64_t* out_pos) const override; 66 67 private: 68 const uint8_t* text_{nullptr}; // Owned by the caller. 69 size_t n_{0}; 70 71 std::vector<SAIDX> sa_; 72 }; 73 74 template <typename SAIDX> 75 bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) { 76 if (!sa_.empty()) { 77 // Already initialized. 78 LOG(ERROR) << "SuffixArray already initialized"; 79 return false; 80 } 81 if (static_cast<uint64_t>(n) > 82 static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) { 83 LOG(ERROR) << "Input too big (" << n << ") for this implementation"; 84 return false; 85 } 86 text_ = text; 87 n_ = n; 88 sa_.resize(n + 1); 89 90 if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) { 91 LOG(ERROR) << "divsufsrot() failed"; 92 return false; 93 } 94 95 return true; 96 } 97 98 template <typename SAIDX> 99 void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target, 100 size_t length, 101 size_t* out_length, 102 uint64_t* out_pos) const { 103 SAIDX suf_left; 104 SAIDX count = 105 CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left); 106 if (count > 0) { 107 // This is the simple case where we found the whole |target| string was 108 // found. 109 *out_pos = sa_[suf_left]; 110 *out_length = length; 111 return; 112 } 113 // In this case, |suf_left| points to the first suffix array position such 114 // that the suffix at that position is lexicographically larger than |target|. 115 // We only need to check whether the previous entry or the current entry is a 116 // longer match. 117 size_t prev_suffix_len = 0; 118 if (suf_left > 0) { 119 const size_t prev_max_len = 120 std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length); 121 const uint8_t* prev_suffix = text_ + sa_[suf_left - 1]; 122 prev_suffix_len = 123 std::mismatch(target, target + prev_max_len, prev_suffix).first - 124 target; 125 } 126 127 size_t next_suffix_len = 0; 128 if (static_cast<size_t>(suf_left) < n_) { 129 const uint8_t* next_suffix = text_ + sa_[suf_left]; 130 const size_t next_max_len = 131 std::min(n_ - static_cast<size_t>(sa_[suf_left]), length); 132 next_suffix_len = 133 std::mismatch(target, target + next_max_len, next_suffix).first - 134 target; 135 } 136 137 *out_length = std::max(next_suffix_len, prev_suffix_len); 138 if (!*out_length) { 139 *out_pos = 0; 140 } else if (next_suffix_len >= prev_suffix_len) { 141 *out_pos = sa_[suf_left]; 142 } else { 143 *out_pos = sa_[suf_left - 1]; 144 } 145 } 146 147 std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex( 148 const uint8_t* text, 149 size_t n) { 150 // The maximum supported size when using the suffix array based on the 32-bit 151 // saidx_t type. We limit this to something a bit smaller (16 bytes smaller) 152 // than the maximum representable number so references like "n + 1" are don't 153 // overflow. 154 const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16; 155 std::unique_ptr<SuffixArrayIndexInterface> ret; 156 157 if (n > kMaxSaidxSize) { 158 SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>(); 159 ret.reset(sa_ptr); 160 if (!sa_ptr->Init(text, n)) 161 return nullptr; 162 } else { 163 SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>(); 164 ret.reset(sa_ptr); 165 if (!sa_ptr->Init(text, n)) 166 return nullptr; 167 } 168 return ret; 169 } 170 171 172 } // namespace bsdiff 173