1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "bsdiff/suffix_array_index.h"
6 
7 #include <limits>
8 #include <vector>
9 
10 #include <divsufsort.h>
11 #include <divsufsort64.h>
12 
13 #include "bsdiff/logging.h"
14 
15 namespace {
16 
17 // libdivsufsort C++ overloaded functions used to allow calling the right
18 // implementation based on the pointer size.
CallDivSufSort(const uint8_t * text,saidx_t * sa,size_t n)19 int CallDivSufSort(const uint8_t* text, saidx_t* sa, size_t n) {
20   return divsufsort(text, sa, n);
21 }
CallDivSufSort(const uint8_t * text,saidx64_t * sa,size_t n)22 int CallDivSufSort(const uint8_t* text, saidx64_t* sa, size_t n) {
23   return divsufsort64(text, sa, n);
24 }
25 
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx_t * sa,size_t sa_size,saidx_t * left)26 saidx_t CallSaSearch(const uint8_t* text,
27                      size_t text_size,
28                      const uint8_t* pattern,
29                      size_t pattern_size,
30                      const saidx_t* sa,
31                      size_t sa_size,
32                      saidx_t* left) {
33   return sa_search(text, text_size, pattern, pattern_size, sa, sa_size, left);
34 }
CallSaSearch(const uint8_t * text,size_t text_size,const uint8_t * pattern,size_t pattern_size,const saidx64_t * sa,size_t sa_size,saidx64_t * left)35 saidx64_t CallSaSearch(const uint8_t* text,
36                        size_t text_size,
37                        const uint8_t* pattern,
38                        size_t pattern_size,
39                        const saidx64_t* sa,
40                        size_t sa_size,
41                        saidx64_t* left) {
42   return sa_search64(text, text_size, pattern, pattern_size, sa, sa_size, left);
43 }
44 
45 }  // namespace
46 
47 namespace bsdiff {
48 
49 // The SAIDX template type must be either saidx_t or saidx64_t, which will
50 // depend on the maximum size of the input text needed.
51 template <typename SAIDX>
52 class SuffixArrayIndex : public SuffixArrayIndexInterface {
53  public:
54   SuffixArrayIndex() = default;
55 
56   // Initialize and construct the suffix array of the |text| string of length
57   // |n|. The memory pointed by |text| must be kept alive. Returns whether the
58   // construction succeeded.
59   bool Init(const uint8_t* text, size_t n);
60 
61   // SuffixArrayIndexInterface overrides.
62   void SearchPrefix(const uint8_t* target,
63                     size_t length,
64                     size_t* out_length,
65                     uint64_t* out_pos) const override;
66 
67  private:
68   const uint8_t* text_{nullptr};  // Owned by the caller.
69   size_t n_{0};
70 
71   std::vector<SAIDX> sa_;
72 };
73 
74 template <typename SAIDX>
Init(const uint8_t * text,size_t n)75 bool SuffixArrayIndex<SAIDX>::Init(const uint8_t* text, size_t n) {
76   if (!sa_.empty()) {
77     // Already initialized.
78     LOG(ERROR) << "SuffixArray already initialized";
79     return false;
80   }
81   if (static_cast<uint64_t>(n) >
82       static_cast<uint64_t>(std::numeric_limits<SAIDX>::max())) {
83     LOG(ERROR) << "Input too big (" << n << ") for this implementation";
84     return false;
85   }
86   text_ = text;
87   n_ = n;
88   sa_.resize(n + 1);
89 
90   if (n > 0 && CallDivSufSort(text_, sa_.data(), n) != 0) {
91     LOG(ERROR) << "divsufsrot() failed";
92     return false;
93   }
94 
95   return true;
96 }
97 
98 template <typename SAIDX>
SearchPrefix(const uint8_t * target,size_t length,size_t * out_length,uint64_t * out_pos) const99 void SuffixArrayIndex<SAIDX>::SearchPrefix(const uint8_t* target,
100                                            size_t length,
101                                            size_t* out_length,
102                                            uint64_t* out_pos) const {
103   SAIDX suf_left;
104   SAIDX count =
105       CallSaSearch(text_, n_, target, length, sa_.data(), n_, &suf_left);
106   if (count > 0) {
107     // This is the simple case where we found the whole |target| string was
108     // found.
109     *out_pos = sa_[suf_left];
110     *out_length = length;
111     return;
112   }
113   // In this case, |suf_left| points to the first suffix array position such
114   // that the suffix at that position is lexicographically larger than |target|.
115   // We only need to check whether the previous entry or the current entry is a
116   // longer match.
117   size_t prev_suffix_len = 0;
118   if (suf_left > 0) {
119     const size_t prev_max_len =
120         std::min(n_ - static_cast<size_t>(sa_[suf_left - 1]), length);
121     const uint8_t* prev_suffix = text_ + sa_[suf_left - 1];
122     prev_suffix_len =
123         std::mismatch(target, target + prev_max_len, prev_suffix).first -
124         target;
125   }
126 
127   size_t next_suffix_len = 0;
128   if (static_cast<size_t>(suf_left) < n_) {
129     const uint8_t* next_suffix = text_ + sa_[suf_left];
130     const size_t next_max_len =
131         std::min(n_ - static_cast<size_t>(sa_[suf_left]), length);
132     next_suffix_len =
133         std::mismatch(target, target + next_max_len, next_suffix).first -
134         target;
135   }
136 
137   *out_length = std::max(next_suffix_len, prev_suffix_len);
138   if (!*out_length) {
139     *out_pos = 0;
140   } else if (next_suffix_len >= prev_suffix_len) {
141     *out_pos = sa_[suf_left];
142   } else {
143     *out_pos = sa_[suf_left - 1];
144   }
145 }
146 
CreateSuffixArrayIndex(const uint8_t * text,size_t n)147 std::unique_ptr<SuffixArrayIndexInterface> CreateSuffixArrayIndex(
148     const uint8_t* text,
149     size_t n) {
150   // The maximum supported size when using the suffix array based on the 32-bit
151   // saidx_t type. We limit this to something a bit smaller (16 bytes smaller)
152   // than the maximum representable number so references like "n + 1" are don't
153   // overflow.
154   const size_t kMaxSaidxSize = std::numeric_limits<saidx_t>::max() - 16;
155   std::unique_ptr<SuffixArrayIndexInterface> ret;
156 
157   if (n > kMaxSaidxSize) {
158     SuffixArrayIndex<saidx64_t>* sa_ptr = new SuffixArrayIndex<saidx64_t>();
159     ret.reset(sa_ptr);
160     if (!sa_ptr->Init(text, n))
161       return nullptr;
162   } else {
163     SuffixArrayIndex<saidx_t>* sa_ptr = new SuffixArrayIndex<saidx_t>();
164     ret.reset(sa_ptr);
165     if (!sa_ptr->Init(text, n))
166       return nullptr;
167   }
168   return ret;
169 }
170 
171 
172 }  // namespace bsdiff
173