1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_CMDLINE_TOKEN_RANGE_H_
18 #define ART_CMDLINE_TOKEN_RANGE_H_
19 
20 #include <assert.h>
21 #include <vector>
22 #include <string>
23 #include <algorithm>
24 #include <memory>
25 
26 namespace art {
27 // A range of tokens to make token matching algorithms easier.
28 //
29 // We try really hard to avoid copying and store only a pointer and iterators to the
30 // interiors of the vector, so a typical copy constructor never ends up doing a deep copy.
31 // It is up to the user to play nice and not to mutate the strings in-place.
32 //
33 // Tokens are only copied if a mutating operation is performed (and even then only
34 // if it *actually* mutates the token).
35 struct TokenRange {
36   // Short-hand for a vector of strings. A single string and a token is synonymous.
37   using TokenList = std::vector<std::string>;
38 
39   // Copying-from-vector constructor.
TokenRangeTokenRange40   explicit TokenRange(const TokenList& token_list)
41     : token_list_(new TokenList(token_list)),
42       begin_(token_list_->begin()),
43       end_(token_list_->end())
44   {}
45 
46   // Copying-from-iterator constructor
47   template <typename ForwardIterator>
TokenRangeTokenRange48   explicit TokenRange(ForwardIterator it_begin, ForwardIterator it_end)
49     : token_list_(new TokenList(it_begin, it_end)),
50       begin_(token_list_->begin()),
51       end_(token_list_->end())
52   {}
53 
54 #if 0
55   // Copying-from-vector constructor.
56   TokenRange(const TokenList& token_list ATTRIBUTE_UNUSED,
57              TokenList::const_iterator it_begin,
58              TokenList::const_iterator it_end)
59     : token_list_(new TokenList(it_begin, it_end)),
60       begin_(token_list_->begin()),
61       end_(token_list_->end()) {
62     assert(it_begin >= token_list.begin());
63     assert(it_end <= token_list.end());
64   }
65 #endif
66 
67   // Copying from char array constructor, convertings into tokens (strings) along the way.
TokenRangeTokenRange68   TokenRange(const char* token_list[], size_t length)
69     : token_list_(new TokenList(&token_list[0], &token_list[length])),
70       begin_(token_list_->begin()),
71       end_(token_list_->end())
72   {}
73 
74   // Non-copying move-from-vector constructor. Takes over the token vector.
TokenRangeTokenRange75   explicit TokenRange(TokenList&& token_list)
76     : token_list_(new TokenList(std::forward<TokenList>(token_list))),
77       begin_(token_list_->begin()),
78       end_(token_list_->end())
79   {}
80 
81   // Non-copying constructor. Retain reference to existing list of tokens.
TokenRangeTokenRange82   TokenRange(std::shared_ptr<TokenList> token_list,
83              TokenList::const_iterator it_begin,
84              TokenList::const_iterator it_end)
85     : token_list_(token_list),
86       begin_(it_begin),
87       end_(it_end) {
88     assert(it_begin >= token_list->begin());
89     assert(it_end <= token_list->end());
90   }
91 
92   // Non-copying copy constructor.
93   TokenRange(const TokenRange&) = default;
94 
95   // Non-copying move constructor.
96   TokenRange(TokenRange&&) = default;
97 
98   // Non-copying constructor. Retains reference to an existing list of tokens, with offset.
TokenRangeTokenRange99   explicit TokenRange(std::shared_ptr<TokenList> token_list)
100     : token_list_(token_list),
101       begin_(token_list_->begin()),
102       end_(token_list_->end())
103   {}
104 
105   // Iterator type for begin() and end(). Guaranteed to be a RandomAccessIterator.
106   using iterator = TokenList::const_iterator;
107 
108   // Iterator type for const begin() and const end(). Guaranteed to be a RandomAccessIterator.
109   using const_iterator = iterator;
110 
111   // Create a token range by splitting a string. Each separator gets their own token.
112   // Since the separator are retained as tokens, it might be useful to call
113   // RemoveToken afterwards.
SplitTokenRange114   static TokenRange Split(const std::string& string, std::initializer_list<char> separators) {
115     TokenList new_token_list;
116 
117     std::string tok;
118     for (auto&& c : string) {
119       for (char sep : separators) {
120         if (c == sep) {
121           // We spotted a separator character.
122           // Push back everything before the last separator as a new token.
123           // Push back the separator as a token.
124           if (!tok.empty()) {
125             new_token_list.push_back(tok);
126             tok = "";
127           }
128           new_token_list.push_back(std::string() + sep);
129         } else {
130           // Build up the token with another character.
131           tok += c;
132         }
133       }
134     }
135 
136     if (!tok.empty()) {
137       new_token_list.push_back(tok);
138     }
139 
140     return TokenRange(std::move(new_token_list));
141   }
142 
143   // A RandomAccessIterator to the first element in this range.
beginTokenRange144   iterator begin() const {
145     return begin_;
146   }
147 
148   // A RandomAccessIterator to one past the last element in this range.
endTokenRange149   iterator end() const {
150     return end_;
151   }
152 
153   // The size of the range, i.e. how many tokens are in it.
SizeTokenRange154   size_t Size() const {
155     return std::distance(begin_, end_);
156   }
157 
158   // Are there 0 tokens in this range?
IsEmptyTokenRange159   bool IsEmpty() const {
160     return Size() > 0;
161   }
162 
163   // Look up a token by it's offset.
GetTokenTokenRange164   const std::string& GetToken(size_t offset) const {
165     assert(offset < Size());
166     return *(begin_ + offset);
167   }
168 
169   // Does this token range equal the other range?
170   // Equality is defined as having both the same size, and
171   // each corresponding token being equal.
172   bool operator==(const TokenRange& other) const {
173     if (this == &other) {
174       return true;
175     }
176 
177     if (Size() != other.Size()) {
178       return false;
179     }
180 
181     return std::equal(begin(), end(), other.begin());
182   }
183 
184   // Look up the token at the requested index.
185   const std::string& operator[](int index) const {
186     assert(index >= 0 && static_cast<size_t>(index) < Size());
187     return *(begin() + index);
188   }
189 
190   // Does this current range start with the other range?
StartsWithTokenRange191   bool StartsWith(const TokenRange& other) const {
192     if (this == &other) {
193       return true;
194     }
195 
196     if (Size() < other.Size()) {
197       return false;
198     }
199 
200     auto& smaller = Size() < other.Size() ? *this : other;
201     auto& greater = Size() < other.Size() ? other : *this;
202 
203     return std::equal(smaller.begin(), smaller.end(), greater.begin());
204   }
205 
206   // Remove all characters 'c' from each token, potentially copying the underlying tokens.
RemoveCharacterTokenRange207   TokenRange RemoveCharacter(char c) const {
208     TokenList new_token_list(begin(), end());
209 
210     bool changed = false;
211     for (auto&& token : new_token_list) {
212       auto it = std::remove_if(token.begin(), token.end(), [&](char ch) {
213         if (ch == c) {
214           changed = true;
215           return true;
216         }
217         return false;
218       });
219       token.erase(it, token.end());
220     }
221 
222     if (!changed) {
223       return *this;
224     }
225 
226     return TokenRange(std::move(new_token_list));
227   }
228 
229   // Remove all tokens matching this one, potentially copying the underlying tokens.
RemoveTokenTokenRange230   TokenRange RemoveToken(const std::string& token) {
231     return RemoveIf([&](const std::string& tok) { return tok == token; });
232   }
233 
234   // Discard all empty tokens, potentially copying the underlying tokens.
DiscardEmptyTokenRange235   TokenRange DiscardEmpty() const {
236     return RemoveIf([](const std::string& token) { return token.empty(); });
237   }
238 
239   // Create a non-copying subset of this range.
240   // Length is trimmed so that the Slice does not go out of range.
241   TokenRange Slice(size_t offset, size_t length = std::string::npos) const {
242     assert(offset < Size());
243 
244     if (length != std::string::npos && offset + length > Size()) {
245       length = Size() - offset;
246     }
247 
248     iterator it_end;
249     if (length == std::string::npos) {
250       it_end = end();
251     } else {
252       it_end = begin() + offset + length;
253     }
254 
255     return TokenRange(token_list_, begin() + offset, it_end);
256   }
257 
258   // Try to match the string with tokens from this range.
259   // Each token is used to match exactly once (after which the next token is used, and so on).
260   // The matching happens from left-to-right in a non-greedy fashion.
261   // If the currently-matched token is the wildcard, then the new outputted token will
262   // contain as much as possible until the next token is matched.
263   //
264   // For example, if this == ["a:", "_", "b:] and "_" is the match string, then
265   // MatchSubstrings on "a:foob:" will yield: ["a:", "foo", "b:"]
266   //
267   // Since the string matching can fail (e.g. ["foo"] against "bar"), then this
268   // function can fail, in which cause it will return null.
MatchSubstringsTokenRange269   std::unique_ptr<TokenRange> MatchSubstrings(const std::string& string,
270                                               const std::string& wildcard) const {
271     TokenList new_token_list;
272 
273     size_t wildcard_idx = std::string::npos;
274     size_t string_idx = 0;
275 
276     // Function to push all the characters matched as a wildcard so far
277     // as a brand new token. It resets the wildcard matching.
278     // Empty wildcards are possible and ok, but only if wildcard matching was on.
279     auto maybe_push_wildcard_token = [&]() {
280       if (wildcard_idx != std::string::npos) {
281         size_t wildcard_length = string_idx - wildcard_idx;
282         std::string wildcard_substr = string.substr(wildcard_idx, wildcard_length);
283         new_token_list.push_back(std::move(wildcard_substr));
284 
285         wildcard_idx = std::string::npos;
286       }
287     };
288 
289     for (iterator it = begin(); it != end(); ++it) {
290       const std::string& tok = *it;
291 
292       if (tok == wildcard) {
293         maybe_push_wildcard_token();
294         wildcard_idx = string_idx;
295         continue;
296       }
297 
298       size_t next_token_idx = string.find(tok);
299       if (next_token_idx == std::string::npos) {
300         // Could not find token at all
301         return nullptr;
302       } else if (next_token_idx != string_idx && wildcard_idx == std::string::npos) {
303         // Found the token at a non-starting location, and we weren't
304         // trying to parse the wildcard.
305         return nullptr;
306       }
307 
308       new_token_list.push_back(string.substr(next_token_idx, tok.size()));
309       maybe_push_wildcard_token();
310       string_idx += tok.size();
311     }
312 
313     size_t remaining = string.size() - string_idx;
314     if (remaining > 0) {
315       if (wildcard_idx == std::string::npos) {
316         // Some characters were still remaining in the string,
317         // but it wasn't trying to match a wildcard.
318         return nullptr;
319       }
320     }
321 
322     // If some characters are remaining, the rest must be a wildcard.
323     string_idx += remaining;
324     maybe_push_wildcard_token();
325 
326     return std::unique_ptr<TokenRange>(new TokenRange(std::move(new_token_list)));
327   }
328 
329   // Do a quick match token-by-token, and see if they match.
330   // Any tokens with a wildcard in them are only matched up until the wildcard.
331   // If this is true, then the wildcard matching later on can still fail, so this is not
332   // a guarantee that the argument is correct, it's more of a strong hint that the
333   // user-provided input *probably* was trying to match this argument.
334   //
335   // Returns how many tokens were either matched (or ignored because there was a
336   // wildcard present). 0 means no match. If the size() tokens are returned.
MaybeMatchesTokenRange337   size_t MaybeMatches(const TokenRange& token_list, const std::string& wildcard) const {
338     auto token_it = token_list.begin();
339     auto token_end = token_list.end();
340     auto name_it = begin();
341     auto name_end = end();
342 
343     size_t matched_tokens = 0;
344 
345     while (token_it != token_end && name_it != name_end) {
346       // Skip token matching when the corresponding name has a wildcard in it.
347       const std::string& name = *name_it;
348 
349       size_t wildcard_idx = name.find(wildcard);
350       if (wildcard_idx == std::string::npos) {  // No wildcard present
351         // Did the definition token match the user token?
352         if (name != *token_it) {
353           return matched_tokens;
354         }
355       } else {
356         std::string name_prefix = name.substr(0, wildcard_idx);
357 
358         // Did the user token start with the up-to-the-wildcard prefix?
359         if (!StartsWith(*token_it, name_prefix)) {
360           return matched_tokens;
361         }
362       }
363 
364       ++token_it;
365       ++name_it;
366       ++matched_tokens;
367     }
368 
369     // If we got this far, it's either a full match or the token list was too short.
370     return matched_tokens;
371   }
372 
373   // Flatten the token range by joining every adjacent token with the separator character.
374   // e.g. ["hello", "world"].join('$') == "hello$world"
JoinTokenRange375   std::string Join(char separator) const {
376     TokenList tmp(begin(), end());
377     return art::Join(tmp, separator);
378     // TODO: Join should probably take an offset or iterators
379   }
380 
381  private:
StartsWithTokenRange382   static bool StartsWith(const std::string& larger, const std::string& smaller) {
383     if (larger.size() >= smaller.size()) {
384       return std::equal(smaller.begin(), smaller.end(), larger.begin());
385     }
386 
387     return false;
388   }
389 
390   template <typename TPredicate>
RemoveIfTokenRange391   TokenRange RemoveIf(const TPredicate& predicate) const {
392     // If any of the tokens in the token lists are empty, then
393     // we need to remove them and compress the token list into a smaller one.
394     bool remove = false;
395     for (auto it = begin_; it != end_; ++it) {
396       auto&& token = *it;
397 
398       if (predicate(token)) {
399         remove = true;
400         break;
401       }
402     }
403 
404     // Actually copy the token list and remove the tokens that don't match our predicate.
405     if (remove) {
406       auto token_list = std::make_shared<TokenList>(begin(), end());
407       TokenList::iterator new_end =
408           std::remove_if(token_list->begin(), token_list->end(), predicate);
409       token_list->erase(new_end, token_list->end());
410 
411       assert(token_list_->size() > token_list->size() && "Nothing was actually removed!");
412 
413       return TokenRange(token_list);
414     }
415 
416     return *this;
417   }
418 
419   const std::shared_ptr<std::vector<std::string>> token_list_;
420   const iterator begin_;
421   const iterator end_;
422 };
423 }  // namespace art
424 
425 #endif  // ART_CMDLINE_TOKEN_RANGE_H_
426