1 // Copyright 2020 The Pigweed Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not 4 // use this file except in compliance with the License. You may obtain a copy of 5 // the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 // License for the specific language governing permissions and limitations under 13 // the License. 14 15 // This file provides the Detokenizer class, which is used to decode tokenized 16 // strings. To use a Detokenizer, load a binary format token database into 17 // memory, construct a TokenDatabase, and pass it to a Detokenizer: 18 // 19 // std::vector data = ReadFile("my_tokenized_strings.db"); 20 // Detokenizer detok(TokenDatabase::Create(data)); 21 // 22 // DetokenizedString result = detok.Detokenize(my_data); 23 // std::cout << result.BestString() << '\n'; 24 // 25 #pragma once 26 27 #include <cstddef> 28 #include <cstdint> 29 #include <span> 30 #include <string> 31 #include <unordered_map> 32 #include <utility> 33 #include <vector> 34 35 #include "pw_tokenizer/internal/decode.h" 36 #include "pw_tokenizer/token_database.h" 37 38 namespace pw::tokenizer { 39 40 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>; 41 42 // A string that has been detokenized. This class tracks all possible results if 43 // there are token collisions. 44 class DetokenizedString { 45 public: 46 DetokenizedString(uint32_t token, 47 const std::span<const TokenizedStringEntry>& entries, 48 const std::span<const uint8_t>& arguments); 49 DetokenizedString()50 DetokenizedString() : has_token_(false) {} 51 52 // True if there was only one valid match and it decoded successfully. ok()53 bool ok() const { return matches_.size() == 1 && matches_[0].ok(); } 54 55 // Returns the strings that matched the token, with the best matches first. matches()56 const std::vector<DecodedFormatString>& matches() const { return matches_; } 57 58 // Returns the detokenized string or an empty string if there were no matches. 59 // If there are multiple possible results, the DetokenizedString returns the 60 // first match. 61 std::string BestString() const; 62 63 // Returns the best match, with error messages inserted for arguments that 64 // failed to parse. 65 std::string BestStringWithErrors() const; 66 67 private: 68 uint32_t token_; 69 bool has_token_; 70 std::vector<DecodedFormatString> matches_; 71 }; 72 73 // Decodes and detokenizes strings from a TokenDatabase. This class builds a 74 // hash table from the TokenDatabase to give O(1) token lookups. 75 class Detokenizer { 76 public: 77 // Constructs a detokenizer from a TokenDatabase. The TokenDatabase is not 78 // referenced by the Detokenizer after construction; its memory can be freed. 79 Detokenizer(const TokenDatabase& database); 80 81 // Decodes and detokenizes the encoded message. Returns a DetokenizedString 82 // that stores all possible detokenized string results. 83 DetokenizedString Detokenize(const std::span<const uint8_t>& encoded) const; 84 Detokenize(const std::string_view & encoded)85 DetokenizedString Detokenize(const std::string_view& encoded) const { 86 return Detokenize(encoded.data(), encoded.size()); 87 } 88 Detokenize(const void * encoded,size_t size_bytes)89 DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const { 90 return Detokenize( 91 std::span(static_cast<const uint8_t*>(encoded), size_bytes)); 92 } 93 94 private: 95 std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_; 96 }; 97 98 } // namespace pw::tokenizer 99