1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 // This file provides the Detokenizer class, which is used to decode tokenized
16 // strings.  To use a Detokenizer, load a binary format token database into
17 // memory, construct a TokenDatabase, and pass it to a Detokenizer:
18 //
19 //   std::vector data = ReadFile("my_tokenized_strings.db");
20 //   Detokenizer detok(TokenDatabase::Create(data));
21 //
22 //   DetokenizedString result = detok.Detokenize(my_data);
23 //   std::cout << result.BestString() << '\n';
24 //
25 #pragma once
26 
27 #include <cstddef>
28 #include <cstdint>
29 #include <span>
30 #include <string>
31 #include <unordered_map>
32 #include <utility>
33 #include <vector>
34 
35 #include "pw_tokenizer/internal/decode.h"
36 #include "pw_tokenizer/token_database.h"
37 
38 namespace pw::tokenizer {
39 
40 using TokenizedStringEntry = std::pair<FormatString, uint32_t /*date removed*/>;
41 
42 // A string that has been detokenized. This class tracks all possible results if
43 // there are token collisions.
44 class DetokenizedString {
45  public:
46   DetokenizedString(uint32_t token,
47                     const std::span<const TokenizedStringEntry>& entries,
48                     const std::span<const uint8_t>& arguments);
49 
DetokenizedString()50   DetokenizedString() : has_token_(false) {}
51 
52   // True if there was only one valid match and it decoded successfully.
ok()53   bool ok() const { return matches_.size() == 1 && matches_[0].ok(); }
54 
55   // Returns the strings that matched the token, with the best matches first.
matches()56   const std::vector<DecodedFormatString>& matches() const { return matches_; }
57 
58   // Returns the detokenized string or an empty string if there were no matches.
59   // If there are multiple possible results, the DetokenizedString returns the
60   // first match.
61   std::string BestString() const;
62 
63   // Returns the best match, with error messages inserted for arguments that
64   // failed to parse.
65   std::string BestStringWithErrors() const;
66 
67  private:
68   uint32_t token_;
69   bool has_token_;
70   std::vector<DecodedFormatString> matches_;
71 };
72 
73 // Decodes and detokenizes strings from a TokenDatabase. This class builds a
74 // hash table from the TokenDatabase to give O(1) token lookups.
75 class Detokenizer {
76  public:
77   // Constructs a detokenizer from a TokenDatabase. The TokenDatabase is not
78   // referenced by the Detokenizer after construction; its memory can be freed.
79   Detokenizer(const TokenDatabase& database);
80 
81   // Decodes and detokenizes the encoded message. Returns a DetokenizedString
82   // that stores all possible detokenized string results.
83   DetokenizedString Detokenize(const std::span<const uint8_t>& encoded) const;
84 
Detokenize(const std::string_view & encoded)85   DetokenizedString Detokenize(const std::string_view& encoded) const {
86     return Detokenize(encoded.data(), encoded.size());
87   }
88 
Detokenize(const void * encoded,size_t size_bytes)89   DetokenizedString Detokenize(const void* encoded, size_t size_bytes) const {
90     return Detokenize(
91         std::span(static_cast<const uint8_t*>(encoded), size_bytes));
92   }
93 
94  private:
95   std::unordered_map<uint32_t, std::vector<TokenizedStringEntry>> database_;
96 };
97 
98 }  // namespace pw::tokenizer
99