1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_HEADER_POLICY_H
18 #define LATINIME_HEADER_POLICY_H
19 
20 #include <cstdint>
21 
22 #include "defines.h"
23 #include "dictionary/header/header_read_write_utils.h"
24 #include "dictionary/interface/dictionary_header_structure_policy.h"
25 #include "dictionary/utils/entry_counters.h"
26 #include "dictionary/utils/format_utils.h"
27 #include "utils/char_utils.h"
28 #include "utils/time_keeper.h"
29 
30 namespace latinime {
31 
32 class HeaderPolicy : public DictionaryHeaderStructurePolicy {
33  public:
34     // Reads information from existing dictionary buffer.
HeaderPolicy(const uint8_t * const dictBuf,const FormatUtils::FORMAT_VERSION formatVersion)35     HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion)
36             : mDictFormatVersion(formatVersion),
37               mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
38               mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
39               mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
40               mLocale(readLocale()),
41               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
42               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
43               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
44                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
45               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
46                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
47               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
48                       LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
49               mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
50               mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
51                       EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
52               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
53                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
54               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
55                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
56                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
57               mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
58 
59     // Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,const std::vector<int> & locale,const DictionaryHeaderStructurePolicy::AttributeMap * const attributeMap)60     HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
61             const std::vector<int> &locale,
62             const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap)
63             : mDictFormatVersion(dictFormatVersion),
64               mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
65                       attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale),
66               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
67               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
68               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
69                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
70               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
71                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
72               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
73                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
74               mNgramCounts(readNgramCounts()), mMaxNgramCounts(readMaxNgramCounts()),
75               mExtendedRegionSize(0),
76               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
77                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
78               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
79                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
80                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
81               mCodePointTable(HeaderReadWriteUtils::readCodePointTable(&mAttributeMap)) {}
82 
83     // Copy header information
HeaderPolicy(const HeaderPolicy * const headerPolicy)84     HeaderPolicy(const HeaderPolicy *const headerPolicy)
85             : mDictFormatVersion(headerPolicy->mDictFormatVersion),
86               mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize),
87               mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale),
88               mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier),
89               mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
90               mIsDecayingDict(headerPolicy->mIsDecayingDict),
91               mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
92               mNgramCounts(headerPolicy->mNgramCounts),
93               mMaxNgramCounts(headerPolicy->mMaxNgramCounts),
94               mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
95               mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
96               mForgettingCurveProbabilityValuesTableId(
97                       headerPolicy->mForgettingCurveProbabilityValuesTableId),
98               mCodePointTable(headerPolicy->mCodePointTable) {}
99 
100     // Temporary dummy header.
HeaderPolicy()101     HeaderPolicy()
102             : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
103               mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
104               mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
105               mDate(0), mLastDecayedTime(0), mNgramCounts(), mMaxNgramCounts(),
106               mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
107               mForgettingCurveProbabilityValuesTableId(0), mCodePointTable(nullptr) {}
108 
~HeaderPolicy()109     ~HeaderPolicy() {}
110 
getFormatVersionNumber()111     virtual int getFormatVersionNumber() const {
112         // Conceptually this converts the symbolic value we use in the code into the
113         // hardcoded of the bytes in the file. But we want the constants to be the
114         // same so we use them for both here.
115         switch (mDictFormatVersion) {
116             case FormatUtils::VERSION_2:
117             case FormatUtils::VERSION_201:
118                 AKLOGE("Dictionary versions 2 and 201 are incompatible with this version");
119                 return FormatUtils::UNKNOWN_VERSION;
120             case FormatUtils::VERSION_202:
121                 return FormatUtils::VERSION_202;
122             case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
123                 return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
124             case FormatUtils::VERSION_402:
125                 return FormatUtils::VERSION_402;
126             case FormatUtils::VERSION_403:
127                 return FormatUtils::VERSION_403;
128             default:
129                 return FormatUtils::UNKNOWN_VERSION;
130         }
131     }
132 
isValid()133     AK_FORCE_INLINE bool isValid() const {
134         // Decaying dictionary must have historical information.
135         if (!mIsDecayingDict) {
136             return true;
137         }
138         if (mHasHistoricalInfoOfWords) {
139             return true;
140         } else {
141             return false;
142         }
143     }
144 
getSize()145     AK_FORCE_INLINE int getSize() const {
146         return mSize;
147     }
148 
getMultiWordCostMultiplier()149     AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
150         return mMultiWordCostMultiplier;
151     }
152 
isDecayingDict()153     AK_FORCE_INLINE bool isDecayingDict() const {
154         return mIsDecayingDict;
155     }
156 
requiresGermanUmlautProcessing()157     AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
158         return mRequiresGermanUmlautProcessing;
159     }
160 
getDate()161     AK_FORCE_INLINE int getDate() const {
162         return mDate;
163     }
164 
getLastDecayedTime()165     AK_FORCE_INLINE int getLastDecayedTime() const {
166         return mLastDecayedTime;
167     }
168 
getNgramCounts()169     AK_FORCE_INLINE const EntryCounts &getNgramCounts() const {
170         return mNgramCounts;
171     }
172 
getMaxNgramCounts()173     AK_FORCE_INLINE const EntryCounts getMaxNgramCounts() const {
174         return mMaxNgramCounts;
175     }
176 
getExtendedRegionSize()177     AK_FORCE_INLINE int getExtendedRegionSize() const {
178         return mExtendedRegionSize;
179     }
180 
hasHistoricalInfoOfWords()181     AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
182         return mHasHistoricalInfoOfWords;
183     }
184 
shouldBoostExactMatches()185     AK_FORCE_INLINE bool shouldBoostExactMatches() const {
186         // TODO: Investigate better ways to handle exact matches for personalized dictionaries.
187         return !isDecayingDict();
188     }
189 
getAttributeMap()190     const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const {
191         return &mAttributeMap;
192     }
193 
getForgettingCurveProbabilityValuesTableId()194     AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
195         return mForgettingCurveProbabilityValuesTableId;
196     }
197 
198     void readHeaderValueOrQuestionMark(const char *const key,
199             int *outValue, int outValueSize) const;
200 
201     bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
202             const EntryCounts &entryCounts, const int extendedRegionSize,
203             BufferWithExtendableBuffer *const outBuffer) const;
204 
205     void fillInHeader(const bool updatesLastDecayedTime, const EntryCounts &entryCounts,
206             const int extendedRegionSize,
207             DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
208 
getLocale()209     AK_FORCE_INLINE const std::vector<int> *getLocale() const {
210         return &mLocale;
211     }
212 
supportsBeginningOfSentence()213     bool supportsBeginningOfSentence() const {
214         return mDictFormatVersion >= FormatUtils::VERSION_402;
215     }
216 
getCodePointTable()217     const int *getCodePointTable() const {
218         return mCodePointTable;
219     }
220 
221  private:
222     DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
223 
224     static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
225     static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
226     static const char *const IS_DECAYING_DICT_KEY;
227     static const char *const DATE_KEY;
228     static const char *const LAST_DECAYED_TIME_KEY;
229     static const char *const NGRAM_COUNT_KEYS[];
230     static const char *const MAX_NGRAM_COUNT_KEYS[];
231     static const int DEFAULT_MAX_NGRAM_COUNTS[];
232     static const char *const EXTENDED_REGION_SIZE_KEY;
233     static const char *const HAS_HISTORICAL_INFO_KEY;
234     static const char *const LOCALE_KEY;
235     static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
236     static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
237     static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
238     static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
239     static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
240     static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
241 
242     const FormatUtils::FORMAT_VERSION mDictFormatVersion;
243     const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
244     const int mSize;
245     DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap;
246     const std::vector<int> mLocale;
247     const float mMultiWordCostMultiplier;
248     const bool mRequiresGermanUmlautProcessing;
249     const bool mIsDecayingDict;
250     const int mDate;
251     const int mLastDecayedTime;
252     const EntryCounts mNgramCounts;
253     const EntryCounts mMaxNgramCounts;
254     const int mExtendedRegionSize;
255     const bool mHasHistoricalInfoOfWords;
256     const int mForgettingCurveProbabilityValuesTableId;
257     const int *const mCodePointTable;
258 
259     const std::vector<int> readLocale() const;
260     float readMultipleWordCostMultiplier() const;
261     bool readRequiresGermanUmlautProcessing() const;
262     const EntryCounts readNgramCounts() const;
263     const EntryCounts readMaxNgramCounts() const;
264     static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
265             const uint8_t *const dictBuf);
266 };
267 } // namespace latinime
268 #endif /* LATINIME_HEADER_POLICY_H */
269