1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_HEADER_POLICY_H
18 #define LATINIME_HEADER_POLICY_H
19 
20 #include <cstdint>
21 
22 #include "defines.h"
23 #include "suggest/core/policy/dictionary_header_structure_policy.h"
24 #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h"
25 #include "suggest/policyimpl/dictionary/utils/format_utils.h"
26 #include "utils/char_utils.h"
27 #include "utils/time_keeper.h"
28 
29 namespace latinime {
30 
31 class HeaderPolicy : public DictionaryHeaderStructurePolicy {
32  public:
33     // Reads information from existing dictionary buffer.
HeaderPolicy(const uint8_t * const dictBuf,const FormatUtils::FORMAT_VERSION formatVersion)34     HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion)
35             : mDictFormatVersion(formatVersion),
36               mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)),
37               mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)),
38               mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)),
39               mLocale(readLocale()),
40               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
41               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
42               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
43                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
44               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
45                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
46               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
47                       LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
48               mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
49                       UNIGRAM_COUNT_KEY, 0 /* defaultValue */)),
50               mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
51                       BIGRAM_COUNT_KEY, 0 /* defaultValue */)),
52               mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
53                       EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)),
54               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
55                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
56               mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
57                       &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
58                       DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
59               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
60                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
61                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
62               mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
63                       &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
64                       DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
65               mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
66                       &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
67               mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
68                       &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
69 
70     // Constructs header information using an attribute map.
HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,const std::vector<int> & locale,const DictionaryHeaderStructurePolicy::AttributeMap * const attributeMap)71     HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,
72             const std::vector<int> &locale,
73             const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap)
74             : mDictFormatVersion(dictFormatVersion),
75               mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap(
76                       attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale),
77               mMultiWordCostMultiplier(readMultipleWordCostMultiplier()),
78               mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()),
79               mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
80                       IS_DECAYING_DICT_KEY, false /* defaultValue */)),
81               mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
82                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
83               mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
84                       DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)),
85               mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0),
86               mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue(
87                       &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)),
88               mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue(
89                       &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY,
90                       DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)),
91               mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue(
92                       &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY,
93                       DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)),
94               mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue(
95                       &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY,
96                       DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)),
97               mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(
98                       &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)),
99               mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue(
100                       &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {}
101 
102     // Copy header information
HeaderPolicy(const HeaderPolicy * const headerPolicy)103     HeaderPolicy(const HeaderPolicy *const headerPolicy)
104             : mDictFormatVersion(headerPolicy->mDictFormatVersion),
105               mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize),
106               mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale),
107               mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier),
108               mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing),
109               mIsDecayingDict(headerPolicy->mIsDecayingDict),
110               mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime),
111               mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount),
112               mExtendedRegionSize(headerPolicy->mExtendedRegionSize),
113               mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords),
114               mForgettingCurveOccurrencesToLevelUp(
115                       headerPolicy->mForgettingCurveOccurrencesToLevelUp),
116               mForgettingCurveProbabilityValuesTableId(
117                       headerPolicy->mForgettingCurveProbabilityValuesTableId),
118               mForgettingCurveDurationToLevelDown(
119                       headerPolicy->mForgettingCurveDurationToLevelDown),
120               mMaxUnigramCount(headerPolicy->mMaxUnigramCount),
121               mMaxBigramCount(headerPolicy->mMaxBigramCount) {}
122 
123     // Temporary dummy header.
HeaderPolicy()124     HeaderPolicy()
125             : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0),
126               mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f),
127               mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false),
128               mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0),
129               mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false),
130               mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0),
131               mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {}
132 
~HeaderPolicy()133     ~HeaderPolicy() {}
134 
getFormatVersionNumber()135     virtual int getFormatVersionNumber() const {
136         // Conceptually this converts the symbolic value we use in the code into the
137         // hardcoded of the bytes in the file. But we want the constants to be the
138         // same so we use them for both here.
139         switch (mDictFormatVersion) {
140             case FormatUtils::VERSION_2:
141                 return FormatUtils::VERSION_2;
142             case FormatUtils::VERSION_4_ONLY_FOR_TESTING:
143                 return FormatUtils::VERSION_4_ONLY_FOR_TESTING;
144             case FormatUtils::VERSION_4:
145                 return FormatUtils::VERSION_4;
146             case FormatUtils::VERSION_4_DEV:
147                 return FormatUtils::VERSION_4_DEV;
148             default:
149                 return FormatUtils::UNKNOWN_VERSION;
150         }
151     }
152 
isValid()153     AK_FORCE_INLINE bool isValid() const {
154         // Decaying dictionary must have historical information.
155         if (!mIsDecayingDict) {
156             return true;
157         }
158         if (mHasHistoricalInfoOfWords) {
159             return true;
160         } else {
161             return false;
162         }
163     }
164 
getSize()165     AK_FORCE_INLINE int getSize() const {
166         return mSize;
167     }
168 
getMultiWordCostMultiplier()169     AK_FORCE_INLINE float getMultiWordCostMultiplier() const {
170         return mMultiWordCostMultiplier;
171     }
172 
isDecayingDict()173     AK_FORCE_INLINE bool isDecayingDict() const {
174         return mIsDecayingDict;
175     }
176 
requiresGermanUmlautProcessing()177     AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const {
178         return mRequiresGermanUmlautProcessing;
179     }
180 
getDate()181     AK_FORCE_INLINE int getDate() const {
182         return mDate;
183     }
184 
getLastDecayedTime()185     AK_FORCE_INLINE int getLastDecayedTime() const {
186         return mLastDecayedTime;
187     }
188 
getUnigramCount()189     AK_FORCE_INLINE int getUnigramCount() const {
190         return mUnigramCount;
191     }
192 
getBigramCount()193     AK_FORCE_INLINE int getBigramCount() const {
194         return mBigramCount;
195     }
196 
getExtendedRegionSize()197     AK_FORCE_INLINE int getExtendedRegionSize() const {
198         return mExtendedRegionSize;
199     }
200 
hasHistoricalInfoOfWords()201     AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const {
202         return mHasHistoricalInfoOfWords;
203     }
204 
shouldBoostExactMatches()205     AK_FORCE_INLINE bool shouldBoostExactMatches() const {
206         // TODO: Investigate better ways to handle exact matches for personalized dictionaries.
207         return !isDecayingDict();
208     }
209 
getAttributeMap()210     const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const {
211         return &mAttributeMap;
212     }
213 
getForgettingCurveOccurrencesToLevelUp()214     AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const {
215         return mForgettingCurveOccurrencesToLevelUp;
216     }
217 
getForgettingCurveProbabilityValuesTableId()218     AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const {
219         return mForgettingCurveProbabilityValuesTableId;
220     }
221 
getForgettingCurveDurationToLevelDown()222     AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const {
223         return mForgettingCurveDurationToLevelDown;
224     }
225 
getMaxUnigramCount()226     AK_FORCE_INLINE int getMaxUnigramCount() const {
227         return mMaxUnigramCount;
228     }
229 
getMaxBigramCount()230     AK_FORCE_INLINE int getMaxBigramCount() const {
231         return mMaxBigramCount;
232     }
233 
234     void readHeaderValueOrQuestionMark(const char *const key,
235             int *outValue, int outValueSize) const;
236 
237     bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
238             const int unigramCount, const int bigramCount,
239             const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const;
240 
241     void fillInHeader(const bool updatesLastDecayedTime,
242             const int unigramCount, const int bigramCount, const int extendedRegionSize,
243             DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const;
244 
getLocale()245     AK_FORCE_INLINE const std::vector<int> *getLocale() const {
246         return &mLocale;
247     }
248 
supportsBeginningOfSentence()249     bool supportsBeginningOfSentence() const {
250         return mDictFormatVersion >= FormatUtils::VERSION_4;
251     }
252 
253  private:
254     DISALLOW_COPY_AND_ASSIGN(HeaderPolicy);
255 
256     static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY;
257     static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY;
258     static const char *const IS_DECAYING_DICT_KEY;
259     static const char *const DATE_KEY;
260     static const char *const LAST_DECAYED_TIME_KEY;
261     static const char *const UNIGRAM_COUNT_KEY;
262     static const char *const BIGRAM_COUNT_KEY;
263     static const char *const EXTENDED_REGION_SIZE_KEY;
264     static const char *const HAS_HISTORICAL_INFO_KEY;
265     static const char *const LOCALE_KEY;
266     static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY;
267     static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY;
268     static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY;
269     static const char *const MAX_UNIGRAM_COUNT_KEY;
270     static const char *const MAX_BIGRAM_COUNT_KEY;
271     static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE;
272     static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE;
273     static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP;
274     static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID;
275     static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS;
276     static const int DEFAULT_MAX_UNIGRAM_COUNT;
277     static const int DEFAULT_MAX_BIGRAM_COUNT;
278 
279     const FormatUtils::FORMAT_VERSION mDictFormatVersion;
280     const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags;
281     const int mSize;
282     DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap;
283     const std::vector<int> mLocale;
284     const float mMultiWordCostMultiplier;
285     const bool mRequiresGermanUmlautProcessing;
286     const bool mIsDecayingDict;
287     const int mDate;
288     const int mLastDecayedTime;
289     const int mUnigramCount;
290     const int mBigramCount;
291     const int mExtendedRegionSize;
292     const bool mHasHistoricalInfoOfWords;
293     const int mForgettingCurveOccurrencesToLevelUp;
294     const int mForgettingCurveProbabilityValuesTableId;
295     const int mForgettingCurveDurationToLevelDown;
296     const int mMaxUnigramCount;
297     const int mMaxBigramCount;
298 
299     const std::vector<int> readLocale() const;
300     float readMultipleWordCostMultiplier() const;
301     bool readRequiresGermanUmlautProcessing() const;
302 
303     static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes(
304             const uint8_t *const dictBuf);
305 };
306 } // namespace latinime
307 #endif /* LATINIME_HEADER_POLICY_H */
308