1 /* 2 * Copyright (C) 2013, The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LATINIME_HEADER_POLICY_H 18 #define LATINIME_HEADER_POLICY_H 19 20 #include <cstdint> 21 22 #include "defines.h" 23 #include "suggest/core/policy/dictionary_header_structure_policy.h" 24 #include "suggest/policyimpl/dictionary/header/header_read_write_utils.h" 25 #include "suggest/policyimpl/dictionary/utils/format_utils.h" 26 #include "utils/char_utils.h" 27 #include "utils/time_keeper.h" 28 29 namespace latinime { 30 31 class HeaderPolicy : public DictionaryHeaderStructurePolicy { 32 public: 33 // Reads information from existing dictionary buffer. HeaderPolicy(const uint8_t * const dictBuf,const FormatUtils::FORMAT_VERSION formatVersion)34 HeaderPolicy(const uint8_t *const dictBuf, const FormatUtils::FORMAT_VERSION formatVersion) 35 : mDictFormatVersion(formatVersion), 36 mDictionaryFlags(HeaderReadWriteUtils::getFlags(dictBuf)), 37 mSize(HeaderReadWriteUtils::getHeaderSize(dictBuf)), 38 mAttributeMap(createAttributeMapAndReadAllAttributes(dictBuf)), 39 mLocale(readLocale()), 40 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), 41 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), 42 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, 43 IS_DECAYING_DICT_KEY, false /* defaultValue */)), 44 mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 45 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 46 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 47 LAST_DECAYED_TIME_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 48 mUnigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 49 UNIGRAM_COUNT_KEY, 0 /* defaultValue */)), 50 mBigramCount(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 51 BIGRAM_COUNT_KEY, 0 /* defaultValue */)), 52 mExtendedRegionSize(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 53 EXTENDED_REGION_SIZE_KEY, 0 /* defaultValue */)), 54 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( 55 &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), 56 mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue( 57 &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, 58 DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), 59 mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( 60 &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, 61 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), 62 mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue( 63 &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, 64 DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), 65 mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( 66 &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), 67 mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( 68 &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} 69 70 // Constructs header information using an attribute map. HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion,const std::vector<int> & locale,const DictionaryHeaderStructurePolicy::AttributeMap * const attributeMap)71 HeaderPolicy(const FormatUtils::FORMAT_VERSION dictFormatVersion, 72 const std::vector<int> &locale, 73 const DictionaryHeaderStructurePolicy::AttributeMap *const attributeMap) 74 : mDictFormatVersion(dictFormatVersion), 75 mDictionaryFlags(HeaderReadWriteUtils::createAndGetDictionaryFlagsUsingAttributeMap( 76 attributeMap)), mSize(0), mAttributeMap(*attributeMap), mLocale(locale), 77 mMultiWordCostMultiplier(readMultipleWordCostMultiplier()), 78 mRequiresGermanUmlautProcessing(readRequiresGermanUmlautProcessing()), 79 mIsDecayingDict(HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap, 80 IS_DECAYING_DICT_KEY, false /* defaultValue */)), 81 mDate(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 82 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 83 mLastDecayedTime(HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap, 84 DATE_KEY, TimeKeeper::peekCurrentTime() /* defaultValue */)), 85 mUnigramCount(0), mBigramCount(0), mExtendedRegionSize(0), 86 mHasHistoricalInfoOfWords(HeaderReadWriteUtils::readBoolAttributeValue( 87 &mAttributeMap, HAS_HISTORICAL_INFO_KEY, false /* defaultValue */)), 88 mForgettingCurveOccurrencesToLevelUp(HeaderReadWriteUtils::readIntAttributeValue( 89 &mAttributeMap, FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY, 90 DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP)), 91 mForgettingCurveProbabilityValuesTableId(HeaderReadWriteUtils::readIntAttributeValue( 92 &mAttributeMap, FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY, 93 DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID)), 94 mForgettingCurveDurationToLevelDown(HeaderReadWriteUtils::readIntAttributeValue( 95 &mAttributeMap, FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY, 96 DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS)), 97 mMaxUnigramCount(HeaderReadWriteUtils::readIntAttributeValue( 98 &mAttributeMap, MAX_UNIGRAM_COUNT_KEY, DEFAULT_MAX_UNIGRAM_COUNT)), 99 mMaxBigramCount(HeaderReadWriteUtils::readIntAttributeValue( 100 &mAttributeMap, MAX_BIGRAM_COUNT_KEY, DEFAULT_MAX_BIGRAM_COUNT)) {} 101 102 // Copy header information HeaderPolicy(const HeaderPolicy * const headerPolicy)103 HeaderPolicy(const HeaderPolicy *const headerPolicy) 104 : mDictFormatVersion(headerPolicy->mDictFormatVersion), 105 mDictionaryFlags(headerPolicy->mDictionaryFlags), mSize(headerPolicy->mSize), 106 mAttributeMap(headerPolicy->mAttributeMap), mLocale(headerPolicy->mLocale), 107 mMultiWordCostMultiplier(headerPolicy->mMultiWordCostMultiplier), 108 mRequiresGermanUmlautProcessing(headerPolicy->mRequiresGermanUmlautProcessing), 109 mIsDecayingDict(headerPolicy->mIsDecayingDict), 110 mDate(headerPolicy->mDate), mLastDecayedTime(headerPolicy->mLastDecayedTime), 111 mUnigramCount(headerPolicy->mUnigramCount), mBigramCount(headerPolicy->mBigramCount), 112 mExtendedRegionSize(headerPolicy->mExtendedRegionSize), 113 mHasHistoricalInfoOfWords(headerPolicy->mHasHistoricalInfoOfWords), 114 mForgettingCurveOccurrencesToLevelUp( 115 headerPolicy->mForgettingCurveOccurrencesToLevelUp), 116 mForgettingCurveProbabilityValuesTableId( 117 headerPolicy->mForgettingCurveProbabilityValuesTableId), 118 mForgettingCurveDurationToLevelDown( 119 headerPolicy->mForgettingCurveDurationToLevelDown), 120 mMaxUnigramCount(headerPolicy->mMaxUnigramCount), 121 mMaxBigramCount(headerPolicy->mMaxBigramCount) {} 122 123 // Temporary dummy header. HeaderPolicy()124 HeaderPolicy() 125 : mDictFormatVersion(FormatUtils::UNKNOWN_VERSION), mDictionaryFlags(0), mSize(0), 126 mAttributeMap(), mLocale(CharUtils::EMPTY_STRING), mMultiWordCostMultiplier(0.0f), 127 mRequiresGermanUmlautProcessing(false), mIsDecayingDict(false), 128 mDate(0), mLastDecayedTime(0), mUnigramCount(0), mBigramCount(0), 129 mExtendedRegionSize(0), mHasHistoricalInfoOfWords(false), 130 mForgettingCurveOccurrencesToLevelUp(0), mForgettingCurveProbabilityValuesTableId(0), 131 mForgettingCurveDurationToLevelDown(0), mMaxUnigramCount(0), mMaxBigramCount(0) {} 132 ~HeaderPolicy()133 ~HeaderPolicy() {} 134 getFormatVersionNumber()135 virtual int getFormatVersionNumber() const { 136 // Conceptually this converts the symbolic value we use in the code into the 137 // hardcoded of the bytes in the file. But we want the constants to be the 138 // same so we use them for both here. 139 switch (mDictFormatVersion) { 140 case FormatUtils::VERSION_2: 141 return FormatUtils::VERSION_2; 142 case FormatUtils::VERSION_4_ONLY_FOR_TESTING: 143 return FormatUtils::VERSION_4_ONLY_FOR_TESTING; 144 case FormatUtils::VERSION_4: 145 return FormatUtils::VERSION_4; 146 case FormatUtils::VERSION_4_DEV: 147 return FormatUtils::VERSION_4_DEV; 148 default: 149 return FormatUtils::UNKNOWN_VERSION; 150 } 151 } 152 isValid()153 AK_FORCE_INLINE bool isValid() const { 154 // Decaying dictionary must have historical information. 155 if (!mIsDecayingDict) { 156 return true; 157 } 158 if (mHasHistoricalInfoOfWords) { 159 return true; 160 } else { 161 return false; 162 } 163 } 164 getSize()165 AK_FORCE_INLINE int getSize() const { 166 return mSize; 167 } 168 getMultiWordCostMultiplier()169 AK_FORCE_INLINE float getMultiWordCostMultiplier() const { 170 return mMultiWordCostMultiplier; 171 } 172 isDecayingDict()173 AK_FORCE_INLINE bool isDecayingDict() const { 174 return mIsDecayingDict; 175 } 176 requiresGermanUmlautProcessing()177 AK_FORCE_INLINE bool requiresGermanUmlautProcessing() const { 178 return mRequiresGermanUmlautProcessing; 179 } 180 getDate()181 AK_FORCE_INLINE int getDate() const { 182 return mDate; 183 } 184 getLastDecayedTime()185 AK_FORCE_INLINE int getLastDecayedTime() const { 186 return mLastDecayedTime; 187 } 188 getUnigramCount()189 AK_FORCE_INLINE int getUnigramCount() const { 190 return mUnigramCount; 191 } 192 getBigramCount()193 AK_FORCE_INLINE int getBigramCount() const { 194 return mBigramCount; 195 } 196 getExtendedRegionSize()197 AK_FORCE_INLINE int getExtendedRegionSize() const { 198 return mExtendedRegionSize; 199 } 200 hasHistoricalInfoOfWords()201 AK_FORCE_INLINE bool hasHistoricalInfoOfWords() const { 202 return mHasHistoricalInfoOfWords; 203 } 204 shouldBoostExactMatches()205 AK_FORCE_INLINE bool shouldBoostExactMatches() const { 206 // TODO: Investigate better ways to handle exact matches for personalized dictionaries. 207 return !isDecayingDict(); 208 } 209 getAttributeMap()210 const DictionaryHeaderStructurePolicy::AttributeMap *getAttributeMap() const { 211 return &mAttributeMap; 212 } 213 getForgettingCurveOccurrencesToLevelUp()214 AK_FORCE_INLINE int getForgettingCurveOccurrencesToLevelUp() const { 215 return mForgettingCurveOccurrencesToLevelUp; 216 } 217 getForgettingCurveProbabilityValuesTableId()218 AK_FORCE_INLINE int getForgettingCurveProbabilityValuesTableId() const { 219 return mForgettingCurveProbabilityValuesTableId; 220 } 221 getForgettingCurveDurationToLevelDown()222 AK_FORCE_INLINE int getForgettingCurveDurationToLevelDown() const { 223 return mForgettingCurveDurationToLevelDown; 224 } 225 getMaxUnigramCount()226 AK_FORCE_INLINE int getMaxUnigramCount() const { 227 return mMaxUnigramCount; 228 } 229 getMaxBigramCount()230 AK_FORCE_INLINE int getMaxBigramCount() const { 231 return mMaxBigramCount; 232 } 233 234 void readHeaderValueOrQuestionMark(const char *const key, 235 int *outValue, int outValueSize) const; 236 237 bool fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime, 238 const int unigramCount, const int bigramCount, 239 const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const; 240 241 void fillInHeader(const bool updatesLastDecayedTime, 242 const int unigramCount, const int bigramCount, const int extendedRegionSize, 243 DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const; 244 getLocale()245 AK_FORCE_INLINE const std::vector<int> *getLocale() const { 246 return &mLocale; 247 } 248 supportsBeginningOfSentence()249 bool supportsBeginningOfSentence() const { 250 return mDictFormatVersion >= FormatUtils::VERSION_4; 251 } 252 253 private: 254 DISALLOW_COPY_AND_ASSIGN(HeaderPolicy); 255 256 static const char *const MULTIPLE_WORDS_DEMOTION_RATE_KEY; 257 static const char *const REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY; 258 static const char *const IS_DECAYING_DICT_KEY; 259 static const char *const DATE_KEY; 260 static const char *const LAST_DECAYED_TIME_KEY; 261 static const char *const UNIGRAM_COUNT_KEY; 262 static const char *const BIGRAM_COUNT_KEY; 263 static const char *const EXTENDED_REGION_SIZE_KEY; 264 static const char *const HAS_HISTORICAL_INFO_KEY; 265 static const char *const LOCALE_KEY; 266 static const char *const FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY; 267 static const char *const FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY; 268 static const char *const FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY; 269 static const char *const MAX_UNIGRAM_COUNT_KEY; 270 static const char *const MAX_BIGRAM_COUNT_KEY; 271 static const int DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE; 272 static const float MULTIPLE_WORD_COST_MULTIPLIER_SCALE; 273 static const int DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP; 274 static const int DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID; 275 static const int DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS; 276 static const int DEFAULT_MAX_UNIGRAM_COUNT; 277 static const int DEFAULT_MAX_BIGRAM_COUNT; 278 279 const FormatUtils::FORMAT_VERSION mDictFormatVersion; 280 const HeaderReadWriteUtils::DictionaryFlags mDictionaryFlags; 281 const int mSize; 282 DictionaryHeaderStructurePolicy::AttributeMap mAttributeMap; 283 const std::vector<int> mLocale; 284 const float mMultiWordCostMultiplier; 285 const bool mRequiresGermanUmlautProcessing; 286 const bool mIsDecayingDict; 287 const int mDate; 288 const int mLastDecayedTime; 289 const int mUnigramCount; 290 const int mBigramCount; 291 const int mExtendedRegionSize; 292 const bool mHasHistoricalInfoOfWords; 293 const int mForgettingCurveOccurrencesToLevelUp; 294 const int mForgettingCurveProbabilityValuesTableId; 295 const int mForgettingCurveDurationToLevelDown; 296 const int mMaxUnigramCount; 297 const int mMaxBigramCount; 298 299 const std::vector<int> readLocale() const; 300 float readMultipleWordCostMultiplier() const; 301 bool readRequiresGermanUmlautProcessing() const; 302 303 static DictionaryHeaderStructurePolicy::AttributeMap createAttributeMapAndReadAllAttributes( 304 const uint8_t *const dictBuf); 305 }; 306 } // namespace latinime 307 #endif /* LATINIME_HEADER_POLICY_H */ 308