1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "suggest/policyimpl/dictionary/header/header_policy.h"
18 
19 #include <algorithm>
20 
21 namespace latinime {
22 
23 // Note that these are corresponding definitions in Java side in DictionaryHeader.
24 const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
25 const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
26         "REQUIRES_GERMAN_UMLAUT_PROCESSING";
27 // TODO: Change attribute string to "IS_DECAYING_DICT".
28 const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
29 const char *const HeaderPolicy::DATE_KEY = "date";
30 const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
31 const char *const HeaderPolicy::UNIGRAM_COUNT_KEY = "UNIGRAM_COUNT";
32 const char *const HeaderPolicy::BIGRAM_COUNT_KEY = "BIGRAM_COUNT";
33 const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
34 // Historical info is information that is needed to support decaying such as timestamp, level and
35 // count.
36 const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
37 const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
38 const char *const HeaderPolicy::FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP_KEY =
39         "FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP";
40 const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
41         "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
42 const char *const HeaderPolicy::FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS_KEY =
43         "FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS";
44 
45 const char *const HeaderPolicy::MAX_UNIGRAM_COUNT_KEY = "MAX_UNIGRAM_COUNT";
46 const char *const HeaderPolicy::MAX_BIGRAM_COUNT_KEY = "MAX_BIGRAM_COUNT";
47 
48 const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
49 const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
50 const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_OCCURRENCES_TO_LEVEL_UP = 2;
51 const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
52 // 30 days
53 const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_DURATION_TO_LEVEL_DOWN_IN_SECONDS =
54         30 * 24 * 60 * 60;
55 
56 const int HeaderPolicy::DEFAULT_MAX_UNIGRAM_COUNT = 10000;
57 const int HeaderPolicy::DEFAULT_MAX_BIGRAM_COUNT = 10000;
58 
59 // Used for logging. Question mark is used to indicate that the key is not found.
readHeaderValueOrQuestionMark(const char * const key,int * outValue,int outValueSize) const60 void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
61         int outValueSize) const {
62     if (outValueSize <= 0) return;
63     if (outValueSize == 1) {
64         outValue[0] = '\0';
65         return;
66     }
67     std::vector<int> keyCodePointVector;
68     HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector);
69     DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it =
70             mAttributeMap.find(keyCodePointVector);
71     if (it == mAttributeMap.end()) {
72         // The key was not found.
73         outValue[0] = '?';
74         outValue[1] = '\0';
75         return;
76     }
77     const int terminalIndex = std::min(static_cast<int>(it->second.size()), outValueSize - 1);
78     for (int i = 0; i < terminalIndex; ++i) {
79         outValue[i] = it->second[i];
80     }
81     outValue[terminalIndex] = '\0';
82 }
83 
readLocale() const84 const std::vector<int> HeaderPolicy::readLocale() const {
85     return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY);
86 }
87 
readMultipleWordCostMultiplier() const88 float HeaderPolicy::readMultipleWordCostMultiplier() const {
89     const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
90             MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
91     if (demotionRate <= 0) {
92         return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
93     }
94     return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
95 }
96 
readRequiresGermanUmlautProcessing() const97 bool HeaderPolicy::readRequiresGermanUmlautProcessing() const {
98     return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
99             REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false);
100 }
101 
fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,const int unigramCount,const int bigramCount,const int extendedRegionSize,BufferWithExtendableBuffer * const outBuffer) const102 bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
103         const int unigramCount, const int bigramCount,
104         const int extendedRegionSize, BufferWithExtendableBuffer *const outBuffer) const {
105     int writingPos = 0;
106     DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap);
107     fillInHeader(updatesLastDecayedTime, unigramCount, bigramCount,
108             extendedRegionSize, &attributeMapToWrite);
109     if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion,
110             &writingPos)) {
111         return false;
112     }
113     if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags,
114             &writingPos)) {
115         return false;
116     }
117     // Temporarily writes a dummy header size.
118     int headerSizeFieldPos = writingPos;
119     if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */,
120             &writingPos)) {
121         return false;
122     }
123     if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite,
124             &writingPos)) {
125         return false;
126     }
127     // Writes the actual header size.
128     if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos,
129             &headerSizeFieldPos)) {
130         return false;
131     }
132     return true;
133 }
134 
fillInHeader(const bool updatesLastDecayedTime,const int unigramCount,const int bigramCount,const int extendedRegionSize,DictionaryHeaderStructurePolicy::AttributeMap * outAttributeMap) const135 void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime, const int unigramCount,
136         const int bigramCount, const int extendedRegionSize,
137         DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const {
138     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, UNIGRAM_COUNT_KEY, unigramCount);
139     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, BIGRAM_COUNT_KEY, bigramCount);
140     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY,
141             extendedRegionSize);
142     // Set the current time as the generation time.
143     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY,
144             TimeKeeper::peekCurrentTime());
145     HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale);
146     if (updatesLastDecayedTime) {
147         // Set current time as the last updated time.
148         HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY,
149                 TimeKeeper::peekCurrentTime());
150     }
151 }
152 
153 /* static */ DictionaryHeaderStructurePolicy::AttributeMap
createAttributeMapAndReadAllAttributes(const uint8_t * const dictBuf)154         HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) {
155     DictionaryHeaderStructurePolicy::AttributeMap attributeMap;
156     HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap);
157     return attributeMap;
158 }
159 
160 } // namespace latinime
161