1 /*
2  * Copyright (C) 2013, The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/header/header_policy.h"
18 
19 #include <algorithm>
20 
21 #include "utils/ngram_utils.h"
22 
23 namespace latinime {
24 
25 // Note that these are corresponding definitions in Java side in DictionaryHeader.
26 const char *const HeaderPolicy::MULTIPLE_WORDS_DEMOTION_RATE_KEY = "MULTIPLE_WORDS_DEMOTION_RATE";
27 const char *const HeaderPolicy::REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY =
28         "REQUIRES_GERMAN_UMLAUT_PROCESSING";
29 // TODO: Change attribute string to "IS_DECAYING_DICT".
30 const char *const HeaderPolicy::IS_DECAYING_DICT_KEY = "USES_FORGETTING_CURVE";
31 const char *const HeaderPolicy::DATE_KEY = "date";
32 const char *const HeaderPolicy::LAST_DECAYED_TIME_KEY = "LAST_DECAYED_TIME";
33 const char *const HeaderPolicy::NGRAM_COUNT_KEYS[] =
34         {"UNIGRAM_COUNT", "BIGRAM_COUNT", "TRIGRAM_COUNT", "QUADGRAM_COUNT"};
35 const char *const HeaderPolicy::MAX_NGRAM_COUNT_KEYS[] =
36         {"MAX_UNIGRAM_ENTRY_COUNT", "MAX_BIGRAM_ENTRY_COUNT", "MAX_TRIGRAM_ENTRY_COUNT",
37                 "MAX_QUADGRAM_ENTRY_COUNT"};
38 const int HeaderPolicy::DEFAULT_MAX_NGRAM_COUNTS[] = {10000, 30000, 30000, 30000};
39 const char *const HeaderPolicy::EXTENDED_REGION_SIZE_KEY = "EXTENDED_REGION_SIZE";
40 // Historical info is information that is needed to support decaying such as timestamp, level and
41 // count.
42 const char *const HeaderPolicy::HAS_HISTORICAL_INFO_KEY = "HAS_HISTORICAL_INFO";
43 const char *const HeaderPolicy::LOCALE_KEY = "locale"; // match Java declaration
44 const char *const HeaderPolicy::FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID_KEY =
45         "FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID";
46 
47 const int HeaderPolicy::DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE = 100;
48 const float HeaderPolicy::MULTIPLE_WORD_COST_MULTIPLIER_SCALE = 100.0f;
49 const int HeaderPolicy::DEFAULT_FORGETTING_CURVE_PROBABILITY_VALUES_TABLE_ID = 3;
50 
51 // Used for logging. Question mark is used to indicate that the key is not found.
readHeaderValueOrQuestionMark(const char * const key,int * outValue,int outValueSize) const52 void HeaderPolicy::readHeaderValueOrQuestionMark(const char *const key, int *outValue,
53         int outValueSize) const {
54     if (outValueSize <= 0) return;
55     if (outValueSize == 1) {
56         outValue[0] = '\0';
57         return;
58     }
59     std::vector<int> keyCodePointVector;
60     HeaderReadWriteUtils::insertCharactersIntoVector(key, &keyCodePointVector);
61     DictionaryHeaderStructurePolicy::AttributeMap::const_iterator it =
62             mAttributeMap.find(keyCodePointVector);
63     if (it == mAttributeMap.end()) {
64         // The key was not found.
65         outValue[0] = '?';
66         outValue[1] = '\0';
67         return;
68     }
69     const int terminalIndex = std::min(static_cast<int>(it->second.size()), outValueSize - 1);
70     for (int i = 0; i < terminalIndex; ++i) {
71         outValue[i] = it->second[i];
72     }
73     outValue[terminalIndex] = '\0';
74 }
75 
readLocale() const76 const std::vector<int> HeaderPolicy::readLocale() const {
77     return HeaderReadWriteUtils::readCodePointVectorAttributeValue(&mAttributeMap, LOCALE_KEY);
78 }
79 
readMultipleWordCostMultiplier() const80 float HeaderPolicy::readMultipleWordCostMultiplier() const {
81     const int demotionRate = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
82             MULTIPLE_WORDS_DEMOTION_RATE_KEY, DEFAULT_MULTIPLE_WORDS_DEMOTION_RATE);
83     if (demotionRate <= 0) {
84         return static_cast<float>(MAX_VALUE_FOR_WEIGHTING);
85     }
86     return MULTIPLE_WORD_COST_MULTIPLIER_SCALE / static_cast<float>(demotionRate);
87 }
88 
readRequiresGermanUmlautProcessing() const89 bool HeaderPolicy::readRequiresGermanUmlautProcessing() const {
90     return HeaderReadWriteUtils::readBoolAttributeValue(&mAttributeMap,
91             REQUIRES_GERMAN_UMLAUT_PROCESSING_KEY, false);
92 }
93 
fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,const EntryCounts & entryCounts,const int extendedRegionSize,BufferWithExtendableBuffer * const outBuffer) const94 bool HeaderPolicy::fillInAndWriteHeaderToBuffer(const bool updatesLastDecayedTime,
95         const EntryCounts &entryCounts, const int extendedRegionSize,
96         BufferWithExtendableBuffer *const outBuffer) const {
97     int writingPos = 0;
98     DictionaryHeaderStructurePolicy::AttributeMap attributeMapToWrite(mAttributeMap);
99     fillInHeader(updatesLastDecayedTime, entryCounts, extendedRegionSize, &attributeMapToWrite);
100     if (!HeaderReadWriteUtils::writeDictionaryVersion(outBuffer, mDictFormatVersion,
101             &writingPos)) {
102         return false;
103     }
104     if (!HeaderReadWriteUtils::writeDictionaryFlags(outBuffer, mDictionaryFlags,
105             &writingPos)) {
106         return false;
107     }
108     // Temporarily writes a dummy header size.
109     int headerSizeFieldPos = writingPos;
110     if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, 0 /* size */,
111             &writingPos)) {
112         return false;
113     }
114     if (!HeaderReadWriteUtils::writeHeaderAttributes(outBuffer, &attributeMapToWrite,
115             &writingPos)) {
116         return false;
117     }
118     // Writes the actual header size.
119     if (!HeaderReadWriteUtils::writeDictionaryHeaderSize(outBuffer, writingPos,
120             &headerSizeFieldPos)) {
121         return false;
122     }
123     return true;
124 }
125 
126 namespace {
127 
getIndexFromNgramType(const NgramType ngramType)128 int getIndexFromNgramType(const NgramType ngramType) {
129     return static_cast<int>(ngramType);
130 }
131 
132 } // namespace
133 
fillInHeader(const bool updatesLastDecayedTime,const EntryCounts & entryCounts,const int extendedRegionSize,DictionaryHeaderStructurePolicy::AttributeMap * outAttributeMap) const134 void HeaderPolicy::fillInHeader(const bool updatesLastDecayedTime,
135         const EntryCounts &entryCounts, const int extendedRegionSize,
136         DictionaryHeaderStructurePolicy::AttributeMap *outAttributeMap) const {
137     for (const auto ngramType : AllNgramTypes::ASCENDING) {
138         HeaderReadWriteUtils::setIntAttribute(outAttributeMap,
139                 NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)],
140                 entryCounts.getNgramCount(ngramType));
141     }
142     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, EXTENDED_REGION_SIZE_KEY,
143             extendedRegionSize);
144     // Set the current time as the generation time.
145     HeaderReadWriteUtils::setIntAttribute(outAttributeMap, DATE_KEY,
146             TimeKeeper::peekCurrentTime());
147     HeaderReadWriteUtils::setCodePointVectorAttribute(outAttributeMap, LOCALE_KEY, mLocale);
148     if (updatesLastDecayedTime) {
149         // Set current time as the last updated time.
150         HeaderReadWriteUtils::setIntAttribute(outAttributeMap, LAST_DECAYED_TIME_KEY,
151                 TimeKeeper::peekCurrentTime());
152     }
153 }
154 
155 /* static */ DictionaryHeaderStructurePolicy::AttributeMap
createAttributeMapAndReadAllAttributes(const uint8_t * const dictBuf)156         HeaderPolicy::createAttributeMapAndReadAllAttributes(const uint8_t *const dictBuf) {
157     DictionaryHeaderStructurePolicy::AttributeMap attributeMap;
158     HeaderReadWriteUtils::fetchAllHeaderAttributes(dictBuf, &attributeMap);
159     return attributeMap;
160 }
161 
readNgramCounts() const162 /* static */ const EntryCounts HeaderPolicy::readNgramCounts() const {
163     MutableEntryCounters entryCounters;
164     for (const auto ngramType : AllNgramTypes::ASCENDING) {
165         const int entryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
166                 NGRAM_COUNT_KEYS[getIndexFromNgramType(ngramType)], 0 /* defaultValue */);
167         entryCounters.setNgramCount(ngramType, entryCount);
168     }
169     return entryCounters.getEntryCounts();
170 }
171 
readMaxNgramCounts() const172 /* static */ const EntryCounts HeaderPolicy::readMaxNgramCounts() const {
173     MutableEntryCounters entryCounters;
174     for (const auto ngramType : AllNgramTypes::ASCENDING) {
175         const int index = getIndexFromNgramType(ngramType);
176         const int maxEntryCount = HeaderReadWriteUtils::readIntAttributeValue(&mAttributeMap,
177                 MAX_NGRAM_COUNT_KEYS[index], DEFAULT_MAX_NGRAM_COUNTS[index]);
178         entryCounters.setNgramCount(ngramType, maxEntryCount);
179     }
180     return entryCounters.getEntryCounts();
181 }
182 
183 } // namespace latinime
184