1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "dictionary/property/ngram_context.h"
18 
19 #include "dictionary/interface/dictionary_structure_with_buffer_policy.h"
20 #include "utils/char_utils.h"
21 
22 namespace latinime {
23 
NgramContext()24 NgramContext::NgramContext() : mPrevWordCount(0) {}
25 
NgramContext(const NgramContext & ngramContext)26 NgramContext::NgramContext(const NgramContext &ngramContext)
27         : mPrevWordCount(ngramContext.mPrevWordCount) {
28     for (size_t i = 0; i < mPrevWordCount; ++i) {
29         mPrevWordCodePointCount[i] = ngramContext.mPrevWordCodePointCount[i];
30         memmove(mPrevWordCodePoints[i], ngramContext.mPrevWordCodePoints[i],
31                 sizeof(mPrevWordCodePoints[i][0]) * mPrevWordCodePointCount[i]);
32         mIsBeginningOfSentence[i] = ngramContext.mIsBeginningOfSentence[i];
33     }
34 }
35 
NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],const int * const prevWordCodePointCount,const bool * const isBeginningOfSentence,const size_t prevWordCount)36 NgramContext::NgramContext(const int prevWordCodePoints[][MAX_WORD_LENGTH],
37         const int *const prevWordCodePointCount, const bool *const isBeginningOfSentence,
38         const size_t prevWordCount)
39         : mPrevWordCount(std::min(NELEMS(mPrevWordCodePoints), prevWordCount)) {
40     clear();
41     for (size_t i = 0; i < mPrevWordCount; ++i) {
42         if (prevWordCodePointCount[i] < 0 || prevWordCodePointCount[i] > MAX_WORD_LENGTH) {
43             continue;
44         }
45         memmove(mPrevWordCodePoints[i], prevWordCodePoints[i],
46                 sizeof(mPrevWordCodePoints[i][0]) * prevWordCodePointCount[i]);
47         mPrevWordCodePointCount[i] = prevWordCodePointCount[i];
48         mIsBeginningOfSentence[i] = isBeginningOfSentence[i];
49     }
50 }
51 
NgramContext(const int * const prevWordCodePoints,const int prevWordCodePointCount,const bool isBeginningOfSentence)52 NgramContext::NgramContext(const int *const prevWordCodePoints, const int prevWordCodePointCount,
53         const bool isBeginningOfSentence) : mPrevWordCount(1) {
54     clear();
55     if (prevWordCodePointCount > MAX_WORD_LENGTH || !prevWordCodePoints) {
56         return;
57     }
58     memmove(mPrevWordCodePoints[0], prevWordCodePoints,
59             sizeof(mPrevWordCodePoints[0][0]) * prevWordCodePointCount);
60     mPrevWordCodePointCount[0] = prevWordCodePointCount;
61     mIsBeginningOfSentence[0] = isBeginningOfSentence;
62 }
63 
isValid() const64 bool NgramContext::isValid() const {
65     if (mPrevWordCodePointCount[0] > 0) {
66         return true;
67     }
68     if (mIsBeginningOfSentence[0]) {
69         return true;
70     }
71     return false;
72 }
73 
getNthPrevWordCodePoints(const size_t n) const74 const CodePointArrayView NgramContext::getNthPrevWordCodePoints(const size_t n) const {
75     if (n <= 0 || n > mPrevWordCount) {
76         return CodePointArrayView();
77     }
78     return CodePointArrayView(mPrevWordCodePoints[n - 1], mPrevWordCodePointCount[n - 1]);
79 }
80 
isNthPrevWordBeginningOfSentence(const size_t n) const81 bool NgramContext::isNthPrevWordBeginningOfSentence(const size_t n) const {
82     if (n <= 0 || n > mPrevWordCount) {
83         return false;
84     }
85     return mIsBeginningOfSentence[n - 1];
86 }
87 
getWordId(const DictionaryStructureWithBufferPolicy * const dictStructurePolicy,const int * const wordCodePoints,const int wordCodePointCount,const bool isBeginningOfSentence,const bool tryLowerCaseSearch)88 /* static */ int NgramContext::getWordId(
89         const DictionaryStructureWithBufferPolicy *const dictStructurePolicy,
90         const int *const wordCodePoints, const int wordCodePointCount,
91         const bool isBeginningOfSentence, const bool tryLowerCaseSearch) {
92     if (!dictStructurePolicy || !wordCodePoints || wordCodePointCount > MAX_WORD_LENGTH) {
93         return NOT_A_WORD_ID;
94     }
95     int codePoints[MAX_WORD_LENGTH];
96     int codePointCount = wordCodePointCount;
97     memmove(codePoints, wordCodePoints, sizeof(int) * codePointCount);
98     if (isBeginningOfSentence) {
99         codePointCount = CharUtils::attachBeginningOfSentenceMarker(codePoints, codePointCount,
100                 MAX_WORD_LENGTH);
101         if (codePointCount <= 0) {
102             return NOT_A_WORD_ID;
103         }
104     }
105     const CodePointArrayView codePointArrayView(codePoints, codePointCount);
106     const int wordId = dictStructurePolicy->getWordId(codePointArrayView,
107             false /* forceLowerCaseSearch */);
108     if (wordId != NOT_A_WORD_ID || !tryLowerCaseSearch) {
109         // Return the id when when the word was found or doesn't try lower case search.
110         return wordId;
111     }
112     // Check bigrams for lower-cased previous word if original was not found. Useful for
113     // auto-capitalized words like "The [current_word]".
114     return dictStructurePolicy->getWordId(codePointArrayView, true /* forceLowerCaseSearch */);
115 }
116 
clear()117 void NgramContext::clear() {
118     for (size_t i = 0; i < NELEMS(mPrevWordCodePoints); ++i) {
119         mPrevWordCodePointCount[i] = 0;
120         mIsBeginningOfSentence[i] = false;
121     }
122 }
123 } // namespace latinime
124