1 /*
2  * Copyright (C) 2010 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LATINIME_CHAR_UTILS_H
18 #define LATINIME_CHAR_UTILS_H
19 
20 #include <cctype>
21 #include <cstring>
22 #include <vector>
23 
24 #include "defines.h"
25 
26 namespace latinime {
27 
28 class CharUtils {
29  public:
isAsciiUpper(int c)30     static AK_FORCE_INLINE bool isAsciiUpper(int c) {
31         // Note: isupper(...) reports false positives for some Cyrillic characters, causing them to
32         // be incorrectly lower-cased using toAsciiLower(...) rather than latin_tolower(...).
33         return (c >= 'A' && c <= 'Z');
34     }
35 
toAsciiLower(int c)36     static AK_FORCE_INLINE int toAsciiLower(int c) {
37         return c - 'A' + 'a';
38     }
39 
isAscii(int c)40     static AK_FORCE_INLINE bool isAscii(int c) {
41         return isascii(c) != 0;
42     }
43 
toLowerCase(const int c)44     static AK_FORCE_INLINE int toLowerCase(const int c) {
45         if (isAsciiUpper(c)) {
46             return toAsciiLower(c);
47         }
48         if (isAscii(c)) {
49             return c;
50         }
51         return static_cast<int>(latin_tolower(static_cast<unsigned short>(c)));
52     }
53 
toBaseLowerCase(const int c)54     static AK_FORCE_INLINE int toBaseLowerCase(const int c) {
55         return toLowerCase(toBaseCodePoint(c));
56     }
57 
isIntentionalOmissionCodePoint(const int codePoint)58     static AK_FORCE_INLINE bool isIntentionalOmissionCodePoint(const int codePoint) {
59         // TODO: Do not hardcode here
60         return codePoint == KEYCODE_SINGLE_QUOTE || codePoint == KEYCODE_HYPHEN_MINUS;
61     }
62 
getCodePointCount(const int arraySize,const int * const codePoints)63     static AK_FORCE_INLINE int getCodePointCount(const int arraySize, const int *const codePoints) {
64         int size = 0;
65         for (; size < arraySize; ++size) {
66             if (codePoints[size] == '\0') {
67                 break;
68             }
69         }
70         return size;
71     }
72 
toBaseCodePoint(int c)73     static AK_FORCE_INLINE int toBaseCodePoint(int c) {
74         if (c < BASE_CHARS_SIZE) {
75             return static_cast<int>(BASE_CHARS[c]);
76         }
77         return c;
78     }
79 
getSpaceCount(const int * const codePointBuffer,const int length)80     static AK_FORCE_INLINE int getSpaceCount(const int *const codePointBuffer, const int length) {
81         int spaceCount = 0;
82         for (int i = 0; i < length; ++i) {
83             if (codePointBuffer[i] == KEYCODE_SPACE) {
84                 ++spaceCount;
85             }
86         }
87         return spaceCount;
88     }
89 
isInUnicodeSpace(const int codePoint)90     static AK_FORCE_INLINE int isInUnicodeSpace(const int codePoint) {
91         return codePoint >= MIN_UNICODE_CODE_POINT && codePoint <= MAX_UNICODE_CODE_POINT;
92     }
93 
94     static unsigned short latin_tolower(const unsigned short c);
95     static const std::vector<int> EMPTY_STRING;
96 
97     // Returns updated code point count. Returns 0 when the code points cannot be marked as a
98     // Beginning-of-Sentence.
attachBeginningOfSentenceMarker(int * const codePoints,const int codePointCount,const int maxCodePoint)99     static AK_FORCE_INLINE int attachBeginningOfSentenceMarker(int *const codePoints,
100             const int codePointCount, const int maxCodePoint) {
101         if (codePointCount > 0 && codePoints[0] == CODE_POINT_BEGINNING_OF_SENTENCE) {
102             // Marker has already been attached.
103             return codePointCount;
104         }
105         if (codePointCount >= maxCodePoint) {
106             // the code points cannot be marked as a Beginning-of-Sentence.
107             return 0;
108         }
109         memmove(codePoints + 1, codePoints, sizeof(int) * codePointCount);
110         codePoints[0] = CODE_POINT_BEGINNING_OF_SENTENCE;
111         return codePointCount + 1;
112     }
113 
114  private:
115     DISALLOW_IMPLICIT_CONSTRUCTORS(CharUtils);
116 
117     static const int MIN_UNICODE_CODE_POINT;
118     static const int MAX_UNICODE_CODE_POINT;
119 
120     /**
121      * Table mapping most combined Latin, Greek, and Cyrillic characters
122      * to their base characters.  If c is in range, BASE_CHARS[c] == c
123      * if c is not a combined character, or the base character if it
124      * is combined.
125      */
126     static const int BASE_CHARS_SIZE = 0x0500;
127     static const unsigned short BASE_CHARS[BASE_CHARS_SIZE];
128 };
129 } // namespace latinime
130 #endif // LATINIME_CHAR_UTILS_H
131