1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * An implementation of Liang's hyphenation algorithm.
19  */
20 
21 #include "unicode/locid.h"
22 #include <memory>
23 #include <unordered_map>
24 
25 #ifndef MINIKIN_HYPHENATOR_H
26 #define MINIKIN_HYPHENATOR_H
27 
28 namespace minikin {
29 
30 enum class HyphenationType : uint8_t {
31     // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.
32 
33     // Do not break.
34     DONT_BREAK = 0,
35     // Break the line and insert a normal hyphen.
36     BREAK_AND_INSERT_HYPHEN = 1,
37     // Break the line and insert an Armenian hyphen (U+058A).
38     BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
39     // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
40     BREAK_AND_INSERT_MAQAF = 3,
41     // Break the line and insert a Canadian Syllabics hyphen (U+1400).
42     BREAK_AND_INSERT_UCAS_HYPHEN = 4,
43     // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
44     // present or the script does not use a hyphen (e.g. in Malayalam).
45     BREAK_AND_DONT_INSERT_HYPHEN = 5,
46     // Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
47     // as "l-/l".
48     BREAK_AND_REPLACE_WITH_HYPHEN = 6,
49     // Break the line, and repeat the hyphen (which is the last character) at the beginning of the
50     // next line. Used in Polish, where "czerwono-niebieska" should hyphenate as
51     // "czerwono-/-niebieska".
52     BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
53     // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
54     // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
55     // behavior when a soft hyphen is used in Arabic script.
56     BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
57 };
58 
59 // The hyphen edit represents an edit to the string when a word is
60 // hyphenated. The most common hyphen edit is adding a "-" at the end
61 // of a syllable, but nonstandard hyphenation allows for more choices.
62 // Note that a HyphenEdit can hold two types of edits at the same time,
63 // One at the beginning of the string/line and one at the end.
64 class HyphenEdit {
65 public:
66     static const uint32_t NO_EDIT = 0x00;
67 
68     static const uint32_t INSERT_HYPHEN_AT_END = 0x01;
69     static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02;
70     static const uint32_t INSERT_MAQAF_AT_END = 0x03;
71     static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04;
72     static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05;
73     static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06;
74     static const uint32_t BREAK_AT_END = 0x07;
75 
76     static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3;
77     static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3;
78     static const uint32_t BREAK_AT_START = 0x03 << 3;
79 
80     // Keep in sync with the definitions in the Java code at:
81     // frameworks/base/graphics/java/android/graphics/Paint.java
82     static const uint32_t MASK_END_OF_LINE = 0x07;
83     static const uint32_t MASK_START_OF_LINE = 0x03 << 3;
84 
isReplacement(uint32_t hyph)85     inline static bool isReplacement(uint32_t hyph) {
86         return hyph == REPLACE_WITH_HYPHEN_AT_END;
87     }
88 
isInsertion(uint32_t hyph)89     inline static bool isInsertion(uint32_t hyph) {
90         return (hyph == INSERT_HYPHEN_AT_END
91                 || hyph == INSERT_ARMENIAN_HYPHEN_AT_END
92                 || hyph == INSERT_MAQAF_AT_END
93                 || hyph == INSERT_UCAS_HYPHEN_AT_END
94                 || hyph == INSERT_ZWJ_AND_HYPHEN_AT_END
95                 || hyph == INSERT_HYPHEN_AT_START
96                 || hyph == INSERT_ZWJ_AT_START);
97     }
98 
99     const static uint32_t* getHyphenString(uint32_t hyph);
100     static uint32_t editForThisLine(HyphenationType type);
101     static uint32_t editForNextLine(HyphenationType type);
102 
HyphenEdit()103     HyphenEdit() : hyphen(NO_EDIT) { }
HyphenEdit(uint32_t hyphenInt)104     HyphenEdit(uint32_t hyphenInt) : hyphen(hyphenInt) { }  // NOLINT(implicit)
getHyphen()105     uint32_t getHyphen() const { return hyphen; }
106     bool operator==(const HyphenEdit &other) const { return hyphen == other.hyphen; }
107 
getEnd()108     uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; }
getStart()109     uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; }
110 
111 private:
112     uint32_t hyphen;
113 };
114 
115 // hyb file header; implementation details are in the .cpp file
116 struct Header;
117 
118 class Hyphenator {
119 public:
120     // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
121     // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
122     // corresponding code unit offset in the word.
123     //
124     // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
125     // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
126     void hyphenate(std::vector<HyphenationType>* result, const uint16_t* word, size_t len,
127             const icu::Locale& locale);
128 
129     // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
130     // immediately after which line breaks are allowed, but words containing it should not be
131     // automatically hyphenated.
132     static bool isLineBreakingHyphen(uint32_t cp);
133 
134     // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
135     // the caller is responsible for ensuring that the lifetime of the pattern data is
136     // at least as long as the Hyphenator object.
137 
138     // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
139     static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix);
140 
141 private:
142     // apply various hyphenation rules including hard and soft hyphens, ignoring patterns
143     void hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len,
144             const icu::Locale& locale);
145 
146     // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
147     // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
148     // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
149     // Note that this method writes len+2 entries into alpha_codes (including start and stop)
150     HyphenationType alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len);
151 
152     // calculate hyphenation from patterns, assuming alphabet lookup has already been done
153     void hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len,
154             HyphenationType hyphenValue);
155 
156     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
157     // that temporary buffers can be stack-allocated without waste, which is a slightly
158     // different use case. It measures UTF-16 code units.
159     static const size_t MAX_HYPHENATED_SIZE = 64;
160 
161     const uint8_t* patternData;
162     size_t minPrefix, minSuffix;
163 
164     // accessors for binary data
getHeader()165     const Header* getHeader() const {
166         return reinterpret_cast<const Header*>(patternData);
167     }
168 
169 };
170 
171 }  // namespace minikin
172 
173 #endif   // MINIKIN_HYPHENATOR_H
174