1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package android.text;
18 
19 import android.annotation.Nullable;
20 import android.util.Log;
21 
22 import com.android.internal.annotations.GuardedBy;
23 
24 import java.io.File;
25 import java.io.IOException;
26 import java.io.RandomAccessFile;
27 import java.nio.ByteBuffer;
28 import java.nio.MappedByteBuffer;
29 import java.nio.channels.FileChannel;
30 import java.util.HashMap;
31 import java.util.Locale;
32 
33 /**
34  * Hyphenator is a wrapper class for a native implementation of automatic hyphenation,
35  * in essence finding valid hyphenation opportunities in a word.
36  *
37  * @hide
38  */
39 public class Hyphenator {
40     // This class has deliberately simple lifetime management (no finalizer) because in
41     // the common case a process will use a very small number of locales.
42 
43     private static String TAG = "Hyphenator";
44 
45     // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but
46     // that appears too small.
47     private static final int INDIC_MIN_PREFIX = 2;
48     private static final int INDIC_MIN_SUFFIX = 2;
49 
50     private final static Object sLock = new Object();
51 
52     @GuardedBy("sLock")
53     final static HashMap<Locale, Hyphenator> sMap = new HashMap<Locale, Hyphenator>();
54 
55     // Reasonable enough values for cases where we have no hyphenation patterns but may be able to
56     // do some automatic hyphenation based on characters. These values would be used very rarely.
57     private static final int DEFAULT_MIN_PREFIX = 2;
58     private static final int DEFAULT_MIN_SUFFIX = 2;
59     final static Hyphenator sEmptyHyphenator =
60             new Hyphenator(StaticLayout.nLoadHyphenator(
61                                    null, 0, DEFAULT_MIN_PREFIX, DEFAULT_MIN_SUFFIX),
62                            null);
63 
64     final private long mNativePtr;
65 
66     // We retain a reference to the buffer to keep the memory mapping valid
67     @SuppressWarnings("unused")
68     final private ByteBuffer mBuffer;
69 
Hyphenator(long nativePtr, ByteBuffer b)70     private Hyphenator(long nativePtr, ByteBuffer b) {
71         mNativePtr = nativePtr;
72         mBuffer = b;
73     }
74 
getNativePtr()75     public long getNativePtr() {
76         return mNativePtr;
77     }
78 
get(@ullable Locale locale)79     public static Hyphenator get(@Nullable Locale locale) {
80         synchronized (sLock) {
81             Hyphenator result = sMap.get(locale);
82             if (result != null) {
83                 return result;
84             }
85 
86             // If there's a variant, fall back to language+variant only, if available
87             final String variant = locale.getVariant();
88             if (!variant.isEmpty()) {
89                 final Locale languageAndVariantOnlyLocale =
90                         new Locale(locale.getLanguage(), "", variant);
91                 result = sMap.get(languageAndVariantOnlyLocale);
92                 if (result != null) {
93                     sMap.put(locale, result);
94                     return result;
95                 }
96             }
97 
98             // Fall back to language-only, if available
99             final Locale languageOnlyLocale = new Locale(locale.getLanguage());
100             result = sMap.get(languageOnlyLocale);
101             if (result != null) {
102                 sMap.put(locale, result);
103                 return result;
104             }
105 
106             // Fall back to script-only, if available
107             final String script = locale.getScript();
108             if (!script.equals("")) {
109                 final Locale scriptOnlyLocale = new Locale.Builder()
110                         .setLanguage("und")
111                         .setScript(script)
112                         .build();
113                 result = sMap.get(scriptOnlyLocale);
114                 if (result != null) {
115                     sMap.put(locale, result);
116                     return result;
117                 }
118             }
119 
120             sMap.put(locale, sEmptyHyphenator);  // To remember we found nothing.
121         }
122         return sEmptyHyphenator;
123     }
124 
125     private static class HyphenationData {
126         final String mLanguageTag;
127         final int mMinPrefix, mMinSuffix;
HyphenationData(String languageTag, int minPrefix, int minSuffix)128         HyphenationData(String languageTag, int minPrefix, int minSuffix) {
129             this.mLanguageTag = languageTag;
130             this.mMinPrefix = minPrefix;
131             this.mMinSuffix = minSuffix;
132         }
133     }
134 
loadHyphenator(HyphenationData data)135     private static Hyphenator loadHyphenator(HyphenationData data) {
136         String patternFilename = "hyph-" + data.mLanguageTag.toLowerCase(Locale.US) + ".hyb";
137         File patternFile = new File(getSystemHyphenatorLocation(), patternFilename);
138         if (!patternFile.canRead()) {
139             Log.e(TAG, "hyphenation patterns for " + patternFile + " not found or unreadable");
140             return null;
141         }
142         try {
143             RandomAccessFile f = new RandomAccessFile(patternFile, "r");
144             try {
145                 FileChannel fc = f.getChannel();
146                 MappedByteBuffer buf = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
147                 long nativePtr = StaticLayout.nLoadHyphenator(
148                         buf, 0, data.mMinPrefix, data.mMinSuffix);
149                 return new Hyphenator(nativePtr, buf);
150             } finally {
151                 f.close();
152             }
153         } catch (IOException e) {
154             Log.e(TAG, "error loading hyphenation " + patternFile, e);
155             return null;
156         }
157     }
158 
getSystemHyphenatorLocation()159     private static File getSystemHyphenatorLocation() {
160         return new File("/system/usr/hyphen-data");
161     }
162 
163     // This array holds pairs of language tags that are used to prefill the map from locale to
164     // hyphenation data: The hyphenation data for the first field will be prefilled from the
165     // hyphenation data for the second field.
166     //
167     // The aliases that are computable by the get() method above are not included.
168     private static final String[][] LOCALE_FALLBACK_DATA = {
169         // English locales that fall back to en-US. The data is
170         // from CLDR. It's all English locales, minus the locales whose
171         // parent is en-001 (from supplementalData.xml, under <parentLocales>).
172         // TODO: Figure out how to get this from ICU.
173         {"en-AS", "en-US"}, // English (American Samoa)
174         {"en-GU", "en-US"}, // English (Guam)
175         {"en-MH", "en-US"}, // English (Marshall Islands)
176         {"en-MP", "en-US"}, // English (Northern Mariana Islands)
177         {"en-PR", "en-US"}, // English (Puerto Rico)
178         {"en-UM", "en-US"}, // English (United States Minor Outlying Islands)
179         {"en-VI", "en-US"}, // English (Virgin Islands)
180 
181         // All English locales other than those falling back to en-US are mapped to en-GB.
182         {"en", "en-GB"},
183 
184         // For German, we're assuming the 1996 (and later) orthography by default.
185         {"de", "de-1996"},
186         // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography.
187         {"de-LI-1901", "de-CH-1901"},
188 
189         // Norwegian is very probably Norwegian Bokmål.
190         {"no", "nb"},
191 
192         // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
193         {"mn", "mn-Cyrl"}, // Mongolian
194 
195         // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
196         // Data is from CLDR's likelySubtags.xml.
197         // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
198         {"am", "und-Ethi"}, // Amharic
199         {"byn", "und-Ethi"}, // Blin
200         {"gez", "und-Ethi"}, // Geʻez
201         {"ti", "und-Ethi"}, // Tigrinya
202         {"wal", "und-Ethi"}, // Wolaytta
203     };
204 
205     private static final HyphenationData[] AVAILABLE_LANGUAGES = {
206         new HyphenationData("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Assamese
207         new HyphenationData("bg", 2, 2), // Bulgarian
208         new HyphenationData("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Bengali
209         new HyphenationData("cu", 1, 2), // Church Slavonic
210         new HyphenationData("cy", 2, 3), // Welsh
211         new HyphenationData("da", 2, 2), // Danish
212         new HyphenationData("de-1901", 2, 2), // German 1901 orthography
213         new HyphenationData("de-1996", 2, 2), // German 1996 orthography
214         new HyphenationData("de-CH-1901", 2, 2), // Swiss High German 1901 orthography
215         new HyphenationData("en-GB", 2, 3), // British English
216         new HyphenationData("en-US", 2, 3), // American English
217         new HyphenationData("es", 2, 2), // Spanish
218         new HyphenationData("et", 2, 3), // Estonian
219         new HyphenationData("eu", 2, 2), // Basque
220         new HyphenationData("fr", 2, 3), // French
221         new HyphenationData("ga", 2, 3), // Irish
222         new HyphenationData("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Gujarati
223         new HyphenationData("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Hindi
224         new HyphenationData("hr", 2, 2), // Croatian
225         new HyphenationData("hu", 2, 2), // Hungarian
226         // texhyphen sources say Armenian may be (1, 2), but that it needs confirmation.
227         // Going with a more conservative value of (2, 2) for now.
228         new HyphenationData("hy", 2, 2), // Armenian
229         new HyphenationData("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Kannada
230         new HyphenationData("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Malayalam
231         new HyphenationData("mn-Cyrl", 2, 2), // Mongolian in Cyrillic script
232         new HyphenationData("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Marathi
233         new HyphenationData("nb", 2, 2), // Norwegian Bokmål
234         new HyphenationData("nn", 2, 2), // Norwegian Nynorsk
235         new HyphenationData("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Oriya
236         new HyphenationData("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Punjabi
237         new HyphenationData("pt", 2, 3), // Portuguese
238         new HyphenationData("sl", 2, 2), // Slovenian
239         new HyphenationData("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Tamil
240         new HyphenationData("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Telugu
241         new HyphenationData("tk", 2, 2), // Turkmen
242         new HyphenationData("und-Ethi", 1, 1), // Any language in Ethiopic script
243     };
244 
245     /**
246      * Load hyphenation patterns at initialization time. We want to have patterns
247      * for all locales loaded and ready to use so we don't have to do any file IO
248      * on the UI thread when drawing text in different locales.
249      *
250      * @hide
251      */
init()252     public static void init() {
253         sMap.put(null, null);
254 
255         for (int i = 0; i < AVAILABLE_LANGUAGES.length; i++) {
256             HyphenationData data = AVAILABLE_LANGUAGES[i];
257             Hyphenator h = loadHyphenator(data);
258             if (h != null) {
259                 sMap.put(Locale.forLanguageTag(data.mLanguageTag), h);
260             }
261         }
262 
263         for (int i = 0; i < LOCALE_FALLBACK_DATA.length; i++) {
264             String language = LOCALE_FALLBACK_DATA[i][0];
265             String fallback = LOCALE_FALLBACK_DATA[i][1];
266             sMap.put(Locale.forLanguageTag(language), sMap.get(Locale.forLanguageTag(fallback)));
267         }
268     }
269 }
270