1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ***************************************************************************
5  * Copyright (C) 2008-2016 International Business Machines Corporation
6  * and others. All Rights Reserved.
7  ***************************************************************************
8  *
9  * Unicode Spoof Detection
10  */
11 
12 package com.ibm.icu.text;
13 
14 import java.io.IOException;
15 import java.io.LineNumberReader;
16 import java.io.Reader;
17 import java.nio.ByteBuffer;
18 import java.text.ParseException;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.BitSet;
22 import java.util.Collections;
23 import java.util.Comparator;
24 import java.util.HashSet;
25 import java.util.Hashtable;
26 import java.util.LinkedHashSet;
27 import java.util.Locale;
28 import java.util.MissingResourceException;
29 import java.util.Set;
30 import java.util.Vector;
31 import java.util.regex.Matcher;
32 import java.util.regex.Pattern;
33 
34 import com.ibm.icu.impl.ICUBinary;
35 import com.ibm.icu.impl.ICUBinary.Authenticate;
36 import com.ibm.icu.impl.Utility;
37 import com.ibm.icu.lang.UCharacter;
38 import com.ibm.icu.lang.UCharacterCategory;
39 import com.ibm.icu.lang.UProperty;
40 import com.ibm.icu.lang.UScript;
41 import com.ibm.icu.util.ULocale;
42 
43 /**
44  * <p>
45  * This class, based on <a href="http://unicode.org/reports/tr36">Unicode Technical Report #36</a> and
46  * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
47  *
48  * <ol>
49  * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desparejado" and
50  * "ԁеѕрагејаԁо".</li>
51  * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
52  * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
53  * </ol>
54  *
55  * <p>
56  * Although originally designed as a method for flagging suspicious identifier strings such as URLs,
57  * <code>SpoofChecker</code> has a number of other practical use cases, such as preventing attempts to evade bad-word
58  * content filters.
59  *
60  * <h2>Confusables</h2>
61  *
62  * <p>
63  * The following example shows how to use <code>SpoofChecker</code> to check for confusability between two strings:
64  *
65  * <pre>
66  * <code>
67  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
68  * int result = sc.areConfusable("desparejado", "ԁеѕрагејаԁо");
69  * System.out.println(result != 0);  // true
70  * </code>
71  * </pre>
72  *
73  * <p>
74  * <code>SpoofChecker</code> uses a builder paradigm: options are specified within the context of a lightweight
75  * {@link SpoofChecker.Builder} object, and upon calling {@link SpoofChecker.Builder#build}, expensive data loading
76  * operations are performed, and an immutable <code>SpoofChecker</code> is returned.
77  *
78  * <p>
79  * The first line of the example creates a <code>SpoofChecker</code> object with confusable-checking enabled; the second
80  * line performs the confusability test. For best performance, the instance should be created once (e.g., upon
81  * application startup), and the more efficient {@link SpoofChecker#areConfusable} method can be used at runtime.
82  *
83  * <p>
84  * UTS 39 defines two strings to be <em>confusable</em> if they map to the same skeleton. A <em>skeleton</em> is a
85  * sequence of families of confusable characters, where each family has a single exemplar character.
86  * {@link SpoofChecker#getSkeleton} computes the skeleton for a particular string, so the following snippet is
87  * equivalent to the example above:
88  *
89  * <pre>
90  * <code>
91  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
92  * boolean result = sc.getSkeleton("desparejado").equals(sc.getSkeleton("ԁеѕрагејаԁо"));
93  * System.out.println(result);  // true
94  * </code>
95  * </pre>
96  *
97  * <p>
98  * If you need to check if a string is confusable with any string in a dictionary of many strings, rather than calling
99  * {@link SpoofChecker#areConfusable} many times in a loop, {@link SpoofChecker#getSkeleton} can be used instead, as
100  * shown below:
101  *
102  * <pre>
103  * // Setup:
104  * String[] DICTIONARY = new String[]{ "lorem", "ipsum" }; // example
105  * SpoofChecker sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CONFUSABLE).build();
106  * HashSet&lt;String&gt; skeletons = new HashSet&lt;String&gt;();
107  * for (String word : DICTIONARY) {
108  *   skeletons.add(sc.getSkeleton(word));
109  * }
110  *
111  * // Live Check:
112  * boolean result = skeletons.contains(sc.getSkeleton("1orern"));
113  * System.out.println(result);  // true
114  * </pre>
115  *
116  * <p>
117  * <b>Note:</b> Since the Unicode confusables mapping table is frequently updated, confusable skeletons are <em>not</em>
118  * guaranteed to be the same between ICU releases. We therefore recommend that you always compute confusable skeletons
119  * at runtime and do not rely on creating a permanent, or difficult to update, database of skeletons.
120  *
121  * <h2>Spoof Detection</h2>
122  *
123  * <p>
124  * The following snippet shows a minimal example of using <code>SpoofChecker</code> to perform spoof detection on a
125  * string:
126  *
127  * <pre>
128  * SpoofChecker sc = new SpoofChecker.Builder()
129  *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
130  *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
131  *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
132  *     .build();
133  * boolean result = sc.failsChecks("pаypаl");  // with Cyrillic 'а' characters
134  * System.out.println(result);  // true
135  * </pre>
136  *
137  * <p>
138  * As in the case for confusability checking, it is good practice to create one <code>SpoofChecker</code> instance at
139  * startup, and call the cheaper {@link SpoofChecker#failsChecks} online. In the second line, we specify the set of
140  * allowed characters to be those with type RECOMMENDED or INCLUSION, according to the recommendation in UTS 39. In the
141  * third line, the CONFUSABLE checks are disabled. It is good practice to disable them if you won't be using the
142  * instance to perform confusability checking.
143  *
144  * <p>
145  * To get more details on why a string failed the checks, use a {@link SpoofChecker.CheckResult}:
146  *
147  * <pre>
148  * <code>
149  * SpoofChecker sc = new SpoofChecker.Builder()
150  *     .setAllowedChars(SpoofChecker.RECOMMENDED.cloneAsThawed().addAll(SpoofChecker.INCLUSION))
151  *     .setRestrictionLevel(SpoofChecker.RestrictionLevel.MODERATELY_RESTRICTIVE)
152  *     .setChecks(SpoofChecker.ALL_CHECKS &~ SpoofChecker.CONFUSABLE)
153  *     .build();
154  * SpoofChecker.CheckResult checkResult = new SpoofChecker.CheckResult();
155  * boolean result = sc.failsChecks("pаypаl", checkResult);
156  * System.out.println(checkResult.checks);  // 16
157  * </code>
158  * </pre>
159  *
160  * <p>
161  * The return value is a bitmask of the checks that failed. In this case, there was one check that failed:
162  * {@link SpoofChecker#RESTRICTION_LEVEL}, corresponding to the fifth bit (16). The possible checks are:
163  *
164  * <ul>
165  * <li><code>RESTRICTION_LEVEL</code>: flags strings that violate the
166  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">Restriction Level</a> test as specified in UTS
167  * 39; in most cases, this means flagging strings that contain characters from multiple different scripts.</li>
168  * <li><code>INVISIBLE</code>: flags strings that contain invisible characters, such as zero-width spaces, or character
169  * sequences that are likely not to display, such as multiple occurrences of the same non-spacing mark.</li>
170  * <li><code>CHAR_LIMIT</code>: flags strings that contain characters outside of a specified set of acceptable
171  * characters. See {@link SpoofChecker.Builder#setAllowedChars} and {@link SpoofChecker.Builder#setAllowedLocales}.</li>
172  * <li><code>MIXED_NUMBERS</code>: flags strings that contain digits from multiple different numbering systems.</li>
173  * </ul>
174  *
175  * <p>
176  * These checks can be enabled independently of each other. For example, if you were interested in checking for only the
177  * INVISIBLE and MIXED_NUMBERS conditions, you could do:
178  *
179  * <pre>
180  * <code>
181  * SpoofChecker sc = new SpoofChecker.Builder()
182  *     .setChecks(SpoofChecker.INVISIBLE | SpoofChecker.MIXED_NUMBERS)
183  *     .build();
184  * boolean result = sc.failsChecks("৪8");
185  * System.out.println(result);  // true
186  * </code>
187  * </pre>
188  *
189  * <p>
190  * <b>Note:</b> The Restriction Level is the most powerful of the checks. The full logic is documented in
191  * <a href="http://unicode.org/reports/tr39/#Restriction_Level_Detection">UTS 39</a>, but the basic idea is that strings
192  * are restricted to contain characters from only a single script, <em>except</em> that most scripts are allowed to have
193  * Latin characters interspersed. Although the default restriction level is <code>HIGHLY_RESTRICTIVE</code>, it is
194  * recommended that users set their restriction level to <code>MODERATELY_RESTRICTIVE</code>, which allows Latin mixed
195  * with all other scripts except Cyrillic, Greek, and Cherokee, with which it is often confusable. For more details on
196  * the levels, see UTS 39 or {@link SpoofChecker.RestrictionLevel}. The Restriction Level test is aware of the set of
197  * allowed characters set in {@link SpoofChecker.Builder#setAllowedChars}. Note that characters which have script code
198  * COMMON or INHERITED, such as numbers and punctuation, are ignored when computing whether a string has multiple
199  * scripts.
200  *
201  * <h2>Additional Information</h2>
202  *
203  * <p>
204  * A <code>SpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
205  *
206  * <p>
207  * <b>Thread Safety:</b> The methods on <code>SpoofChecker</code> objects are thread safe. The test functions for
208  * checking a single identifier, or for testing whether two identifiers are potentially confusable, may called
209  * concurrently from multiple threads using the same <code>SpoofChecker</code> instance.
210  *
211  * @stable ICU 4.6
212  */
213 public class SpoofChecker {
214 
215     /**
216      * Constants from UTS 39 for use in setRestrictionLevel.
217      *
218      * @stable ICU 53
219      */
220     public enum RestrictionLevel {
221         /**
222          * All characters in the string are in the identifier profile and all characters in the string are in the ASCII
223          * range.
224          *
225          * @stable ICU 53
226          */
227         ASCII,
228         /**
229          * The string classifies as ASCII-Only, or all characters in the string are in the identifier profile and the
230          * string is single-script, according to the definition in UTS 39 section 5.1.
231          *
232          * @stable ICU 53
233          */
234         SINGLE_SCRIPT_RESTRICTIVE,
235         /**
236          * The string classifies as Single Script, or all characters in the string are in the identifier profile and the
237          * string is covered by any of the following sets of scripts, according to the definition in UTS 39 section 5.1:
238          * <ul>
239          * <li>Latin + Han + Bopomofo (or equivalently: Latn + Hanb)</li>
240          * <li>Latin + Han + Hiragana + Katakana (or equivalently: Latn + Jpan)</li>
241          * <li>Latin + Han + Hangul (or equivalently: Latn +Kore)</li>
242          * </ul>
243          *
244          * @stable ICU 53
245          */
246         HIGHLY_RESTRICTIVE,
247         /**
248          * The string classifies as Highly Restrictive, or all characters in the string are in the identifier profile
249          * and the string is covered by Latin and any one other Recommended or Aspirational script, except Cyrillic,
250          * Greek, and Cherokee.
251          *
252          * @stable ICU 53
253          */
254         MODERATELY_RESTRICTIVE,
255         /**
256          * All characters in the string are in the identifier profile. Allow arbitrary mixtures of scripts, such as
257          * Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us.
258          *
259          * @stable ICU 53
260          */
261         MINIMALLY_RESTRICTIVE,
262         /**
263          * Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
264          *
265          * @stable ICU 53
266          */
267         UNRESTRICTIVE,
268     }
269 
270     /**
271      * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
272      *
273      * @stable ICU 58
274      */
275     public static final UnicodeSet INCLUSION = new UnicodeSet(
276             "['\\-.\\:\\u00B7\\u0375\\u058A\\u05F3\\u05F4\\u06FD\\u06FE\\u0F0B\\u200C"
277             + "\\u200D\\u2010\\u2019\\u2027\\u30A0\\u30FB]"
278             ).freeze();
279     // Note: data from IdentifierStatus.txt & IdentifierType.txt
280     // There is tooling to generate this constant in the unicodetools project:
281     //      org.unicode.text.tools.RecommendedSetGenerator
282     // It will print the Java and C++ code to the console for easy copy-paste into this file.
283 
284     /**
285      * Security Profile constant from UTS 39 for use in {@link SpoofChecker.Builder#setAllowedChars}.
286      *
287      * @stable ICU 58
288      */
289     public static final UnicodeSet RECOMMENDED = new UnicodeSet(
290             "[0-9A-Z_a-z\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u0131\\u0134-\\u013E"
291             + "\\u0141-\\u0148\\u014A-\\u017E\\u018F\\u01A0\\u01A1\\u01AF\\u01B0\\u01CD-"
292             + "\\u01DC\\u01DE-\\u01E3\\u01E6-\\u01F0\\u01F4\\u01F5\\u01F8-\\u021B\\u021E"
293             + "\\u021F\\u0226-\\u0233\\u0259\\u02BB\\u02BC\\u02EC\\u0300-\\u0304\\u0306-"
294             + "\\u030C\\u030F-\\u0311\\u0313\\u0314\\u031B\\u0323-\\u0328\\u032D\\u032E"
295             + "\\u0330\\u0331\\u0335\\u0338\\u0339\\u0342\\u0345\\u037B-\\u037D\\u0386"
296             + "\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03FC-\\u045F\\u048A-"
297             + "\\u0529\\u052E\\u052F\\u0531-\\u0556\\u0559\\u0560-\\u0586\\u0588\\u05B4"
298             + "\\u05D0-\\u05EA\\u05EF-\\u05F2\\u0620-\\u063F\\u0641-\\u0655\\u0660-\\u0669"
299             + "\\u0670-\\u0672\\u0674\\u0679-\\u068D\\u068F-\\u06D3\\u06D5\\u06E5\\u06E6"
300             + "\\u06EE-\\u06FC\\u06FF\\u0750-\\u07B1\\u08A0-\\u08AC\\u08B2\\u08B6-\\u08BD"
301             + "\\u0901-\\u094D\\u094F\\u0950\\u0956\\u0957\\u0960-\\u0963\\u0966-\\u096F"
302             + "\\u0971-\\u0977\\u0979-\\u097F\\u0981-\\u0983\\u0985-\\u098C\\u098F\\u0990"
303             + "\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BC-\\u09C4\\u09C7"
304             + "\\u09C8\\u09CB-\\u09CE\\u09D7\\u09E0-\\u09E3\\u09E6-\\u09F1\\u09FC\\u09FE"
305             + "\\u0A01-\\u0A03\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30"
306             + "\\u0A32\\u0A35\\u0A38\\u0A39\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B-"
307             + "\\u0A4D\\u0A5C\\u0A66-\\u0A74\\u0A81-\\u0A83\\u0A85-\\u0A8D\\u0A8F-\\u0A91"
308             + "\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABC-\\u0AC5"
309             + "\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0AD0\\u0AE0-\\u0AE3\\u0AE6-\\u0AEF\\u0AFA-"
310             + "\\u0AFF\\u0B01-\\u0B03\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-"
311             + "\\u0B30\\u0B32\\u0B33\\u0B35-\\u0B39\\u0B3C-\\u0B43\\u0B47\\u0B48\\u0B4B-"
312             + "\\u0B4D\\u0B56\\u0B57\\u0B5F-\\u0B61\\u0B66-\\u0B6F\\u0B71\\u0B82\\u0B83"
313             + "\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E"
314             + "\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BBE-\\u0BC2\\u0BC6-"
315             + "\\u0BC8\\u0BCA-\\u0BCD\\u0BD0\\u0BD7\\u0BE6-\\u0BEF\\u0C01-\\u0C0C\\u0C0E-"
316             + "\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D-\\u0C44\\u0C46-"
317             + "\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C60\\u0C61\\u0C66-\\u0C6F\\u0C80"
318             + "\\u0C82\\u0C83\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3"
319             + "\\u0CB5-\\u0CB9\\u0CBC-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5\\u0CD6"
320             + "\\u0CE0-\\u0CE3\\u0CE6-\\u0CEF\\u0CF1\\u0CF2\\u0D00\\u0D02\\u0D03\\u0D05-"
321             + "\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4E\\u0D54-"
322             + "\\u0D57\\u0D60\\u0D61\\u0D66-\\u0D6F\\u0D7A-\\u0D7F\\u0D82\\u0D83\\u0D85-"
323             + "\\u0D8E\\u0D91-\\u0D96\\u0D9A-\\u0DA5\\u0DA7-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD"
324             + "\\u0DC0-\\u0DC6\\u0DCA\\u0DCF-\\u0DD4\\u0DD6\\u0DD8-\\u0DDE\\u0DF2\\u0E01-"
325             + "\\u0E32\\u0E34-\\u0E3A\\u0E40-\\u0E4E\\u0E50-\\u0E59\\u0E81\\u0E82\\u0E84"
326             + "\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3"
327             + "\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD-\\u0EB2\\u0EB4-\\u0EB9\\u0EBB-\\u0EBD"
328             + "\\u0EC0-\\u0EC4\\u0EC6\\u0EC8-\\u0ECD\\u0ED0-\\u0ED9\\u0EDE\\u0EDF\\u0F00"
329             + "\\u0F20-\\u0F29\\u0F35\\u0F37\\u0F3E-\\u0F42\\u0F44-\\u0F47\\u0F49-\\u0F4C"
330             + "\\u0F4E-\\u0F51\\u0F53-\\u0F56\\u0F58-\\u0F5B\\u0F5D-\\u0F68\\u0F6A-\\u0F6C"
331             + "\\u0F71\\u0F72\\u0F74\\u0F7A-\\u0F80\\u0F82-\\u0F84\\u0F86-\\u0F92\\u0F94-"
332             + "\\u0F97\\u0F99-\\u0F9C\\u0F9E-\\u0FA1\\u0FA3-\\u0FA6\\u0FA8-\\u0FAB\\u0FAD-"
333             + "\\u0FB8\\u0FBA-\\u0FBC\\u0FC6\\u1000-\\u1049\\u1050-\\u109D\\u10C7\\u10CD"
334             + "\\u10D0-\\u10F0\\u10F7-\\u10FA\\u10FD-\\u10FF\\u1200-\\u1248\\u124A-\\u124D"
335             + "\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-"
336             + "\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6"
337             + "\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u135D-\\u135F\\u1380-\\u138F"
338             + "\\u1780-\\u17A2\\u17A5-\\u17A7\\u17A9-\\u17B3\\u17B6-\\u17CA\\u17D2\\u17D7"
339             + "\\u17DC\\u17E0-\\u17E9\\u1C80-\\u1C88\\u1C90-\\u1CBA\\u1CBD-\\u1CBF\\u1E00-"
340             + "\\u1E99\\u1E9E\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45"
341             + "\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F70\\u1F72"
342             + "\\u1F74\\u1F76\\u1F78\\u1F7A\\u1F7C\\u1F80-\\u1FB4\\u1FB6-\\u1FBA\\u1FBC"
343             + "\\u1FC2-\\u1FC4\\u1FC6-\\u1FC8\\u1FCA\\u1FCC\\u1FD0-\\u1FD2\\u1FD6-\\u1FDA"
344             + "\\u1FE0-\\u1FE2\\u1FE4-\\u1FEA\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FF8\\u1FFA"
345             + "\\u1FFC\\u2D27\\u2D2D\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-"
346             + "\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-"
347             + "\\u2DDE\\u3005-\\u3007\\u3041-\\u3096\\u3099\\u309A\\u309D\\u309E\\u30A1-"
348             + "\\u30FA\\u30FC-\\u30FE\\u3105-\\u312F\\u31A0-\\u31BA\\u3400-\\u4DB5\\u4E00-"
349             + "\\u9FEF\\uA660\\uA661\\uA674-\\uA67B\\uA67F\\uA69F\\uA717-\\uA71F\\uA788"
350             + "\\uA78D\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7AE\\uA7AF\\uA7B8\\uA7B9"
351             + "\\uA7FA\\uA9E7-\\uA9FE\\uAA60-\\uAA76\\uAA7A-\\uAA7F\\uAB01-\\uAB06\\uAB09-"
352             + "\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uAC00-\\uD7A3\\uFA0E"
353             + "\\uFA0F\\uFA11\\uFA13\\uFA14\\uFA1F\\uFA21\\uFA23\\uFA24\\uFA27-\\uFA29"
354             + "\\U0001133B\\U0001B002-\\U0001B11E\\U00020000-\\U0002A6D6\\U0002A700-"
355             + "\\U0002B734\\U0002B740-\\U0002B81D\\U0002B820-\\U0002CEA1\\U0002CEB0-"
356             + "\\U0002EBE0]"
357             ).freeze();
358     // Note: data from IdentifierStatus.txt & IdentifierType.txt
359     // There is tooling to generate this constant in the unicodetools project:
360     //      org.unicode.text.tools.RecommendedSetGenerator
361     // It will print the Java and C++ code to the console for easy copy-paste into this file.
362 
363     /**
364      * Constants for the kinds of checks that USpoofChecker can perform. These values are used both to select the set of
365      * checks that will be performed, and to report results from the check function.
366      *
367      */
368 
369     /**
370      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
371      * that the two strings are visually confusable and that they are from the same script, according to UTS 39 section
372      * 4.
373      *
374      * @stable ICU 4.6
375      */
376     public static final int SINGLE_SCRIPT_CONFUSABLE = 1;
377 
378     /**
379      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
380      * that the two strings are visually confusable and that they are <b>not</b> from the same script, according to UTS
381      * 39 section 4.
382      *
383      * @stable ICU 4.6
384      */
385     public static final int MIXED_SCRIPT_CONFUSABLE = 2;
386 
387     /**
388      * When performing the two-string {@link SpoofChecker#areConfusable} test, this flag in the return value indicates
389      * that the two strings are visually confusable and that they are not from the same script but both of them are
390      * single-script strings, according to UTS 39 section 4.
391      *
392      * @stable ICU 4.6
393      */
394     public static final int WHOLE_SCRIPT_CONFUSABLE = 4;
395 
396     /**
397      * Enable this flag in {@link SpoofChecker.Builder#setChecks} to turn on all types of confusables. You may set the
398      * checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to make
399      * {@link SpoofChecker#areConfusable} return only those types of confusables.
400      *
401      * @stable ICU 58
402      */
403     public static final int CONFUSABLE = SINGLE_SCRIPT_CONFUSABLE | MIXED_SCRIPT_CONFUSABLE | WHOLE_SCRIPT_CONFUSABLE;
404 
405     /**
406      * This flag is deprecated and no longer affects the behavior of SpoofChecker.
407      *
408      * @deprecated ICU 58 Any case confusable mappings were removed from UTS 39; the corresponding ICU API was
409      * deprecated.
410      */
411     @Deprecated
412     public static final int ANY_CASE = 8;
413 
414     /**
415      * Check that an identifier satisfies the requirements for the restriction level specified in
416      * {@link SpoofChecker.Builder#setRestrictionLevel}. The default restriction level is
417      * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}.
418      *
419      * @stable ICU 58
420      */
421     public static final int RESTRICTION_LEVEL = 16;
422 
423     /**
424      * Check that an identifier contains only characters from a single script (plus chars from the common and inherited
425      * scripts.) Applies to checks of a single identifier check only.
426      *
427      * @deprecated ICU 51 Use RESTRICTION_LEVEL
428      */
429     @Deprecated
430     public static final int SINGLE_SCRIPT = RESTRICTION_LEVEL;
431 
432     /**
433      * Check an identifier for the presence of invisible characters, such as zero-width spaces, or character sequences
434      * that are likely not to display, such as multiple occurrences of the same non-spacing mark. This check does not
435      * test the input string as a whole for conformance to any particular syntax for identifiers.
436      *
437      * @stable ICU 4.6
438      */
439     public static final int INVISIBLE = 32;
440 
441     /**
442      * Check that an identifier contains only characters from a specified set of acceptable characters. See
443      * {@link Builder#setAllowedChars} and {@link Builder#setAllowedLocales}. Note that a string that fails this check
444      * will also fail the {@link #RESTRICTION_LEVEL} check.
445      *
446      * @stable ICU 4.6
447      */
448     public static final int CHAR_LIMIT = 64;
449 
450     /**
451      * Check that an identifier does not mix numbers from different numbering systems. For more information, see UTS 39
452      * section 5.3.
453      *
454      * @stable ICU 58
455      */
456     public static final int MIXED_NUMBERS = 128;
457 
458     /**
459      * Check that an identifier does not have a combining character following a character in which that
460      * combining character would be hidden; for example 'i' followed by a U+0307 combining dot.
461      * <p>
462      * More specifically, the following characters are forbidden from preceding a U+0307:
463      * <ul>
464      * <li>Those with the Soft_Dotted Unicode property (which includes 'i' and 'j')</li>
465      * <li>Latin lowercase letter 'l'</li>
466      * <li>Dotless 'i' and 'j' ('ı' and 'ȷ', U+0131 and U+0237)</li>
467      * <li>Any character whose confusable prototype ends with such a character
468      * (Soft_Dotted, 'l', 'ı', or 'ȷ')</li>
469      * </ul>
470      * In addition, combining characters are allowed between the above characters and U+0307 except those
471      * with combining class 0 or combining class "Above" (230, same class as U+0307).
472      * <p>
473      * This list and the number of combing characters considered by this check may grow over time.
474      *
475      * @draft ICU 62
476      * @provisional This API might change or be removed in a future release.
477      */
478     public static final int HIDDEN_OVERLAY = 256;
479 
480     // Update CheckResult.toString() when a new check is added.
481 
482     /**
483      * Enable all spoof checks.
484      *
485      * @stable ICU 4.6
486      */
487     public static final int ALL_CHECKS = 0xFFFFFFFF;
488 
489     // Used for checking for ASCII-Only restriction level
490     static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
491 
492     /**
493      * private constructor: a SpoofChecker has to be built by the builder
494      */
SpoofChecker()495     private SpoofChecker() {
496     }
497 
498     /**
499      * SpoofChecker Builder. To create a SpoofChecker, first instantiate a SpoofChecker.Builder, set the desired
500      * checking options on the builder, then call the build() function to create a SpoofChecker instance.
501      *
502      * @stable ICU 4.6
503      */
504     public static class Builder {
505         int fChecks; // Bit vector of checks to perform.
506         SpoofData fSpoofData;
507         final UnicodeSet fAllowedCharsSet = new UnicodeSet(0, 0x10ffff); // The UnicodeSet of allowed characters.
508         // for this Spoof Checker. Defaults to all chars.
509         final Set<ULocale> fAllowedLocales = new LinkedHashSet<>(); // The list of allowed locales.
510         private RestrictionLevel fRestrictionLevel;
511 
512         /**
513          * Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
514          * LOCALE_LIMIT and CHAR_LIMIT. Note that additional checks may be added in the future, resulting in the changes
515          * to the default checking behavior.
516          *
517          * @stable ICU 4.6
518          */
Builder()519         public Builder() {
520             fChecks = ALL_CHECKS;
521             fSpoofData = null;
522             fRestrictionLevel = RestrictionLevel.HIGHLY_RESTRICTIVE;
523         }
524 
525         /**
526          * Constructor: Create a Spoof Checker Builder, and set the configuration from an existing SpoofChecker.
527          *
528          * @param src
529          *            The existing checker.
530          * @stable ICU 4.6
531          */
Builder(SpoofChecker src)532         public Builder(SpoofChecker src) {
533             fChecks = src.fChecks;
534             fSpoofData = src.fSpoofData; // For the data, we will either use the source data
535                                          // as-is, or drop the builder's reference to it
536                                          // and generate new data, depending on what our
537                                          // caller does with the builder.
538             fAllowedCharsSet.set(src.fAllowedCharsSet);
539             fAllowedLocales.addAll(src.fAllowedLocales);
540             fRestrictionLevel = src.fRestrictionLevel;
541         }
542 
543         /**
544          * Create a SpoofChecker with current configuration.
545          *
546          * @return SpoofChecker
547          * @stable ICU 4.6
548          */
build()549         public SpoofChecker build() {
550             // TODO: Make this data loading be lazy (see #12696).
551             if (fSpoofData == null) {
552                 // read binary file
553                 fSpoofData = SpoofData.getDefault();
554             }
555 
556             // Copy all state from the builder to the new SpoofChecker.
557             // Make sure that everything is either cloned or copied, so
558             // that subsequent re-use of the builder won't modify the built
559             // SpoofChecker.
560             //
561             // One exception to this: the SpoofData is just assigned.
562             // If the builder subsequently needs to modify fSpoofData
563             // it will create a new SpoofData object first.
564 
565             SpoofChecker result = new SpoofChecker();
566             result.fChecks = this.fChecks;
567             result.fSpoofData = this.fSpoofData;
568             result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
569             result.fAllowedCharsSet.freeze();
570             result.fAllowedLocales = new HashSet<>(this.fAllowedLocales);
571             result.fRestrictionLevel = this.fRestrictionLevel;
572             return result;
573         }
574 
575         /**
576          * Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data file
577          * confusables.txt as described in Unicode UAX 39. The syntax of the source data is as described in UAX 39 for
578          * these files, and the content of these files is acceptable input.
579          *
580          * @param confusables
581          *            the Reader of confusable characters definitions, as found in file confusables.txt from
582          *            unicode.org.
583          * @throws ParseException
584          *             To report syntax errors in the input.
585          *
586          * @stable ICU 58
587          */
setData(Reader confusables)588         public Builder setData(Reader confusables) throws ParseException, IOException {
589 
590             // Compile the binary data from the source (text) format.
591             // Drop the builder's reference to any pre-existing data, which may
592             // be in use in an already-built checker.
593 
594             fSpoofData = new SpoofData();
595             ConfusabledataBuilder.buildConfusableData(confusables, fSpoofData);
596             return this;
597         }
598 
599         /**
600          * Deprecated as of ICU 58; use {@link SpoofChecker.Builder#setData(Reader confusables)} instead.
601          *
602          * @param confusables
603          *            the Reader of confusable characters definitions, as found in file confusables.txt from
604          *            unicode.org.
605          * @param confusablesWholeScript
606          *            No longer supported.
607          * @throws ParseException
608          *             To report syntax errors in the input.
609          *
610          * @deprecated ICU 58
611          */
612         @Deprecated
setData(Reader confusables, Reader confusablesWholeScript)613         public Builder setData(Reader confusables, Reader confusablesWholeScript) throws ParseException, IOException {
614             setData(confusables);
615             return this;
616         }
617 
618         /**
619          * Specify the bitmask of checks that will be performed by {@link SpoofChecker#failsChecks}. Calling this method
620          * overwrites any checks that may have already been enabled. By default, all checks are enabled.
621          *
622          * To enable specific checks and disable all others, the "whitelisted" checks should be ORed together. For
623          * example, to fail strings containing characters outside of the set specified by {@link #setAllowedChars} and
624          * also strings that contain digits from mixed numbering systems:
625          *
626          * <pre>
627          * {@code
628          * builder.setChecks(SpoofChecker.CHAR_LIMIT | SpoofChecker.MIXED_NUMBERS);
629          * }
630          * </pre>
631          *
632          * To disable specific checks and enable all others, the "blacklisted" checks should be ANDed away from
633          * ALL_CHECKS. For example, if you are not planning to use the {@link SpoofChecker#areConfusable} functionality,
634          * it is good practice to disable the CONFUSABLE check:
635          *
636          * <pre>
637          * {@code
638          * builder.setChecks(SpoofChecker.ALL_CHECKS & ~SpoofChecker.CONFUSABLE);
639          * }
640          * </pre>
641          *
642          * Note that methods such as {@link #setAllowedChars}, {@link #setAllowedLocales}, and
643          * {@link #setRestrictionLevel} will enable certain checks when called. Those methods will OR the check they
644          * enable onto the existing bitmask specified by this method. For more details, see the documentation of those
645          * methods.
646          *
647          * @param checks
648          *            The set of checks that this spoof checker will perform. The value is an 'or' of the desired
649          *            checks.
650          * @return self
651          * @stable ICU 4.6
652          */
setChecks(int checks)653         public Builder setChecks(int checks) {
654             // Verify that the requested checks are all ones (bits) that
655             // are acceptable, known values.
656             if (0 != (checks & ~SpoofChecker.ALL_CHECKS)) {
657                 throw new IllegalArgumentException("Bad Spoof Checks value.");
658             }
659             this.fChecks = (checks & SpoofChecker.ALL_CHECKS);
660             return this;
661         }
662 
663         /**
664          * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
665          * associated with the specified locales. Any previously specified list of locales is replaced by the new
666          * settings.
667          *
668          * A set of languages is determined from the locale(s), and from those a set of acceptable Unicode scripts is
669          * determined. Characters from this set of scripts, along with characters from the "common" and "inherited"
670          * Unicode Script categories will be permitted.
671          *
672          * Supplying an empty string removes all restrictions; characters from any script will be allowed.
673          *
674          * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker when calling this function with a
675          * non-empty list of locales.
676          *
677          * The Unicode Set of characters that will be allowed is accessible via the {@link #getAllowedChars} function.
678          * setAllowedLocales() will <i>replace</i> any previously applied set of allowed characters.
679          *
680          * Adjustments, such as additions or deletions of certain classes of characters, can be made to the result of
681          * {@link #setAllowedChars} by fetching the resulting set with {@link #getAllowedChars}, manipulating it with
682          * the Unicode Set API, then resetting the spoof detectors limits with {@link #setAllowedChars}.
683          *
684          * @param locales
685          *            A Set of ULocales, from which the language and associated script are extracted. If the locales Set
686          *            is null, no restrictions will be placed on the allowed characters.
687          *
688          * @return self
689          * @stable ICU 4.6
690          */
setAllowedLocales(Set<ULocale> locales)691         public Builder setAllowedLocales(Set<ULocale> locales) {
692             fAllowedCharsSet.clear();
693 
694             for (ULocale locale : locales) {
695                 // Add the script chars for this locale to the accumulating set
696                 // of allowed chars.
697                 addScriptChars(locale, fAllowedCharsSet);
698             }
699 
700             // If our caller provided an empty list of locales, we disable the
701             // allowed characters checking
702             fAllowedLocales.clear();
703             if (locales.size() == 0) {
704                 fAllowedCharsSet.add(0, 0x10ffff);
705                 fChecks &= ~CHAR_LIMIT;
706                 return this;
707             }
708 
709             // Add all common and inherited characters to the set of allowed
710             // chars.
711             UnicodeSet tempSet = new UnicodeSet();
712             tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.COMMON);
713             fAllowedCharsSet.addAll(tempSet);
714             tempSet.applyIntPropertyValue(UProperty.SCRIPT, UScript.INHERITED);
715             fAllowedCharsSet.addAll(tempSet);
716 
717             // Store the updated spoof checker state.
718             fAllowedLocales.clear();
719             fAllowedLocales.addAll(locales);
720             fChecks |= CHAR_LIMIT;
721             return this;
722         }
723 
724         /**
725          * Limit characters that are acceptable in identifiers being checked to those normally used with the languages
726          * associated with the specified locales. Any previously specified list of locales is replaced by the new
727          * settings.
728          *
729          * @param locales
730          *            A Set of Locales, from which the language and associated script are extracted. If the locales Set
731          *            is null, no restrictions will be placed on the allowed characters.
732          *
733          * @return self
734          * @stable ICU 54
735          */
setAllowedJavaLocales(Set<Locale> locales)736         public Builder setAllowedJavaLocales(Set<Locale> locales) {
737             HashSet<ULocale> ulocales = new HashSet<>(locales.size());
738             for (Locale locale : locales) {
739                 ulocales.add(ULocale.forLocale(locale));
740             }
741             return setAllowedLocales(ulocales);
742         }
743 
744         // Add (union) to the UnicodeSet all of the characters for the scripts
745         // used for the specified locale. Part of the implementation of
746         // setAllowedLocales.
addScriptChars(ULocale locale, UnicodeSet allowedChars)747         private void addScriptChars(ULocale locale, UnicodeSet allowedChars) {
748             int scripts[] = UScript.getCode(locale);
749             if (scripts != null) {
750                 UnicodeSet tmpSet = new UnicodeSet();
751                 for (int i = 0; i < scripts.length; i++) {
752                     tmpSet.applyIntPropertyValue(UProperty.SCRIPT, scripts[i]);
753                     allowedChars.addAll(tmpSet);
754                 }
755             }
756             // else it's an unknown script.
757             // Maybe they asked for the script of "zxx", which refers to no linguistic content.
758             // Maybe they asked for the script of a newer locale that we don't know in the older version of ICU.
759         }
760 
761         /**
762          * Limit the acceptable characters to those specified by a Unicode Set. Any previously specified character limit
763          * is is replaced by the new settings. This includes limits on characters that were set with the
764          * setAllowedLocales() function. Note that the RESTRICTED set is useful.
765          *
766          * The {@link #CHAR_LIMIT} test is automatically enabled for this SpoofChecker by this function.
767          *
768          * @param chars
769          *            A Unicode Set containing the list of characters that are permitted. The incoming set is cloned by
770          *            this function, so there are no restrictions on modifying or deleting the UnicodeSet after calling
771          *            this function. Note that this clears the allowedLocales set.
772          * @return self
773          * @stable ICU 4.6
774          */
setAllowedChars(UnicodeSet chars)775         public Builder setAllowedChars(UnicodeSet chars) {
776             fAllowedCharsSet.set(chars);
777             fAllowedLocales.clear();
778             fChecks |= CHAR_LIMIT;
779             return this;
780         }
781 
782         /**
783          * Set the loosest restriction level allowed for strings. The default if this is not called is
784          * {@link RestrictionLevel#HIGHLY_RESTRICTIVE}. Calling this method enables the {@link #RESTRICTION_LEVEL} and
785          * {@link #MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
786          * to be performed by {@link SpoofChecker#failsChecks}, see {@link #setChecks}.
787          *
788          * @param restrictionLevel
789          *            The loosest restriction level allowed.
790          * @return self
791          * @provisional This API might change or be removed in a future release.
792          * @stable ICU 58
793          */
setRestrictionLevel(RestrictionLevel restrictionLevel)794         public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
795             fRestrictionLevel = restrictionLevel;
796             fChecks |= RESTRICTION_LEVEL | MIXED_NUMBERS;
797             return this;
798         }
799 
800         /*
801          * *****************************************************************************
802          * Internal classes for compililing confusable data into its binary (runtime) form.
803          * *****************************************************************************
804          */
805         // ---------------------------------------------------------------------
806         //
807         // buildConfusableData Compile the source confusable data, as defined by
808         // the Unicode data file confusables.txt, into the binary
809         // structures used by the confusable detector.
810         //
811         // The binary structures are described in uspoof_impl.h
812         //
813         // 1. parse the data, making a hash table mapping from a codepoint to a String.
814         //
815         // 2. Sort all of the strings encountered by length, since they will need to
816         // be stored in that order in the final string table.
817         // TODO: Sorting these strings by length is no longer needed since the removal of
818         // the string lengths table.  This logic can be removed to save processing time
819         // when building confusables data.
820         //
821         // 3. Build a list of keys (UChar32s) from the mapping table. Sort the
822         // list because that will be the ordering of our runtime table.
823         //
824         // 4. Generate the run time string table. This is generated before the key & value
825         // table because we need the string indexes when building those tables.
826         //
827         // 5. Build the run-time key and value table. These are parallel tables, and
828         // are built at the same time
829 
830         // class ConfusabledataBuilder
831         // An instance of this class exists while the confusable data is being built from source.
832         // It encapsulates the intermediate data structures that are used for building.
833         // It exports one static function, to do a confusable data build.
834         private static class ConfusabledataBuilder {
835 
836             private Hashtable<Integer, SPUString> fTable;
837             private UnicodeSet fKeySet; // A set of all keys (UChar32s) that go into the
838                                         // four mapping tables.
839 
840             // The compiled data is first assembled into the following four collections,
841             // then output to the builder's SpoofData object.
842             private StringBuffer fStringTable;
843             private ArrayList<Integer> fKeyVec;
844             private ArrayList<Integer> fValueVec;
845             private SPUStringPool stringPool;
846             private Pattern fParseLine;
847             private Pattern fParseHexNum;
848             private int fLineNum;
849 
ConfusabledataBuilder()850             ConfusabledataBuilder() {
851                 fTable = new Hashtable<>();
852                 fKeySet = new UnicodeSet();
853                 fKeyVec = new ArrayList<>();
854                 fValueVec = new ArrayList<>();
855                 stringPool = new SPUStringPool();
856             }
857 
build(Reader confusables, SpoofData dest)858             void build(Reader confusables, SpoofData dest) throws ParseException, java.io.IOException {
859                 StringBuffer fInput = new StringBuffer();
860 
861                 // Convert the user input data from UTF-8 to char (UTF-16)
862                 LineNumberReader lnr = new LineNumberReader(confusables);
863                 do {
864                     String line = lnr.readLine();
865                     if (line == null) {
866                         break;
867                     }
868                     fInput.append(line);
869                     fInput.append('\n');
870                 } while (true);
871 
872                 // Regular Expression to parse a line from Confusables.txt. The expression will match
873                 // any line. What was matched is determined by examining which capture groups have a match.
874                 // Capture Group 1: the source char
875                 // Capture Group 2: the replacement chars
876                 // Capture Group 3-6 the table type, SL, SA, ML, or MA (deprecated)
877                 // Capture Group 7: A blank or comment only line.
878                 // Capture Group 8: A syntactically invalid line. Anything that didn't match before.
879                 // Example Line from the confusables.txt source file:
880                 // "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
881                 fParseLine = Pattern.compile("(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" + // Match the source char
882                         "[ \\t]*([0-9A-Fa-f]+" + // Match the replacement char(s)
883                         "(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" + // (continued)
884                         "\\s*(?:(SL)|(SA)|(ML)|(MA))" + // Match the table type
885                         "[ \\t]*(?:#.*?)?$" + // Match any trailing #comment
886                         "|^([ \\t]*(?:#.*?)?)$" + // OR match empty lines or lines with only a #comment
887                         "|^(.*?)$"); // OR match any line, which catches illegal lines.
888 
889                 // Regular expression for parsing a hex number out of a space-separated list of them.
890                 // Capture group 1 gets the number, with spaces removed.
891                 fParseHexNum = Pattern.compile("\\s*([0-9A-F]+)");
892 
893                 // Zap any Byte Order Mark at the start of input. Changing it to a space
894                 // is benign given the syntax of the input.
895                 if (fInput.charAt(0) == 0xfeff) {
896                     fInput.setCharAt(0, (char) 0x20);
897                 }
898 
899                 // Parse the input, one line per iteration of this loop.
900                 Matcher matcher = fParseLine.matcher(fInput);
901                 while (matcher.find()) {
902                     fLineNum++;
903                     if (matcher.start(7) >= 0) {
904                         // this was a blank or comment line.
905                         continue;
906                     }
907                     if (matcher.start(8) >= 0) {
908                         // input file syntax error.
909                         // status = U_PARSE_ERROR;
910                         throw new ParseException(
911                                 "Confusables, line " + fLineNum + ": Unrecognized Line: " + matcher.group(8),
912                                 matcher.start(8));
913                     }
914 
915                     // We have a good input line. Extract the key character and mapping
916                     // string, and
917                     // put them into the appropriate mapping table.
918                     int keyChar = Integer.parseInt(matcher.group(1), 16);
919                     if (keyChar > 0x10ffff) {
920                         throw new ParseException(
921                                 "Confusables, line " + fLineNum + ": Bad code point: " + matcher.group(1),
922                                 matcher.start(1));
923                     }
924                     Matcher m = fParseHexNum.matcher(matcher.group(2));
925 
926                     StringBuilder mapString = new StringBuilder();
927                     while (m.find()) {
928                         int c = Integer.parseInt(m.group(1), 16);
929                         if (c > 0x10ffff) {
930                             throw new ParseException(
931                                     "Confusables, line " + fLineNum + ": Bad code point: " + Integer.toString(c, 16),
932                                     matcher.start(2));
933                         }
934                         mapString.appendCodePoint(c);
935                     }
936                     assert (mapString.length() >= 1);
937 
938                     // Put the map (value) string into the string pool
939                     // This a little like a Java intern() - any duplicates will be
940                     // eliminated.
941                     SPUString smapString = stringPool.addString(mapString.toString());
942 
943                     // Add the char . string mapping to the table.
944                     // For Unicode 8, the SL, SA and ML tables have been discontinued.
945                     // All input data from confusables.txt is tagged MA.
946                     fTable.put(keyChar, smapString);
947 
948                     fKeySet.add(keyChar);
949                 }
950 
951                 // Input data is now all parsed and collected.
952                 // Now create the run-time binary form of the data.
953                 //
954                 // This is done in two steps. First the data is assembled into vectors and strings,
955                 // for ease of construction, then the contents of these collections are copied
956                 // into the actual SpoofData object.
957 
958                 // Build up the string array, and record the index of each string therein
959                 // in the (build time only) string pool.
960                 // Strings of length one are not entered into the strings array.
961                 // (Strings in the table are sorted by length)
962 
963                 stringPool.sort();
964                 fStringTable = new StringBuffer();
965                 int poolSize = stringPool.size();
966                 int i;
967                 for (i = 0; i < poolSize; i++) {
968                     SPUString s = stringPool.getByIndex(i);
969                     int strLen = s.fStr.length();
970                     int strIndex = fStringTable.length();
971                     if (strLen == 1) {
972                         // strings of length one do not get an entry in the string table.
973                         // Keep the single string character itself here, which is the same
974                         // convention that is used in the final run-time string table index.
975                         s.fCharOrStrTableIndex = s.fStr.charAt(0);
976                     } else {
977                         s.fCharOrStrTableIndex = strIndex;
978                         fStringTable.append(s.fStr);
979                     }
980                 }
981 
982                 // Construct the compile-time Key and Value table.
983                 //
984                 // The keys in the Key table follow the format described in uspoof.h for the
985                 // Cfu confusables data structure.
986                 //
987                 // Starting in ICU 58, each code point has exactly one entry in the data
988                 // structure.
989 
990                 for (String keyCharStr : fKeySet) {
991                     int keyChar = keyCharStr.codePointAt(0);
992                     SPUString targetMapping = fTable.get(keyChar);
993                     assert targetMapping != null;
994 
995                     // Throw a sane exception if trying to consume a long string.  Otherwise,
996                     // codePointAndLengthToKey will throw an assertion error.
997                     if (targetMapping.fStr.length() > 256) {
998                         throw new IllegalArgumentException("Confusable prototypes cannot be longer than 256 entries.");
999                     }
1000 
1001                     int key = ConfusableDataUtils.codePointAndLengthToKey(keyChar, targetMapping.fStr.length());
1002                     int value = targetMapping.fCharOrStrTableIndex;
1003 
1004                     fKeyVec.add(key);
1005                     fValueVec.add(value);
1006                 }
1007 
1008                 // Put the assembled data into the destination SpoofData object.
1009 
1010                 // The Key Table
1011                 // While copying the keys to the output array,
1012                 // also sanity check that the keys are sorted.
1013                 int numKeys = fKeyVec.size();
1014                 dest.fCFUKeys = new int[numKeys];
1015                 int previousCodePoint = 0;
1016                 for (i = 0; i < numKeys; i++) {
1017                     int key = fKeyVec.get(i);
1018                     int codePoint = ConfusableDataUtils.keyToCodePoint(key);
1019                     // strictly greater because there can be only one entry per code point
1020                     assert codePoint > previousCodePoint;
1021                     dest.fCFUKeys[i] = key;
1022                     previousCodePoint = codePoint;
1023                 }
1024 
1025                 // The Value Table, parallels the key table
1026                 int numValues = fValueVec.size();
1027                 assert (numKeys == numValues);
1028                 dest.fCFUValues = new short[numValues];
1029                 i = 0;
1030                 for (int value : fValueVec) {
1031                     assert (value < 0xffff);
1032                     dest.fCFUValues[i++] = (short) value;
1033                 }
1034 
1035                 // The Strings Table.
1036                 dest.fCFUStrings = fStringTable.toString();
1037             }
1038 
1039             public static void buildConfusableData(Reader confusables, SpoofData dest)
1040                     throws java.io.IOException, ParseException {
1041                 ConfusabledataBuilder builder = new ConfusabledataBuilder();
1042                 builder.build(confusables, dest);
1043             }
1044 
1045             /*
1046              * *****************************************************************************
1047              * Internal classes for compiling confusable data into its binary (runtime) form.
1048              * *****************************************************************************
1049              */
1050             // SPUString
1051             // Holds a string that is the result of one of the mappings defined
1052             // by the confusable mapping data (confusables.txt from Unicode.org)
1053             // Instances of SPUString exist during the compilation process only.
1054 
1055             private static class SPUString {
1056                 String fStr; // The actual string.
1057                 int fCharOrStrTableIndex; // Index into the final runtime data for this string.
1058                 // (or, for length 1, the single string char itself,
1059                 // there being no string table entry for it.)
1060 
1061                 SPUString(String s) {
1062                     fStr = s;
1063                     fCharOrStrTableIndex = 0;
1064                 }
1065             }
1066 
1067             // Comparison function for ordering strings in the string pool.
1068             // Compare by length first, then, within a group of the same length,
1069             // by code point order.
1070 
1071             private static class SPUStringComparator implements Comparator<SPUString> {
1072                 @Override
1073                 public int compare(SPUString sL, SPUString sR) {
1074                     int lenL = sL.fStr.length();
1075                     int lenR = sR.fStr.length();
1076                     if (lenL < lenR) {
1077                         return -1;
1078                     } else if (lenL > lenR) {
1079                         return 1;
1080                     } else {
1081                         return sL.fStr.compareTo(sR.fStr);
1082                     }
1083                 }
1084 
1085                 final static SPUStringComparator INSTANCE = new SPUStringComparator();
1086             }
1087 
1088             // String Pool A utility class for holding the strings that are the result of
1089             // the spoof mappings. These strings will utimately end up in the
1090             // run-time String Table.
1091             // This is sort of like a sorted set of strings, except that ICU's anemic
1092             // built-in collections don't support those, so it is implemented with a
1093             // combination of a uhash and a Vector.
1094             private static class SPUStringPool {
1095                 public SPUStringPool() {
1096                     fVec = new Vector<>();
1097                     fHash = new Hashtable<>();
1098                 }
1099 
1100                 public int size() {
1101                     return fVec.size();
1102                 }
1103 
1104                 // Get the n-th string in the collection.
1105                 public SPUString getByIndex(int index) {
1106                     SPUString retString = fVec.elementAt(index);
1107                     return retString;
1108                 }
1109 
1110                 // Add a string. Return the string from the table.
1111                 // If the input parameter string is already in the table, delete the
1112                 // input parameter and return the existing string.
1113                 public SPUString addString(String src) {
1114                     SPUString hashedString = fHash.get(src);
1115                     if (hashedString == null) {
1116                         hashedString = new SPUString(src);
1117                         fHash.put(src, hashedString);
1118                         fVec.addElement(hashedString);
1119                     }
1120                     return hashedString;
1121                 }
1122 
1123                 // Sort the contents; affects the ordering of getByIndex().
1124                 public void sort() {
1125                     Collections.sort(fVec, SPUStringComparator.INSTANCE);
1126                 }
1127 
1128                 private Vector<SPUString> fVec; // Elements are SPUString *
1129                 private Hashtable<String, SPUString> fHash; // Key: Value:
1130             }
1131 
1132         }
1133     }
1134 
1135     /**
1136      * Get the Restriction Level that is being tested.
1137      *
1138      * @return The restriction level
1139      * @internal
1140      * @deprecated This API is ICU internal only.
1141      */
1142     @Deprecated
1143     public RestrictionLevel getRestrictionLevel() {
1144         return fRestrictionLevel;
1145     }
1146 
1147     /**
1148      * Get the set of checks that this Spoof Checker has been configured to perform.
1149      *
1150      * @return The set of checks that this spoof checker will perform.
1151      * @stable ICU 4.6
1152      */
1153     public int getChecks() {
1154         return fChecks;
1155     }
1156 
1157     /**
1158      * Get a read-only set of locales for the scripts that are acceptable in strings to be checked. If no limitations on
1159      * scripts have been specified, an empty set will be returned.
1160      *
1161      * setAllowedChars() will reset the list of allowed locales to be empty.
1162      *
1163      * The returned set may not be identical to the originally specified set that is supplied to setAllowedLocales();
1164      * the information other than languages from the originally specified locales may be omitted.
1165      *
1166      * @return A set of locales corresponding to the acceptable scripts.
1167      *
1168      * @stable ICU 4.6
1169      */
1170     public Set<ULocale> getAllowedLocales() {
1171         return Collections.unmodifiableSet(fAllowedLocales);
1172     }
1173 
1174     /**
1175      * Get a set of {@link java.util.Locale} instances for the scripts that are acceptable in strings to be checked. If
1176      * no limitations on scripts have been specified, an empty set will be returned.
1177      *
1178      * @return A set of locales corresponding to the acceptable scripts.
1179      * @stable ICU 54
1180      */
1181     public Set<Locale> getAllowedJavaLocales() {
1182         HashSet<Locale> locales = new HashSet<>(fAllowedLocales.size());
1183         for (ULocale uloc : fAllowedLocales) {
1184             locales.add(uloc.toLocale());
1185         }
1186         return locales;
1187     }
1188 
1189     /**
1190      * Get a UnicodeSet for the characters permitted in an identifier. This corresponds to the limits imposed by the Set
1191      * Allowed Characters functions. Limitations imposed by other checks will not be reflected in the set returned by
1192      * this function.
1193      *
1194      * The returned set will be frozen, meaning that it cannot be modified by the caller.
1195      *
1196      * @return A UnicodeSet containing the characters that are permitted by the CHAR_LIMIT test.
1197      * @stable ICU 4.6
1198      */
1199     public UnicodeSet getAllowedChars() {
1200         return fAllowedCharsSet;
1201     }
1202 
1203     /**
1204      * A struct-like class to hold the results of a Spoof Check operation. Tells which check(s) have failed.
1205      *
1206      * @stable ICU 4.6
1207      */
1208     public static class CheckResult {
1209         /**
1210          * Indicates which of the spoof check(s) have failed. The value is a bitwise OR of the constants for the tests
1211          * in question: RESTRICTION_LEVEL, CHAR_LIMIT, and so on.
1212          *
1213          * @stable ICU 4.6
1214          * @see Builder#setChecks
1215          */
1216         public int checks;
1217 
1218         /**
1219          * The index of the first string position that failed a check.
1220          *
1221          * @deprecated ICU 51. No longer supported. Always set to zero.
1222          */
1223         @Deprecated
1224         public int position;
1225 
1226         /**
1227          * The numerics found in the string, if MIXED_NUMBERS was set; otherwise null.  The set will contain the zero
1228          * digit from each decimal number system found in the input string.
1229          *
1230          * @stable ICU 58
1231          */
1232         public UnicodeSet numerics;
1233 
1234         /**
1235          * The restriction level that the text meets, if RESTRICTION_LEVEL is set; otherwise null.
1236          *
1237          * @stable ICU 58
1238          */
1239         public RestrictionLevel restrictionLevel;
1240 
1241         /**
1242          * Default constructor
1243          *
1244          * @stable ICU 4.6
1245          */
1246         public CheckResult() {
1247             checks = 0;
1248             position = 0;
1249         }
1250 
1251         /**
1252          * {@inheritDoc}
1253          *
1254          * @stable ICU 4.6
1255          */
1256         @Override
1257         public String toString() {
1258             StringBuilder sb = new StringBuilder();
1259             sb.append("checks:");
1260             if (checks == 0) {
1261                 sb.append(" none");
1262             } else if (checks == ALL_CHECKS) {
1263                 sb.append(" all");
1264             } else {
1265                 if ((checks & SINGLE_SCRIPT_CONFUSABLE) != 0) {
1266                     sb.append(" SINGLE_SCRIPT_CONFUSABLE");
1267                 }
1268                 if ((checks & MIXED_SCRIPT_CONFUSABLE) != 0) {
1269                     sb.append(" MIXED_SCRIPT_CONFUSABLE");
1270                 }
1271                 if ((checks & WHOLE_SCRIPT_CONFUSABLE) != 0) {
1272                     sb.append(" WHOLE_SCRIPT_CONFUSABLE");
1273                 }
1274                 if ((checks & ANY_CASE) != 0) {
1275                     sb.append(" ANY_CASE");
1276                 }
1277                 if ((checks & RESTRICTION_LEVEL) != 0) {
1278                     sb.append(" RESTRICTION_LEVEL");
1279                 }
1280                 if ((checks & INVISIBLE) != 0) {
1281                     sb.append(" INVISIBLE");
1282                 }
1283                 if ((checks & CHAR_LIMIT) != 0) {
1284                     sb.append(" CHAR_LIMIT");
1285                 }
1286                 if ((checks & MIXED_NUMBERS) != 0) {
1287                     sb.append(" MIXED_NUMBERS");
1288                 }
1289             }
1290             sb.append(", numerics: ").append(numerics.toPattern(false));
1291             sb.append(", position: ").append(position);
1292             sb.append(", restrictionLevel: ").append(restrictionLevel);
1293             return sb.toString();
1294         }
1295     }
1296 
1297     /**
1298      * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1299      * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1300      *
1301      * @param text
1302      *            A String to be checked for possible security issues.
1303      * @param checkResult
1304      *            Output parameter, indicates which specific tests failed. May be null if the information is not wanted.
1305      * @return True there any issue is found with the input string.
1306      * @stable ICU 4.8
1307      */
1308     public boolean failsChecks(String text, CheckResult checkResult) {
1309         int length = text.length();
1310 
1311         int result = 0;
1312         if (checkResult != null) {
1313             checkResult.position = 0;
1314             checkResult.numerics = null;
1315             checkResult.restrictionLevel = null;
1316         }
1317 
1318         if (0 != (this.fChecks & RESTRICTION_LEVEL)) {
1319             RestrictionLevel textRestrictionLevel = getRestrictionLevel(text);
1320             if (textRestrictionLevel.compareTo(fRestrictionLevel) > 0) {
1321                 result |= RESTRICTION_LEVEL;
1322             }
1323             if (checkResult != null) {
1324                 checkResult.restrictionLevel = textRestrictionLevel;
1325             }
1326         }
1327 
1328         if (0 != (this.fChecks & MIXED_NUMBERS)) {
1329             UnicodeSet numerics = new UnicodeSet();
1330             getNumerics(text, numerics);
1331             if (numerics.size() > 1) {
1332                 result |= MIXED_NUMBERS;
1333             }
1334             if (checkResult != null) {
1335                 checkResult.numerics = numerics;
1336             }
1337         }
1338 
1339         if (0 != (this.fChecks & HIDDEN_OVERLAY)) {
1340             int index = findHiddenOverlay(text);
1341             if (index != -1) {
1342                 result |= HIDDEN_OVERLAY;
1343             }
1344         }
1345 
1346         if (0 != (this.fChecks & CHAR_LIMIT)) {
1347             int i;
1348             int c;
1349             for (i = 0; i < length;) {
1350                 // U16_NEXT(text, i, length, c);
1351                 c = Character.codePointAt(text, i);
1352                 i = Character.offsetByCodePoints(text, i, 1);
1353                 if (!this.fAllowedCharsSet.contains(c)) {
1354                     result |= CHAR_LIMIT;
1355                     break;
1356                 }
1357             }
1358         }
1359 
1360         if (0 != (this.fChecks & INVISIBLE)) {
1361             // This check needs to be done on NFD input
1362             String nfdText = nfdNormalizer.normalize(text);
1363 
1364             // scan for more than one occurrence of the same non-spacing mark
1365             // in a sequence of non-spacing marks.
1366             int i;
1367             int c;
1368             int firstNonspacingMark = 0;
1369             boolean haveMultipleMarks = false;
1370             UnicodeSet marksSeenSoFar = new UnicodeSet(); // Set of combining marks in a
1371                                                           // single combining sequence.
1372             for (i = 0; i < length;) {
1373                 c = Character.codePointAt(nfdText, i);
1374                 i = Character.offsetByCodePoints(nfdText, i, 1);
1375                 if (Character.getType(c) != UCharacterCategory.NON_SPACING_MARK) {
1376                     firstNonspacingMark = 0;
1377                     if (haveMultipleMarks) {
1378                         marksSeenSoFar.clear();
1379                         haveMultipleMarks = false;
1380                     }
1381                     continue;
1382                 }
1383                 if (firstNonspacingMark == 0) {
1384                     firstNonspacingMark = c;
1385                     continue;
1386                 }
1387                 if (!haveMultipleMarks) {
1388                     marksSeenSoFar.add(firstNonspacingMark);
1389                     haveMultipleMarks = true;
1390                 }
1391                 if (marksSeenSoFar.contains(c)) {
1392                     // report the error, and stop scanning.
1393                     // No need to find more than the first failure.
1394                     result |= INVISIBLE;
1395                     break;
1396                 }
1397                 marksSeenSoFar.add(c);
1398             }
1399         }
1400         if (checkResult != null) {
1401             checkResult.checks = result;
1402         }
1403         return (0 != result);
1404     }
1405 
1406     /**
1407      * Check the specified string for possible security issues. The text to be checked will typically be an identifier
1408      * of some sort. The set of checks to be performed was specified when building the SpoofChecker.
1409      *
1410      * @param text
1411      *            A String to be checked for possible security issues.
1412      * @return True there any issue is found with the input string.
1413      * @stable ICU 4.8
1414      */
failsChecks(String text)1415     public boolean failsChecks(String text) {
1416         return failsChecks(text, null);
1417     }
1418 
1419     /**
1420      * Check the whether two specified strings are visually confusable. The types of confusability to be tested - single
1421      * script, mixed script, or whole script - are determined by the check options set for the SpoofChecker.
1422      *
1423      * The tests to be performed are controlled by the flags SINGLE_SCRIPT_CONFUSABLE MIXED_SCRIPT_CONFUSABLE
1424      * WHOLE_SCRIPT_CONFUSABLE At least one of these tests must be selected.
1425      *
1426      * ANY_CASE is a modifier for the tests. Select it if the identifiers may be of mixed case. If identifiers are case
1427      * folded for comparison and display to the user, do not select the ANY_CASE option.
1428      *
1429      *
1430      * @param s1
1431      *            The first of the two strings to be compared for confusability.
1432      * @param s2
1433      *            The second of the two strings to be compared for confusability.
1434      * @return Non-zero if s1 and s1 are confusable. If not 0, the value will indicate the type(s) of confusability
1435      *         found, as defined by spoof check test constants.
1436      * @stable ICU 4.6
1437      */
areConfusable(String s1, String s2)1438     public int areConfusable(String s1, String s2) {
1439         //
1440         // See section 4 of UAX 39 for the algorithm for checking whether two strings are confusable,
1441         // and for definitions of the types (single, whole, mixed-script) of confusables.
1442 
1443         // We only care about a few of the check flags. Ignore the others.
1444         // If no tests relevant to this function have been specified, signal an error.
1445         // TODO: is this really the right thing to do? It's probably an error on
1446         // the caller's part, but logically we would just return 0 (no error).
1447         if ((this.fChecks & CONFUSABLE) == 0) {
1448             throw new IllegalArgumentException("No confusable checks are enabled.");
1449         }
1450 
1451         // Compute the skeletons and check for confusability.
1452         String s1Skeleton = getSkeleton(s1);
1453         String s2Skeleton = getSkeleton(s2);
1454         if (!s1Skeleton.equals(s2Skeleton)) {
1455             return 0;
1456         }
1457 
1458         // If we get here, the strings are confusable. Now we just need to set the flags for the appropriate classes
1459         // of confusables according to UTS 39 section 4.
1460         // Start by computing the resolved script sets of s1 and s2.
1461         ScriptSet s1RSS = new ScriptSet();
1462         getResolvedScriptSet(s1, s1RSS);
1463         ScriptSet s2RSS = new ScriptSet();
1464         getResolvedScriptSet(s2, s2RSS);
1465 
1466         // Turn on all applicable flags
1467         int result = 0;
1468         if (s1RSS.intersects(s2RSS)) {
1469             result |= SINGLE_SCRIPT_CONFUSABLE;
1470         } else {
1471             result |= MIXED_SCRIPT_CONFUSABLE;
1472             if (!s1RSS.isEmpty() && !s2RSS.isEmpty()) {
1473                 result |= WHOLE_SCRIPT_CONFUSABLE;
1474             }
1475         }
1476 
1477         // Turn off flags that the user doesn't want
1478         result &= fChecks;
1479 
1480         return result;
1481     }
1482 
1483     /**
1484      * Get the "skeleton" for an identifier string. Skeletons are a transformation of the input string; Two strings are
1485      * confusable if their skeletons are identical. See Unicode UAX 39 for additional information.
1486      *
1487      * Using skeletons directly makes it possible to quickly check whether an identifier is confusable with any of some
1488      * large set of existing identifiers, by creating an efficiently searchable collection of the skeletons.
1489      *
1490      * Skeletons are computed using the algorithm and data described in Unicode UAX 39.
1491      *
1492      * @param str
1493      *            The input string whose skeleton will be generated.
1494      * @return The output skeleton string.
1495      *
1496      * @stable ICU 58
1497      */
getSkeleton(CharSequence str)1498     public String getSkeleton(CharSequence str) {
1499         // Apply the skeleton mapping to the NFD normalized input string
1500         // Accumulate the skeleton, possibly unnormalized, in a String.
1501         String nfdId = nfdNormalizer.normalize(str);
1502         int normalizedLen = nfdId.length();
1503         StringBuilder skelSB = new StringBuilder();
1504         for (int inputIndex = 0; inputIndex < normalizedLen;) {
1505             int c = Character.codePointAt(nfdId, inputIndex);
1506             inputIndex += Character.charCount(c);
1507             this.fSpoofData.confusableLookup(c, skelSB);
1508         }
1509         String skelStr = skelSB.toString();
1510         skelStr = nfdNormalizer.normalize(skelStr);
1511         return skelStr;
1512     }
1513 
1514     /**
1515      * Calls {@link SpoofChecker#getSkeleton(CharSequence id)}. Starting with ICU 55, the "type" parameter has been
1516      * ignored, and starting with ICU 58, this function has been deprecated.
1517      *
1518      * @param type
1519      *            No longer supported. Prior to ICU 55, was used to specify the mapping table SL, SA, ML, or MA.
1520      * @param id
1521      *            The input identifier whose skeleton will be generated.
1522      * @return The output skeleton string.
1523      *
1524      * @deprecated ICU 58
1525      */
1526     @Deprecated
getSkeleton(int type, String id)1527     public String getSkeleton(int type, String id) {
1528         return getSkeleton(id);
1529     }
1530 
1531     /**
1532      * Equality function. Return true if the two SpoofChecker objects incorporate the same confusable data and have
1533      * enabled the same set of checks.
1534      *
1535      * @param other
1536      *            the SpoofChecker being compared with.
1537      * @return true if the two SpoofCheckers are equal.
1538      * @stable ICU 4.6
1539      */
1540     @Override
equals(Object other)1541     public boolean equals(Object other) {
1542         if (!(other instanceof SpoofChecker)) {
1543             return false;
1544         }
1545         SpoofChecker otherSC = (SpoofChecker) other;
1546         if (fSpoofData != otherSC.fSpoofData && fSpoofData != null && !fSpoofData.equals(otherSC.fSpoofData)) {
1547             return false;
1548         }
1549         if (fChecks != otherSC.fChecks) {
1550             return false;
1551         }
1552         if (fAllowedLocales != otherSC.fAllowedLocales && fAllowedLocales != null
1553                 && !fAllowedLocales.equals(otherSC.fAllowedLocales)) {
1554             return false;
1555         }
1556         if (fAllowedCharsSet != otherSC.fAllowedCharsSet && fAllowedCharsSet != null
1557                 && !fAllowedCharsSet.equals(otherSC.fAllowedCharsSet)) {
1558             return false;
1559         }
1560         if (fRestrictionLevel != otherSC.fRestrictionLevel) {
1561             return false;
1562         }
1563         return true;
1564     }
1565 
1566     /**
1567      * Overrides {@link Object#hashCode()}.
1568      * @stable ICU 4.6
1569      */
1570     @Override
hashCode()1571     public int hashCode() {
1572         return fChecks
1573                 ^ fSpoofData.hashCode()
1574                 ^ fAllowedLocales.hashCode()
1575                 ^ fAllowedCharsSet.hashCode()
1576                 ^ fRestrictionLevel.ordinal();
1577     }
1578 
1579     /**
1580      * Computes the augmented script set for a code point, according to UTS 39 section 5.1.
1581      */
getAugmentedScriptSet(int codePoint, ScriptSet result)1582     private static void getAugmentedScriptSet(int codePoint, ScriptSet result) {
1583         result.clear();
1584         UScript.getScriptExtensions(codePoint, result);
1585 
1586         // Section 5.1 step 1
1587         if (result.get(UScript.HAN)) {
1588             result.set(UScript.HAN_WITH_BOPOMOFO);
1589             result.set(UScript.JAPANESE);
1590             result.set(UScript.KOREAN);
1591         }
1592         if (result.get(UScript.HIRAGANA)) {
1593             result.set(UScript.JAPANESE);
1594         }
1595         if (result.get(UScript.KATAKANA)) {
1596             result.set(UScript.JAPANESE);
1597         }
1598         if (result.get(UScript.HANGUL)) {
1599             result.set(UScript.KOREAN);
1600         }
1601         if (result.get(UScript.BOPOMOFO)) {
1602             result.set(UScript.HAN_WITH_BOPOMOFO);
1603         }
1604 
1605         // Section 5.1 step 2
1606         if (result.get(UScript.COMMON) || result.get(UScript.INHERITED)) {
1607             result.setAll();
1608         }
1609     }
1610 
1611     /**
1612      * Computes the resolved script set for a string, according to UTS 39 section 5.1.
1613      */
getResolvedScriptSet(CharSequence input, ScriptSet result)1614     private void getResolvedScriptSet(CharSequence input, ScriptSet result) {
1615         getResolvedScriptSetWithout(input, UScript.CODE_LIMIT, result);
1616     }
1617 
1618     /**
1619      * Computes the resolved script set for a string, omitting characters having the specified script. If
1620      * UScript.CODE_LIMIT is passed as the second argument, all characters are included.
1621      */
getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result)1622     private void getResolvedScriptSetWithout(CharSequence input, int script, ScriptSet result) {
1623         result.setAll();
1624 
1625         ScriptSet temp = new ScriptSet();
1626         for (int utf16Offset = 0; utf16Offset < input.length();) {
1627             int codePoint = Character.codePointAt(input, utf16Offset);
1628             utf16Offset += Character.charCount(codePoint);
1629 
1630             // Compute the augmented script set for the character
1631             getAugmentedScriptSet(codePoint, temp);
1632 
1633             // Intersect the augmented script set with the resolved script set, but only if the character doesn't
1634             // have the script specified in the function call
1635             if (script == UScript.CODE_LIMIT || !temp.get(script)) {
1636                 result.and(temp);
1637             }
1638         }
1639     }
1640 
1641     /**
1642      * Computes the set of numerics for a string, according to UTS 39 section 5.3.
1643      */
getNumerics(String input, UnicodeSet result)1644     private void getNumerics(String input, UnicodeSet result) {
1645         result.clear();
1646 
1647         for (int utf16Offset = 0; utf16Offset < input.length();) {
1648             int codePoint = Character.codePointAt(input, utf16Offset);
1649             utf16Offset += Character.charCount(codePoint);
1650 
1651             // Store a representative character for each kind of decimal digit
1652             if (UCharacter.getType(codePoint) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
1653                 // Store the zero character as a representative for comparison.
1654                 // Unicode guarantees it is codePoint - value
1655                 result.add(codePoint - UCharacter.getNumericValue(codePoint));
1656             }
1657         }
1658     }
1659 
1660     /**
1661      * Computes the restriction level of a string, according to UTS 39 section 5.2.
1662      */
getRestrictionLevel(String input)1663     private RestrictionLevel getRestrictionLevel(String input) {
1664         // Section 5.2 step 1:
1665         if (!fAllowedCharsSet.containsAll(input)) {
1666             return RestrictionLevel.UNRESTRICTIVE;
1667         }
1668 
1669         // Section 5.2 step 2:
1670         if (ASCII.containsAll(input)) {
1671             return RestrictionLevel.ASCII;
1672         }
1673 
1674         // Section 5.2 steps 3:
1675         ScriptSet resolvedScriptSet = new ScriptSet();
1676         getResolvedScriptSet(input, resolvedScriptSet);
1677 
1678         // Section 5.2 step 4:
1679         if (!resolvedScriptSet.isEmpty()) {
1680             return RestrictionLevel.SINGLE_SCRIPT_RESTRICTIVE;
1681         }
1682 
1683         // Section 5.2 step 5:
1684         ScriptSet resolvedNoLatn = new ScriptSet();
1685         getResolvedScriptSetWithout(input, UScript.LATIN, resolvedNoLatn);
1686 
1687         // Section 5.2 step 6:
1688         if (resolvedNoLatn.get(UScript.HAN_WITH_BOPOMOFO) || resolvedNoLatn.get(UScript.JAPANESE)
1689                 || resolvedNoLatn.get(UScript.KOREAN)) {
1690             return RestrictionLevel.HIGHLY_RESTRICTIVE;
1691         }
1692 
1693         // Section 5.2 step 7:
1694         if (!resolvedNoLatn.isEmpty() && !resolvedNoLatn.get(UScript.CYRILLIC) && !resolvedNoLatn.get(UScript.GREEK)
1695                 && !resolvedNoLatn.get(UScript.CHEROKEE)) {
1696             return RestrictionLevel.MODERATELY_RESTRICTIVE;
1697         }
1698 
1699         // Section 5.2 step 8:
1700         return RestrictionLevel.MINIMALLY_RESTRICTIVE;
1701     }
1702 
findHiddenOverlay(String input)1703     int findHiddenOverlay(String input) {
1704         boolean sawLeadCharacter = false;
1705         StringBuilder sb = new StringBuilder();
1706         for (int i=0; i<input.length();) {
1707             int cp = input.codePointAt(i);
1708             if (sawLeadCharacter && cp == 0x0307) {
1709                 return i;
1710             }
1711             int combiningClass = UCharacter.getCombiningClass(cp);
1712             // Skip over characters except for those with combining class 0 (non-combining characters) or with
1713             // combining class 230 (same class as U+0307)
1714             assert UCharacter.getCombiningClass(0x0307) == 230;
1715             if (combiningClass == 0 || combiningClass == 230) {
1716                 sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp, sb);
1717             }
1718             i += UCharacter.charCount(cp);
1719         }
1720         return -1;
1721     }
1722 
isIllegalCombiningDotLeadCharacterNoLookup(int cp)1723     boolean isIllegalCombiningDotLeadCharacterNoLookup(int cp) {
1724         return cp == 'i' || cp == 'j' || cp == 'ı' || cp == 'ȷ' || cp == 'l' ||
1725                UCharacter.hasBinaryProperty(cp, UProperty.SOFT_DOTTED);
1726     }
1727 
isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb)1728     boolean isIllegalCombiningDotLeadCharacter(int cp, StringBuilder sb) {
1729         if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) {
1730             return true;
1731         }
1732         sb.setLength(0);
1733         fSpoofData.confusableLookup(cp, sb);
1734         int finalCp = UCharacter.codePointBefore(sb, sb.length());
1735         if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) {
1736             return true;
1737         }
1738         return false;
1739     }
1740 
1741     // Data Members
1742     private int fChecks; // Bit vector of checks to perform.
1743     private SpoofData fSpoofData;
1744     private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
1745     private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
1746     private RestrictionLevel fRestrictionLevel;
1747 
1748     private static Normalizer2 nfdNormalizer = Normalizer2.getNFDInstance();
1749 
1750     // Confusable Mappings Data Structures, version 2.0
1751     //
1752     // This description and the corresponding implementation are to be kept
1753     // in-sync with the copy in icu4c uspoof_impl.h.
1754     //
1755     // For the confusable data, we are essentially implementing a map,
1756     //     key: a code point
1757     //     value: a string. Most commonly one char in length, but can be more.
1758     //
1759     // The keys are stored as a sorted array of 32 bit ints.
1760     //         bits 0-23 a code point value
1761     //         bits 24-31 length of value string, in UChars (between 1 and 256 UChars).
1762     //     The key table is sorted in ascending code point order. (not on the
1763     //     32 bit int value, the flag bits do not participate in the sorting.)
1764     //
1765     //     Lookup is done by means of a binary search in the key table.
1766     //
1767     // The corresponding values are kept in a parallel array of 16 bit ints.
1768     //     If the value string is of length 1, it is literally in the value array.
1769     //     For longer strings, the value array contains an index into the strings
1770     //     table.
1771     //
1772     // String Table:
1773     //     The strings table contains all of the value strings (those of length two or greater)
1774     //     concatentated together into one long char (UTF-16) array.
1775     //
1776     //     There is no nul character or other mark between adjacent strings.
1777     //
1778     //----------------------------------------------------------------------------
1779     //
1780     //  Changes from format version 1 to format version 2:
1781     //        1) Removal of the whole-script confusable data tables.
1782     //        2) Removal of the SL/SA/ML/MA and multi-table flags in the key bitmask.
1783     //        3) Expansion of string length value in the key bitmask from 2 bits to 8 bits.
1784     //        4) Removal of the string lengths table since 8 bits is sufficient for the
1785     //           lengths of all entries in confusables.txt.
1786     //
1787     private static final class ConfusableDataUtils {
1788         public static final int FORMAT_VERSION = 2; // version for ICU 58
1789 
keyToCodePoint(int key)1790         public static final int keyToCodePoint(int key) {
1791             return key & 0x00ffffff;
1792         }
1793 
keyToLength(int key)1794         public static final int keyToLength(int key) {
1795             return ((key & 0xff000000) >> 24) + 1;
1796         }
1797 
codePointAndLengthToKey(int codePoint, int length)1798         public static final int codePointAndLengthToKey(int codePoint, int length) {
1799             assert (codePoint & 0x00ffffff) == codePoint;
1800             assert length <= 256;
1801             return codePoint | ((length - 1) << 24);
1802         }
1803     }
1804 
1805     // -------------------------------------------------------------------------------------
1806     //
1807     // SpoofData
1808     //
1809     // This class corresponds to the ICU SpoofCheck data.
1810     //
1811     // The data can originate with the Binary ICU data that is generated in ICU4C,
1812     // or it can originate from source rules that are compiled in ICU4J.
1813     //
1814     // This class does not include the set of checks to be performed, but only
1815     // data that is serialized into the ICU binary data.
1816     //
1817     // Because Java cannot easily wrap binary data like ICU4C, the binary data is
1818     // copied into Java structures that are convenient for use by the run time code.
1819     //
1820     // ---------------------------------------------------------------------------------------
1821     private static class SpoofData {
1822 
1823         // The Confusable data, Java data structures for.
1824         int[] fCFUKeys;
1825         short[] fCFUValues;
1826         String fCFUStrings;
1827 
1828         private static final int DATA_FORMAT = 0x43667520; // "Cfu "
1829 
1830         private static final class IsAcceptable implements Authenticate {
1831             @Override
isDataVersionAcceptable(byte version[])1832             public boolean isDataVersionAcceptable(byte version[]) {
1833                 return version[0] == ConfusableDataUtils.FORMAT_VERSION || version[1] != 0 || version[2] != 0
1834                         || version[3] != 0;
1835             }
1836         }
1837 
1838         private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
1839 
1840         private static final class DefaultData {
1841             private static SpoofData INSTANCE = null;
1842             private static IOException EXCEPTION = null;
1843 
1844             static {
1845                 // Note: Although this is static, the Java runtime can delay execution of this block until
1846                 // the data is actually requested via SpoofData.getDefault().
1847                 try {
1848                     INSTANCE = new SpoofData(ICUBinary.getRequiredData("confusables.cfu"));
1849                 } catch (IOException e) {
1850                     EXCEPTION = e;
1851                 }
1852             }
1853         }
1854 
1855         /**
1856          * @return instance for Unicode standard data
1857          */
getDefault()1858         public static SpoofData getDefault() {
1859             if (DefaultData.EXCEPTION != null) {
1860                 throw new MissingResourceException(
1861                         "Could not load default confusables data: " + DefaultData.EXCEPTION.getMessage(),
1862                         "SpoofChecker", "");
1863             }
1864             return DefaultData.INSTANCE;
1865         }
1866 
1867         // SpoofChecker Data constructor for use from data builder.
1868         // Initializes a new, empty data area that will be populated later.
SpoofData()1869         private SpoofData() {
1870         }
1871 
1872         // Constructor for use when creating from prebuilt default data.
1873         // A ByteBuffer is what the ICU internal data loading functions provide.
SpoofData(ByteBuffer bytes)1874         private SpoofData(ByteBuffer bytes) throws java.io.IOException {
1875             ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE);
1876             bytes.mark();
1877             readData(bytes);
1878         }
1879 
1880         @Override
equals(Object other)1881         public boolean equals(Object other) {
1882             if (!(other instanceof SpoofData)) {
1883                 return false;
1884             }
1885             SpoofData otherData = (SpoofData) other;
1886             if (!Arrays.equals(fCFUKeys, otherData.fCFUKeys))
1887                 return false;
1888             if (!Arrays.equals(fCFUValues, otherData.fCFUValues))
1889                 return false;
1890             if (!Utility.sameObjects(fCFUStrings, otherData.fCFUStrings) && fCFUStrings != null
1891                     && !fCFUStrings.equals(otherData.fCFUStrings))
1892                 return false;
1893             return true;
1894         }
1895 
1896         @Override
hashCode()1897         public int hashCode() {
1898             return Arrays.hashCode(fCFUKeys)
1899                     ^ Arrays.hashCode(fCFUValues)
1900                     ^ fCFUStrings.hashCode();
1901         }
1902 
1903         // Set the SpoofChecker data from pre-built binary data in a byte buffer.
1904         // The binary data format is as described for ICU4C spoof data.
1905         //
readData(ByteBuffer bytes)1906         private void readData(ByteBuffer bytes) throws java.io.IOException {
1907             int magic = bytes.getInt();
1908             if (magic != 0x3845fdef) {
1909                 throw new IllegalArgumentException("Bad Spoof Check Data.");
1910             }
1911             @SuppressWarnings("unused")
1912             int dataFormatVersion = bytes.getInt();
1913             @SuppressWarnings("unused")
1914             int dataLength = bytes.getInt();
1915 
1916             int CFUKeysOffset = bytes.getInt();
1917             int CFUKeysSize = bytes.getInt();
1918 
1919             int CFUValuesOffset = bytes.getInt();
1920             int CFUValuesSize = bytes.getInt();
1921 
1922             int CFUStringTableOffset = bytes.getInt();
1923             int CFUStringTableSize = bytes.getInt();
1924 
1925             // We have now read the file header, and obtained the position for each
1926             // of the data items. Now read each in turn, first seeking the
1927             // input stream to the position of the data item.
1928 
1929             bytes.reset();
1930             ICUBinary.skipBytes(bytes, CFUKeysOffset);
1931             fCFUKeys = ICUBinary.getInts(bytes, CFUKeysSize, 0);
1932 
1933             bytes.reset();
1934             ICUBinary.skipBytes(bytes, CFUValuesOffset);
1935             fCFUValues = ICUBinary.getShorts(bytes, CFUValuesSize, 0);
1936 
1937             bytes.reset();
1938             ICUBinary.skipBytes(bytes, CFUStringTableOffset);
1939             fCFUStrings = ICUBinary.getString(bytes, CFUStringTableSize, 0);
1940         }
1941 
1942         /**
1943          * Append the confusable skeleton transform for a single code point to a StringBuilder. The string to be
1944          * appended will between 1 and 18 characters as of Unicode 9.
1945          *
1946          * This is the heart of the confusable skeleton generation implementation.
1947          */
confusableLookup(int inChar, StringBuilder dest)1948         public void confusableLookup(int inChar, StringBuilder dest) {
1949             // Perform a binary search.
1950             // [lo, hi), i.e lo is inclusive, hi is exclusive.
1951             // The result after the loop will be in lo.
1952             int lo = 0;
1953             int hi = length();
1954             do {
1955                 int mid = (lo + hi) / 2;
1956                 if (codePointAt(mid) > inChar) {
1957                     hi = mid;
1958                 } else if (codePointAt(mid) < inChar) {
1959                     lo = mid;
1960                 } else {
1961                     // Found result. Break early.
1962                     lo = mid;
1963                     break;
1964                 }
1965             } while (hi - lo > 1);
1966 
1967             // Did we find an entry? If not, the char maps to itself.
1968             if (codePointAt(lo) != inChar) {
1969                 dest.appendCodePoint(inChar);
1970                 return;
1971             }
1972 
1973             // Add the element to the string builder and return.
1974             appendValueTo(lo, dest);
1975             return;
1976         }
1977 
1978         /**
1979          * Return the number of confusable entries in this SpoofData.
1980          *
1981          * @return The number of entries.
1982          */
length()1983         public int length() {
1984             return fCFUKeys.length;
1985         }
1986 
1987         /**
1988          * Return the code point (key) at the specified index.
1989          *
1990          * @param index
1991          *            The index within the SpoofData.
1992          * @return The code point.
1993          */
codePointAt(int index)1994         public int codePointAt(int index) {
1995             return ConfusableDataUtils.keyToCodePoint(fCFUKeys[index]);
1996         }
1997 
1998         /**
1999          * Append the confusable skeleton at the specified index to the StringBuilder dest.
2000          *
2001          * @param index
2002          *            The index within the SpoofData.
2003          * @param dest
2004          *            The StringBuilder to which to append the skeleton.
2005          */
appendValueTo(int index, StringBuilder dest)2006         public void appendValueTo(int index, StringBuilder dest) {
2007             int stringLength = ConfusableDataUtils.keyToLength(fCFUKeys[index]);
2008 
2009             // Value is either a char (for strings of length 1) or
2010             // an index into the string table (for longer strings)
2011             short value = fCFUValues[index];
2012             if (stringLength == 1) {
2013                 dest.append((char) value);
2014             } else {
2015                 dest.append(fCFUStrings, value, value + stringLength);
2016             }
2017         }
2018     }
2019 
2020     // -------------------------------------------------------------------------------
2021     //
2022     // ScriptSet - Script code bit sets.
2023     // Extends Java BitSet with input/output support and a few helper methods.
2024     // Note: The I/O is not currently being used, so it has been commented out. If
2025     // it is needed again, the code can be restored.
2026     //
2027     // -------------------------------------------------------------------------------
2028     static class ScriptSet extends BitSet {
2029 
2030         // Eclipse default value to quell warnings:
2031         private static final long serialVersionUID = 1L;
2032 
2033         // // The serialized version of this class can hold INT_CAPACITY * 32 scripts.
2034         // private static final int INT_CAPACITY = 6;
2035         // private static final long serialVersionUID = INT_CAPACITY;
2036         // static {
2037         // assert ScriptSet.INT_CAPACITY * Integer.SIZE <= UScript.CODE_LIMIT;
2038         // }
2039         //
2040         // public ScriptSet() {
2041         // }
2042         //
2043         // public ScriptSet(ByteBuffer bytes) throws java.io.IOException {
2044         // for (int i = 0; i < INT_CAPACITY; i++) {
2045         // int bits = bytes.getInt();
2046         // for (int j = 0; j < Integer.SIZE; j++) {
2047         // if ((bits & (1 << j)) != 0) {
2048         // set(i * Integer.SIZE + j);
2049         // }
2050         // }
2051         // }
2052         // }
2053         //
2054         // public void output(DataOutputStream os) throws java.io.IOException {
2055         // for (int i = 0; i < INT_CAPACITY; i++) {
2056         // int bits = 0;
2057         // for (int j = 0; j < Integer.SIZE; j++) {
2058         // if (get(i * Integer.SIZE + j)) {
2059         // bits |= (1 << j);
2060         // }
2061         // }
2062         // os.writeInt(bits);
2063         // }
2064         // }
2065 
and(int script)2066         public void and(int script) {
2067             this.clear(0, script);
2068             this.clear(script + 1, UScript.CODE_LIMIT);
2069         }
2070 
setAll()2071         public void setAll() {
2072             this.set(0, UScript.CODE_LIMIT);
2073         }
2074 
isFull()2075         public boolean isFull() {
2076             return cardinality() == UScript.CODE_LIMIT;
2077         }
2078 
appendStringTo(StringBuilder sb)2079         public void appendStringTo(StringBuilder sb) {
2080             sb.append("{ ");
2081             if (isEmpty()) {
2082                 sb.append("- ");
2083             } else if (isFull()) {
2084                 sb.append("* ");
2085             } else {
2086                 for (int script = 0; script < UScript.CODE_LIMIT; script++) {
2087                     if (get(script)) {
2088                         sb.append(UScript.getShortName(script));
2089                         sb.append(" ");
2090                     }
2091                 }
2092             }
2093             sb.append("}");
2094         }
2095 
2096         @Override
toString()2097         public String toString() {
2098             StringBuilder sb = new StringBuilder();
2099             sb.append("<ScriptSet ");
2100             appendStringTo(sb);
2101             sb.append(">");
2102             return sb.toString();
2103         }
2104     }
2105 }
2106