1 package org.unicode.cldr.draft;
2 
3 import java.util.Map;
4 import java.util.TreeMap;
5 
6 import org.unicode.cldr.util.Timer;
7 
8 import com.ibm.icu.text.Normalizer;
9 import com.ibm.icu.text.UnicodeSet;
10 
11 public class NormalizedIdentifierParser {
12     enum Status {
13         NotNameChar, NameContinue, Name, UNKNOWN, DONE
14     }
15 
16     enum CharType {
17         Illegal, NameContinueNFC, NameContinueOther, NameStartNFC, NameStartOther, Whitespace, Other
18     }
19 
20     static final UnicodeSet XmlNameStartChar = new UnicodeSet("[\\: A-Z _ a-z " +
21         "\\u00C0-\\u00D6 \\u00D8-\\u00F6 \\u00F8-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF \\u200C-\\u200D" +
22         "\\u2070-\\u218F \\u2C00-\\u2FEF \\u3001-\\uD7FF \\uF900-\\uFDCF \\uFDF0-\\uFFFD \\U00010000-\\U000EFFFF]")
23             .freeze();
24     static final UnicodeSet XmlNameContinueChar = new UnicodeSet(
25         "[- . 0-9 \\u00B7 \\u0300-\\u036F \\u203F-\\u2040]").freeze();
26     static final UnicodeSet XmlNameChar = new UnicodeSet(XmlNameStartChar).addAll(XmlNameContinueChar)
27         .freeze();
28     static final UnicodeSet XmlWhiteSpace = new UnicodeSet("[\\u0009\\u000D\\u000A\\u0020]").freeze();
29     static final UnicodeSet XmlIllegal = new UnicodeSet(
30         "[^\\u0009\\u000D\\u000A\\u0020-\uD7FF\\uE000-\\uFFFD\\U00010000-\\U000EFFFF]").freeze();
31     static final UnicodeSet NfcSafe = new UnicodeSet("[:nfkcqc=yes:]").freeze();
32 
33     private String input;
34     private int endPosition;
35     private int startPosition;
36     private Status status;
37     private boolean knownNfc;
38 
set(String input, int position)39     public NormalizedIdentifierParser set(String input, int position) {
40         this.input = input;
41         startPosition = endPosition = position;
42         status = Status.UNKNOWN;
43         return this;
44     }
45 
next()46     public Status next() {
47         startPosition = endPosition;
48         if (endPosition >= input.length()) {
49             return status = Status.DONE;
50         }
51         // since the vast majority of characters by frequency are Latin-1 (<FF),
52         // this can also be optimized by having a special loop for Latin-1
53         // and only dropping into the full check if a non-Latin-1 characters
54 
55         // check the first character specially
56         int codePoint = input.codePointAt(endPosition);
57         CharType type = getCharType(codePoint);
58         endPosition += codePoint < 0x10000 ? 1 : 2;
59         switch (type) {
60         case NameStartNFC:
61             knownNfc = true;
62             status = Status.Name;
63             break;
64         case NameStartOther:
65             knownNfc = false;
66             status = Status.Name;
67             break;
68         case NameContinueNFC:
69             knownNfc = true;
70             status = Status.NameContinue;
71             break;
72         case NameContinueOther:
73             knownNfc = false;
74             status = Status.NameContinue;
75             break;
76         default:
77             knownNfc = NfcSafe.contains(codePoint); // we don't care about the value, so production code wouldn't check
78             return Status.NotNameChar;
79         }
80 
81         loop: while (endPosition < input.length()) {
82             codePoint = input.codePointAt(endPosition);
83             type = getCharType(codePoint);
84             switch (type) {
85             case NameStartOther:
86             case NameContinueOther:
87                 knownNfc = false;
88                 break;
89             case NameStartNFC:
90             case NameContinueNFC:
91                 break;
92             default:
93                 break loop;
94             }
95             endPosition += codePoint < 0x10000 ? 1 : 2;
96         }
97         return status;
98     }
99 
100     public CharType getCharType(int codePoint) {
101         // Normally this would just be a trie lookup, but we simulate it here
102         if (XmlNameContinueChar.contains(codePoint)) {
103             return NfcSafe.contains(codePoint) ? CharType.NameContinueNFC : CharType.NameContinueOther;
104         } else if (XmlNameStartChar.contains(codePoint)) {
105             return NfcSafe.contains(codePoint) ? CharType.NameStartNFC : CharType.NameStartOther;
106         } else if (XmlIllegal.contains(codePoint)) {
107             return CharType.Illegal;
108         } else if (XmlWhiteSpace.contains(codePoint)) {
109             return CharType.Whitespace;
110         } else {
111             return CharType.Other;
112         }
113     }
114 
115     public String getToken() {
116         return input.substring(startPosition, endPosition);
117     }
118 
119     public Status getStatus() {
120         return status;
121     }
122 
123     public int getEndPosition() {
124         return endPosition;
125     }
126 
127     public int getStartPosition() {
128         return startPosition;
129     }
130 
131     public boolean isKnownNfc() {
132         return knownNfc;
133     }
134 
135     Status getIdStatus(String s) {
136         set(s, 0).next();
137         if (endPosition != s.length()) {
138             return Status.NotNameChar;
139         }
140         return getStatus();
141     }
142 
143     static void showDiffs() {
144         Map<Status, Map<Status, UnicodeSet>> map = new TreeMap<>();
145         NormalizedIdentifierParser parser = new NormalizedIdentifierParser();
146         for (int codePoint = 0; codePoint <= 0x10FFFF; ++codePoint) {
147             String source = new StringBuilder().appendCodePoint(codePoint).toString();
148             Status sourceStatus = parser.getIdStatus(source);
149             String target = Normalizer.normalize(codePoint, Normalizer.NFC);
150             Status targetStatus = parser.getIdStatus(target);
151             if (sourceStatus == targetStatus) {
152                 continue;
153             }
154             Map<Status, UnicodeSet> map2 = map.get(sourceStatus);
155             if (map2 == null) {
156                 map.put(sourceStatus, map2 = new TreeMap<>());
157             }
158             UnicodeSet set = map2.get(targetStatus);
159             if (set == null) {
160                 map2.put(targetStatus, set = new UnicodeSet());
161             }
162             set.add(codePoint);
163         }
164         for (Status sourceStatus : map.keySet()) {
165             Map<Status, UnicodeSet> map2 = map.get(sourceStatus);
166             for (Status targetStatus : map2.keySet()) {
167                 UnicodeSet set = map2.get(targetStatus);
168                 System.out.println(sourceStatus + "\t=>\t" + targetStatus + "\t" + set);
169             }
170         }
171     }
172 
173     public static void main(String[] args) {
174         System.out.println("NameStart: " + XmlNameStartChar);
175         System.out.println("NameContinue: " + XmlNameContinueChar);
176         System.out.println("Whitespace: " + XmlWhiteSpace);
177         System.out.println("Illegal: " + XmlIllegal);
178 
179         compareNormalizer();
180 
181         NormalizedIdentifierParser parser = new NormalizedIdentifierParser();
182         parser.set("\u0308ghi)j\u0308$abc+def*(", 0);
183         for (Status status = parser.next(); status != Status.DONE; status = parser.next()) {
184             System.out.println(status + ": \t" + parser.getToken() + (!parser.isKnownNfc() ? "\tNot Known NFC!" : ""));
185         }
186     }
187 
188     private static void compareNormalizer() {
189         final int iterations = 100000000;
190         compareNormalizer("nörmalization", iterations);
191         compareNormalizer("No\u0308rmalization", iterations);
192     }
193 
194     private static void compareNormalizer(String test, int iterations) {
195         String s;
196         s = test.toLowerCase();
197         s = Normalizer.normalize(test, Normalizer.NFC);
198         Timer timer = new Timer();
199         timer.start();
200         for (int i = 0; i < iterations; ++i) {
201             s = test.toLowerCase();
202         }
203         timer.stop();
204         final long lowercaseDuration = timer.getDuration();
205         System.out.println("Java Lowercasing: " + lowercaseDuration * 1000.0d / iterations
206             + "µs; for " + test);
207 
208         timer.start();
209         for (int i = 0; i < iterations; ++i) {
210             s = Normalizer.normalize(test, Normalizer.NFC);
211         }
212         timer.stop();
213         final long nfcDuration = timer.getDuration();
214         System.out.println("ICU Normalizing: " + nfcDuration * 1000.0d / iterations
215             + "µs = " + (nfcDuration * 100.0d / lowercaseDuration - 1)
216             + "%; for " + test);
217     }
218 }
219