1 package org.unicode.cldr.draft; 2 3 import java.util.Map; 4 import java.util.TreeMap; 5 6 import org.unicode.cldr.util.Timer; 7 8 import com.ibm.icu.text.Normalizer; 9 import com.ibm.icu.text.UnicodeSet; 10 11 public class NormalizedIdentifierParser { 12 enum Status { 13 NotNameChar, NameContinue, Name, UNKNOWN, DONE 14 } 15 16 enum CharType { 17 Illegal, NameContinueNFC, NameContinueOther, NameStartNFC, NameStartOther, Whitespace, Other 18 } 19 20 static final UnicodeSet XmlNameStartChar = new UnicodeSet("[\\: A-Z _ a-z " + 21 "\\u00C0-\\u00D6 \\u00D8-\\u00F6 \\u00F8-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF \\u200C-\\u200D" + 22 "\\u2070-\\u218F \\u2C00-\\u2FEF \\u3001-\\uD7FF \\uF900-\\uFDCF \\uFDF0-\\uFFFD \\U00010000-\\U000EFFFF]") 23 .freeze(); 24 static final UnicodeSet XmlNameContinueChar = new UnicodeSet( 25 "[- . 0-9 \\u00B7 \\u0300-\\u036F \\u203F-\\u2040]").freeze(); 26 static final UnicodeSet XmlNameChar = new UnicodeSet(XmlNameStartChar).addAll(XmlNameContinueChar) 27 .freeze(); 28 static final UnicodeSet XmlWhiteSpace = new UnicodeSet("[\\u0009\\u000D\\u000A\\u0020]").freeze(); 29 static final UnicodeSet XmlIllegal = new UnicodeSet( 30 "[^\\u0009\\u000D\\u000A\\u0020-\uD7FF\\uE000-\\uFFFD\\U00010000-\\U000EFFFF]").freeze(); 31 static final UnicodeSet NfcSafe = new UnicodeSet("[:nfkcqc=yes:]").freeze(); 32 33 private String input; 34 private int endPosition; 35 private int startPosition; 36 private Status status; 37 private boolean knownNfc; 38 set(String input, int position)39 public NormalizedIdentifierParser set(String input, int position) { 40 this.input = input; 41 startPosition = endPosition = position; 42 status = Status.UNKNOWN; 43 return this; 44 } 45 next()46 public Status next() { 47 startPosition = endPosition; 48 if (endPosition >= input.length()) { 49 return status = Status.DONE; 50 } 51 // since the vast majority of characters by frequency are Latin-1 (<FF), 52 // this can also be optimized by having a special loop for Latin-1 53 // and only dropping into the full check if a non-Latin-1 characters 54 55 // check the first character specially 56 int codePoint = input.codePointAt(endPosition); 57 CharType type = getCharType(codePoint); 58 endPosition += codePoint < 0x10000 ? 1 : 2; 59 switch (type) { 60 case NameStartNFC: 61 knownNfc = true; 62 status = Status.Name; 63 break; 64 case NameStartOther: 65 knownNfc = false; 66 status = Status.Name; 67 break; 68 case NameContinueNFC: 69 knownNfc = true; 70 status = Status.NameContinue; 71 break; 72 case NameContinueOther: 73 knownNfc = false; 74 status = Status.NameContinue; 75 break; 76 default: 77 knownNfc = NfcSafe.contains(codePoint); // we don't care about the value, so production code wouldn't check 78 return Status.NotNameChar; 79 } 80 81 loop: while (endPosition < input.length()) { 82 codePoint = input.codePointAt(endPosition); 83 type = getCharType(codePoint); 84 switch (type) { 85 case NameStartOther: 86 case NameContinueOther: 87 knownNfc = false; 88 break; 89 case NameStartNFC: 90 case NameContinueNFC: 91 break; 92 default: 93 break loop; 94 } 95 endPosition += codePoint < 0x10000 ? 1 : 2; 96 } 97 return status; 98 } 99 100 public CharType getCharType(int codePoint) { 101 // Normally this would just be a trie lookup, but we simulate it here 102 if (XmlNameContinueChar.contains(codePoint)) { 103 return NfcSafe.contains(codePoint) ? CharType.NameContinueNFC : CharType.NameContinueOther; 104 } else if (XmlNameStartChar.contains(codePoint)) { 105 return NfcSafe.contains(codePoint) ? CharType.NameStartNFC : CharType.NameStartOther; 106 } else if (XmlIllegal.contains(codePoint)) { 107 return CharType.Illegal; 108 } else if (XmlWhiteSpace.contains(codePoint)) { 109 return CharType.Whitespace; 110 } else { 111 return CharType.Other; 112 } 113 } 114 115 public String getToken() { 116 return input.substring(startPosition, endPosition); 117 } 118 119 public Status getStatus() { 120 return status; 121 } 122 123 public int getEndPosition() { 124 return endPosition; 125 } 126 127 public int getStartPosition() { 128 return startPosition; 129 } 130 131 public boolean isKnownNfc() { 132 return knownNfc; 133 } 134 135 Status getIdStatus(String s) { 136 set(s, 0).next(); 137 if (endPosition != s.length()) { 138 return Status.NotNameChar; 139 } 140 return getStatus(); 141 } 142 143 static void showDiffs() { 144 Map<Status, Map<Status, UnicodeSet>> map = new TreeMap<>(); 145 NormalizedIdentifierParser parser = new NormalizedIdentifierParser(); 146 for (int codePoint = 0; codePoint <= 0x10FFFF; ++codePoint) { 147 String source = new StringBuilder().appendCodePoint(codePoint).toString(); 148 Status sourceStatus = parser.getIdStatus(source); 149 String target = Normalizer.normalize(codePoint, Normalizer.NFC); 150 Status targetStatus = parser.getIdStatus(target); 151 if (sourceStatus == targetStatus) { 152 continue; 153 } 154 Map<Status, UnicodeSet> map2 = map.get(sourceStatus); 155 if (map2 == null) { 156 map.put(sourceStatus, map2 = new TreeMap<>()); 157 } 158 UnicodeSet set = map2.get(targetStatus); 159 if (set == null) { 160 map2.put(targetStatus, set = new UnicodeSet()); 161 } 162 set.add(codePoint); 163 } 164 for (Status sourceStatus : map.keySet()) { 165 Map<Status, UnicodeSet> map2 = map.get(sourceStatus); 166 for (Status targetStatus : map2.keySet()) { 167 UnicodeSet set = map2.get(targetStatus); 168 System.out.println(sourceStatus + "\t=>\t" + targetStatus + "\t" + set); 169 } 170 } 171 } 172 173 public static void main(String[] args) { 174 System.out.println("NameStart: " + XmlNameStartChar); 175 System.out.println("NameContinue: " + XmlNameContinueChar); 176 System.out.println("Whitespace: " + XmlWhiteSpace); 177 System.out.println("Illegal: " + XmlIllegal); 178 179 compareNormalizer(); 180 181 NormalizedIdentifierParser parser = new NormalizedIdentifierParser(); 182 parser.set("\u0308ghi)j\u0308$abc+def*(", 0); 183 for (Status status = parser.next(); status != Status.DONE; status = parser.next()) { 184 System.out.println(status + ": \t" + parser.getToken() + (!parser.isKnownNfc() ? "\tNot Known NFC!" : "")); 185 } 186 } 187 188 private static void compareNormalizer() { 189 final int iterations = 100000000; 190 compareNormalizer("nörmalization", iterations); 191 compareNormalizer("No\u0308rmalization", iterations); 192 } 193 194 private static void compareNormalizer(String test, int iterations) { 195 String s; 196 s = test.toLowerCase(); 197 s = Normalizer.normalize(test, Normalizer.NFC); 198 Timer timer = new Timer(); 199 timer.start(); 200 for (int i = 0; i < iterations; ++i) { 201 s = test.toLowerCase(); 202 } 203 timer.stop(); 204 final long lowercaseDuration = timer.getDuration(); 205 System.out.println("Java Lowercasing: " + lowercaseDuration * 1000.0d / iterations 206 + "µs; for " + test); 207 208 timer.start(); 209 for (int i = 0; i < iterations; ++i) { 210 s = Normalizer.normalize(test, Normalizer.NFC); 211 } 212 timer.stop(); 213 final long nfcDuration = timer.getDuration(); 214 System.out.println("ICU Normalizing: " + nfcDuration * 1000.0d / iterations 215 + "µs = " + (nfcDuration * 100.0d / lowercaseDuration - 1) 216 + "%; for " + test); 217 } 218 } 219