1 package org.unicode.cldr.util; 2 3 import java.util.Collection; 4 import java.util.Comparator; 5 import java.util.HashSet; 6 import java.util.Set; 7 import java.util.TreeSet; 8 9 import org.unicode.cldr.util.XEquivalenceClass.SetMaker; 10 11 import com.ibm.icu.lang.UCharacter; 12 import com.ibm.icu.text.Normalizer; 13 import com.ibm.icu.text.UTF16; 14 import com.ibm.icu.text.UnicodeSet; 15 import com.ibm.icu.text.UnicodeSetIterator; 16 import com.ibm.icu.util.ULocale; 17 18 public class VariantFolder { 19 private AlternateFetcher alternateFetcher; 20 public static SetMaker mySetMaker = new SetMaker() { 21 Comparator c = new UTF16.StringComparator(true, false, 0); 22 Comparator bestIsLowest = new Comparator() { 23 @Override 24 public int compare(Object o1, Object o2) { 25 String s1 = o1.toString(); 26 String s2 = o2.toString(); 27 final boolean casefold1 = UCharacter.foldCase(s1, true).equals(s1); 28 final boolean casefold2 = UCharacter.foldCase(s2, true).equals(s2); 29 if (casefold1 != casefold2) { 30 return casefold1 ? -1 : 1; 31 } 32 final boolean canonical1 = Normalizer.isNormalized(s1, Normalizer.COMPOSE, 0); 33 final boolean canonical2 = Normalizer.isNormalized(s2, Normalizer.COMPOSE, 0); 34 if (canonical1 != canonical2) { 35 return canonical1 ? -1 : 1; 36 } 37 int len1 = s1.codePointCount(0, s1.length()); 38 int len2 = s2.codePointCount(0, s2.length()); 39 if (len1 != len2) { 40 return len1 - len2; 41 } 42 return c.compare(s1, s2); 43 } 44 45 }; 46 47 @Override 48 public Set make() { 49 return new TreeSet(bestIsLowest); 50 } 51 }; 52 public static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]"); 53 54 //private String source; 55 56 //private Set<String> result; 57 58 public interface AlternateFetcher { 59 /** 60 * The input string MUST be in the output set. Note that the results must be 61 * valid even if input string might not be on even code point boundaries. 62 * For example, if the input is "XabY" where X and Y are have surrogates, 63 * and the alternates are by case, then the results have to be {"XabY", 64 * "XAbY", "XaBY", "XABY"}. 65 * <p> 66 * The caller must never modify the set. 67 * 68 * @param item 69 * @return 70 */ getAlternates(String item, Set<String> output)71 Set<String> getAlternates(String item, Set<String> output); 72 } 73 74 public static class CompatibilityFolder implements VariantFolder.AlternateFetcher { 75 private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]"); 76 static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker); 77 static { 78 for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next();) { 79 String item = it.getString(); equivalents.add(item, Normalizer.decompose(item, true))80 equivalents.add(item, Normalizer.decompose(item, true)); equivalents.add(item, Normalizer.compose(item, true))81 equivalents.add(item, Normalizer.compose(item, true)); 82 } 83 } 84 85 @Override getAlternates(String item, Set<String> output)86 public Set<String> getAlternates(String item, Set<String> output) { 87 output.add(item); 88 return equivalents.getEquivalences(item); 89 } 90 91 } 92 93 public static class CanonicalFolder implements VariantFolder.AlternateFetcher { 94 static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker); 95 static { 96 for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next();) { 97 String item = it.getString(); equivalents.add(item, Normalizer.decompose(item, false))98 equivalents.add(item, Normalizer.decompose(item, false)); equivalents.add(item, Normalizer.compose(item, false))99 equivalents.add(item, Normalizer.compose(item, false)); 100 } 101 } 102 103 @Override getAlternates(String item, Set<String> output)104 public Set<String> getAlternates(String item, Set<String> output) { 105 output.add(item); 106 return equivalents.getEquivalences(item); 107 } 108 109 } 110 111 public static class CaseVariantFolder implements VariantFolder.AlternateFetcher { 112 private static final UnicodeSet NORMAL_CHARS = new UnicodeSet("[^[:c:]]"); 113 static XEquivalenceClass equivalents = new XEquivalenceClass("none", mySetMaker); 114 static { 115 for (UnicodeSetIterator it = new UnicodeSetIterator(NORMAL_CHARS); it.next();) { 116 String item = it.getString(); equivalents.add(item, UCharacter.toLowerCase(item))117 equivalents.add(item, UCharacter.toLowerCase(item)); equivalents.add(item, UCharacter.toUpperCase(item))118 equivalents.add(item, UCharacter.toUpperCase(item)); equivalents.add(item, UCharacter.foldCase(item, true))119 equivalents.add(item, UCharacter.foldCase(item, true)); equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null))120 equivalents.add(item, UCharacter.toTitleCase(ULocale.ROOT, item, null)); 121 } 122 } 123 124 @Override getAlternates(String item, Set<String> output)125 public Set<String> getAlternates(String item, Set<String> output) { 126 output.add(item); 127 return equivalents.getEquivalences(item); 128 } 129 } 130 131 /** 132 * The class is designed to be immutable, at least as far as Java allows. That is, if the alternateFetcher is, then 133 * it will be. 134 * 135 * @param alternateFetcher 136 */ VariantFolder(AlternateFetcher alternateFetcher)137 public VariantFolder(AlternateFetcher alternateFetcher) { 138 this.alternateFetcher = alternateFetcher; 139 } 140 141 // We keep track of the alternates for each combination of start,len 142 // so with a length of 3 we have the following structure 143 // {{0,1}, {1,2}, {2,3} -- length of 1 144 // {0,2}, {1,3} 145 // {0,3}} 146 147 @SuppressWarnings("unchecked") getClosure(String source)148 public Set<String> getClosure(String source) { 149 int stringLength = source.length(); 150 if (stringLength == 0) { 151 Set<String> result = new HashSet<>(); 152 result.add(source); 153 return result; 154 } 155 Set<String>[][] combos = new Set[stringLength][]; 156 for (int i = 0; i < stringLength; ++i) { 157 combos[i] = new Set[stringLength - i]; 158 } 159 for (int i = 0; i < stringLength; ++i) { 160 combos[0][i] = alternateFetcher.getAlternates(source.substring(i, i + 1), 161 new HashSet<String>()); 162 } 163 for (int level = 1; level < stringLength; ++level) { 164 // at each level, we add strings of that length (plus 1) 165 for (int start = 0; start < stringLength - level; ++start) { 166 int limit = start + level + 1; 167 // System.out.println(start + ", " + limit); 168 // we first add any longer alternates 169 Collection<String> current = combos[level][start] = new HashSet<>(); 170 current.addAll(alternateFetcher.getAlternates(source.substring(start, 171 limit), new HashSet<String>())); 172 // then we add the cross product of shorter strings 173 for (int breakPoint = start + 1; breakPoint < limit; ++breakPoint) { 174 addCrossProduct(combos[breakPoint - start - 1][start], combos[limit 175 - breakPoint - 1][breakPoint], current); 176 } 177 } 178 } 179 return combos[combos.length - 1][0]; 180 } 181 addCrossProduct(Collection<String> source1, Collection<String> source2, Collection<String> output)182 private void addCrossProduct(Collection<String> source1, 183 Collection<String> source2, Collection<String> output) { 184 for (String x : source1) { 185 for (String y : source2) { 186 output.add(x + y); 187 } 188 } 189 } 190 getClosure(UnicodeSet input)191 public UnicodeSet getClosure(UnicodeSet input) { 192 UnicodeSet result = new UnicodeSet(); 193 for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) { 194 Set<String> temp = getClosure(it.getString()); 195 for (String s : temp) { 196 result.add(s); 197 } 198 } 199 return result; 200 } 201 reduce(String s)202 public String reduce(String s) { 203 Set<String> temp = getClosure(s); 204 return temp.iterator().next(); 205 } 206 reduce(UnicodeSet input)207 public UnicodeSet reduce(UnicodeSet input) { 208 UnicodeSet result = new UnicodeSet(); 209 for (UnicodeSetIterator it = new UnicodeSetIterator(input); it.next();) { 210 final String reduce = reduce(it.getString()); 211 result.add(reduce); 212 } 213 return result; 214 } 215 }