1 /**
2  *******************************************************************************
3  * Copyright (C) 1996-2012, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  **********************************************************************
6  * Author: Mark Davis
7  **********************************************************************
8  */
9 
10 package org.unicode.cldr.util;
11 
12 import java.io.IOException;
13 import java.text.FieldPosition;
14 import java.util.Comparator;
15 import java.util.TreeSet;
16 
17 import com.ibm.icu.impl.Utility;
18 import com.ibm.icu.lang.UCharacter;
19 import com.ibm.icu.text.StringTransform;
20 import com.ibm.icu.text.UTF16;
21 import com.ibm.icu.text.UTF16.StringComparator;
22 import com.ibm.icu.text.UnicodeSet;
23 import com.ibm.icu.text.UnicodeSetIterator;
24 import com.ibm.icu.util.ICUUncheckedIOException;
25 
26 /** Provides more flexible formatting of UnicodeSet patterns.
27  */
28 public class UnicodeSetPrettyPrinter {
29     private static final StringComparator CODEPOINT_ORDER = new UTF16.StringComparator(true, false, 0);
30     private static final UnicodeSet PATTERN_WHITESPACE = (UnicodeSet) new UnicodeSet("[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze();
31     private static final UnicodeSet SORT_AT_END = (UnicodeSet) new UnicodeSet("[[:Cn:][:Cs:][:Co:][:Ideographic:]]").freeze();
32     private static final UnicodeSet QUOTED_SYNTAX = (UnicodeSet) new UnicodeSet("[\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]").addAll(PATTERN_WHITESPACE).freeze();
33 
34     private boolean first = true;
35     private StringBuffer target = new StringBuffer();
36     private int firstCodePoint = -2;
37     private int lastCodePoint = -2;
38     private boolean compressRanges = true;
39     private String lastString = "";
40     private UnicodeSet toQuote = new UnicodeSet(PATTERN_WHITESPACE);
41     private StringTransform quoter = null;
42 
43     private Comparator<String> ordering;
44     private Comparator<String> spaceComp;
45 
UnicodeSetPrettyPrinter()46     public UnicodeSetPrettyPrinter() {
47     }
48 
getQuoter()49     public StringTransform getQuoter() {
50         return quoter;
51     }
52 
setQuoter(StringTransform quoter)53     public UnicodeSetPrettyPrinter setQuoter(StringTransform quoter) {
54         this.quoter = quoter;
55         return this; // for chaining
56     }
57 
isCompressRanges()58     public boolean isCompressRanges() {
59         return compressRanges;
60     }
61 
62     /**
63      * @param compressRanges if you want abcde instead of a-e, make this false
64      * @return
65      */
setCompressRanges(boolean compressRanges)66     public UnicodeSetPrettyPrinter setCompressRanges(boolean compressRanges) {
67         this.compressRanges = compressRanges;
68         return this;
69     }
70 
getOrdering()71     public Comparator<String> getOrdering() {
72         return ordering;
73     }
74 
75     /**
76      * @param ordering the resulting  ordering of the list of characters in the pattern
77      * @return
78      */
setOrdering(Comparator ordering)79     public UnicodeSetPrettyPrinter setOrdering(Comparator ordering) {
80         this.ordering = ordering == null ? CODEPOINT_ORDER : new org.unicode.cldr.util.MultiComparator<String>(ordering, CODEPOINT_ORDER);
81         return this;
82     }
83 
getSpaceComparator()84     public Comparator<String> getSpaceComparator() {
85         return spaceComp;
86     }
87 
88     /**
89      * @param spaceComp if the comparison returns non-zero, then a space will be inserted between characters
90      * @return this, for chaining
91      */
setSpaceComparator(Comparator spaceComp)92     public UnicodeSetPrettyPrinter setSpaceComparator(Comparator spaceComp) {
93         this.spaceComp = spaceComp;
94         return this;
95     }
96 
getToQuote()97     public UnicodeSet getToQuote() {
98         return toQuote;
99     }
100 
101     /**
102      * a UnicodeSet of extra characters to quote with \\uXXXX-style escaping (will automatically quote pattern whitespace)
103      * @param toQuote
104      */
setToQuote(UnicodeSet toQuote)105     public UnicodeSetPrettyPrinter setToQuote(UnicodeSet toQuote) {
106         if (toQuote != null) {
107             toQuote = (UnicodeSet) toQuote.cloneAsThawed();
108             toQuote.addAll(PATTERN_WHITESPACE);
109             this.toQuote = toQuote;
110         }
111         return this;
112     }
113 
114     /**
115      * Get the pattern for a particular set.
116      * @param uset
117      * @return formatted UnicodeSet
118      */
format(UnicodeSet uset)119     public String format(UnicodeSet uset) {
120         first = true;
121         UnicodeSet putAtEnd = new UnicodeSet(uset).retainAll(SORT_AT_END); // remove all the unassigned gorp for now
122         // make sure that comparison separates all strings, even canonically equivalent ones
123         TreeSet<String> orderedStrings = new TreeSet<String>(ordering);
124         for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange();) {
125             if (it.codepoint == UnicodeSetIterator.IS_STRING) {
126                 orderedStrings.add(it.string);
127             } else {
128                 for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
129                     if (!putAtEnd.contains(i)) {
130                         orderedStrings.add(UTF16.valueOf(i));
131                     }
132                 }
133             }
134         }
135         target.setLength(0);
136         target.append("[");
137         for (String item : orderedStrings) {
138             appendUnicodeSetItem(item);
139         }
140         for (UnicodeSetIterator it = new UnicodeSetIterator(putAtEnd); it.next();) { // add back the unassigned gorp
141             appendUnicodeSetItem(it.codepoint); // we know that these are only codepoints, not strings, so this is safe
142         }
143         flushLast();
144         target.append("]");
145         String sresult = target.toString();
146 
147         // double check the results. This can be removed once we have more tests.
148         //        try {
149         //            UnicodeSet  doubleCheck = new UnicodeSet(sresult);
150         //            if (!uset.equals(doubleCheck)) {
151         //                throw new IllegalStateException("Failure to round-trip in pretty-print " + uset + " => " + sresult + Utility.LINE_SEPARATOR + " source-result: " + new UnicodeSet(uset).removeAll(doubleCheck) +  Utility.LINE_SEPARATOR + " result-source: " + new UnicodeSet(doubleCheck).removeAll(uset));
152         //            }
153         //        } catch (RuntimeException e) {
154         //            throw (RuntimeException) new IllegalStateException("Failure to round-trip in pretty-print " + uset).initCause(e);
155         //        }
156         return sresult;
157     }
158 
appendUnicodeSetItem(String s)159     private UnicodeSetPrettyPrinter appendUnicodeSetItem(String s) {
160         if (UTF16.hasMoreCodePointsThan(s, 1)) {
161             flushLast();
162             addSpaceAsNeededBefore(s);
163             appendQuoted(s);
164             lastString = s;
165         } else {
166             appendUnicodeSetItem(UTF16.charAt(s, 0));
167         }
168         return this;
169     }
170 
appendUnicodeSetItem(int cp)171     private void appendUnicodeSetItem(int cp) {
172         if (!compressRanges)
173             flushLast();
174         if (cp == lastCodePoint + 1) {
175             lastCodePoint = cp; // continue range
176         } else { // start range
177             flushLast();
178             firstCodePoint = lastCodePoint = cp;
179         }
180     }
181 
182     /**
183      *
184      */
addSpaceAsNeededBefore(String s)185     private void addSpaceAsNeededBefore(String s) {
186         if (first) {
187             first = false;
188         } else if (spaceComp != null && spaceComp.compare(s, lastString) != 0) {
189             target.append(' ');
190         } else {
191             int cp = UTF16.charAt(s, 0);
192             if (!toQuote.contains(cp) && !QUOTED_SYNTAX.contains(cp)) {
193                 int type = UCharacter.getType(cp);
194                 if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) {
195                     target.append(' ');
196                 } else if (type == UCharacter.SURROGATE && cp >= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
197                     target.append(' '); // make sure we don't accidentally merge two surrogates
198                 }
199             }
200         }
201     }
202 
addSpaceAsNeededBefore(int codepoint)203     private void addSpaceAsNeededBefore(int codepoint) {
204         addSpaceAsNeededBefore(UTF16.valueOf(codepoint));
205     }
206 
flushLast()207     private void flushLast() {
208         if (lastCodePoint >= 0) {
209             addSpaceAsNeededBefore(firstCodePoint);
210             if (firstCodePoint != lastCodePoint) {
211                 appendQuoted(firstCodePoint);
212                 if (firstCodePoint + 1 != lastCodePoint) {
213                     target.append('-');
214                 } else {
215                     addSpaceAsNeededBefore(lastCodePoint);
216                 }
217             }
218             appendQuoted(lastCodePoint);
219             lastString = UTF16.valueOf(lastCodePoint);
220             firstCodePoint = lastCodePoint = -2;
221         }
222     }
223 
appendQuoted(String s)224     private void appendQuoted(String s) {
225         if (toQuote.containsSome(s) && quoter != null) {
226             target.append(quoter.transform(s));
227         } else {
228             int cp;
229             target.append("{");
230             for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
231                 appendQuoted(cp = UTF16.charAt(s, i));
232             }
233             target.append("}");
234         }
235     }
236 
appendQuoted(int codePoint)237     UnicodeSetPrettyPrinter appendQuoted(int codePoint) {
238         if (toQuote.contains(codePoint)) {
239             if (quoter != null) {
240                 target.append(quoter.transform(UTF16.valueOf(codePoint)));
241                 return this;
242             }
243             if (codePoint > 0xFFFF) {
244                 target.append("\\U");
245                 target.append(Utility.hex(codePoint, 8));
246             } else {
247                 target.append("\\u");
248                 target.append(Utility.hex(codePoint, 4));
249             }
250             return this;
251         }
252         switch (codePoint) {
253         case '[': // SET_OPEN:
254         case ']': // SET_CLOSE:
255         case '-': // HYPHEN:
256         case '^': // COMPLEMENT:
257         case '&': // INTERSECTION:
258         case '\\': //BACKSLASH:
259         case '{':
260         case '}':
261         case '$':
262         case ':':
263             target.append('\\');
264             break;
265         default:
266             // Escape whitespace
267             if (PATTERN_WHITESPACE.contains(codePoint)) {
268                 target.append('\\');
269             }
270             break;
271         }
272         UTF16.append(target, codePoint);
273         return this;
274     }
275     //  Appender append(String s) {
276     //  target.append(s);
277     //  return this;
278     //  }
279     //  public String toString() {
280     //  return target.toString();
281     //  }
282 
format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos)283     public Appendable format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos) {
284         try {
285             return toAppendTo.append(format(obj));
286         } catch (IOException e) {
287             throw new ICUUncheckedIOException(e);
288         }
289     }
290 }
291