1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2006-2009, Google, International Business Machines Corporation *
6  * and others. All Rights Reserved.                                            *
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import com.ibm.icu.text.UTF16;
12 import com.ibm.icu.text.UnicodeSet;
13 
14 /**
15  * A simple parsing class for patterns and rules. Handles '...' quotations, \\uxxxx and \\Uxxxxxxxx, and symple syntax.
16  * The '' (two quotes) is treated as a single quote, inside or outside a quote
17  * <ul>
18  * <li>Any ignorable characters are ignored in parsing.</li>
19  * <li>Any syntax characters are broken into separate tokens</li>
20  * <li>Quote characters can be specified: '...', "...", and \x </li>
21  * <li>Other characters are treated as literals</li>
22  * </ul>
23  */
24 public class PatternTokenizer {
25     // settings used in the interpretation of the pattern
26     private UnicodeSet ignorableCharacters = new UnicodeSet();
27     private UnicodeSet syntaxCharacters = new UnicodeSet();
28     private UnicodeSet extraQuotingCharacters = new UnicodeSet();
29     private UnicodeSet escapeCharacters = new UnicodeSet();
30     private boolean usingSlash = false;
31     private boolean usingQuote = false;
32 
33     // transient data, set when needed. Null it out for any changes in the above fields.
34     private transient UnicodeSet needingQuoteCharacters = null;
35 
36     // data about the current pattern being parsed. start gets moved as we go along.
37     private int start;
38     private int limit;
39     private String pattern;
40 
getIgnorableCharacters()41     public UnicodeSet getIgnorableCharacters() {
42         return (UnicodeSet) ignorableCharacters.clone();
43     }
44     /**
45      * Sets the characters to be ignored in parsing, eg new UnicodeSet("[:pattern_whitespace:]");
46      * @param ignorableCharacters Characters to be ignored.
47      * @return A PatternTokenizer object in which characters are specified as ignored characters.
48      */
setIgnorableCharacters(UnicodeSet ignorableCharacters)49     public PatternTokenizer setIgnorableCharacters(UnicodeSet ignorableCharacters) {
50         this.ignorableCharacters = (UnicodeSet) ignorableCharacters.clone();
51         needingQuoteCharacters = null;
52         return this;
53     }
getSyntaxCharacters()54     public UnicodeSet getSyntaxCharacters() {
55         return (UnicodeSet) syntaxCharacters.clone();
56     }
getExtraQuotingCharacters()57     public UnicodeSet getExtraQuotingCharacters() {
58         return (UnicodeSet) extraQuotingCharacters.clone();
59     }
60     /**
61      *  Sets the characters to be interpreted as syntax characters in parsing, eg new UnicodeSet("[:pattern_syntax:]")
62      * @param syntaxCharacters Characters to be set as syntax characters.
63      * @return A PatternTokenizer object in which characters are specified as syntax characters.
64      */
setSyntaxCharacters(UnicodeSet syntaxCharacters)65     public PatternTokenizer setSyntaxCharacters(UnicodeSet syntaxCharacters) {
66         this.syntaxCharacters = (UnicodeSet) syntaxCharacters.clone();
67         needingQuoteCharacters = null;
68         return this;
69     }
70     /**
71      *  Sets the extra characters to be quoted in literals
72      * @param syntaxCharacters Characters to be set as extra quoting characters.
73      * @return A PatternTokenizer object in which characters are specified as extra quoting characters.
74      */
setExtraQuotingCharacters(UnicodeSet syntaxCharacters)75     public PatternTokenizer setExtraQuotingCharacters(UnicodeSet syntaxCharacters) {
76         this.extraQuotingCharacters = (UnicodeSet) syntaxCharacters.clone();
77         needingQuoteCharacters = null;
78         return this;
79     }
80 
getEscapeCharacters()81     public UnicodeSet getEscapeCharacters() {
82         return (UnicodeSet) escapeCharacters.clone();
83     }
84     /**
85      * Set characters to be escaped in literals, in quoteLiteral and normalize, eg new UnicodeSet("[^\\u0020-\\u007E]");
86      * @param escapeCharacters Characters to be set as escape characters.
87      * @return A PatternTokenizer object in which characters are specified as escape characters.
88      */
setEscapeCharacters(UnicodeSet escapeCharacters)89     public PatternTokenizer setEscapeCharacters(UnicodeSet escapeCharacters) {
90         this.escapeCharacters = (UnicodeSet) escapeCharacters.clone();
91         return this;
92     }
isUsingQuote()93     public boolean isUsingQuote() {
94         return usingQuote;
95     }
setUsingQuote(boolean usingQuote)96     public PatternTokenizer setUsingQuote(boolean usingQuote) {
97         this.usingQuote = usingQuote;
98         needingQuoteCharacters = null;
99         return this;
100     }
isUsingSlash()101     public boolean isUsingSlash() {
102         return usingSlash;
103     }
setUsingSlash(boolean usingSlash)104     public PatternTokenizer setUsingSlash(boolean usingSlash) {
105         this.usingSlash = usingSlash;
106         needingQuoteCharacters = null;
107         return this;
108     }
109     //    public UnicodeSet getQuoteCharacters() {
110 //  return (UnicodeSet) quoteCharacters.clone();
111 //  }
112 //  public PatternTokenizer setQuoteCharacters(UnicodeSet quoteCharacters) {
113 //  this.quoteCharacters = (UnicodeSet) quoteCharacters.clone();
114 //  needingQuoteCharacters = null;
115 //  return this;
116 //  }
getLimit()117     public int getLimit() {
118         return limit;
119     }
setLimit(int limit)120     public PatternTokenizer setLimit(int limit) {
121         this.limit = limit;
122         return this;
123     }
getStart()124     public int getStart() {
125         return start;
126     }
setStart(int start)127     public PatternTokenizer setStart(int start) {
128         this.start = start;
129         return this;
130     }
131 
setPattern(CharSequence pattern)132     public PatternTokenizer setPattern(CharSequence pattern) {
133         return setPattern(pattern.toString());
134     }
135 
setPattern(String pattern)136     public PatternTokenizer setPattern(String pattern) {
137         if (pattern == null) {
138             throw new IllegalArgumentException("Inconsistent arguments");
139         }
140         this.start = 0;
141         this.limit = pattern.length();
142         this.pattern = pattern;
143         return this;
144     }
145 
146     public static final char SINGLE_QUOTE = '\'';
147     public static final char BACK_SLASH = '\\';
148     private static int NO_QUOTE = -1, IN_QUOTE = -2;
149 
quoteLiteral(CharSequence string)150     public String quoteLiteral(CharSequence string) {
151         return quoteLiteral(string.toString());
152     }
153 
154     /**
155      * Quote a literal string, using the available settings. Thus syntax characters, quote characters, and ignorable characters will be put into quotes.
156      * @param string String passed to quote a literal string.
157      * @return A string using the available settings will place syntax, quote, or ignorable characters into quotes.
158      */
quoteLiteral(String string)159     public String quoteLiteral(String string) {
160         if (needingQuoteCharacters == null) {
161             needingQuoteCharacters = new UnicodeSet().addAll(syntaxCharacters).addAll(ignorableCharacters).addAll(extraQuotingCharacters); // .addAll(quoteCharacters)
162             if (usingSlash) needingQuoteCharacters.add(BACK_SLASH);
163             if (usingQuote) needingQuoteCharacters.add(SINGLE_QUOTE);
164         }
165         StringBuffer result = new StringBuffer();
166         int quotedChar = NO_QUOTE;
167         int cp;
168         for (int i = 0; i < string.length(); i += UTF16.getCharCount(cp)) {
169             cp = UTF16.charAt(string, i);
170             if (escapeCharacters.contains(cp)) {
171                 // we may have to fix up previous characters
172                 if (quotedChar == IN_QUOTE) {
173                     result.append(SINGLE_QUOTE);
174                     quotedChar = NO_QUOTE;
175                 }
176                 appendEscaped(result, cp);
177                 continue;
178             }
179 
180             if (needingQuoteCharacters.contains(cp)) {
181                 // if we have already started a quote
182                 if (quotedChar == IN_QUOTE) {
183                     UTF16.append(result, cp);
184                     if (usingQuote && cp == SINGLE_QUOTE) { // double it
185                         result.append(SINGLE_QUOTE);
186                     }
187                     continue;
188                 }
189                 // otherwise not already in quote
190                 if (usingSlash) {
191                     result.append(BACK_SLASH);
192                     UTF16.append(result, cp);
193                     continue;
194                 }
195                 if (usingQuote) {
196                     if (cp == SINGLE_QUOTE) { // double it and continue
197                         result.append(SINGLE_QUOTE);
198                         result.append(SINGLE_QUOTE);
199                         continue;
200                     }
201                     result.append(SINGLE_QUOTE);
202                     UTF16.append(result, cp);
203                     quotedChar = IN_QUOTE;
204                     continue;
205                 }
206                 // we have no choice but to use \\u or \\U
207                 appendEscaped(result, cp);
208                 continue;
209             }
210             // otherwise cp doesn't need quoting
211             // we may have to fix up previous characters
212             if (quotedChar == IN_QUOTE) {
213                 result.append(SINGLE_QUOTE);
214                 quotedChar = NO_QUOTE;
215             }
216             UTF16.append(result, cp);
217         }
218         // all done.
219         // we may have to fix up previous characters
220         if (quotedChar == IN_QUOTE) {
221             result.append(SINGLE_QUOTE);
222         }
223         return result.toString();
224     }
225 
appendEscaped(StringBuffer result, int cp)226     private void appendEscaped(StringBuffer result, int cp) {
227         if (cp <= 0xFFFF) {
228             result.append("\\u").append(Utility.hex(cp,4));
229         } else {
230             result.append("\\U").append(Utility.hex(cp,8));
231         }
232     }
233 
normalize()234     public String normalize() {
235         int oldStart = start;
236         StringBuffer result = new StringBuffer();
237         StringBuffer buffer = new StringBuffer();
238         while (true) {
239             buffer.setLength(0);
240             int status = next(buffer);
241             if (status == DONE) {
242                 start = oldStart;
243                 return result.toString();
244             }
245             if (status != SYNTAX) {
246                 result.append(quoteLiteral(buffer));
247             } else {
248                 result.append(buffer);
249             }
250         }
251     }
252 
253     public static final int DONE = 0, SYNTAX = 1, LITERAL = 2, BROKEN_QUOTE = 3, BROKEN_ESCAPE = 4, UNKNOWN = 5;
254 
255     private static final int AFTER_QUOTE = -1, NONE = 0, START_QUOTE = 1, NORMAL_QUOTE = 2, SLASH_START = 3, HEX = 4;
256 
next(StringBuffer buffer)257     public int next(StringBuffer buffer) {
258         if (start >= limit) return DONE;
259         int status = UNKNOWN;
260         int lastQuote = UNKNOWN;
261         int quoteStatus = NONE;
262         int hexCount = 0;
263         int hexValue = 0;
264         int cp;
265         main:
266             for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
267                 cp = UTF16.charAt(pattern, i);
268                 // if we are in a quote, then handle it.
269                 switch (quoteStatus) {
270                 case SLASH_START:
271                     switch (cp) {
272                     case 'u':
273                         quoteStatus = HEX;
274                         hexCount = 4;
275                         hexValue = 0;
276                         continue main;
277                     case 'U':
278                         quoteStatus = HEX;
279                         hexCount = 8;
280                         hexValue = 0;
281                         continue main;
282                     default:
283                         if (usingSlash) {
284                             UTF16.append(buffer, cp);
285                             quoteStatus = NONE;
286                             continue main;
287                         } else {
288                             buffer.append(BACK_SLASH);
289                             quoteStatus = NONE;
290                         }
291                     }
292                     break; // fall through to NONE
293                 case HEX:
294                     hexValue <<= 4;
295                     hexValue += cp;
296                     switch (cp) {
297                     case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
298                         hexValue -= '0'; break;
299                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
300                         hexValue -= 'a' - 10; break;
301                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
302                         hexValue -= 'A' - 10; break;
303                     default:
304                         start = i;
305                     return BROKEN_ESCAPE;
306                     }
307                     --hexCount;
308                     if (hexCount == 0) {
309                         quoteStatus = NONE;
310                         UTF16.append(buffer, hexValue);
311                     }
312                     continue main;
313                 case AFTER_QUOTE:
314                     // see if we get another quote character
315                     // if we just ended a quote BUT the following character is the lastQuote character, then we have a situation like '...''...', so we restart the quote
316                     if (cp == lastQuote) {
317                         UTF16.append(buffer, cp);
318                         quoteStatus = NORMAL_QUOTE;
319                         continue main;
320                     }
321                     quoteStatus = NONE;
322                     break; // fall through to NONE
323                 case START_QUOTE:
324                     // if we are at the very start of a quote, and we hit another quote mark then we emit a literal quote character and end the quote
325                     if (cp == lastQuote) {
326                         UTF16.append(buffer, cp);
327                         quoteStatus = NONE; // get out of quote, with no trace remaining
328                         continue;
329                     }
330                     // otherwise get into quote
331                     UTF16.append(buffer, cp);
332                     quoteStatus = NORMAL_QUOTE;
333                     continue main;
334                 case NORMAL_QUOTE:
335                     if (cp == lastQuote) {
336                         quoteStatus = AFTER_QUOTE; // get out of quote
337                         continue main;
338                     }
339                     UTF16.append(buffer, cp);
340                     continue main;
341                 }
342 
343                 if (ignorableCharacters.contains(cp)) {
344                     continue;
345                 }
346                 // do syntax characters
347                 if (syntaxCharacters.contains(cp)) {
348                     if (status == UNKNOWN) {
349                         UTF16.append(buffer, cp);
350                         start = i + UTF16.getCharCount(cp);
351                         return SYNTAX;
352                     } else { // LITERAL, so back up and break
353                         start = i;
354                         return status;
355                     }
356                 }
357                 // otherwise it is a literal; keep on going
358                 status = LITERAL;
359                 if (cp == BACK_SLASH) {
360                     quoteStatus = SLASH_START;
361                     continue;
362                 } else if (usingQuote && cp == SINGLE_QUOTE) {
363                     lastQuote = cp;
364                     quoteStatus = START_QUOTE;
365                     continue;
366                 }
367                 // normal literals
368                 UTF16.append(buffer, cp);
369             }
370         // handle final cleanup
371         start = limit;
372         switch (quoteStatus) {
373         case HEX:
374             status = BROKEN_ESCAPE;
375             break;
376         case SLASH_START:
377             if (usingSlash) {
378                 status = BROKEN_ESCAPE;
379             } else {
380                 buffer.append(BACK_SLASH);
381             }
382             break;
383         case START_QUOTE: case NORMAL_QUOTE:
384             status = BROKEN_QUOTE;
385             break;
386         }
387         return status;
388     }
389 
390 
391 }
392 //eof
393