1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, Google, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.io.BufferedReader; 12 import java.io.FileInputStream; 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.io.InputStreamReader; 16 import java.io.UnsupportedEncodingException; 17 import java.text.ParsePosition; 18 import java.util.Arrays; 19 import java.util.Comparator; 20 import java.util.LinkedHashSet; 21 import java.util.List; 22 import java.util.Map; 23 import java.util.Map.Entry; 24 import java.util.Set; 25 import java.util.TreeMap; 26 import java.util.regex.Pattern; 27 28 import com.ibm.icu.text.StringTransform; 29 import com.ibm.icu.text.SymbolTable; 30 import com.ibm.icu.text.UnicodeSet; 31 import com.ibm.icu.util.Freezable; 32 33 /** 34 * Contains utilities to supplement the JDK Regex, since it doesn't handle 35 * Unicode well. 36 * 37 * <p>TODO: Move to com.ibm.icu.dev.somewhere. 38 * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools. 39 * 40 * @author markdavis 41 */ 42 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform { 43 // Note: we don't currently have any state, but intend to in the future, 44 // particularly for the regex style supported. 45 46 private SymbolTable symbolTable; 47 48 /** 49 * Set the symbol table for internal processing 50 * @internal 51 */ getSymbolTable()52 public SymbolTable getSymbolTable() { 53 return symbolTable; 54 } 55 56 /** 57 * Get the symbol table for internal processing 58 * @internal 59 */ setSymbolTable(SymbolTable symbolTable)60 public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { 61 this.symbolTable = symbolTable; 62 return this; 63 } 64 65 /** 66 * Adds full Unicode property support, with the latest version of Unicode, 67 * to Java Regex, bringing it up to Level 1 (see 68 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the 69 * regex pattern string and interpreting the character classes (\p{...}, 70 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With 71 * this utility, Java regex expressions can be updated to work with the 72 * latest version of Unicode, and with all Unicode properties. Note that the 73 * UnicodeSet syntax has not yet, however, been updated to be completely 74 * consistent with Java regex, so be careful of the differences. 75 * <p>Not thread-safe; create a separate copy for different threads. 76 * <p>In the future, we may extend this to support other regex packages. 77 * 78 * @regex A modified Java regex pattern, as in the input to 79 * Pattern.compile(), except that all "character classes" are 80 * processed as if they were UnicodeSet patterns. Example: 81 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. 82 * @return A processed Java regex pattern, suitable for input to 83 * Pattern.compile(). 84 */ 85 @Override transform(String regex)86 public String transform(String regex) { 87 StringBuilder result = new StringBuilder(); 88 UnicodeSet temp = new UnicodeSet(); 89 ParsePosition pos = new ParsePosition(0); 90 int state = 0; // 1 = after \ 91 92 // We add each character unmodified to the output, unless we have a 93 // UnicodeSet. Note that we don't worry about supplementary characters, 94 // since none of the syntax uses them. 95 96 for (int i = 0; i < regex.length(); ++i) { 97 // look for UnicodeSets, allowing for quoting with \ and \Q 98 char ch = regex.charAt(i); 99 switch (state) { 100 case 0: // we only care about \, and '['. 101 if (ch == '\\') { 102 if (UnicodeSet.resemblesPattern(regex, i)) { 103 // should only happen with \p 104 i = processSet(regex, i, result, temp, pos); 105 continue; 106 } 107 state = 1; 108 } else if (ch == '[') { 109 // if we have what looks like a UnicodeSet 110 if (UnicodeSet.resemblesPattern(regex, i)) { 111 i = processSet(regex, i, result, temp, pos); 112 continue; 113 } 114 } 115 break; 116 117 case 1: // we are after a \ 118 if (ch == 'Q') { 119 state = 2; 120 } else { 121 state = 0; 122 } 123 break; 124 125 case 2: // we are in a \Q... 126 if (ch == '\\') { 127 state = 3; 128 } 129 break; 130 131 case 3: // we are in a \Q...\ 132 if (ch == 'E') { 133 state = 0; 134 } else if (ch != '\\') { 135 state = 2; 136 } 137 break; 138 } 139 result.append(ch); 140 } 141 return result.toString(); 142 } 143 144 /** 145 * Convenience static function, using standard parameters. 146 * @param regex as in process() 147 * @return processed regex pattern, as in process() 148 */ 149 public static String fix(String regex) { 150 return STANDARD.transform(regex); 151 } 152 153 /** 154 * Compile a regex string, after processing by fix(...). 155 * 156 * @param regex Raw regex pattern, as in fix(...). 157 * @return Pattern 158 */ 159 public static Pattern compile(String regex) { 160 return Pattern.compile(STANDARD.transform(regex)); 161 } 162 163 /** 164 * Compile a regex string, after processing by fix(...). 165 * 166 * @param regex Raw regex pattern, as in fix(...). 167 * @return Pattern 168 */ 169 public static Pattern compile(String regex, int options) { 170 return Pattern.compile(STANDARD.transform(regex), options); 171 } 172 173 /** 174 * Compile a composed string from a set of BNF lines; see the List version for more information. 175 * 176 * @param bnfLines Series of BNF lines. 177 * @return Pattern 178 */ 179 public String compileBnf(String bnfLines) { 180 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n"))); 181 } 182 183 /** 184 * Compile a composed string from a set of BNF lines, such as for composing a regex 185 * expression. The lines can be in any order, but there must not be any 186 * cycles. The result can be used as input for fix(). 187 * <p> 188 * Example: 189 * <pre> 190 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?; 191 * scheme = reserved+; 192 * host = // reserved+; 193 * query = [\\=reserved]+; 194 * fragment = reserved+; 195 * reserved = [[:ascii:][:alphabetic:]]; 196 * </pre> 197 * <p> 198 * Caveats: at this point the parsing is simple; for example, # cannot be 199 * quoted (use \\u0023); you can set it to null to disable. 200 * The equality sign and a few others can be reset with 201 * setBnfX(). 202 * 203 * @param lines Series of lines that represent a BNF expression. The lines contain 204 * a series of statements that of the form x=y;. A statement can take 205 * multiple lines, but there can't be multiple statements on a line. 206 * A hash quotes to the end of the line. 207 * @return Pattern 208 */ 209 public String compileBnf(List<String> lines) { 210 Map<String, String> variables = getVariables(lines); 211 Set<String> unused = new LinkedHashSet<String>(variables.keySet()); 212 // brute force replacement; do twice to allow for different order 213 // later on can optimize 214 for (int i = 0; i < 2; ++i) { 215 for (Entry<String, String> entry : variables.entrySet()) { 216 String variable = entry.getKey(), 217 definition = entry.getValue(); 218 219 for (Entry<String, String> entry2 : variables.entrySet()) { 220 String variable2 = entry2.getKey(), 221 definition2 = entry2.getValue(); 222 if (variable.equals(variable2)) { 223 continue; 224 } 225 String altered2 = definition2.replace(variable, definition); 226 if (!altered2.equals(definition2)) { 227 unused.remove(variable); 228 variables.put(variable2, altered2); 229 // if (log != null) { 230 // try { 231 // log.append(variable2 + "=" + altered2 + ";"); 232 // } catch (IOException e) { 233 // throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); 234 // } 235 // } 236 } 237 } 238 } 239 } 240 if (unused.size() != 1) { 241 throw new IllegalArgumentException("Not a single root: " + unused); 242 } 243 return variables.get(unused.iterator().next()); 244 } 245 246 public String getBnfCommentString() { 247 return bnfCommentString; 248 } 249 250 public void setBnfCommentString(String bnfCommentString) { 251 this.bnfCommentString = bnfCommentString; 252 } 253 254 public String getBnfVariableInfix() { 255 return bnfVariableInfix; 256 } 257 258 public void setBnfVariableInfix(String bnfVariableInfix) { 259 this.bnfVariableInfix = bnfVariableInfix; 260 } 261 262 public String getBnfLineSeparator() { 263 return bnfLineSeparator; 264 } 265 266 public void setBnfLineSeparator(String bnfLineSeparator) { 267 this.bnfLineSeparator = bnfLineSeparator; 268 } 269 270 /** 271 * Utility for loading lines from a file. 272 * @param result The result of the appended lines. 273 * @param file The file to have an input stream. 274 * @param encoding if null, then UTF-8 275 * @return filled list 276 * @throws IOException If there were problems opening the file for input stream. 277 */ 278 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException { 279 InputStream is = new FileInputStream(file); 280 try { 281 return appendLines(result, is, encoding); 282 } finally { 283 is.close(); 284 } 285 } 286 287 /** 288 * Utility for loading lines from a UTF8 file. 289 * @param result The result of the appended lines. 290 * @param inputStream The input stream. 291 * @param encoding if null, then UTF-8 292 * @return filled list 293 * @throws IOException If there were problems opening the input stream for reading. 294 */ 295 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding) 296 throws UnsupportedEncodingException, IOException { 297 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding)); 298 while (true) { 299 String line = in.readLine(); 300 if (line == null) break; 301 result.add(line); 302 } 303 return result; 304 } 305 306 307 308 /* (non-Javadoc) 309 * @see com.ibm.icu.util.Freezable#cloneAsThawed() 310 */ 311 @Override 312 public UnicodeRegex cloneAsThawed() { 313 // TODO Auto-generated method stub 314 try { 315 return (UnicodeRegex)clone(); 316 } catch (CloneNotSupportedException e) { 317 throw new IllegalArgumentException(); // should never happen 318 } 319 } 320 321 /* (non-Javadoc) 322 * @see com.ibm.icu.util.Freezable#freeze() 323 */ 324 @Override 325 public UnicodeRegex freeze() { 326 // no action needed now. 327 return this; 328 } 329 330 /* (non-Javadoc) 331 * @see com.ibm.icu.util.Freezable#isFrozen() 332 */ 333 @Override 334 public boolean isFrozen() { 335 // at this point, always true 336 return true; 337 } 338 339 // ===== PRIVATES ===== 340 341 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { 342 try { 343 pos.setIndex(i); 344 UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); 345 x.complement().complement(); // hack to fix toPattern 346 result.append(x.toPattern(false)); 347 i = pos.getIndex() - 1; // allow for the loop increment 348 return i; 349 } catch (Exception e) { 350 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); 351 } 352 } 353 354 private static final UnicodeRegex STANDARD = new UnicodeRegex(); 355 private String bnfCommentString = "#"; 356 private String bnfVariableInfix = "="; 357 private String bnfLineSeparator = "\n"; 358 // private Appendable log = null; 359 360 private Comparator<Object> LongestFirst = new Comparator<Object>() { 361 @Override 362 public int compare(Object obj0, Object obj1) { 363 String arg0 = obj0.toString(); 364 String arg1 = obj1.toString(); 365 int len0 = arg0.length(); 366 int len1 = arg1.length(); 367 if (len0 != len1) return len1 - len0; 368 return arg0.compareTo(arg1); 369 } 370 }; 371 372 private Map<String, String> getVariables(List<String> lines) { 373 Map<String, String> variables = new TreeMap<String, String>(LongestFirst); 374 String variable = null; 375 StringBuffer definition = new StringBuffer(); 376 int count = 0; 377 for (String line : lines) { 378 ++count; 379 // remove initial bom, comments 380 if (line.length() == 0) continue; 381 if (line.charAt(0) == '\uFEFF') line = line.substring(1); 382 383 if (bnfCommentString != null) { 384 int hashPos = line.indexOf(bnfCommentString); 385 if (hashPos >= 0) line = line.substring(0, hashPos); 386 } 387 String trimline = line.trim(); 388 if (trimline.length() == 0) continue; 389 390 // String[] lineParts = line.split(";"); 391 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); 392 if (linePart.trim().length() == 0) continue; 393 boolean terminated = trimline.endsWith(";"); 394 if (terminated) { 395 linePart = linePart.substring(0,linePart.lastIndexOf(';')); 396 } 397 int equalsPos = linePart.indexOf(bnfVariableInfix); 398 if (equalsPos >= 0) { 399 if (variable != null) { 400 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); 401 } 402 variable = linePart.substring(0,equalsPos).trim(); 403 if (variables.containsKey(variable)) { 404 throw new IllegalArgumentException("Duplicate variable definition in " + line); 405 } 406 definition.append(linePart.substring(equalsPos+1).trim()); 407 } else { // no equals, so 408 if (variable == null) { 409 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); 410 } 411 definition.append(bnfLineSeparator).append(linePart); 412 } 413 // we are terminated if i is not at the end, or the line ends with a ; 414 if (terminated) { 415 variables.put(variable, definition.toString()); 416 variable = null; // signal we have no variable 417 definition.setLength(0); 418 } 419 } 420 if (variable != null) { 421 throw new IllegalArgumentException("Missing ';' at end"); 422 } 423 return variables; 424 } 425 } 426