1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import com.ibm.icu.impl.Utility; 12 13 /** 14 * A transliteration rule used by 15 * <code>RuleBasedTransliterator</code>. 16 * <code>TransliterationRule</code> is an immutable object. 17 * 18 * <p>A rule consists of an input pattern and an output string. When 19 * the input pattern is matched, the output string is emitted. The 20 * input pattern consists of zero or more characters which are matched 21 * exactly (the key) and optional context. Context must match if it 22 * is specified. Context may be specified before the key, after the 23 * key, or both. The key, preceding context, and following context 24 * may contain variables. Variables represent a set of Unicode 25 * characters, such as the letters <i>a</i> through <i>z</i>. 26 * Variables are detected by looking up each character in a supplied 27 * variable list to see if it has been so defined. 28 * 29 * <p>A rule may contain segments in its input string and segment 30 * references in its output string. A segment is a substring of the 31 * input pattern, indicated by an offset and limit. The segment may 32 * be in the preceding or following context. It may not span a 33 * context boundary. A segment reference is a special character in 34 * the output string that causes a segment of the input string (not 35 * the input pattern) to be copied to the output string. The range of 36 * special characters that represent segment references is defined by 37 * RuleBasedTransliterator.Data. 38 * 39 * <p>Example: The rule "([a-z]) . ([0-9]) > $2 . $1" will change the input 40 * string "abc.123" to "ab1.c23". 41 * 42 * <p>Copyright © IBM Corporation 1999. All rights reserved. 43 * 44 * @author Alan Liu 45 */ 46 class TransliterationRule { 47 48 // TODO Eliminate the pattern and keyLength data members. They 49 // are used only by masks() and getIndexValue() which are called 50 // only during build time, not during run-time. Perhaps these 51 // methods and pattern/keyLength can be isolated into a separate 52 // object. 53 54 /** 55 * The match that must occur before the key, or null if there is no 56 * preceding context. 57 */ 58 private StringMatcher anteContext; 59 60 /** 61 * The matcher object for the key. If null, then the key is empty. 62 */ 63 private StringMatcher key; 64 65 /** 66 * The match that must occur after the key, or null if there is no 67 * following context. 68 */ 69 private StringMatcher postContext; 70 71 /** 72 * The object that performs the replacement if the key, 73 * anteContext, and postContext are matched. Never null. 74 */ 75 private UnicodeReplacer output; 76 77 /** 78 * The string that must be matched, consisting of the anteContext, key, 79 * and postContext, concatenated together, in that order. Some components 80 * may be empty (zero length). 81 * @see anteContextLength 82 * @see keyLength 83 */ 84 private String pattern; 85 86 /** 87 * An array of matcher objects corresponding to the input pattern 88 * segments. If there are no segments this is null. N.B. This is 89 * a UnicodeMatcher for generality, but in practice it is always a 90 * StringMatcher. In the future we may generalize this, but for 91 * now we sometimes cast down to StringMatcher. 92 */ 93 UnicodeMatcher[] segments; 94 95 /** 96 * The length of the string that must match before the key. If 97 * zero, then there is no matching requirement before the key. 98 * Substring [0,anteContextLength) of pattern is the anteContext. 99 */ 100 private int anteContextLength; 101 102 /** 103 * The length of the key. Substring [anteContextLength, 104 * anteContextLength + keyLength) is the key. 105 */ 106 private int keyLength; 107 108 /** 109 * Miscellaneous attributes. 110 */ 111 byte flags; 112 113 /** 114 * Flag attributes. 115 */ 116 static final int ANCHOR_START = 1; 117 static final int ANCHOR_END = 2; 118 119 /** 120 * An alias pointer to the data for this rule. The data provides 121 * lookup services for matchers and segments. 122 */ 123 private final RuleBasedTransliterator.Data data; 124 125 126 /** 127 * Construct a new rule with the given input, output text, and other 128 * attributes. A cursor position may be specified for the output text. 129 * @param input input string, including key and optional ante and 130 * post context 131 * @param anteContextPos offset into input to end of ante context, or -1 if 132 * none. Must be <= input.length() if not -1. 133 * @param postContextPos offset into input to start of post context, or -1 134 * if none. Must be <= input.length() if not -1, and must be >= 135 * anteContextPos. 136 * @param output output string 137 * @param cursorPos offset into output at which cursor is located, or -1 if 138 * none. If less than zero, then the cursor is placed after the 139 * <code>output</code>; that is, -1 is equivalent to 140 * <code>output.length()</code>. If greater than 141 * <code>output.length()</code> then an exception is thrown. 142 * @param cursorOffset an offset to be added to cursorPos to position the 143 * cursor either in the ante context, if < 0, or in the post context, if > 144 * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to 145 * "xyz" and moves the cursor to before "a". It would have a cursorOffset 146 * of -3. 147 * @param segs array of UnicodeMatcher corresponding to input pattern 148 * segments, or null if there are none 149 * @param anchorStart true if the the rule is anchored on the left to 150 * the context start 151 * @param anchorEnd true if the rule is anchored on the right to the 152 * context limit 153 */ TransliterationRule(String input, int anteContextPos, int postContextPos, String output, int cursorPos, int cursorOffset, UnicodeMatcher[] segs, boolean anchorStart, boolean anchorEnd, RuleBasedTransliterator.Data theData)154 public TransliterationRule(String input, 155 int anteContextPos, int postContextPos, 156 String output, 157 int cursorPos, int cursorOffset, 158 UnicodeMatcher[] segs, 159 boolean anchorStart, boolean anchorEnd, 160 RuleBasedTransliterator.Data theData) { 161 data = theData; 162 163 // Do range checks only when warranted to save time 164 if (anteContextPos < 0) { 165 anteContextLength = 0; 166 } else { 167 if (anteContextPos > input.length()) { 168 throw new IllegalArgumentException("Invalid ante context"); 169 } 170 anteContextLength = anteContextPos; 171 } 172 if (postContextPos < 0) { 173 keyLength = input.length() - anteContextLength; 174 } else { 175 if (postContextPos < anteContextLength || 176 postContextPos > input.length()) { 177 throw new IllegalArgumentException("Invalid post context"); 178 } 179 keyLength = postContextPos - anteContextLength; 180 } 181 if (cursorPos < 0) { 182 cursorPos = output.length(); 183 } else if (cursorPos > output.length()) { 184 throw new IllegalArgumentException("Invalid cursor position"); 185 } 186 187 // We don't validate the segments array. The caller must 188 // guarantee that the segments are well-formed (that is, that 189 // all $n references in the output refer to indices of this 190 // array, and that no array elements are null). 191 this.segments = segs; 192 193 pattern = input; 194 flags = 0; 195 if (anchorStart) { 196 flags |= ANCHOR_START; 197 } 198 if (anchorEnd) { 199 flags |= ANCHOR_END; 200 } 201 202 anteContext = null; 203 if (anteContextLength > 0) { 204 anteContext = new StringMatcher(pattern.substring(0, anteContextLength), 205 0, data); 206 } 207 208 key = null; 209 if (keyLength > 0) { 210 key = new StringMatcher(pattern.substring(anteContextLength, anteContextLength + keyLength), 211 0, data); 212 } 213 214 int postContextLength = pattern.length() - keyLength - anteContextLength; 215 postContext = null; 216 if (postContextLength > 0) { 217 postContext = new StringMatcher(pattern.substring(anteContextLength + keyLength), 218 0, data); 219 } 220 221 this.output = new StringReplacer(output, cursorPos + cursorOffset, data); 222 } 223 224 /** 225 * Return the preceding context length. This method is needed to 226 * support the <code>Transliterator</code> method 227 * <code>getMaximumContextLength()</code>. 228 */ getAnteContextLength()229 public int getAnteContextLength() { 230 return anteContextLength + (((flags & ANCHOR_START) != 0) ? 1 : 0); 231 } 232 233 /** 234 * Internal method. Returns 8-bit index value for this rule. 235 * This is the low byte of the first character of the key, 236 * unless the first character of the key is a set. If it's a 237 * set, or otherwise can match multiple keys, the index value is -1. 238 */ getIndexValue()239 final int getIndexValue() { 240 if (anteContextLength == pattern.length()) { 241 // A pattern with just ante context {such as foo)>bar} can 242 // match any key. 243 return -1; 244 } 245 int c = UTF16.charAt(pattern, anteContextLength); 246 return data.lookupMatcher(c) == null ? (c & 0xFF) : -1; 247 } 248 249 /** 250 * Internal method. Returns true if this rule matches the given 251 * index value. The index value is an 8-bit integer, 0..255, 252 * representing the low byte of the first character of the key. 253 * It matches this rule if it matches the first character of the 254 * key, or if the first character of the key is a set, and the set 255 * contains any character with a low byte equal to the index 256 * value. If the rule contains only ante context, as in foo)>bar, 257 * then it will match any key. 258 */ matchesIndexValue(int v)259 final boolean matchesIndexValue(int v) { 260 // Delegate to the key, or if there is none, to the postContext. 261 // If there is neither then we match any key; return true. 262 UnicodeMatcher m = (key != null) ? key : postContext; 263 return (m != null) ? m.matchesIndexValue(v) : true; 264 } 265 266 /** 267 * Return true if this rule masks another rule. If r1 masks r2 then 268 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 269 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 270 * "[c]a>x" masks "[dc]a>y". 271 */ masks(TransliterationRule r2)272 public boolean masks(TransliterationRule r2) { 273 /* Rule r1 masks rule r2 if the string formed of the 274 * antecontext, key, and postcontext overlaps in the following 275 * way: 276 * 277 * r1: aakkkpppp 278 * r2: aaakkkkkpppp 279 * ^ 280 * 281 * The strings must be aligned at the first character of the 282 * key. The length of r1 to the left of the alignment point 283 * must be <= the length of r2 to the left; ditto for the 284 * right. The characters of r1 must equal (or be a superset 285 * of) the corresponding characters of r2. The superset 286 * operation should be performed to check for UnicodeSet 287 * masking. 288 * 289 * Anchors: Two patterns that differ only in anchors only 290 * mask one another if they are exactly equal, and r2 has 291 * all the anchors r1 has (optionally, plus some). Here Y 292 * means the row masks the column, N means it doesn't. 293 * 294 * ab ^ab ab$ ^ab$ 295 * ab Y Y Y Y 296 * ^ab N Y N Y 297 * ab$ N N Y Y 298 * ^ab$ N N N Y 299 * 300 * Post context: {a}b masks ab, but not vice versa, since {a}b 301 * matches everything ab matches, and {a}b matches {|a|}b but ab 302 * does not. Pre context is different (a{b} does not align with 303 * ab). 304 */ 305 306 /* LIMITATION of the current mask algorithm: Some rule 307 * maskings are currently not detected. For example, 308 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO 309 */ 310 311 int len = pattern.length(); 312 int left = anteContextLength; 313 int left2 = r2.anteContextLength; 314 int right = pattern.length() - left; 315 int right2 = r2.pattern.length() - left2; 316 317 // TODO Clean this up -- some logic might be combinable with the 318 // next statement. 319 320 // Test for anchor masking 321 if (left == left2 && right == right2 && 322 keyLength <= r2.keyLength && 323 r2.pattern.regionMatches(0, pattern, 0, len)) { 324 // The following boolean logic implements the table above 325 return (flags == r2.flags) || 326 (!((flags & ANCHOR_START) != 0) && !((flags & ANCHOR_END) != 0)) || 327 (((r2.flags & ANCHOR_START) != 0) && ((r2.flags & ANCHOR_END) != 0)); 328 } 329 330 return left <= left2 && 331 (right < right2 || 332 (right == right2 && keyLength <= r2.keyLength)) && 333 r2.pattern.regionMatches(left2 - left, pattern, 0, len); 334 } 335 posBefore(Replaceable str, int pos)336 static final int posBefore(Replaceable str, int pos) { 337 return (pos > 0) ? 338 pos - UTF16.getCharCount(str.char32At(pos-1)) : 339 pos - 1; 340 } 341 posAfter(Replaceable str, int pos)342 static final int posAfter(Replaceable str, int pos) { 343 return (pos >= 0 && pos < str.length()) ? 344 pos + UTF16.getCharCount(str.char32At(pos)) : 345 pos + 1; 346 } 347 348 /** 349 * Attempt a match and replacement at the given position. Return 350 * the degree of match between this rule and the given text. The 351 * degree of match may be mismatch, a partial match, or a full 352 * match. A mismatch means at least one character of the text 353 * does not match the context or key. A partial match means some 354 * context and key characters match, but the text is not long 355 * enough to match all of them. A full match means all context 356 * and key characters match. 357 * 358 * If a full match is obtained, perform a replacement, update pos, 359 * and return U_MATCH. Otherwise both text and pos are unchanged. 360 * 361 * @param text the text 362 * @param pos the position indices 363 * @param incremental if TRUE, test for partial matches that may 364 * be completed by additional text inserted at pos.limit. 365 * @return one of <code>U_MISMATCH</code>, 366 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 367 * incremental is FALSE then U_PARTIAL_MATCH will not be returned. 368 */ matchAndReplace(Replaceable text, Transliterator.Position pos, boolean incremental)369 public int matchAndReplace(Replaceable text, 370 Transliterator.Position pos, 371 boolean incremental) { 372 // Matching and replacing are done in one method because the 373 // replacement operation needs information obtained during the 374 // match. Another way to do this is to have the match method 375 // create a match result struct with relevant offsets, and to pass 376 // this into the replace method. 377 378 // ============================ MATCH =========================== 379 380 // Reset segment match data 381 if (segments != null) { 382 for (int i=0; i<segments.length; ++i) { 383 ((StringMatcher) segments[i]).resetMatch(); 384 } 385 } 386 387 int keyLimit; 388 int[] intRef = new int[1]; 389 390 // ------------------------ Ante Context ------------------------ 391 392 // A mismatch in the ante context, or with the start anchor, 393 // is an outright U_MISMATCH regardless of whether we are 394 // incremental or not. 395 int oText; // offset into 'text' 396 int minOText; 397 398 // Note (1): We process text in 16-bit code units, rather than 399 // 32-bit code points. This works because stand-ins are 400 // always in the BMP and because we are doing a literal match 401 // operation, which can be done 16-bits at a time. 402 403 int anteLimit = posBefore(text, pos.contextStart); 404 405 int match; 406 407 // Start reverse match at char before pos.start 408 intRef[0] = posBefore(text, pos.start); 409 410 if (anteContext != null) { 411 match = anteContext.matches(text, intRef, anteLimit, false); 412 if (match != UnicodeMatcher.U_MATCH) { 413 return UnicodeMatcher.U_MISMATCH; 414 } 415 } 416 417 oText = intRef[0]; 418 419 minOText = posAfter(text, oText); 420 421 // ------------------------ Start Anchor ------------------------ 422 423 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { 424 return UnicodeMatcher.U_MISMATCH; 425 } 426 427 // -------------------- Key and Post Context -------------------- 428 429 intRef[0] = pos.start; 430 431 if (key != null) { 432 match = key.matches(text, intRef, pos.limit, incremental); 433 if (match != UnicodeMatcher.U_MATCH) { 434 return match; 435 } 436 } 437 438 keyLimit = intRef[0]; 439 440 if (postContext != null) { 441 if (incremental && keyLimit == pos.limit) { 442 // The key matches just before pos.limit, and there is 443 // a postContext. Since we are in incremental mode, 444 // we must assume more characters may be inserted at 445 // pos.limit -- this is a partial match. 446 return UnicodeMatcher.U_PARTIAL_MATCH; 447 } 448 449 match = postContext.matches(text, intRef, pos.contextLimit, incremental); 450 if (match != UnicodeMatcher.U_MATCH) { 451 return match; 452 } 453 } 454 455 oText = intRef[0]; 456 457 // ------------------------- Stop Anchor ------------------------ 458 459 if (((flags & ANCHOR_END)) != 0) { 460 if (oText != pos.contextLimit) { 461 return UnicodeMatcher.U_MISMATCH; 462 } 463 if (incremental) { 464 return UnicodeMatcher.U_PARTIAL_MATCH; 465 } 466 } 467 468 // =========================== REPLACE ========================== 469 470 // We have a full match. The key is between pos.start and 471 // keyLimit. 472 473 int newLength = output.replace(text, pos.start, keyLimit, intRef); 474 int lenDelta = newLength - (keyLimit - pos.start); 475 int newStart = intRef[0]; 476 477 oText += lenDelta; 478 pos.limit += lenDelta; 479 pos.contextLimit += lenDelta; 480 // Restrict new value of start to [minOText, min(oText, pos.limit)]. 481 pos.start = Math.max(minOText, Math.min(Math.min(oText, pos.limit), newStart)); 482 return UnicodeMatcher.U_MATCH; 483 } 484 485 /** 486 * Create a source string that represents this rule. Append it to the 487 * given string. 488 */ toRule(boolean escapeUnprintable)489 public String toRule(boolean escapeUnprintable) { 490 // int i; 491 492 StringBuffer rule = new StringBuffer(); 493 494 // Accumulate special characters (and non-specials following them) 495 // into quoteBuf. Append quoteBuf, within single quotes, when 496 // a non-quoted element must be inserted. 497 StringBuffer quoteBuf = new StringBuffer(); 498 499 // Do not emit the braces '{' '}' around the pattern if there 500 // is neither anteContext nor postContext. 501 boolean emitBraces = 502 (anteContext != null) || (postContext != null); 503 504 // Emit start anchor 505 if ((flags & ANCHOR_START) != 0) { 506 rule.append('^'); 507 } 508 509 // Emit the input pattern 510 Utility.appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); 511 512 if (emitBraces) { 513 Utility.appendToRule(rule, '{', true, escapeUnprintable, quoteBuf); 514 } 515 516 Utility.appendToRule(rule, key, escapeUnprintable, quoteBuf); 517 518 if (emitBraces) { 519 Utility.appendToRule(rule, '}', true, escapeUnprintable, quoteBuf); 520 } 521 522 Utility.appendToRule(rule, postContext, escapeUnprintable, quoteBuf); 523 524 // Emit end anchor 525 if ((flags & ANCHOR_END) != 0) { 526 rule.append('$'); 527 } 528 529 Utility.appendToRule(rule, " > ", true, escapeUnprintable, quoteBuf); 530 531 // Emit the output pattern 532 533 Utility.appendToRule(rule, output.toReplacerPattern(escapeUnprintable), 534 true, escapeUnprintable, quoteBuf); 535 536 Utility.appendToRule(rule, ';', true, escapeUnprintable, quoteBuf); 537 538 return rule.toString(); 539 } 540 541 /** 542 * Return a string representation of this object. 543 * @return string representation of this object 544 */ 545 @Override toString()546 public String toString() { 547 return '{' + toRule(true) + '}'; 548 } 549 550 /** 551 * Find the source and target sets, subject to the input filter. 552 * There is a known issue with filters containing multiple characters. 553 */ 554 // TODO: Problem: the rule is [{ab}]c > x 555 // The filter is [a{bc}]. 556 // If the input is abc, then the rule will work. 557 // However, following code applying the filter won't catch that case. 558 addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting)559 void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet, UnicodeSet revisiting) { 560 int limit = anteContextLength + keyLength; 561 UnicodeSet tempSource = new UnicodeSet(); 562 UnicodeSet temp = new UnicodeSet(); 563 564 // We need to walk through the pattern. 565 // Iff some of the characters at ALL of the the positions are matched by the filter, then we add temp to toUnionTo 566 for (int i=anteContextLength; i<limit; ) { 567 int ch = UTF16.charAt(pattern, i); 568 i += UTF16.getCharCount(ch); 569 UnicodeMatcher matcher = data.lookupMatcher(ch); 570 if (matcher == null) { 571 if (!filter.contains(ch)) { 572 return; 573 } 574 tempSource.add(ch); 575 } else { 576 try { 577 if (!filter.containsSome((UnicodeSet) matcher)) { 578 return; 579 } 580 matcher.addMatchSetTo(tempSource); 581 } catch (ClassCastException e) { // if the matcher is not a UnicodeSet 582 temp.clear(); 583 matcher.addMatchSetTo(temp); 584 if (!filter.containsSome(temp)) { 585 return; 586 } 587 tempSource.addAll(temp); 588 } 589 } 590 } 591 // if we made our way through the gauntlet, add to source/target 592 sourceSet.addAll(tempSource); 593 output.addReplacementSetTo(targetSet); 594 } 595 } 596