1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2014, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl.text; 11 12 import java.util.HashMap; 13 import java.util.Map; 14 15 import com.ibm.icu.impl.ICUDebug; 16 import com.ibm.icu.text.CollationElementIterator; 17 import com.ibm.icu.text.Collator; 18 import com.ibm.icu.text.RbnfLenientScanner; 19 import com.ibm.icu.text.RbnfLenientScannerProvider; 20 import com.ibm.icu.text.RuleBasedCollator; 21 import com.ibm.icu.util.ULocale; 22 23 /** 24 * Returns RbnfLenientScanners that use the old RuleBasedNumberFormat 25 * implementation behind setLenientParseMode, which is based on Collator. 26 * @internal 27 * @deprecated This API is ICU internal only. 28 */ 29 @Deprecated 30 public class RbnfScannerProviderImpl implements RbnfLenientScannerProvider { 31 private static final boolean DEBUG = ICUDebug.enabled("rbnf"); 32 private Map<String, RbnfLenientScanner> cache; 33 34 /** 35 * @internal 36 * @deprecated This API is ICU internal only. 37 */ 38 @Deprecated RbnfScannerProviderImpl()39 public RbnfScannerProviderImpl() { 40 cache = new HashMap<String, RbnfLenientScanner>(); 41 } 42 43 /** 44 * Returns a collation-based scanner. 45 * 46 * Only primary differences are treated as significant. This means that case 47 * differences, accent differences, alternate spellings of the same letter 48 * (e.g., ae and a-umlaut in German), ignorable characters, etc. are ignored in 49 * matching the text. In many cases, numerals will be accepted in place of words 50 * or phrases as well. 51 * 52 * For example, all of the following will correctly parse as 255 in English in 53 * lenient-parse mode: 54 * <br>"two hundred fifty-five" 55 * <br>"two hundred fifty five" 56 * <br>"TWO HUNDRED FIFTY-FIVE" 57 * <br>"twohundredfiftyfive" 58 * <br>"2 hundred fifty-5" 59 * 60 * The Collator used is determined by the locale that was 61 * passed to this object on construction. The description passed to this object 62 * on construction may supply additional collation rules that are appended to the 63 * end of the default collator for the locale, enabling additional equivalences 64 * (such as adding more ignorable characters or permitting spelled-out version of 65 * symbols; see the demo program for examples). 66 * 67 * It's important to emphasize that even strict parsing is relatively lenient: it 68 * will accept some text that it won't produce as output. In English, for example, 69 * it will correctly parse "two hundred zero" and "fifteen hundred". 70 * 71 * @internal 72 * @deprecated This API is ICU internal only. 73 */ 74 @Deprecated get(ULocale locale, String extras)75 public RbnfLenientScanner get(ULocale locale, String extras) { 76 RbnfLenientScanner result = null; 77 String key = locale.toString() + "/" + extras; 78 synchronized(cache) { 79 result = cache.get(key); 80 if (result != null) { 81 return result; 82 } 83 } 84 result = createScanner(locale, extras); 85 synchronized(cache) { 86 cache.put(key, result); 87 } 88 return result; 89 } 90 91 /** 92 * @internal 93 * @deprecated This API is ICU internal only. 94 */ 95 @Deprecated createScanner(ULocale locale, String extras)96 protected RbnfLenientScanner createScanner(ULocale locale, String extras) { 97 RuleBasedCollator collator = null; 98 try { 99 // create a default collator based on the locale, 100 // then pull out that collator's rules, append any additional 101 // rules specified in the description, and create a _new_ 102 // collator based on the combination of those rules 103 collator = (RuleBasedCollator)Collator.getInstance(locale.toLocale()); 104 if (extras != null) { 105 String rules = collator.getRules() + extras; 106 collator = new RuleBasedCollator(rules); 107 } 108 collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION); 109 } 110 catch (Exception e) { 111 // If we get here, it means we have a malformed set of 112 // collation rules, which hopefully won't happen 113 ///CLOVER:OFF 114 if (DEBUG){ // debug hook 115 e.printStackTrace(); System.out.println("++++"); 116 } 117 collator = null; 118 ///CLOVER:ON 119 } 120 121 return new RbnfLenientScannerImpl(collator); 122 } 123 124 private static class RbnfLenientScannerImpl implements RbnfLenientScanner { 125 private final RuleBasedCollator collator; 126 RbnfLenientScannerImpl(RuleBasedCollator rbc)127 private RbnfLenientScannerImpl(RuleBasedCollator rbc) { 128 this.collator = rbc; 129 } 130 allIgnorable(String s)131 public boolean allIgnorable(String s) { 132 CollationElementIterator iter = collator.getCollationElementIterator(s); 133 134 int o = iter.next(); 135 while (o != CollationElementIterator.NULLORDER 136 && CollationElementIterator.primaryOrder(o) == 0) { 137 o = iter.next(); 138 } 139 return o == CollationElementIterator.NULLORDER; 140 } 141 findText(String str, String key, int startingAt)142 public int[] findText(String str, String key, int startingAt) { 143 int p = startingAt; 144 int keyLen = 0; 145 146 // basically just isolate smaller and smaller substrings of 147 // the target string (each running to the end of the string, 148 // and with the first one running from startingAt to the end) 149 // and then use prefixLength() to see if the search key is at 150 // the beginning of each substring. This is excruciatingly 151 // slow, but it will locate the key and tell use how long the 152 // matching text was. 153 while (p < str.length() && keyLen == 0) { 154 keyLen = prefixLength(str.substring(p), key); 155 if (keyLen != 0) { 156 return new int[] { p, keyLen }; 157 } 158 ++p; 159 } 160 // if we make it to here, we didn't find it. Return -1 for the 161 // location. The length should be ignored, but set it to 0, 162 // which should be "safe" 163 return new int[] { -1, 0 }; 164 } 165 166 ///CLOVER:OFF 167 // The following method contains the same signature as findText 168 // and has never been used by anything once. 169 @SuppressWarnings("unused") findText2(String str, String key, int startingAt)170 public int[] findText2(String str, String key, int startingAt) { 171 172 CollationElementIterator strIter = collator.getCollationElementIterator(str); 173 CollationElementIterator keyIter = collator.getCollationElementIterator(key); 174 175 int keyStart = -1; 176 177 strIter.setOffset(startingAt); 178 179 int oStr = strIter.next(); 180 int oKey = keyIter.next(); 181 while (oKey != CollationElementIterator.NULLORDER) { 182 while (oStr != CollationElementIterator.NULLORDER && 183 CollationElementIterator.primaryOrder(oStr) == 0) { 184 oStr = strIter.next(); 185 } 186 187 while (oKey != CollationElementIterator.NULLORDER && 188 CollationElementIterator.primaryOrder(oKey) == 0) { 189 oKey = keyIter.next(); 190 } 191 192 if (oStr == CollationElementIterator.NULLORDER) { 193 return new int[] { -1, 0 }; 194 } 195 196 if (oKey == CollationElementIterator.NULLORDER) { 197 break; 198 } 199 200 if (CollationElementIterator.primaryOrder(oStr) == 201 CollationElementIterator.primaryOrder(oKey)) { 202 keyStart = strIter.getOffset(); 203 oStr = strIter.next(); 204 oKey = keyIter.next(); 205 } else { 206 if (keyStart != -1) { 207 keyStart = -1; 208 keyIter.reset(); 209 } else { 210 oStr = strIter.next(); 211 } 212 } 213 } 214 215 return new int[] { keyStart, strIter.getOffset() - keyStart }; 216 } 217 ///CLOVER:ON 218 prefixLength(String str, String prefix)219 public int prefixLength(String str, String prefix) { 220 // Create two collation element iterators, one over the target string 221 // and another over the prefix. 222 // 223 // Previous code was matching "fifty-" against " fifty" and leaving 224 // the number " fifty-7" to parse as 43 (50 - 7). 225 // Also it seems that if we consume the entire prefix, that's ok even 226 // if we've consumed the entire string, so I switched the logic to 227 // reflect this. 228 229 CollationElementIterator strIter = collator.getCollationElementIterator(str); 230 CollationElementIterator prefixIter = collator.getCollationElementIterator(prefix); 231 232 // match collation elements between the strings 233 int oStr = strIter.next(); 234 int oPrefix = prefixIter.next(); 235 236 while (oPrefix != CollationElementIterator.NULLORDER) { 237 // skip over ignorable characters in the target string 238 while (CollationElementIterator.primaryOrder(oStr) == 0 && oStr != 239 CollationElementIterator.NULLORDER) { 240 oStr = strIter.next(); 241 } 242 243 // skip over ignorable characters in the prefix 244 while (CollationElementIterator.primaryOrder(oPrefix) == 0 && oPrefix != 245 CollationElementIterator.NULLORDER) { 246 oPrefix = prefixIter.next(); 247 } 248 249 // if skipping over ignorables brought to the end of 250 // the prefix, we DID match: drop out of the loop 251 if (oPrefix == CollationElementIterator.NULLORDER) { 252 break; 253 } 254 255 // if skipping over ignorables brought us to the end 256 // of the target string, we didn't match and return 0 257 if (oStr == CollationElementIterator.NULLORDER) { 258 return 0; 259 } 260 261 // match collation elements from the two strings 262 // (considering only primary differences). If we 263 // get a mismatch, dump out and return 0 264 if (CollationElementIterator.primaryOrder(oStr) != 265 CollationElementIterator.primaryOrder(oPrefix)) { 266 return 0; 267 } 268 269 // otherwise, advance to the next character in each string 270 // and loop (we drop out of the loop when we exhaust 271 // collation elements in the prefix) 272 273 oStr = strIter.next(); 274 oPrefix = prefixIter.next(); 275 } 276 277 int result = strIter.getOffset(); 278 if (oStr != CollationElementIterator.NULLORDER) { 279 --result; 280 } 281 return result; 282 } 283 } 284 } 285