1 /* 2 ******************************************************************************* 3 * Copyright (C) 2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * created on: 2011feb25 7 * created by: Markus W. Scherer 8 */ 9 10 package com.ibm.icu.impl; 11 12 /** 13 * Implements the immutable Unicode properties Pattern_Syntax and Pattern_White_Space. 14 * Hardcodes these properties, does not load data, does not depend on other ICU classes. 15 * <p> 16 * Note: Both properties include ASCII as well as non-ASCII, non-Latin-1 code points, 17 * and both properties only include BMP code points (no supplementary ones). 18 * Pattern_Syntax includes some unassigned code points. 19 * <p> 20 * [:Pattern_White_Space:] = 21 * [\u0009-\u000D\ \u0085\u200E\u200F\u2028\u2029] 22 * <p> 23 * [:Pattern_Syntax:] = 24 * [!-/\:-@\[-\^`\{-~\u00A1-\u00A7\u00A9\u00AB\u00AC\u00AE 25 * \u00B0\u00B1\u00B6\u00BB\u00BF\u00D7\u00F7 26 * \u2010-\u2027\u2030-\u203E\u2041-\u2053\u2055-\u205E 27 * \u2190-\u245F\u2500-\u2775\u2794-\u2BFF\u2E00-\u2E7F 28 * \u3001-\u3003\u3008-\u3020\u3030\uFD3E\uFD3F\uFE45\uFE46] 29 * @author mscherer 30 */ 31 public final class PatternProps { 32 /** 33 * @return true if c is a Pattern_Syntax code point. 34 */ isSyntax(int c)35 public static boolean isSyntax(int c) { 36 if(c<0) { 37 return false; 38 } else if(c<=0xff) { 39 return latin1[c]==3; 40 } else if(c<0x2010) { 41 return false; 42 } else if(c<=0x3030) { 43 int bits=syntax2000[index2000[(c-0x2000)>>5]]; 44 return ((bits>>(c&0x1f))&1)!=0; 45 } else if(0xfd3e<=c && c<=0xfe46) { 46 return c<=0xfd3f || 0xfe45<=c; 47 } else { 48 return false; 49 } 50 } 51 52 /** 53 * @return true if c is a Pattern_Syntax or Pattern_White_Space code point. 54 */ isSyntaxOrWhiteSpace(int c)55 public static boolean isSyntaxOrWhiteSpace(int c) { 56 if(c<0) { 57 return false; 58 } else if(c<=0xff) { 59 return latin1[c]!=0; 60 } else if(c<0x200e) { 61 return false; 62 } else if(c<=0x3030) { 63 int bits=syntaxOrWhiteSpace2000[index2000[(c-0x2000)>>5]]; 64 return ((bits>>(c&0x1f))&1)!=0; 65 } else if(0xfd3e<=c && c<=0xfe46) { 66 return c<=0xfd3f || 0xfe45<=c; 67 } else { 68 return false; 69 } 70 } 71 72 /** 73 * @return true if c is a Pattern_White_Space character. 74 */ isWhiteSpace(int c)75 public static boolean isWhiteSpace(int c) { 76 if(c<0) { 77 return false; 78 } else if(c<=0xff) { 79 return latin1[c]==5; 80 } else if(0x200e<=c && c<=0x2029) { 81 return c<=0x200f || 0x2028<=c; 82 } else { 83 return false; 84 } 85 } 86 87 /** 88 * Skips over Pattern_White_Space starting at index i of the CharSequence. 89 * @return The smallest index at or after i with a non-white space character. 90 */ skipWhiteSpace(CharSequence s, int i)91 public static int skipWhiteSpace(CharSequence s, int i) { 92 while(i<s.length() && isWhiteSpace(s.charAt(i))) { 93 ++i; 94 } 95 return i; 96 } 97 98 /** 99 * @return s except with leading and trailing Pattern_White_Space removed. 100 */ trimWhiteSpace(String s)101 public static String trimWhiteSpace(String s) { 102 if(s.length()==0 || (!isWhiteSpace(s.charAt(0)) && !isWhiteSpace(s.charAt(s.length()-1)))) { 103 return s; 104 } 105 int start=0; 106 int limit=s.length(); 107 while(start<limit && isWhiteSpace(s.charAt(start))) { 108 ++start; 109 } 110 if(start<limit) { 111 // There is non-white space at start; we will not move limit below that, 112 // so we need not test start<limit in the loop. 113 while(isWhiteSpace(s.charAt(limit-1))) { 114 --limit; 115 } 116 } 117 return s.substring(start, limit); 118 } 119 120 /** 121 * Tests whether the CharSequence contains a "pattern identifier", that is, 122 * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters. 123 * @return true if there are no Pattern_White_Space or Pattern_Syntax characters in s. 124 */ isIdentifier(CharSequence s)125 public static boolean isIdentifier(CharSequence s) { 126 int limit=s.length(); 127 if(limit==0) { 128 return false; 129 } 130 int start=0; 131 do { 132 if(isSyntaxOrWhiteSpace(s.charAt(start++))) { 133 return false; 134 } 135 } while(start<limit); 136 return true; 137 } 138 139 /** 140 * Tests whether the CharSequence contains a "pattern identifier", that is, 141 * whether it contains only non-Pattern_White_Space, non-Pattern_Syntax characters. 142 * @return true if there are no Pattern_White_Space or Pattern_Syntax characters 143 * in s between start and (exclusive) limit. 144 */ isIdentifier(CharSequence s, int start, int limit)145 public static boolean isIdentifier(CharSequence s, int start, int limit) { 146 if(start>=limit) { 147 return false; 148 } 149 do { 150 if(isSyntaxOrWhiteSpace(s.charAt(start++))) { 151 return false; 152 } 153 } while(start<limit); 154 return true; 155 } 156 157 /** 158 * Skips over a "pattern identifier" starting at index i of the CharSequence. 159 * @return The smallest index at or after i with 160 * a Pattern_White_Space or Pattern_Syntax character. 161 */ skipIdentifier(CharSequence s, int i)162 public static int skipIdentifier(CharSequence s, int i) { 163 while(i<s.length() && !isSyntaxOrWhiteSpace(s.charAt(i))) { 164 ++i; 165 } 166 return i; 167 } 168 169 /* 170 * One byte per Latin-1 character. 171 * Bit 0 is set if either Pattern property is true, 172 * bit 1 if Pattern_Syntax is true, 173 * bit 2 if Pattern_White_Space is true. 174 * That is, Pattern_Syntax is encoded as 3 and Pattern_White_Space as 5. 175 */ 176 private static final byte latin1[]=new byte[] { // 256 177 // WS: 9..D 178 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 0, 0, 179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 180 // WS: 20 Syntax: 21..2F 181 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 182 // Syntax: 3A..40 183 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 184 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 185 // Syntax: 5B..5E 186 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 187 // Syntax: 60 188 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 189 // Syntax: 7B..7E 190 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 191 // WS: 85 192 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 194 // Syntax: A1..A7, A9, AB, AC, AE 195 0, 3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 0, 3, 0, 196 // Syntax: B0, B1, B6, BB, BF 197 3, 3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 0, 3, 198 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 199 // Syntax: D7 200 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 201 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 202 // Syntax: F7 203 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0 204 }; 205 206 /* 207 * One byte per 32 characters from U+2000..U+303F indexing into 208 * a small table of 32-bit data words. 209 * The first two data words are all-zeros and all-ones. 210 */ 211 private static final byte index2000[]=new byte[] { // 130 212 2, 3, 4, 0, 0, 0, 0, 0, // 20xx 213 0, 0, 0, 0, 5, 1, 1, 1, // 21xx 214 1, 1, 1, 1, 1, 1, 1, 1, // 22xx 215 1, 1, 1, 1, 1, 1, 1, 1, // 23xx 216 1, 1, 1, 0, 0, 0, 0, 0, // 24xx 217 1, 1, 1, 1, 1, 1, 1, 1, // 25xx 218 1, 1, 1, 1, 1, 1, 1, 1, // 26xx 219 1, 1, 1, 6, 7, 1, 1, 1, // 27xx 220 1, 1, 1, 1, 1, 1, 1, 1, // 28xx 221 1, 1, 1, 1, 1, 1, 1, 1, // 29xx 222 1, 1, 1, 1, 1, 1, 1, 1, // 2Axx 223 1, 1, 1, 1, 1, 1, 1, 1, // 2Bxx 224 0, 0, 0, 0, 0, 0, 0, 0, // 2Cxx 225 0, 0, 0, 0, 0, 0, 0, 0, // 2Dxx 226 1, 1, 1, 1, 0, 0, 0, 0, // 2Exx 227 0, 0, 0, 0, 0, 0, 0, 0, // 2Fxx 228 8, 9 // 3000..303F 229 }; 230 231 /* 232 * One 32-bit integer per 32 characters. Ranges of all-false and all-true 233 * are mapped to the first two values, other ranges map to appropriate bit patterns. 234 */ 235 private static final int syntax2000[]=new int[] { 236 0, 237 -1, 238 0xffff0000, // 2: 2010..201F 239 0x7fff00ff, // 3: 2020..2027, 2030..203E 240 0x7feffffe, // 4: 2041..2053, 2055..205E 241 0xffff0000, // 5: 2190..219F 242 0x003fffff, // 6: 2760..2775 243 0xfff00000, // 7: 2794..279F 244 0xffffff0e, // 8: 3001..3003, 3008..301F 245 0x00010001 // 9: 3020, 3030 246 }; 247 248 /* 249 * Same as syntax2000, but with additional bits set for the 250 * Pattern_White_Space characters 200E 200F 2028 2029. 251 */ 252 private static final int syntaxOrWhiteSpace2000[]=new int[] { 253 0, 254 -1, 255 0xffffc000, // 2: 200E..201F 256 0x7fff03ff, // 3: 2020..2029, 2030..203E 257 0x7feffffe, // 4: 2041..2053, 2055..205E 258 0xffff0000, // 5: 2190..219F 259 0x003fffff, // 6: 2760..2775 260 0xfff00000, // 7: 2794..279F 261 0xffffff0e, // 8: 3001..3003, 3008..301F 262 0x00010001 // 9: 3020, 3030 263 }; 264 } 265