1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2011, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ucharstrie.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010nov14 14 * created by: Markus W. Scherer 15 */ 16 17 #include "unicode/utypes.h" 18 #include "unicode/appendable.h" 19 #include "unicode/ucharstrie.h" 20 #include "unicode/uobject.h" 21 #include "unicode/utf16.h" 22 #include "cmemory.h" 23 #include "uassert.h" 24 25 U_NAMESPACE_BEGIN 26 27 UCharsTrie::~UCharsTrie() { 28 uprv_free(ownedArray_); 29 } 30 31 UStringTrieResult 32 UCharsTrie::current() const { 33 const UChar *pos=pos_; 34 if(pos==NULL) { 35 return USTRINGTRIE_NO_MATCH; 36 } else { 37 int32_t node; 38 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 39 valueResult(node) : USTRINGTRIE_NO_VALUE; 40 } 41 } 42 43 UStringTrieResult 44 UCharsTrie::firstForCodePoint(UChar32 cp) { 45 return cp<=0xffff ? 46 first(cp) : 47 (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ? 48 next(U16_TRAIL(cp)) : 49 USTRINGTRIE_NO_MATCH); 50 } 51 52 UStringTrieResult 53 UCharsTrie::nextForCodePoint(UChar32 cp) { 54 return cp<=0xffff ? 55 next(cp) : 56 (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ? 57 next(U16_TRAIL(cp)) : 58 USTRINGTRIE_NO_MATCH); 59 } 60 61 UStringTrieResult 62 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) { 63 // Branch according to the current unit. 64 if(length==0) { 65 length=*pos++; 66 } 67 ++length; 68 // The length of the branch is the number of units to select from. 69 // The data structure encodes a binary search. 70 while(length>kMaxBranchLinearSubNodeLength) { 71 if(uchar<*pos++) { 72 length>>=1; 73 pos=jumpByDelta(pos); 74 } else { 75 length=length-(length>>1); 76 pos=skipDelta(pos); 77 } 78 } 79 // Drop down to linear search for the last few units. 80 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 81 // and divides length by 2. 82 do { 83 if(uchar==*pos++) { 84 UStringTrieResult result; 85 int32_t node=*pos; 86 if(node&kValueIsFinal) { 87 // Leave the final value for getValue() to read. 88 result=USTRINGTRIE_FINAL_VALUE; 89 } else { 90 // Use the non-final value as the jump delta. 91 ++pos; 92 // int32_t delta=readValue(pos, node); 93 int32_t delta; 94 if(node<kMinTwoUnitValueLead) { 95 delta=node; 96 } else if(node<kThreeUnitValueLead) { 97 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++; 98 } else { 99 delta=(pos[0]<<16)|pos[1]; 100 pos+=2; 101 } 102 // end readValue() 103 pos+=delta; 104 node=*pos; 105 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 106 } 107 pos_=pos; 108 return result; 109 } 110 --length; 111 pos=skipValue(pos); 112 } while(length>1); 113 if(uchar==*pos++) { 114 pos_=pos; 115 int32_t node=*pos; 116 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 117 } else { 118 stop(); 119 return USTRINGTRIE_NO_MATCH; 120 } 121 } 122 123 UStringTrieResult 124 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) { 125 int32_t node=*pos++; 126 for(;;) { 127 if(node<kMinLinearMatch) { 128 return branchNext(pos, node, uchar); 129 } else if(node<kMinValueLead) { 130 // Match the first of length+1 units. 131 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 132 if(uchar==*pos++) { 133 remainingMatchLength_=--length; 134 pos_=pos; 135 return (length<0 && (node=*pos)>=kMinValueLead) ? 136 valueResult(node) : USTRINGTRIE_NO_VALUE; 137 } else { 138 // No match. 139 break; 140 } 141 } else if(node&kValueIsFinal) { 142 // No further matching units. 143 break; 144 } else { 145 // Skip intermediate value. 146 pos=skipNodeValue(pos, node); 147 node&=kNodeTypeMask; 148 } 149 } 150 stop(); 151 return USTRINGTRIE_NO_MATCH; 152 } 153 154 UStringTrieResult 155 UCharsTrie::next(int32_t uchar) { 156 const UChar *pos=pos_; 157 if(pos==NULL) { 158 return USTRINGTRIE_NO_MATCH; 159 } 160 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 161 if(length>=0) { 162 // Remaining part of a linear-match node. 163 if(uchar==*pos++) { 164 remainingMatchLength_=--length; 165 pos_=pos; 166 int32_t node; 167 return (length<0 && (node=*pos)>=kMinValueLead) ? 168 valueResult(node) : USTRINGTRIE_NO_VALUE; 169 } else { 170 stop(); 171 return USTRINGTRIE_NO_MATCH; 172 } 173 } 174 return nextImpl(pos, uchar); 175 } 176 177 UStringTrieResult 178 UCharsTrie::next(ConstChar16Ptr ptr, int32_t sLength) { 179 const UChar *s=ptr; 180 if(sLength<0 ? *s==0 : sLength==0) { 181 // Empty input. 182 return current(); 183 } 184 const UChar *pos=pos_; 185 if(pos==NULL) { 186 return USTRINGTRIE_NO_MATCH; 187 } 188 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 189 for(;;) { 190 // Fetch the next input unit, if there is one. 191 // Continue a linear-match node without rechecking sLength<0. 192 int32_t uchar; 193 if(sLength<0) { 194 for(;;) { 195 if((uchar=*s++)==0) { 196 remainingMatchLength_=length; 197 pos_=pos; 198 int32_t node; 199 return (length<0 && (node=*pos)>=kMinValueLead) ? 200 valueResult(node) : USTRINGTRIE_NO_VALUE; 201 } 202 if(length<0) { 203 remainingMatchLength_=length; 204 break; 205 } 206 if(uchar!=*pos) { 207 stop(); 208 return USTRINGTRIE_NO_MATCH; 209 } 210 ++pos; 211 --length; 212 } 213 } else { 214 for(;;) { 215 if(sLength==0) { 216 remainingMatchLength_=length; 217 pos_=pos; 218 int32_t node; 219 return (length<0 && (node=*pos)>=kMinValueLead) ? 220 valueResult(node) : USTRINGTRIE_NO_VALUE; 221 } 222 uchar=*s++; 223 --sLength; 224 if(length<0) { 225 remainingMatchLength_=length; 226 break; 227 } 228 if(uchar!=*pos) { 229 stop(); 230 return USTRINGTRIE_NO_MATCH; 231 } 232 ++pos; 233 --length; 234 } 235 } 236 int32_t node=*pos++; 237 for(;;) { 238 if(node<kMinLinearMatch) { 239 UStringTrieResult result=branchNext(pos, node, uchar); 240 if(result==USTRINGTRIE_NO_MATCH) { 241 return USTRINGTRIE_NO_MATCH; 242 } 243 // Fetch the next input unit, if there is one. 244 if(sLength<0) { 245 if((uchar=*s++)==0) { 246 return result; 247 } 248 } else { 249 if(sLength==0) { 250 return result; 251 } 252 uchar=*s++; 253 --sLength; 254 } 255 if(result==USTRINGTRIE_FINAL_VALUE) { 256 // No further matching units. 257 stop(); 258 return USTRINGTRIE_NO_MATCH; 259 } 260 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 261 node=*pos++; 262 } else if(node<kMinValueLead) { 263 // Match length+1 units. 264 length=node-kMinLinearMatch; // Actual match length minus 1. 265 if(uchar!=*pos) { 266 stop(); 267 return USTRINGTRIE_NO_MATCH; 268 } 269 ++pos; 270 --length; 271 break; 272 } else if(node&kValueIsFinal) { 273 // No further matching units. 274 stop(); 275 return USTRINGTRIE_NO_MATCH; 276 } else { 277 // Skip intermediate value. 278 pos=skipNodeValue(pos, node); 279 node&=kNodeTypeMask; 280 } 281 } 282 } 283 } 284 285 const UChar * 286 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length, 287 UBool haveUniqueValue, int32_t &uniqueValue) { 288 while(length>kMaxBranchLinearSubNodeLength) { 289 ++pos; // ignore the comparison unit 290 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 291 return NULL; 292 } 293 length=length-(length>>1); 294 pos=skipDelta(pos); 295 } 296 do { 297 ++pos; // ignore a comparison unit 298 // handle its value 299 int32_t node=*pos++; 300 UBool isFinal=(UBool)(node>>15); 301 node&=0x7fff; 302 int32_t value=readValue(pos, node); 303 pos=skipValue(pos, node); 304 if(isFinal) { 305 if(haveUniqueValue) { 306 if(value!=uniqueValue) { 307 return NULL; 308 } 309 } else { 310 uniqueValue=value; 311 haveUniqueValue=TRUE; 312 } 313 } else { 314 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 315 return NULL; 316 } 317 haveUniqueValue=TRUE; 318 } 319 } while(--length>1); 320 return pos+1; // ignore the last comparison unit 321 } 322 323 UBool 324 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 325 int32_t node=*pos++; 326 for(;;) { 327 if(node<kMinLinearMatch) { 328 if(node==0) { 329 node=*pos++; 330 } 331 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 332 if(pos==NULL) { 333 return FALSE; 334 } 335 haveUniqueValue=TRUE; 336 node=*pos++; 337 } else if(node<kMinValueLead) { 338 // linear-match node 339 pos+=node-kMinLinearMatch+1; // Ignore the match units. 340 node=*pos++; 341 } else { 342 UBool isFinal=(UBool)(node>>15); 343 int32_t value; 344 if(isFinal) { 345 value=readValue(pos, node&0x7fff); 346 } else { 347 value=readNodeValue(pos, node); 348 } 349 if(haveUniqueValue) { 350 if(value!=uniqueValue) { 351 return FALSE; 352 } 353 } else { 354 uniqueValue=value; 355 haveUniqueValue=TRUE; 356 } 357 if(isFinal) { 358 return TRUE; 359 } 360 pos=skipNodeValue(pos, node); 361 node&=kNodeTypeMask; 362 } 363 } 364 } 365 366 int32_t 367 UCharsTrie::getNextUChars(Appendable &out) const { 368 const UChar *pos=pos_; 369 if(pos==NULL) { 370 return 0; 371 } 372 if(remainingMatchLength_>=0) { 373 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node. 374 return 1; 375 } 376 int32_t node=*pos++; 377 if(node>=kMinValueLead) { 378 if(node&kValueIsFinal) { 379 return 0; 380 } else { 381 pos=skipNodeValue(pos, node); 382 node&=kNodeTypeMask; 383 } 384 } 385 if(node<kMinLinearMatch) { 386 if(node==0) { 387 node=*pos++; 388 } 389 out.reserveAppendCapacity(++node); 390 getNextBranchUChars(pos, node, out); 391 return node; 392 } else { 393 // First unit of the linear-match node. 394 out.appendCodeUnit(*pos); 395 return 1; 396 } 397 } 398 399 void 400 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) { 401 while(length>kMaxBranchLinearSubNodeLength) { 402 ++pos; // ignore the comparison unit 403 getNextBranchUChars(jumpByDelta(pos), length>>1, out); 404 length=length-(length>>1); 405 pos=skipDelta(pos); 406 } 407 do { 408 out.appendCodeUnit(*pos++); 409 pos=skipValue(pos); 410 } while(--length>1); 411 out.appendCodeUnit(*pos); 412 } 413 414 U_NAMESPACE_END 415