1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /******************************************************************** 4 * COPYRIGHT: 5 * Copyright (c) 1999-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ********************************************************************/ 8 /************************************************************************ 9 * Date Name Description 10 * 12/15/99 Madhu Creation. 11 * 01/12/2000 Madhu Updated for changed API and added new tests 12 ************************************************************************/ 13 14 #include "unicode/utypes.h" 15 #if !UCONFIG_NO_BREAK_ITERATION 16 17 #include <stdio.h> 18 #include <stdlib.h> 19 #include <string.h> 20 21 #include "unicode/brkiter.h" 22 #include "unicode/localpointer.h" 23 #include "unicode/numfmt.h" 24 #include "unicode/rbbi.h" 25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 26 #include "unicode/regex.h" 27 #endif 28 #include "unicode/schriter.h" 29 #include "unicode/uchar.h" 30 #include "unicode/utf16.h" 31 #include "unicode/ucnv.h" 32 #include "unicode/uniset.h" 33 #include "unicode/uscript.h" 34 #include "unicode/ustring.h" 35 #include "unicode/utext.h" 36 37 #include "charstr.h" 38 #include "cmemory.h" 39 #include "intltest.h" 40 #include "rbbitst.h" 41 #include "utypeinfo.h" // for 'typeid' to work 42 #include "uvector.h" 43 #include "uvectr32.h" 44 45 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION 46 #include "unicode/filteredbrk.h" 47 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION 48 49 #define TEST_ASSERT(x) {if (!(x)) { \ 50 errln("Failure in file %s, line %d", __FILE__, __LINE__);}} 51 52 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \ 53 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}} 54 55 56 //--------------------------------------------- 57 // runIndexedTest 58 //--------------------------------------------- 59 60 61 // Note: Before adding new tests to this file, check whether the desired test data can 62 // simply be added to the file testdata/rbbitest.txt. In most cases it can, 63 // it's much less work than writing a new test, diagnostic output in the event of failures 64 // is good, and the test data file will is shared with ICU4J, so eventually the test 65 // will run there as well, without additional effort. 66 runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params ) 68 { 69 if (exec) logln("TestSuite RuleBasedBreakIterator: "); 70 fTestParams = params; 71 72 TESTCASE_AUTO_BEGIN; 73 #if !UCONFIG_NO_FILE_IO 74 TESTCASE_AUTO(TestBug4153072); 75 #endif 76 TESTCASE_AUTO(TestStatusReturn); 77 #if !UCONFIG_NO_FILE_IO 78 TESTCASE_AUTO(TestUnicodeFiles); 79 TESTCASE_AUTO(TestEmptyString); 80 #endif 81 TESTCASE_AUTO(TestGetAvailableLocales); 82 TESTCASE_AUTO(TestGetDisplayName); 83 #if !UCONFIG_NO_FILE_IO 84 TESTCASE_AUTO(TestEndBehaviour); 85 TESTCASE_AUTO(TestWordBreaks); 86 TESTCASE_AUTO(TestWordBoundary); 87 TESTCASE_AUTO(TestLineBreaks); 88 TESTCASE_AUTO(TestSentBreaks); 89 TESTCASE_AUTO(TestExtended); 90 #endif 91 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO 92 TESTCASE_AUTO(TestMonkey); 93 #endif 94 #if !UCONFIG_NO_FILE_IO 95 TESTCASE_AUTO(TestBug3818); 96 #endif 97 TESTCASE_AUTO(TestDebug); 98 #if !UCONFIG_NO_FILE_IO 99 TESTCASE_AUTO(TestBug5775); 100 #endif 101 TESTCASE_AUTO(TestBug9983); 102 TESTCASE_AUTO(TestDictRules); 103 TESTCASE_AUTO(TestBug5532); 104 TESTCASE_AUTO(TestBug7547); 105 TESTCASE_AUTO(TestBug12797); 106 TESTCASE_AUTO(TestBug12918); 107 TESTCASE_AUTO_END; 108 } 109 110 111 //--------------------------------------------------------------------------- 112 // 113 // class BITestData Holds a set of Break iterator test data and results 114 // Includes 115 // - the string data to be broken 116 // - a vector of the expected break positions. 117 // - a vector of source line numbers for the data, 118 // (to help see where errors occured.) 119 // - The expected break tag values. 120 // - Vectors of actual break positions and tag values. 121 // - Functions for comparing actual with expected and 122 // reporting errors. 123 // 124 //---------------------------------------------------------------------------- 125 class BITestData { 126 public: 127 UnicodeString fDataToBreak; 128 UVector fExpectedBreakPositions; 129 UVector fExpectedTags; 130 UVector fLineNum; 131 UVector fActualBreakPositions; // Test Results. 132 UVector fActualTags; 133 134 BITestData(UErrorCode &status); 135 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status); 136 void checkResults(const char *heading, RBBITest *test); 137 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx); 138 void clearResults(); 139 }; 140 141 // 142 // Constructor. 143 // BITestData(UErrorCode & status)144 BITestData::BITestData(UErrorCode &status) 145 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status), 146 fActualTags(status) 147 { 148 } 149 150 // 151 // addDataChunk. Add a section (non-breaking) piece if data to the test data. 152 // The macro form collects the line number, which is helpful 153 // when tracking down failures. 154 // 155 // A null data item is inserted at the start of each test's data 156 // to put the starting zero into the data list. The position saved for 157 // each non-null item is its ending position. 158 // 159 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status); addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)160 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) { 161 if (U_FAILURE(status)) {return;} 162 if (data != NULL) { 163 fDataToBreak.append(CharsToUnicodeString(data)); 164 } 165 fExpectedBreakPositions.addElement(fDataToBreak.length(), status); 166 fExpectedTags.addElement(tag, status); 167 fLineNum.addElement(lineNum, status); 168 } 169 170 171 // 172 // checkResults. Compare the actual and expected break positions, report any differences. 173 // checkResults(const char * heading,RBBITest * test)174 void BITestData::checkResults(const char *heading, RBBITest *test) { 175 int32_t expectedIndex = 0; 176 int32_t actualIndex = 0; 177 178 for (;;) { 179 // If we've run through both the expected and actual results vectors, we're done. 180 // break out of the loop. 181 if (expectedIndex >= fExpectedBreakPositions.size() && 182 actualIndex >= fActualBreakPositions.size()) { 183 break; 184 } 185 186 187 if (expectedIndex >= fExpectedBreakPositions.size()) { 188 err(heading, test, expectedIndex-1, actualIndex); 189 actualIndex++; 190 continue; 191 } 192 193 if (actualIndex >= fActualBreakPositions.size()) { 194 err(heading, test, expectedIndex, actualIndex-1); 195 expectedIndex++; 196 continue; 197 } 198 199 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) { 200 err(heading, test, expectedIndex, actualIndex); 201 // Try to resync the positions of the indices, to avoid a rash of spurious erros. 202 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) { 203 actualIndex++; 204 } else { 205 expectedIndex++; 206 } 207 continue; 208 } 209 210 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) { 211 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d", 212 heading, fLineNum.elementAt(expectedIndex), 213 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex)); 214 } 215 216 actualIndex++; 217 expectedIndex++; 218 } 219 } 220 221 // 222 // err - An error was found. Report it, along with information about where the 223 // incorrectly broken test data appeared in the source file. 224 // err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)225 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx) 226 { 227 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx); 228 int32_t actual = fActualBreakPositions.elementAti(actualIdx); 229 int32_t o = 0; 230 int32_t line = fLineNum.elementAti(expectedIdx); 231 if (expectedIdx > 0) { 232 // The line numbers are off by one because a premature break occurs somewhere 233 // within the previous item, rather than at the start of the current (expected) item. 234 // We want to report the offset of the unexpected break from the start of 235 // this previous item. 236 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1); 237 } 238 if (actual < expected) { 239 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected); 240 } else { 241 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected); 242 } 243 } 244 245 clearResults()246 void BITestData::clearResults() { 247 fActualBreakPositions.removeAllElements(); 248 fActualTags.removeAllElements(); 249 } 250 251 252 //-------------------------------------------------------------------------------------- 253 // 254 // RBBITest constructor and destructor 255 // 256 //-------------------------------------------------------------------------------------- 257 RBBITest()258 RBBITest::RBBITest() { 259 fTestParams = NULL; 260 } 261 262 ~RBBITest()263 RBBITest::~RBBITest() { 264 } 265 266 //----------------------------------------------------------------------------------- 267 // 268 // Test for status {tag} return value from break rules. 269 // TODO: a more thorough test. 270 // 271 //----------------------------------------------------------------------------------- TestStatusReturn()272 void RBBITest::TestStatusReturn() { 273 UnicodeString rulesString1("$Letters = [:L:];\n" 274 "$Numbers = [:N:];\n" 275 "$Letters+{1};\n" 276 "$Numbers+{2};\n" 277 "Help\\ /me\\!{4};\n" 278 "[^$Letters $Numbers];\n" 279 "!.*;\n", -1, US_INV); 280 UnicodeString testString1 = "abc123..abc Help me Help me!"; 281 // 01234567890123456789012345678 282 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1}; 283 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1}; 284 285 UErrorCode status=U_ZERO_ERROR; 286 UParseError parseError; 287 288 LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status)); 289 if(U_FAILURE(status)) { 290 dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status)); 291 return; 292 } 293 int32_t pos; 294 int32_t i = 0; 295 bi->setText(testString1); 296 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) { 297 if (pos != bounds1[i]) { 298 errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos); 299 break; 300 } 301 302 int tag = bi->getRuleStatus(); 303 if (tag != brkStatus[i]) { 304 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag); 305 break; 306 } 307 i++; 308 } 309 } 310 311 printStringBreaks(UText * tstr,int expected[],int expectedCount)312 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) { 313 UErrorCode status = U_ZERO_ERROR; 314 char name[100]; 315 printf("code alpha extend alphanum type word sent line name\n"); 316 int nextExpectedIndex = 0; 317 utext_setNativeIndex(tstr, 0); 318 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) { 319 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) { 320 printf("------------------------------------------------ %d\n", j); 321 ++nextExpectedIndex; 322 } 323 324 UChar32 c = utext_next32(tstr); 325 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status); 326 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c, 327 u_isUAlphabetic(c), 328 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND), 329 u_isalnum(c), 330 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY, 331 u_charType(c), 332 U_SHORT_PROPERTY_NAME), 333 u_getPropertyValueName(UCHAR_WORD_BREAK, 334 u_getIntPropertyValue(c, 335 UCHAR_WORD_BREAK), 336 U_SHORT_PROPERTY_NAME), 337 u_getPropertyValueName(UCHAR_SENTENCE_BREAK, 338 u_getIntPropertyValue(c, 339 UCHAR_SENTENCE_BREAK), 340 U_SHORT_PROPERTY_NAME), 341 u_getPropertyValueName(UCHAR_LINE_BREAK, 342 u_getIntPropertyValue(c, 343 UCHAR_LINE_BREAK), 344 U_SHORT_PROPERTY_NAME), 345 name); 346 } 347 } 348 349 printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)350 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) { 351 UErrorCode status = U_ZERO_ERROR; 352 UText *tstr = NULL; 353 tstr = utext_openConstUnicodeString(NULL, &ustr, &status); 354 if (U_FAILURE(status)) { 355 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status)); 356 return; 357 } 358 printStringBreaks(tstr, expected, expectedCount); 359 utext_close(tstr); 360 } 361 362 TestBug3818()363 void RBBITest::TestBug3818() { 364 UErrorCode status = U_ZERO_ERROR; 365 366 // Four Thai words... 367 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 368 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 }; 369 UnicodeString thaiStr(thaiWordData); 370 371 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status); 372 if (U_FAILURE(status) || bi == NULL) { 373 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status)); 374 return; 375 } 376 bi->setText(thaiStr); 377 378 int32_t startOfSecondWord = bi->following(1); 379 if (startOfSecondWord != 4) { 380 errln("Fail at file %s, line %d expected start of word at 4, got %d", 381 __FILE__, __LINE__, startOfSecondWord); 382 } 383 startOfSecondWord = bi->following(0); 384 if (startOfSecondWord != 4) { 385 errln("Fail at file %s, line %d expected start of word at 4, got %d", 386 __FILE__, __LINE__, startOfSecondWord); 387 } 388 delete bi; 389 } 390 391 //---------------------------------------------------------------------------- 392 // 393 // generalIteratorTest Given a break iterator and a set of test data, 394 // Run the tests and report the results. 395 // 396 //---------------------------------------------------------------------------- generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)397 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td) 398 { 399 400 bi.setText(td.fDataToBreak); 401 402 testFirstAndNext(bi, td); 403 404 testLastAndPrevious(bi, td); 405 406 testFollowing(bi, td); 407 testPreceding(bi, td); 408 testIsBoundary(bi, td); 409 doMultipleSelectionTest(bi, td); 410 } 411 412 413 // 414 // testFirstAndNext. Run the iterator forwards in the obvious first(), next() 415 // kind of loop. 416 // testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)417 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td) 418 { 419 UErrorCode status = U_ZERO_ERROR; 420 int32_t p; 421 int32_t lastP = -1; 422 int32_t tag; 423 424 logln("Test first and next"); 425 bi.setText(td.fDataToBreak); 426 td.clearResults(); 427 428 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) { 429 td.fActualBreakPositions.addElement(p, status); // Save result. 430 tag = bi.getRuleStatus(); 431 td.fActualTags.addElement(tag, status); 432 if (p <= lastP) { 433 // If the iterator is not making forward progress, stop. 434 // No need to raise an error here, it'll be detected in the normal check of results. 435 break; 436 } 437 lastP = p; 438 } 439 td.checkResults("testFirstAndNext", this); 440 } 441 442 443 // 444 // TestLastAndPrevious. Run the iterator backwards, starting with last(). 445 // testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)446 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td) 447 { 448 UErrorCode status = U_ZERO_ERROR; 449 int32_t p; 450 int32_t lastP = 0x7ffffffe; 451 int32_t tag; 452 453 logln("Test last and previous"); 454 bi.setText(td.fDataToBreak); 455 td.clearResults(); 456 457 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) { 458 // Save break position. Insert it at start of vector of results, shoving 459 // already-saved results further towards the end. 460 td.fActualBreakPositions.insertElementAt(p, 0, status); 461 // bi.previous(); // TODO: Why does this fix things up???? 462 // bi.next(); 463 tag = bi.getRuleStatus(); 464 td.fActualTags.insertElementAt(tag, 0, status); 465 if (p >= lastP) { 466 // If the iterator is not making progress, stop. 467 // No need to raise an error here, it'll be detected in the normal check of results. 468 break; 469 } 470 lastP = p; 471 } 472 td.checkResults("testLastAndPrevious", this); 473 } 474 475 testFollowing(RuleBasedBreakIterator & bi,BITestData & td)476 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td) 477 { 478 UErrorCode status = U_ZERO_ERROR; 479 int32_t p; 480 int32_t tag; 481 int32_t lastP = -2; // A value that will never be returned as a break position. 482 // cannot be -1; that is returned for DONE. 483 int i; 484 485 logln("testFollowing():"); 486 bi.setText(td.fDataToBreak); 487 td.clearResults(); 488 489 // Save the starting point, since we won't get that out of following. 490 p = bi.first(); 491 td.fActualBreakPositions.addElement(p, status); // Save result. 492 tag = bi.getRuleStatus(); 493 td.fActualTags.addElement(tag, status); 494 495 for (i = 0; i <= td.fDataToBreak.length()+1; i++) { 496 p = bi.following(i); 497 if (p != lastP) { 498 if (p == RuleBasedBreakIterator::DONE) { 499 break; 500 } 501 // We've reached a new break position. Save it. 502 td.fActualBreakPositions.addElement(p, status); // Save result. 503 tag = bi.getRuleStatus(); 504 td.fActualTags.addElement(tag, status); 505 lastP = p; 506 } 507 } 508 // The loop normally exits by means of the break in the middle. 509 // Make sure that the index was at the correct position for the break iterator to have 510 // returned DONE. 511 if (i != td.fDataToBreak.length()) { 512 errln("testFollowing(): iterator returned DONE prematurely."); 513 } 514 515 // Full check of all results. 516 td.checkResults("testFollowing", this); 517 } 518 519 520 testPreceding(RuleBasedBreakIterator & bi,BITestData & td)521 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) { 522 UErrorCode status = U_ZERO_ERROR; 523 int32_t p; 524 int32_t tag; 525 int32_t lastP = 0x7ffffffe; 526 int i; 527 528 logln("testPreceding():"); 529 bi.setText(td.fDataToBreak); 530 td.clearResults(); 531 532 p = bi.last(); 533 td.fActualBreakPositions.addElement(p, status); 534 tag = bi.getRuleStatus(); 535 td.fActualTags.addElement(tag, status); 536 537 for (i = td.fDataToBreak.length(); i>=-1; i--) { 538 p = bi.preceding(i); 539 if (p != lastP) { 540 if (p == RuleBasedBreakIterator::DONE) { 541 break; 542 } 543 // We've reached a new break position. Save it. 544 td.fActualBreakPositions.insertElementAt(p, 0, status); 545 lastP = p; 546 tag = bi.getRuleStatus(); 547 td.fActualTags.insertElementAt(tag, 0, status); 548 } 549 } 550 // The loop normally exits by means of the break in the middle. 551 // Make sure that the index was at the correct position for the break iterator to have 552 // returned DONE. 553 if (i != 0) { 554 errln("testPreceding(): iterator returned DONE prematurely."); 555 } 556 557 // Full check of all results. 558 td.checkResults("testPreceding", this); 559 } 560 561 562 testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)563 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) { 564 UErrorCode status = U_ZERO_ERROR; 565 int i; 566 int32_t tag; 567 568 logln("testIsBoundary():"); 569 bi.setText(td.fDataToBreak); 570 td.clearResults(); 571 572 for (i = 0; i <= td.fDataToBreak.length(); i++) { 573 if (bi.isBoundary(i)) { 574 td.fActualBreakPositions.addElement(i, status); // Save result. 575 tag = bi.getRuleStatus(); 576 td.fActualTags.addElement(tag, status); 577 } 578 } 579 td.checkResults("testIsBoundary: ", this); 580 } 581 582 583 doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)584 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td) 585 { 586 iterator.setText(td.fDataToBreak); 587 588 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone(); 589 int32_t offset = iterator.first(); 590 int32_t testOffset; 591 int32_t count = 0; 592 593 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length()); 594 595 if (*testIterator != iterator) 596 errln("clone() or operator!= failed: two clones compared unequal"); 597 598 do { 599 testOffset = testIterator->first(); 600 testOffset = testIterator->next(count); 601 if (offset != testOffset) 602 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 603 604 if (offset != RuleBasedBreakIterator::DONE) { 605 count++; 606 offset = iterator.next(); 607 608 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) { 609 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset); 610 if (count > 10000 || offset == -1) { 611 errln("operator== failed too many times. Stopping test."); 612 if (offset == -1) { 613 errln("Does (RuleBasedBreakIterator::DONE == -1)?"); 614 } 615 return; 616 } 617 } 618 } 619 } while (offset != RuleBasedBreakIterator::DONE); 620 621 // now do it backwards... 622 offset = iterator.last(); 623 count = 0; 624 625 do { 626 testOffset = testIterator->last(); 627 testOffset = testIterator->next(count); // next() with a negative arg is same as previous 628 if (offset != testOffset) 629 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset); 630 631 if (offset != RuleBasedBreakIterator::DONE) { 632 count--; 633 offset = iterator.previous(); 634 } 635 } while (offset != RuleBasedBreakIterator::DONE); 636 637 delete testIterator; 638 } 639 640 641 //--------------------------------------------- 642 // 643 // other tests 644 // 645 //--------------------------------------------- TestEmptyString()646 void RBBITest::TestEmptyString() 647 { 648 UnicodeString text = ""; 649 UErrorCode status = U_ZERO_ERROR; 650 651 BITestData x(status); 652 ADD_DATACHUNK(x, "", 0, status); // Break at start of data 653 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status); 654 if (U_FAILURE(status)) 655 { 656 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status)); 657 return; 658 } 659 generalIteratorTest(*bi, x); 660 delete bi; 661 } 662 TestGetAvailableLocales()663 void RBBITest::TestGetAvailableLocales() 664 { 665 int32_t locCount = 0; 666 const Locale* locList = BreakIterator::getAvailableLocales(locCount); 667 668 if (locCount == 0) 669 dataerrln("getAvailableLocales() returned an empty list!"); 670 // Just make sure that it's returning good memory. 671 int32_t i; 672 for (i = 0; i < locCount; ++i) { 673 logln(locList[i].getName()); 674 } 675 } 676 677 //Testing the BreakIterator::getDisplayName() function TestGetDisplayName()678 void RBBITest::TestGetDisplayName() 679 { 680 UnicodeString result; 681 682 BreakIterator::getDisplayName(Locale::getUS(), result); 683 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)") 684 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \"" 685 + result); 686 687 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result); 688 if (result != "French (France)") 689 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \"" 690 + result); 691 } 692 /** 693 * Test End Behaviour 694 * @bug 4068137 695 */ TestEndBehaviour()696 void RBBITest::TestEndBehaviour() 697 { 698 UErrorCode status = U_ZERO_ERROR; 699 UnicodeString testString("boo."); 700 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status); 701 if (U_FAILURE(status)) 702 { 703 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status)); 704 return; 705 } 706 wb->setText(testString); 707 708 if (wb->first() != 0) 709 errln("Didn't get break at beginning of string."); 710 if (wb->next() != 3) 711 errln("Didn't get break before period in \"boo.\""); 712 if (wb->current() != 4 && wb->next() != 4) 713 errln("Didn't get break at end of string."); 714 delete wb; 715 } 716 /* 717 * @bug 4153072 718 */ TestBug4153072()719 void RBBITest::TestBug4153072() { 720 UErrorCode status = U_ZERO_ERROR; 721 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status); 722 if (U_FAILURE(status)) 723 { 724 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status)); 725 return; 726 } 727 UnicodeString str("...Hello, World!..."); 728 int32_t begin = 3; 729 int32_t end = str.length() - 3; 730 UBool onBoundary; 731 732 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin); 733 iter->adoptText(textIterator); 734 int index; 735 // Note: with the switch to UText, there is no way to restrict the 736 // iteration range to begin at an index other than zero. 737 // String character iterators created with a non-zero bound are 738 // treated by RBBI as being empty. 739 for (index = -1; index < begin + 1; ++index) { 740 onBoundary = iter->isBoundary(index); 741 if (index == 0? !onBoundary : onBoundary) { 742 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index + 743 " and begin index = " + begin); 744 } 745 } 746 delete iter; 747 } 748 749 750 // 751 // Test for problem reported by Ashok Matoria on 9 July 2007 752 // One.<kSoftHyphen><kSpace>Two. 753 // 754 // Sentence break at start (0) and then on calling next() it breaks at 755 // 'T' of "Two". Now, at this point if I do next() and 756 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two". 757 // TestBug5775()758 void RBBITest::TestBug5775() { 759 UErrorCode status = U_ZERO_ERROR; 760 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 761 TEST_ASSERT_SUCCESS(status); 762 if (U_FAILURE(status)) { 763 return; 764 } 765 // Check for status first for better handling of no data errors. 766 TEST_ASSERT(bi != NULL); 767 if (bi == NULL) { 768 return; 769 } 770 771 UnicodeString s("One.\\u00ad Two.", -1, US_INV); 772 // 01234 56789 773 s = s.unescape(); 774 bi->setText(s); 775 int pos = bi->next(); 776 TEST_ASSERT(pos == 6); 777 pos = bi->next(); 778 TEST_ASSERT(pos == 10); 779 pos = bi->previous(); 780 TEST_ASSERT(pos == 6); 781 delete bi; 782 } 783 784 785 786 //------------------------------------------------------------------------------ 787 // 788 // RBBITest::Extended Run RBBI Tests from an external test data file 789 // 790 //------------------------------------------------------------------------------ 791 792 struct TestParams { 793 BreakIterator *bi; // Break iterator is set while parsing test source. 794 // Changed out whenever test data changes break type. 795 796 UnicodeString dataToBreak; // Data that is built up while parsing the test. 797 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString. 798 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak. 799 UVector32 *srcCol; 800 801 UText *textToBreak; // UText, could be UTF8 or UTF16. 802 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets. 803 CharString utf8String; // UTF-8 form of text to break. 804 TestParamsTestParams805 TestParams(UErrorCode &status) : dataToBreak() { 806 bi = NULL; 807 expectedBreaks = new UVector32(status); 808 srcLine = new UVector32(status); 809 srcCol = new UVector32(status); 810 textToBreak = NULL; 811 textMap = new UVector32(status); 812 } 813 ~TestParamsTestParams814 ~TestParams() { 815 delete bi; 816 delete expectedBreaks; 817 delete srcLine; 818 delete srcCol; 819 utext_close(textToBreak); 820 delete textMap; 821 } 822 823 int32_t getSrcLine(int32_t bp); 824 int32_t getExpectedBreak(int32_t bp); 825 int32_t getSrcCol(int32_t bp); 826 827 void setUTF16(UErrorCode &status); 828 void setUTF8(UErrorCode &status); 829 }; 830 831 // Append a UnicodeString to a CharString with UTF-8 encoding. 832 // Substitute any invalid chars. 833 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted. CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)834 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) { 835 if (U_FAILURE(status)) { 836 return; 837 } 838 int32_t utf8Length; 839 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight. 840 src.getBuffer(), src.length(), // UTF-16 data 841 0xfffd, NULL, // Substitution char, number of subs. 842 &status); 843 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 844 return; 845 } 846 status = U_ZERO_ERROR; 847 int32_t capacity; 848 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status); 849 u_strToUTF8WithSub(buffer, utf8Length, NULL, 850 src.getBuffer(), src.length(), 851 0xfffd, NULL, &status); 852 dest.append(buffer, utf8Length, status); 853 } 854 855 setUTF16(UErrorCode & status)856 void TestParams::setUTF16(UErrorCode &status) { 857 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status); 858 textMap->removeAllElements(); 859 for (int32_t i=0; i<dataToBreak.length(); i++) { 860 if (i == dataToBreak.getChar32Start(i)) { 861 textMap->addElement(i, status); 862 } else { 863 textMap->addElement(-1, status); 864 } 865 } 866 textMap->addElement(dataToBreak.length(), status); 867 U_ASSERT(dataToBreak.length() + 1 == textMap->size()); 868 } 869 870 setUTF8(UErrorCode & status)871 void TestParams::setUTF8(UErrorCode &status) { 872 if (U_FAILURE(status)) { 873 return; 874 } 875 utf8String.clear(); 876 CharStringAppend(utf8String, dataToBreak, status); 877 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status); 878 if (U_FAILURE(status)) { 879 return; 880 } 881 882 textMap->removeAllElements(); 883 int32_t utf16Index = 0; 884 for (;;) { 885 textMap->addElement(utf16Index, status); 886 UChar32 c32 = utext_current32(textToBreak); 887 if (c32 < 0) { 888 break; 889 } 890 utf16Index += U16_LENGTH(c32); 891 utext_next32(textToBreak); 892 while (textMap->size() < utext_getNativeIndex(textToBreak)) { 893 textMap->addElement(-1, status); 894 } 895 } 896 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size()); 897 } 898 899 getSrcLine(int32_t bp)900 int32_t TestParams::getSrcLine(int32_t bp) { 901 if (bp >= textMap->size()) { 902 bp = textMap->size() - 1; 903 } 904 int32_t i = 0; 905 for(; bp >= 0 ; --bp) { 906 // Move to a character boundary if we are not on one already. 907 i = textMap->elementAti(bp); 908 if (i >= 0) { 909 break; 910 } 911 } 912 return srcLine->elementAti(i); 913 } 914 915 getExpectedBreak(int32_t bp)916 int32_t TestParams::getExpectedBreak(int32_t bp) { 917 if (bp >= textMap->size()) { 918 return 0; 919 } 920 int32_t i = textMap->elementAti(bp); 921 int32_t retVal = 0; 922 if (i >= 0) { 923 retVal = expectedBreaks->elementAti(i); 924 } 925 return retVal; 926 } 927 928 getSrcCol(int32_t bp)929 int32_t TestParams::getSrcCol(int32_t bp) { 930 if (bp >= textMap->size()) { 931 bp = textMap->size() - 1; 932 } 933 int32_t i = 0; 934 for(; bp >= 0; --bp) { 935 // Move bp to a character boundary if we are not on one already. 936 i = textMap->elementAti(bp); 937 if (i >= 0) { 938 break; 939 } 940 } 941 return srcCol->elementAti(i); 942 } 943 944 executeTest(TestParams * t,UErrorCode & status)945 void RBBITest::executeTest(TestParams *t, UErrorCode &status) { 946 int32_t bp; 947 int32_t prevBP; 948 int32_t i; 949 950 TEST_ASSERT_SUCCESS(status); 951 if (U_FAILURE(status)) { 952 return; 953 } 954 955 if (t->bi == NULL) { 956 return; 957 } 958 959 t->bi->setText(t->textToBreak, status); 960 // 961 // Run the iterator forward 962 // 963 prevBP = -1; 964 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) { 965 if (prevBP == bp) { 966 // Fail for lack of forward progress. 967 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d", 968 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 969 break; 970 } 971 972 // Check that there we didn't miss an expected break between the last one 973 // and this one. 974 for (i=prevBP+1; i<bp; i++) { 975 if (t->getExpectedBreak(i) != 0) { 976 int expected[] = {0, i}; 977 printStringBreaks(t->dataToBreak, expected, 2); 978 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 979 i, t->getSrcLine(i), t->getSrcCol(i)); 980 } 981 } 982 983 // Check that the break we did find was expected 984 if (t->getExpectedBreak(bp) == 0) { 985 int expected[] = {0, bp}; 986 printStringBreaks(t->textToBreak, expected, 2); 987 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 988 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 989 } else { 990 // The break was expected. 991 // Check that the {nnn} tag value is correct. 992 int32_t expectedTagVal = t->getExpectedBreak(bp); 993 if (expectedTagVal == -1) { 994 expectedTagVal = 0; 995 } 996 int32_t line = t->getSrcLine(bp); 997 int32_t rs = t->bi->getRuleStatus(); 998 if (rs != expectedTagVal) { 999 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n" 1000 " Actual, Expected status = %4d, %4d", 1001 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1002 } 1003 } 1004 1005 prevBP = bp; 1006 } 1007 1008 // Verify that there were no missed expected breaks after the last one found 1009 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) { 1010 if (t->getExpectedBreak(i) != 0) { 1011 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1012 i, t->getSrcLine(i), t->getSrcCol(i)); 1013 } 1014 } 1015 1016 // 1017 // Run the iterator backwards, verify that the same breaks are found. 1018 // 1019 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen. 1020 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) { 1021 if (prevBP == bp) { 1022 // Fail for lack of progress. 1023 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d", 1024 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1025 break; 1026 } 1027 1028 // Check that we didn't miss an expected break between the last one 1029 // and this one. (UVector returns zeros for index out of bounds.) 1030 for (i=prevBP-1; i>bp; i--) { 1031 if (t->getExpectedBreak(i) != 0) { 1032 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1033 i, t->getSrcLine(i), t->getSrcCol(i)); 1034 } 1035 } 1036 1037 // Check that the break we did find was expected 1038 if (t->getExpectedBreak(bp) == 0) { 1039 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d", 1040 bp, t->getSrcLine(bp), t->getSrcCol(bp)); 1041 } else { 1042 // The break was expected. 1043 // Check that the {nnn} tag value is correct. 1044 int32_t expectedTagVal = t->getExpectedBreak(bp); 1045 if (expectedTagVal == -1) { 1046 expectedTagVal = 0; 1047 } 1048 int line = t->getSrcLine(bp); 1049 int32_t rs = t->bi->getRuleStatus(); 1050 if (rs != expectedTagVal) { 1051 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n" 1052 " Actual, Expected status = %4d, %4d", 1053 bp, line, t->getSrcCol(bp), rs, expectedTagVal); 1054 } 1055 } 1056 1057 prevBP = bp; 1058 } 1059 1060 // Verify that there were no missed breaks prior to the last one found 1061 for (i=prevBP-1; i>=0; i--) { 1062 if (t->getExpectedBreak(i) != 0) { 1063 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d", 1064 i, t->getSrcLine(i), t->getSrcCol(i)); 1065 } 1066 } 1067 1068 // Check isBoundary() 1069 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1070 UBool boundaryExpected = (t->getExpectedBreak(i) != 0); 1071 UBool boundaryFound = t->bi->isBoundary(i); 1072 if (boundaryExpected != boundaryFound) { 1073 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n" 1074 " Expected, Actual= %s, %s", 1075 i, t->getSrcLine(i), t->getSrcCol(i), 1076 boundaryExpected ? "true":"false", boundaryFound? "true" : "false"); 1077 } 1078 } 1079 1080 // Check following() 1081 for (i=0; i < utext_nativeLength(t->textToBreak); i++) { 1082 int32_t actualBreak = t->bi->following(i); 1083 int32_t expectedBreak = BreakIterator::DONE; 1084 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) { 1085 if (t->getExpectedBreak(j) != 0) { 1086 expectedBreak = j; 1087 break; 1088 } 1089 } 1090 if (expectedBreak != actualBreak) { 1091 errln("following(%d) incorrect. File line,col= %4d,%4d\n" 1092 " Expected, Actual= %d, %d", 1093 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1094 } 1095 } 1096 1097 // Check preceding() 1098 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) { 1099 int32_t actualBreak = t->bi->preceding(i); 1100 int32_t expectedBreak = BreakIterator::DONE; 1101 1102 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent. 1103 // preceding(trailing byte) will return the index of some preceding code point, 1104 // not the lead byte of the current code point, even though that has a smaller index. 1105 // Therefore, start looking at the expected break data not at i-1, but at 1106 // the start of code point index - 1. 1107 utext_setNativeIndex(t->textToBreak, i); 1108 int32_t j = utext_getNativeIndex(t->textToBreak) - 1; 1109 for (; j >= 0; j--) { 1110 if (t->getExpectedBreak(j) != 0) { 1111 expectedBreak = j; 1112 break; 1113 } 1114 } 1115 if (expectedBreak != actualBreak) { 1116 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n" 1117 " Expected, Actual= %d, %d", 1118 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak); 1119 } 1120 } 1121 } 1122 1123 TestExtended()1124 void RBBITest::TestExtended() { 1125 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1126 UErrorCode status = U_ZERO_ERROR; 1127 Locale locale(""); 1128 1129 UnicodeString rules; 1130 TestParams tp(status); 1131 1132 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status); 1133 if (U_FAILURE(status)) { 1134 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status)); 1135 } 1136 1137 1138 // 1139 // Open and read the test data file. 1140 // 1141 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1142 char testFileName[1000]; 1143 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1144 errln("Can't open test data. Path too long."); 1145 return; 1146 } 1147 strcpy(testFileName, testDataDirectory); 1148 strcat(testFileName, "rbbitst.txt"); 1149 1150 int len; 1151 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1152 if (U_FAILURE(status)) { 1153 return; /* something went wrong, error already output */ 1154 } 1155 1156 1157 bool skipTest = false; // Skip this test? 1158 1159 // 1160 // Put the test data into a UnicodeString 1161 // 1162 UnicodeString testString(FALSE, testFile, len); 1163 1164 enum EParseState{ 1165 PARSE_COMMENT, 1166 PARSE_TAG, 1167 PARSE_DATA, 1168 PARSE_NUM 1169 } 1170 parseState = PARSE_TAG; 1171 1172 EParseState savedState = PARSE_TAG; 1173 1174 static const UChar CH_LF = 0x0a; 1175 static const UChar CH_CR = 0x0d; 1176 static const UChar CH_HASH = 0x23; 1177 /*static const UChar CH_PERIOD = 0x2e;*/ 1178 static const UChar CH_LT = 0x3c; 1179 static const UChar CH_GT = 0x3e; 1180 static const UChar CH_BACKSLASH = 0x5c; 1181 static const UChar CH_BULLET = 0x2022; 1182 1183 int32_t lineNum = 1; 1184 int32_t colStart = 0; 1185 int32_t column = 0; 1186 int32_t charIdx = 0; 1187 1188 int32_t tagValue = 0; // The numeric value of a <nnn> tag. 1189 1190 for (charIdx = 0; charIdx < len; ) { 1191 status = U_ZERO_ERROR; 1192 UChar c = testString.charAt(charIdx); 1193 charIdx++; 1194 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) { 1195 // treat CRLF as a unit 1196 c = CH_LF; 1197 charIdx++; 1198 } 1199 if (c == CH_LF || c == CH_CR) { 1200 lineNum++; 1201 colStart = charIdx; 1202 } 1203 column = charIdx - colStart + 1; 1204 1205 switch (parseState) { 1206 case PARSE_COMMENT: 1207 if (c == 0x0a || c == 0x0d) { 1208 parseState = savedState; 1209 } 1210 break; 1211 1212 case PARSE_TAG: 1213 { 1214 if (c == CH_HASH) { 1215 parseState = PARSE_COMMENT; 1216 savedState = PARSE_TAG; 1217 break; 1218 } 1219 if (u_isUWhiteSpace(c)) { 1220 break; 1221 } 1222 if (testString.compare(charIdx-1, 6, "<word>") == 0) { 1223 delete tp.bi; 1224 tp.bi = BreakIterator::createWordInstance(locale, status); 1225 skipTest = false; 1226 charIdx += 5; 1227 break; 1228 } 1229 if (testString.compare(charIdx-1, 6, "<char>") == 0) { 1230 delete tp.bi; 1231 tp.bi = BreakIterator::createCharacterInstance(locale, status); 1232 skipTest = false; 1233 charIdx += 5; 1234 break; 1235 } 1236 if (testString.compare(charIdx-1, 6, "<line>") == 0) { 1237 delete tp.bi; 1238 tp.bi = BreakIterator::createLineInstance(locale, status); 1239 skipTest = false; 1240 charIdx += 5; 1241 break; 1242 } 1243 if (testString.compare(charIdx-1, 6, "<sent>") == 0) { 1244 delete tp.bi; 1245 tp.bi = BreakIterator::createSentenceInstance(locale, status); 1246 skipTest = false; 1247 charIdx += 5; 1248 break; 1249 } 1250 if (testString.compare(charIdx-1, 7, "<title>") == 0) { 1251 delete tp.bi; 1252 tp.bi = BreakIterator::createTitleInstance(locale, status); 1253 charIdx += 6; 1254 break; 1255 } 1256 1257 // <locale loc_name> 1258 localeMatcher.reset(testString); 1259 if (localeMatcher.lookingAt(charIdx-1, status)) { 1260 UnicodeString localeName = localeMatcher.group(1, status); 1261 char localeName8[100]; 1262 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0); 1263 locale = Locale::createFromName(localeName8); 1264 charIdx += localeMatcher.group(0, status).length() - 1; 1265 TEST_ASSERT_SUCCESS(status); 1266 break; 1267 } 1268 if (testString.compare(charIdx-1, 6, "<data>") == 0) { 1269 parseState = PARSE_DATA; 1270 charIdx += 5; 1271 tp.dataToBreak = ""; 1272 tp.expectedBreaks->removeAllElements(); 1273 tp.srcCol ->removeAllElements(); 1274 tp.srcLine->removeAllElements(); 1275 break; 1276 } 1277 1278 errln("line %d: Tag expected in test file.", lineNum); 1279 parseState = PARSE_COMMENT; 1280 savedState = PARSE_DATA; 1281 goto end_test; // Stop the test. 1282 } 1283 break; 1284 1285 case PARSE_DATA: 1286 if (c == CH_BULLET) { 1287 int32_t breakIdx = tp.dataToBreak.length(); 1288 tp.expectedBreaks->setSize(breakIdx+1); 1289 tp.expectedBreaks->setElementAt(-1, breakIdx); 1290 tp.srcLine->setSize(breakIdx+1); 1291 tp.srcLine->setElementAt(lineNum, breakIdx); 1292 tp.srcCol ->setSize(breakIdx+1); 1293 tp.srcCol ->setElementAt(column, breakIdx); 1294 break; 1295 } 1296 1297 if (testString.compare(charIdx-1, 7, "</data>") == 0) { 1298 // Add final entry to mappings from break location to source file position. 1299 // Need one extra because last break position returned is after the 1300 // last char in the data, not at the last char. 1301 tp.srcLine->addElement(lineNum, status); 1302 tp.srcCol ->addElement(column, status); 1303 1304 parseState = PARSE_TAG; 1305 charIdx += 6; 1306 1307 if (!skipTest) { 1308 // RUN THE TEST! 1309 status = U_ZERO_ERROR; 1310 tp.setUTF16(status); 1311 executeTest(&tp, status); 1312 TEST_ASSERT_SUCCESS(status); 1313 1314 // Run again, this time with UTF-8 text wrapped in a UText. 1315 status = U_ZERO_ERROR; 1316 tp.setUTF8(status); 1317 TEST_ASSERT_SUCCESS(status); 1318 executeTest(&tp, status); 1319 } 1320 break; 1321 } 1322 1323 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) { 1324 // Named character, e.g. \N{COMBINING GRAVE ACCENT} 1325 // Get the code point from the name and insert it into the test data. 1326 // (Damn, no API takes names in Unicode !!! 1327 // we've got to take it back to char *) 1328 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx); 1329 int32_t nameLength = nameEndIdx - (charIdx+2); 1330 char charNameBuf[200]; 1331 UChar32 theChar = -1; 1332 if (nameEndIdx != -1) { 1333 UErrorCode status = U_ZERO_ERROR; 1334 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf)); 1335 charNameBuf[sizeof(charNameBuf)-1] = 0; 1336 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status); 1337 if (U_FAILURE(status)) { 1338 theChar = -1; 1339 } 1340 } 1341 if (theChar == -1) { 1342 errln("Error in named character in test file at line %d, col %d", 1343 lineNum, column); 1344 } else { 1345 // Named code point was recognized. Insert it 1346 // into the test data. 1347 tp.dataToBreak.append(theChar); 1348 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1349 tp.srcLine->addElement(lineNum, status); 1350 tp.srcCol ->addElement(column, status); 1351 } 1352 } 1353 if (nameEndIdx > charIdx) { 1354 charIdx = nameEndIdx+1; 1355 1356 } 1357 break; 1358 } 1359 1360 1361 1362 1363 if (testString.compare(charIdx-1, 2, "<>") == 0) { 1364 charIdx++; 1365 int32_t breakIdx = tp.dataToBreak.length(); 1366 tp.expectedBreaks->setSize(breakIdx+1); 1367 tp.expectedBreaks->setElementAt(-1, breakIdx); 1368 tp.srcLine->setSize(breakIdx+1); 1369 tp.srcLine->setElementAt(lineNum, breakIdx); 1370 tp.srcCol ->setSize(breakIdx+1); 1371 tp.srcCol ->setElementAt(column, breakIdx); 1372 break; 1373 } 1374 1375 if (c == CH_LT) { 1376 tagValue = 0; 1377 parseState = PARSE_NUM; 1378 break; 1379 } 1380 1381 if (c == CH_HASH && column==3) { // TODO: why is column off so far? 1382 parseState = PARSE_COMMENT; 1383 savedState = PARSE_DATA; 1384 break; 1385 } 1386 1387 if (c == CH_BACKSLASH) { 1388 // Check for \ at end of line, a line continuation. 1389 // Advance over (discard) the newline 1390 UChar32 cp = testString.char32At(charIdx); 1391 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) { 1392 // We have a CR LF 1393 // Need an extra increment of the input ptr to move over both of them 1394 charIdx++; 1395 } 1396 if (cp == CH_LF || cp == CH_CR) { 1397 lineNum++; 1398 colStart = charIdx; 1399 charIdx++; 1400 break; 1401 } 1402 1403 // Let unescape handle the back slash. 1404 cp = testString.unescapeAt(charIdx); 1405 if (cp != -1) { 1406 // Escape sequence was recognized. Insert the char 1407 // into the test data. 1408 tp.dataToBreak.append(cp); 1409 while (tp.dataToBreak.length() > tp.srcLine->size()) { 1410 tp.srcLine->addElement(lineNum, status); 1411 tp.srcCol ->addElement(column, status); 1412 } 1413 break; 1414 } 1415 1416 1417 // Not a recognized backslash escape sequence. 1418 // Take the next char as a literal. 1419 // TODO: Should this be an error? 1420 c = testString.charAt(charIdx); 1421 charIdx = testString.moveIndex32(charIdx, 1); 1422 } 1423 1424 // Normal, non-escaped data char. 1425 tp.dataToBreak.append(c); 1426 1427 // Save the mapping from offset in the data to line/column numbers in 1428 // the original input file. Will be used for better error messages only. 1429 // If there's an expected break before this char, the slot in the mapping 1430 // vector will already be set for this char; don't overwrite it. 1431 if (tp.dataToBreak.length() > tp.srcLine->size()) { 1432 tp.srcLine->addElement(lineNum, status); 1433 tp.srcCol ->addElement(column, status); 1434 } 1435 break; 1436 1437 1438 case PARSE_NUM: 1439 // We are parsing an expected numeric tag value, like <1234>, 1440 // within a chunk of data. 1441 if (u_isUWhiteSpace(c)) { 1442 break; 1443 } 1444 1445 if (c == CH_GT) { 1446 // Finished the number. Add the info to the expected break data, 1447 // and switch parse state back to doing plain data. 1448 parseState = PARSE_DATA; 1449 if (tagValue == 0) { 1450 tagValue = -1; 1451 } 1452 int32_t breakIdx = tp.dataToBreak.length(); 1453 tp.expectedBreaks->setSize(breakIdx+1); 1454 tp.expectedBreaks->setElementAt(tagValue, breakIdx); 1455 tp.srcLine->setSize(breakIdx+1); 1456 tp.srcLine->setElementAt(lineNum, breakIdx); 1457 tp.srcCol ->setSize(breakIdx+1); 1458 tp.srcCol ->setElementAt(column, breakIdx); 1459 break; 1460 } 1461 1462 if (u_isdigit(c)) { 1463 tagValue = tagValue*10 + u_charDigitValue(c); 1464 break; 1465 } 1466 1467 errln("Syntax Error in test file at line %d, col %d", 1468 lineNum, column); 1469 parseState = PARSE_COMMENT; 1470 goto end_test; // Stop the test 1471 break; 1472 } 1473 1474 1475 if (U_FAILURE(status)) { 1476 dataerrln("ICU Error %s while parsing test file at line %d.", 1477 u_errorName(status), lineNum); 1478 status = U_ZERO_ERROR; 1479 goto end_test; // Stop the test 1480 } 1481 1482 } 1483 1484 end_test: 1485 delete [] testFile; 1486 #endif 1487 } 1488 1489 1490 //------------------------------------------------------------------------------- 1491 // 1492 // TestDictRules create a break iterator from source rules that includes a 1493 // dictionary range. Regression for bug #7130. Source rules 1494 // do not declare a break iterator type (word, line, sentence, etc. 1495 // but the dictionary code, without a type, would loop. 1496 // 1497 //------------------------------------------------------------------------------- TestDictRules()1498 void RBBITest::TestDictRules() { 1499 const char *rules = "$dictionary = [a-z]; \n" 1500 "!!forward; \n" 1501 "$dictionary $dictionary; \n" 1502 "!!reverse; \n" 1503 "$dictionary $dictionary; \n"; 1504 const char *text = "aa"; 1505 UErrorCode status = U_ZERO_ERROR; 1506 UParseError parseError; 1507 1508 RuleBasedBreakIterator bi(rules, parseError, status); 1509 if (U_SUCCESS(status)) { 1510 UnicodeString utext = text; 1511 bi.setText(utext); 1512 int32_t position; 1513 int32_t loops; 1514 for (loops = 0; loops<10; loops++) { 1515 position = bi.next(); 1516 if (position == RuleBasedBreakIterator::DONE) { 1517 break; 1518 } 1519 } 1520 TEST_ASSERT(loops == 1); 1521 } else { 1522 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status)); 1523 } 1524 } 1525 1526 1527 1528 //------------------------------------------------------------------------------- 1529 // 1530 // ReadAndConvertFile Read a text data file, convert it to UChars, and 1531 // return the data in one big UChar * buffer, which the caller must delete. 1532 // 1533 // parameters: 1534 // fileName: the name of the file, with no directory part. The test data directory 1535 // is assumed. 1536 // ulen an out parameter, receives the actual length (in UChars) of the file data. 1537 // encoding The file encoding. If the file contains a BOM, that will override the encoding 1538 // specified here. The BOM, if it exists, will be stripped from the returned data. 1539 // Pass NULL for the system default encoding. 1540 // status 1541 // returns: 1542 // The file data, converted to UChar. 1543 // The caller must delete this when done with 1544 // delete [] theBuffer; 1545 // 1546 // TODO: This is a clone of RegexTest::ReadAndConvertFile. 1547 // Move this function to some common place. 1548 // 1549 //-------------------------------------------------------------------------------- ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1550 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) { 1551 UChar *retPtr = NULL; 1552 char *fileBuf = NULL; 1553 UConverter* conv = NULL; 1554 FILE *f = NULL; 1555 1556 ulen = 0; 1557 if (U_FAILURE(status)) { 1558 return retPtr; 1559 } 1560 1561 // 1562 // Open the file. 1563 // 1564 f = fopen(fileName, "rb"); 1565 if (f == 0) { 1566 dataerrln("Error opening test data file %s\n", fileName); 1567 status = U_FILE_ACCESS_ERROR; 1568 return NULL; 1569 } 1570 // 1571 // Read it in 1572 // 1573 int fileSize; 1574 int amt_read; 1575 1576 fseek( f, 0, SEEK_END); 1577 fileSize = ftell(f); 1578 fileBuf = new char[fileSize]; 1579 fseek(f, 0, SEEK_SET); 1580 amt_read = fread(fileBuf, 1, fileSize, f); 1581 if (amt_read != fileSize || fileSize <= 0) { 1582 errln("Error reading test data file."); 1583 goto cleanUpAndReturn; 1584 } 1585 1586 // 1587 // Look for a Unicode Signature (BOM) on the data just read 1588 // 1589 int32_t signatureLength; 1590 const char * fileBufC; 1591 const char* bomEncoding; 1592 1593 fileBufC = fileBuf; 1594 bomEncoding = ucnv_detectUnicodeSignature( 1595 fileBuf, fileSize, &signatureLength, &status); 1596 if(bomEncoding!=NULL ){ 1597 fileBufC += signatureLength; 1598 fileSize -= signatureLength; 1599 encoding = bomEncoding; 1600 } 1601 1602 // 1603 // Open a converter to take the rule file to UTF-16 1604 // 1605 conv = ucnv_open(encoding, &status); 1606 if (U_FAILURE(status)) { 1607 goto cleanUpAndReturn; 1608 } 1609 1610 // 1611 // Convert the rules to UChar. 1612 // Preflight first to determine required buffer size. 1613 // 1614 ulen = ucnv_toUChars(conv, 1615 NULL, // dest, 1616 0, // destCapacity, 1617 fileBufC, 1618 fileSize, 1619 &status); 1620 if (status == U_BUFFER_OVERFLOW_ERROR) { 1621 // Buffer Overflow is expected from the preflight operation. 1622 status = U_ZERO_ERROR; 1623 1624 retPtr = new UChar[ulen+1]; 1625 ucnv_toUChars(conv, 1626 retPtr, // dest, 1627 ulen+1, 1628 fileBufC, 1629 fileSize, 1630 &status); 1631 } 1632 1633 cleanUpAndReturn: 1634 fclose(f); 1635 delete []fileBuf; 1636 ucnv_close(conv); 1637 if (U_FAILURE(status)) { 1638 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 1639 delete []retPtr; 1640 retPtr = 0; 1641 ulen = 0; 1642 }; 1643 return retPtr; 1644 } 1645 1646 1647 1648 //-------------------------------------------------------------------------------------------- 1649 // 1650 // Run tests from each of the boundary test data files distributed by the Unicode Consortium 1651 // 1652 //------------------------------------------------------------------------------------------- TestUnicodeFiles()1653 void RBBITest::TestUnicodeFiles() { 1654 RuleBasedBreakIterator *bi; 1655 UErrorCode status = U_ZERO_ERROR; 1656 1657 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 1658 TEST_ASSERT_SUCCESS(status); 1659 if (U_SUCCESS(status)) { 1660 runUnicodeTestData("GraphemeBreakTest.txt", bi); 1661 } 1662 delete bi; 1663 1664 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status); 1665 TEST_ASSERT_SUCCESS(status); 1666 if (U_SUCCESS(status)) { 1667 runUnicodeTestData("WordBreakTest.txt", bi); 1668 } 1669 delete bi; 1670 1671 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status); 1672 TEST_ASSERT_SUCCESS(status); 1673 if (U_SUCCESS(status)) { 1674 runUnicodeTestData("SentenceBreakTest.txt", bi); 1675 } 1676 delete bi; 1677 1678 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status); 1679 TEST_ASSERT_SUCCESS(status); 1680 if (U_SUCCESS(status)) { 1681 runUnicodeTestData("LineBreakTest.txt", bi); 1682 } 1683 delete bi; 1684 } 1685 1686 1687 // Check for test cases from the Unicode test data files that are known to fail 1688 // and should be skipped because ICU is not yet able to fully implement the spec. 1689 // See ticket #7270. 1690 testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1691 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { 1692 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file. 1693 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198 1694 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202 1695 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214 1696 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246 1697 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298 1698 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302 1699 }; 1700 if (strcmp(fileName, "LineBreakTest.txt") != 0) { 1701 return FALSE; 1702 } 1703 1704 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) { 1705 if (testCase == UnicodeString(badTestCases[i])) { 1706 return logKnownIssue("7270"); 1707 } 1708 } 1709 return FALSE; 1710 } 1711 1712 1713 //-------------------------------------------------------------------------------------------- 1714 // 1715 // Run tests from one of the boundary test data files distributed by the Unicode Consortium 1716 // 1717 //------------------------------------------------------------------------------------------- runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1718 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) { 1719 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1720 UErrorCode status = U_ZERO_ERROR; 1721 1722 // 1723 // Open and read the test data file, put it into a UnicodeString. 1724 // 1725 const char *testDataDirectory = IntlTest::getSourceTestData(status); 1726 char testFileName[1000]; 1727 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) { 1728 dataerrln("Can't open test data. Path too long."); 1729 return; 1730 } 1731 strcpy(testFileName, testDataDirectory); 1732 strcat(testFileName, fileName); 1733 1734 logln("Opening data file %s\n", fileName); 1735 1736 int len; 1737 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status); 1738 if (status != U_FILE_ACCESS_ERROR) { 1739 TEST_ASSERT_SUCCESS(status); 1740 TEST_ASSERT(testFile != NULL); 1741 } 1742 if (U_FAILURE(status) || testFile == NULL) { 1743 return; /* something went wrong, error already output */ 1744 } 1745 UnicodeString testFileAsString(TRUE, testFile, len); 1746 1747 // 1748 // Parse the test data file using a regular expression. 1749 // Each kind of token is recognized in its own capture group; what type of item was scanned 1750 // is identified by which group had a match. 1751 // 1752 // Caputure Group # 1 2 3 4 5 1753 // Parses this item: divide x hex digits comment \n unrecognized \n 1754 // 1755 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV); 1756 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status); 1757 UnicodeString testString; 1758 UVector32 breakPositions(status); 1759 int lineNumber = 1; 1760 TEST_ASSERT_SUCCESS(status); 1761 if (U_FAILURE(status)) { 1762 return; 1763 } 1764 1765 // 1766 // Scan through each test case, building up the string to be broken in testString, 1767 // and the positions that should be boundaries in the breakPositions vector. 1768 // 1769 int spin = 0; 1770 while (tokenMatcher.find()) { 1771 if(tokenMatcher.hitEnd()) { 1772 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for. 1773 This occurred when the text file was corrupt (wasn't marked as UTF-8) 1774 and caused an infinite loop here on EBCDIC systems! 1775 */ 1776 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin); 1777 // return; 1778 } 1779 if (tokenMatcher.start(1, status) >= 0) { 1780 // Scanned a divide sign, indicating a break position in the test data. 1781 if (testString.length()>0) { 1782 breakPositions.addElement(testString.length(), status); 1783 } 1784 } 1785 else if (tokenMatcher.start(2, status) >= 0) { 1786 // Scanned an 'x', meaning no break at this position in the test data 1787 // Nothing to be done here. 1788 } 1789 else if (tokenMatcher.start(3, status) >= 0) { 1790 // Scanned Hex digits. Convert them to binary, append to the character data string. 1791 const UnicodeString &hexNumber = tokenMatcher.group(3, status); 1792 int length = hexNumber.length(); 1793 if (length<=8) { 1794 char buf[10]; 1795 hexNumber.extract (0, length, buf, sizeof(buf), US_INV); 1796 UChar32 c = (UChar32)strtol(buf, NULL, 16); 1797 if (c<=0x10ffff) { 1798 testString.append(c); 1799 } else { 1800 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n", 1801 fileName, lineNumber); 1802 } 1803 } else { 1804 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n", 1805 fileName, lineNumber); 1806 } 1807 } 1808 else if (tokenMatcher.start(4, status) >= 0) { 1809 // Scanned to end of a line, possibly skipping over a comment in the process. 1810 // If the line from the file contained test data, run the test now. 1811 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) { 1812 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi); 1813 } 1814 1815 // Clear out this test case. 1816 // The string and breakPositions vector will be refilled as the next 1817 // test case is parsed. 1818 testString.remove(); 1819 breakPositions.removeAllElements(); 1820 lineNumber++; 1821 } else { 1822 // Scanner catchall. Something unrecognized appeared on the line. 1823 char token[16]; 1824 UnicodeString uToken = tokenMatcher.group(0, status); 1825 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token)); 1826 token[sizeof(token)-1] = 0; 1827 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token); 1828 1829 // Clean up, in preparation for continuing with the next line. 1830 testString.remove(); 1831 breakPositions.removeAllElements(); 1832 lineNumber++; 1833 } 1834 TEST_ASSERT_SUCCESS(status); 1835 if (U_FAILURE(status)) { 1836 break; 1837 } 1838 } 1839 1840 delete [] testFile; 1841 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 1842 } 1843 1844 //-------------------------------------------------------------------------------------------- 1845 // 1846 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium 1847 // test data files. Do only a simple, forward-only check - 1848 // this test is mostly to check that ICU and the Unicode 1849 // data agree with each other. 1850 // 1851 //-------------------------------------------------------------------------------------------- checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1852 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber, 1853 const UnicodeString &testString, // Text data to be broken 1854 UVector32 *breakPositions, // Positions where breaks should be found. 1855 RuleBasedBreakIterator *bi) { 1856 int32_t pos; // Break Position in the test string 1857 int32_t expectedI = 0; // Index of expected break position in the vector of expected results. 1858 int32_t expectedPos; // Expected break position (index into test string) 1859 1860 bi->setText(testString); 1861 pos = bi->first(); 1862 pos = bi->next(); 1863 1864 while (pos != BreakIterator::DONE) { 1865 if (expectedI >= breakPositions->size()) { 1866 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1867 testFileName, lineNumber, pos); 1868 break; 1869 } 1870 expectedPos = breakPositions->elementAti(expectedI); 1871 if (pos < expectedPos) { 1872 errln("Test file \"%s\", line %d, unexpected break found at position %d", 1873 testFileName, lineNumber, pos); 1874 break; 1875 } 1876 if (pos > expectedPos) { 1877 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1878 testFileName, lineNumber, expectedPos); 1879 break; 1880 } 1881 pos = bi->next(); 1882 expectedI++; 1883 } 1884 1885 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) { 1886 errln("Test file \"%s\", line %d, failed to find expected break at position %d", 1887 testFileName, lineNumber, breakPositions->elementAti(expectedI)); 1888 } 1889 } 1890 1891 1892 1893 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 1894 //--------------------------------------------------------------------------------------- 1895 // 1896 // classs RBBIMonkeyKind 1897 // 1898 // Monkey Test for Break Iteration 1899 // Abstract interface class. Concrete derived classes independently 1900 // implement the break rules for different iterator types. 1901 // 1902 // The Monkey Test itself uses doesn't know which type of break iterator it is 1903 // testing, but works purely in terms of the interface defined here. 1904 // 1905 //--------------------------------------------------------------------------------------- 1906 class RBBIMonkeyKind { 1907 public: 1908 // Return a UVector of UnicodeSets, representing the character classes used 1909 // for this type of iterator. 1910 virtual UVector *charClasses() = 0; 1911 1912 // Set the test text on which subsequent calls to next() will operate 1913 virtual void setText(const UnicodeString &s) = 0; 1914 1915 // Find the next break postion, starting from the prev break position, or from zero. 1916 // Return -1 after reaching end of string. 1917 virtual int32_t next(int32_t i) = 0; 1918 1919 virtual ~RBBIMonkeyKind(); 1920 UErrorCode deferredStatus; 1921 1922 1923 protected: 1924 RBBIMonkeyKind(); 1925 1926 private: 1927 }; 1928 RBBIMonkeyKind()1929 RBBIMonkeyKind::RBBIMonkeyKind() { 1930 deferredStatus = U_ZERO_ERROR; 1931 } 1932 ~RBBIMonkeyKind()1933 RBBIMonkeyKind::~RBBIMonkeyKind() { 1934 } 1935 1936 1937 //---------------------------------------------------------------------------------------- 1938 // 1939 // Random Numbers. Similar to standard lib rand() and srand() 1940 // Not using library to 1941 // 1. Get same results on all platforms. 1942 // 2. Get access to current seed, to more easily reproduce failures. 1943 // 1944 //--------------------------------------------------------------------------------------- 1945 static uint32_t m_seed = 1; 1946 m_rand()1947 static uint32_t m_rand() 1948 { 1949 m_seed = m_seed * 1103515245 + 12345; 1950 return (uint32_t)(m_seed/65536) % 32768; 1951 } 1952 1953 1954 // 1955 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773 1956 // 1957 static const char *gExtended_Pict = "[" 1958 "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093" 1959 "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" 1960 "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF" 1961 "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395" 1962 "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548" 1963 "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589" 1964 "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0" 1965 "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0" 1966 "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" 1967 "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625" 1968 "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667" 1969 "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF" 1970 "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF" 1971 "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF" 1972 "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF" 1973 "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF" 1974 "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F" 1975 "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8" 1976 "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF" 1977 "]"; 1978 1979 //------------------------------------------------------------------------------------------ 1980 // 1981 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation 1982 // of RBBIMonkeyKind. 1983 // 1984 //------------------------------------------------------------------------------------------ 1985 class RBBICharMonkey: public RBBIMonkeyKind { 1986 public: 1987 RBBICharMonkey(); 1988 virtual ~RBBICharMonkey(); 1989 virtual UVector *charClasses(); 1990 virtual void setText(const UnicodeString &s); 1991 virtual int32_t next(int32_t i); 1992 private: 1993 UVector *fSets; 1994 1995 UnicodeSet *fCRLFSet; 1996 UnicodeSet *fControlSet; 1997 UnicodeSet *fExtendSet; 1998 UnicodeSet *fZWJSet; 1999 UnicodeSet *fRegionalIndicatorSet; 2000 UnicodeSet *fPrependSet; 2001 UnicodeSet *fSpacingSet; 2002 UnicodeSet *fLSet; 2003 UnicodeSet *fVSet; 2004 UnicodeSet *fTSet; 2005 UnicodeSet *fLVSet; 2006 UnicodeSet *fLVTSet; 2007 UnicodeSet *fHangulSet; 2008 UnicodeSet *fEmojiBaseSet; 2009 UnicodeSet *fEmojiModifierSet; 2010 UnicodeSet *fExtendedPictSet; 2011 UnicodeSet *fEBGSet; 2012 UnicodeSet *fEmojiNRKSet; 2013 UnicodeSet *fAnySet; 2014 2015 const UnicodeString *fText; 2016 }; 2017 2018 RBBICharMonkey()2019 RBBICharMonkey::RBBICharMonkey() { 2020 UErrorCode status = U_ZERO_ERROR; 2021 2022 fText = NULL; 2023 2024 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status); 2025 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status); 2026 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status); 2027 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status); 2028 fRegionalIndicatorSet = 2029 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status); 2030 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status); 2031 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status); 2032 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status); 2033 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status); 2034 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status); 2035 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status); 2036 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status); 2037 fHangulSet = new UnicodeSet(); 2038 fHangulSet->addAll(*fLSet); 2039 fHangulSet->addAll(*fVSet); 2040 fHangulSet->addAll(*fTSet); 2041 fHangulSet->addAll(*fLVSet); 2042 fHangulSet->addAll(*fLVTSet); 2043 2044 fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status); 2045 fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status); 2046 fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status); 2047 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status); 2048 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE( 2049 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status); 2050 fAnySet = new UnicodeSet(0, 0x10ffff); 2051 2052 fSets = new UVector(status); 2053 fSets->addElement(fCRLFSet, status); 2054 fSets->addElement(fControlSet, status); 2055 fSets->addElement(fExtendSet, status); 2056 fSets->addElement(fRegionalIndicatorSet, status); 2057 if (!fPrependSet->isEmpty()) { 2058 fSets->addElement(fPrependSet, status); 2059 } 2060 fSets->addElement(fSpacingSet, status); 2061 fSets->addElement(fHangulSet, status); 2062 fSets->addElement(fAnySet, status); 2063 fSets->addElement(fEmojiBaseSet, status); 2064 fSets->addElement(fEmojiModifierSet, status); 2065 fSets->addElement(fZWJSet, status); 2066 fSets->addElement(fExtendedPictSet, status); 2067 fSets->addElement(fEBGSet, status); 2068 fSets->addElement(fEmojiNRKSet,status); 2069 if (U_FAILURE(status)) { 2070 deferredStatus = status; 2071 } 2072 } 2073 2074 setText(const UnicodeString & s)2075 void RBBICharMonkey::setText(const UnicodeString &s) { 2076 fText = &s; 2077 } 2078 2079 2080 next(int32_t prevPos)2081 int32_t RBBICharMonkey::next(int32_t prevPos) { 2082 int p0, p1, p2, p3; // Indices of the significant code points around the 2083 // break position being tested. The candidate break 2084 // location is before p2. 2085 2086 int breakPos = -1; 2087 2088 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2089 UChar32 cBase; // for (X Extend*) patterns, the X character. 2090 2091 if (U_FAILURE(deferredStatus)) { 2092 return -1; 2093 } 2094 2095 // Previous break at end of string. return DONE. 2096 if (prevPos >= fText->length()) { 2097 return -1; 2098 } 2099 p0 = p1 = p2 = p3 = prevPos; 2100 c3 = fText->char32At(prevPos); 2101 c0 = c1 = c2 = cBase = 0; 2102 (void)p0; // suppress set but not used warning. 2103 (void)c0; 2104 2105 // Loop runs once per "significant" character position in the input text. 2106 for (;;) { 2107 // Move all of the positions forward in the input string. 2108 p0 = p1; c0 = c1; 2109 p1 = p2; c1 = c2; 2110 p2 = p3; c2 = c3; 2111 2112 // Advancd p3 by one codepoint 2113 p3 = fText->moveIndex32(p3, 1); 2114 c3 = fText->char32At(p3); 2115 2116 if (p1 == p2) { 2117 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2118 continue; 2119 } 2120 if (p2 == fText->length()) { 2121 // Reached end of string. Always a break position. 2122 break; 2123 } 2124 2125 // Rule GB3 CR x LF 2126 // No Extend or Format characters may appear between the CR and LF, 2127 // which requires the additional check for p2 immediately following p1. 2128 // 2129 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 2130 continue; 2131 } 2132 2133 // Rule (GB4). ( Control | CR | LF ) <break> 2134 if (fControlSet->contains(c1) || 2135 c1 == 0x0D || 2136 c1 == 0x0A) { 2137 break; 2138 } 2139 2140 // Rule (GB5) <break> ( Control | CR | LF ) 2141 // 2142 if (fControlSet->contains(c2) || 2143 c2 == 0x0D || 2144 c2 == 0x0A) { 2145 break; 2146 } 2147 2148 2149 // Rule (GB6) L x ( L | V | LV | LVT ) 2150 if (fLSet->contains(c1) && 2151 (fLSet->contains(c2) || 2152 fVSet->contains(c2) || 2153 fLVSet->contains(c2) || 2154 fLVTSet->contains(c2))) { 2155 continue; 2156 } 2157 2158 // Rule (GB7) ( LV | V ) x ( V | T ) 2159 if ((fLVSet->contains(c1) || fVSet->contains(c1)) && 2160 (fVSet->contains(c2) || fTSet->contains(c2))) { 2161 continue; 2162 } 2163 2164 // Rule (GB8) ( LVT | T) x T 2165 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) && 2166 fTSet->contains(c2)) { 2167 continue; 2168 } 2169 2170 // Rule (GB9) x (Extend | ZWJ) 2171 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) { 2172 if (!fExtendSet->contains(c1)) { 2173 cBase = c1; 2174 } 2175 continue; 2176 } 2177 2178 // Rule (GB9a) x SpacingMark 2179 if (fSpacingSet->contains(c2)) { 2180 continue; 2181 } 2182 2183 // Rule (GB9b) Prepend x 2184 if (fPrependSet->contains(c1)) { 2185 continue; 2186 } 2187 2188 // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier 2189 if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) { 2190 continue; 2191 } 2192 if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) && 2193 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) { 2194 continue; 2195 } 2196 2197 // Rule (GB11) (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji) 2198 if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) && 2199 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { 2200 continue; 2201 } 2202 2203 // Rule (GB12-13) Regional_Indicator x Regional_Indicator 2204 // Note: The first if condition is a little tricky. We only need to force 2205 // a break if there are three or more contiguous RIs. If there are 2206 // only two, a break following will occur via other rules, and will include 2207 // any trailing extend characters, which is needed behavior. 2208 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1) 2209 && fRegionalIndicatorSet->contains(c2)) { 2210 break; 2211 } 2212 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2213 continue; 2214 } 2215 2216 // Rule (GB999) Any <break> Any 2217 break; 2218 } 2219 2220 breakPos = p2; 2221 return breakPos; 2222 } 2223 2224 2225 charClasses()2226 UVector *RBBICharMonkey::charClasses() { 2227 return fSets; 2228 } 2229 2230 ~RBBICharMonkey()2231 RBBICharMonkey::~RBBICharMonkey() { 2232 delete fSets; 2233 delete fCRLFSet; 2234 delete fControlSet; 2235 delete fExtendSet; 2236 delete fRegionalIndicatorSet; 2237 delete fPrependSet; 2238 delete fSpacingSet; 2239 delete fLSet; 2240 delete fVSet; 2241 delete fTSet; 2242 delete fLVSet; 2243 delete fLVTSet; 2244 delete fHangulSet; 2245 delete fAnySet; 2246 delete fEmojiBaseSet; 2247 delete fEmojiModifierSet; 2248 delete fZWJSet; 2249 delete fExtendedPictSet; 2250 delete fEBGSet; 2251 delete fEmojiNRKSet; 2252 } 2253 2254 //------------------------------------------------------------------------------------------ 2255 // 2256 // class RBBIWordMonkey Word Break specific implementation 2257 // of RBBIMonkeyKind. 2258 // 2259 //------------------------------------------------------------------------------------------ 2260 class RBBIWordMonkey: public RBBIMonkeyKind { 2261 public: 2262 RBBIWordMonkey(); 2263 virtual ~RBBIWordMonkey(); 2264 virtual UVector *charClasses(); 2265 virtual void setText(const UnicodeString &s); 2266 virtual int32_t next(int32_t i); 2267 private: 2268 UVector *fSets; 2269 2270 UnicodeSet *fCRSet; 2271 UnicodeSet *fLFSet; 2272 UnicodeSet *fNewlineSet; 2273 UnicodeSet *fRegionalIndicatorSet; 2274 UnicodeSet *fKatakanaSet; 2275 UnicodeSet *fHebrew_LetterSet; 2276 UnicodeSet *fALetterSet; 2277 UnicodeSet *fSingle_QuoteSet; 2278 UnicodeSet *fDouble_QuoteSet; 2279 UnicodeSet *fMidNumLetSet; 2280 UnicodeSet *fMidLetterSet; 2281 UnicodeSet *fMidNumSet; 2282 UnicodeSet *fNumericSet; 2283 UnicodeSet *fFormatSet; 2284 UnicodeSet *fOtherSet; 2285 UnicodeSet *fExtendSet; 2286 UnicodeSet *fExtendNumLetSet; 2287 UnicodeSet *fDictionarySet; 2288 UnicodeSet *fEBaseSet; 2289 UnicodeSet *fEBGSet; 2290 UnicodeSet *fEModifierSet; 2291 UnicodeSet *fZWJSet; 2292 UnicodeSet *fExtendedPictSet; 2293 UnicodeSet *fEmojiNRKSet; 2294 2295 const UnicodeString *fText; 2296 }; 2297 2298 RBBIWordMonkey()2299 RBBIWordMonkey::RBBIWordMonkey() 2300 { 2301 UErrorCode status = U_ZERO_ERROR; 2302 2303 fSets = new UVector(status); 2304 2305 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status); 2306 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status); 2307 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status); 2308 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status); 2309 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status); 2310 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status); 2311 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status); 2312 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status); 2313 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status); 2314 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status); 2315 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status); 2316 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status); 2317 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status); 2318 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status); 2319 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status); 2320 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status); 2321 2322 fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE( 2323 "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status); 2324 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"), status); 2325 fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"), status); 2326 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"), status); 2327 fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status); 2328 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE( 2329 "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status); 2330 2331 fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status); 2332 fDictionarySet->addAll(*fKatakanaSet); 2333 fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status)); 2334 2335 fALetterSet->removeAll(*fDictionarySet); 2336 2337 fOtherSet = new UnicodeSet(); 2338 if(U_FAILURE(status)) { 2339 deferredStatus = status; 2340 return; 2341 } 2342 2343 fOtherSet->complement(); 2344 fOtherSet->removeAll(*fCRSet); 2345 fOtherSet->removeAll(*fLFSet); 2346 fOtherSet->removeAll(*fNewlineSet); 2347 fOtherSet->removeAll(*fKatakanaSet); 2348 fOtherSet->removeAll(*fHebrew_LetterSet); 2349 fOtherSet->removeAll(*fALetterSet); 2350 fOtherSet->removeAll(*fSingle_QuoteSet); 2351 fOtherSet->removeAll(*fDouble_QuoteSet); 2352 fOtherSet->removeAll(*fMidLetterSet); 2353 fOtherSet->removeAll(*fMidNumSet); 2354 fOtherSet->removeAll(*fNumericSet); 2355 fOtherSet->removeAll(*fExtendNumLetSet); 2356 fOtherSet->removeAll(*fFormatSet); 2357 fOtherSet->removeAll(*fExtendSet); 2358 fOtherSet->removeAll(*fRegionalIndicatorSet); 2359 fOtherSet->removeAll(*fEBaseSet); 2360 fOtherSet->removeAll(*fEBGSet); 2361 fOtherSet->removeAll(*fEModifierSet); 2362 fOtherSet->removeAll(*fZWJSet); 2363 fOtherSet->removeAll(*fExtendedPictSet); 2364 fOtherSet->removeAll(*fEmojiNRKSet); 2365 2366 // Inhibit dictionary characters from being tested at all. 2367 fOtherSet->removeAll(*fDictionarySet); 2368 2369 fSets->addElement(fCRSet, status); 2370 fSets->addElement(fLFSet, status); 2371 fSets->addElement(fNewlineSet, status); 2372 fSets->addElement(fRegionalIndicatorSet, status); 2373 fSets->addElement(fHebrew_LetterSet, status); 2374 fSets->addElement(fALetterSet, status); 2375 fSets->addElement(fSingle_QuoteSet, status); 2376 fSets->addElement(fDouble_QuoteSet, status); 2377 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters 2378 // from the test data. They are all in the dictionary set, 2379 // which this (old, to be retired) monkey test cannot handle. 2380 fSets->addElement(fMidLetterSet, status); 2381 fSets->addElement(fMidNumLetSet, status); 2382 fSets->addElement(fMidNumSet, status); 2383 fSets->addElement(fNumericSet, status); 2384 fSets->addElement(fFormatSet, status); 2385 fSets->addElement(fExtendSet, status); 2386 fSets->addElement(fOtherSet, status); 2387 fSets->addElement(fExtendNumLetSet, status); 2388 2389 fSets->addElement(fEBaseSet, status); 2390 fSets->addElement(fEBGSet, status); 2391 fSets->addElement(fEModifierSet, status); 2392 fSets->addElement(fZWJSet, status); 2393 fSets->addElement(fExtendedPictSet, status); 2394 fSets->addElement(fEmojiNRKSet, status); 2395 2396 if (U_FAILURE(status)) { 2397 deferredStatus = status; 2398 } 2399 } 2400 setText(const UnicodeString & s)2401 void RBBIWordMonkey::setText(const UnicodeString &s) { 2402 fText = &s; 2403 } 2404 2405 next(int32_t prevPos)2406 int32_t RBBIWordMonkey::next(int32_t prevPos) { 2407 int p0, p1, p2, p3; // Indices of the significant code points around the 2408 // break position being tested. The candidate break 2409 // location is before p2. 2410 2411 int breakPos = -1; 2412 2413 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2414 2415 if (U_FAILURE(deferredStatus)) { 2416 return -1; 2417 } 2418 2419 // Prev break at end of string. return DONE. 2420 if (prevPos >= fText->length()) { 2421 return -1; 2422 } 2423 p0 = p1 = p2 = p3 = prevPos; 2424 c3 = fText->char32At(prevPos); 2425 c0 = c1 = c2 = 0; 2426 (void)p0; // Suppress set but not used warning. 2427 2428 // Loop runs once per "significant" character position in the input text. 2429 for (;;) { 2430 // Move all of the positions forward in the input string. 2431 p0 = p1; c0 = c1; 2432 p1 = p2; c1 = c2; 2433 p2 = p3; c2 = c3; 2434 2435 // Advancd p3 by X(Extend | Format)* Rule 4 2436 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 2437 do { 2438 p3 = fText->moveIndex32(p3, 1); 2439 c3 = fText->char32At(p3); 2440 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2441 break; 2442 }; 2443 } 2444 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3)); 2445 2446 2447 if (p1 == p2) { 2448 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2449 continue; 2450 } 2451 if (p2 == fText->length()) { 2452 // Reached end of string. Always a break position. 2453 break; 2454 } 2455 2456 // Rule (3) CR x LF 2457 // No Extend or Format characters may appear between the CR and LF, 2458 // which requires the additional check for p2 immediately following p1. 2459 // 2460 if (c1==0x0D && c2==0x0A) { 2461 continue; 2462 } 2463 2464 // Rule (3a) Break before and after newlines (including CR and LF) 2465 // 2466 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) { 2467 break; 2468 }; 2469 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) { 2470 break; 2471 }; 2472 2473 // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK). 2474 // Not ignoring extend chars, so peek into input text to 2475 // get the potential ZWJ, the character immediately preceding c2. 2476 // Sloppy UChar32 indexing: p2-1 may reference trail half 2477 // but char32At will get the full code point. 2478 if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) { 2479 continue; 2480 } 2481 2482 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 2483 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2484 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2485 continue; 2486 } 2487 2488 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 2489 // 2490 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2491 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2492 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) { 2493 continue; 2494 } 2495 2496 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 2497 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) && 2498 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2499 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2500 continue; 2501 } 2502 2503 // Rule (7a) Hebrew_Letter x Single_Quote 2504 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) { 2505 continue; 2506 } 2507 2508 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 2509 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) { 2510 continue; 2511 } 2512 2513 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 2514 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) { 2515 continue; 2516 } 2517 2518 // Rule (8) Numeric x Numeric 2519 if (fNumericSet->contains(c1) && 2520 fNumericSet->contains(c2)) { 2521 continue; 2522 } 2523 2524 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 2525 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) && 2526 fNumericSet->contains(c2)) { 2527 continue; 2528 } 2529 2530 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 2531 if (fNumericSet->contains(c1) && 2532 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) { 2533 continue; 2534 } 2535 2536 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 2537 if (fNumericSet->contains(c0) && 2538 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) && 2539 fNumericSet->contains(c2)) { 2540 continue; 2541 } 2542 2543 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 2544 if (fNumericSet->contains(c1) && 2545 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) && 2546 fNumericSet->contains(c3)) { 2547 continue; 2548 } 2549 2550 // Rule (13) Katakana x Katakana 2551 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 2552 // all Katakana are handled by the dictionary breaker. 2553 if (fKatakanaSet->contains(c1) && 2554 fKatakanaSet->contains(c2)) { 2555 continue; 2556 } 2557 2558 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 2559 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) || 2560 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) && 2561 fExtendNumLetSet->contains(c2)) { 2562 continue; 2563 } 2564 2565 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 2566 if (fExtendNumLetSet->contains(c1) && 2567 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) || 2568 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) { 2569 continue; 2570 } 2571 2572 // WB 14 (E_Base | EBG) x E_Modifier 2573 if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) { 2574 continue; 2575 } 2576 2577 // Rule 15 - 17 Group pairs of Regional Indicators. 2578 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) { 2579 break; 2580 } 2581 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) { 2582 continue; 2583 } 2584 2585 // Rule 999. Break found here. 2586 break; 2587 } 2588 2589 breakPos = p2; 2590 return breakPos; 2591 } 2592 2593 charClasses()2594 UVector *RBBIWordMonkey::charClasses() { 2595 return fSets; 2596 } 2597 2598 ~RBBIWordMonkey()2599 RBBIWordMonkey::~RBBIWordMonkey() { 2600 delete fSets; 2601 delete fCRSet; 2602 delete fLFSet; 2603 delete fNewlineSet; 2604 delete fKatakanaSet; 2605 delete fHebrew_LetterSet; 2606 delete fALetterSet; 2607 delete fSingle_QuoteSet; 2608 delete fDouble_QuoteSet; 2609 delete fMidNumLetSet; 2610 delete fMidLetterSet; 2611 delete fMidNumSet; 2612 delete fNumericSet; 2613 delete fFormatSet; 2614 delete fExtendSet; 2615 delete fExtendNumLetSet; 2616 delete fRegionalIndicatorSet; 2617 delete fDictionarySet; 2618 delete fOtherSet; 2619 delete fEBaseSet; 2620 delete fEBGSet; 2621 delete fEModifierSet; 2622 delete fZWJSet; 2623 delete fExtendedPictSet; 2624 delete fEmojiNRKSet; 2625 } 2626 2627 2628 2629 2630 //------------------------------------------------------------------------------------------ 2631 // 2632 // class RBBISentMonkey Sentence Break specific implementation 2633 // of RBBIMonkeyKind. 2634 // 2635 //------------------------------------------------------------------------------------------ 2636 class RBBISentMonkey: public RBBIMonkeyKind { 2637 public: 2638 RBBISentMonkey(); 2639 virtual ~RBBISentMonkey(); 2640 virtual UVector *charClasses(); 2641 virtual void setText(const UnicodeString &s); 2642 virtual int32_t next(int32_t i); 2643 private: 2644 int moveBack(int posFrom); 2645 int moveForward(int posFrom); 2646 UChar32 cAt(int pos); 2647 2648 UVector *fSets; 2649 2650 UnicodeSet *fSepSet; 2651 UnicodeSet *fFormatSet; 2652 UnicodeSet *fSpSet; 2653 UnicodeSet *fLowerSet; 2654 UnicodeSet *fUpperSet; 2655 UnicodeSet *fOLetterSet; 2656 UnicodeSet *fNumericSet; 2657 UnicodeSet *fATermSet; 2658 UnicodeSet *fSContinueSet; 2659 UnicodeSet *fSTermSet; 2660 UnicodeSet *fCloseSet; 2661 UnicodeSet *fOtherSet; 2662 UnicodeSet *fExtendSet; 2663 2664 const UnicodeString *fText; 2665 2666 }; 2667 RBBISentMonkey()2668 RBBISentMonkey::RBBISentMonkey() 2669 { 2670 UErrorCode status = U_ZERO_ERROR; 2671 2672 fSets = new UVector(status); 2673 2674 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 2675 // set and made into character classes of their own. For the monkey impl, 2676 // they remain in SEP, since Sep always appears with CR and LF in the rules. 2677 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status); 2678 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status); 2679 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status); 2680 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status); 2681 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status); 2682 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status); 2683 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status); 2684 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status); 2685 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status); 2686 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status); 2687 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status); 2688 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status); 2689 fOtherSet = new UnicodeSet(); 2690 2691 if(U_FAILURE(status)) { 2692 deferredStatus = status; 2693 return; 2694 } 2695 2696 fOtherSet->complement(); 2697 fOtherSet->removeAll(*fSepSet); 2698 fOtherSet->removeAll(*fFormatSet); 2699 fOtherSet->removeAll(*fSpSet); 2700 fOtherSet->removeAll(*fLowerSet); 2701 fOtherSet->removeAll(*fUpperSet); 2702 fOtherSet->removeAll(*fOLetterSet); 2703 fOtherSet->removeAll(*fNumericSet); 2704 fOtherSet->removeAll(*fATermSet); 2705 fOtherSet->removeAll(*fSContinueSet); 2706 fOtherSet->removeAll(*fSTermSet); 2707 fOtherSet->removeAll(*fCloseSet); 2708 fOtherSet->removeAll(*fExtendSet); 2709 2710 fSets->addElement(fSepSet, status); 2711 fSets->addElement(fFormatSet, status); 2712 fSets->addElement(fSpSet, status); 2713 fSets->addElement(fLowerSet, status); 2714 fSets->addElement(fUpperSet, status); 2715 fSets->addElement(fOLetterSet, status); 2716 fSets->addElement(fNumericSet, status); 2717 fSets->addElement(fATermSet, status); 2718 fSets->addElement(fSContinueSet, status); 2719 fSets->addElement(fSTermSet, status); 2720 fSets->addElement(fCloseSet, status); 2721 fSets->addElement(fOtherSet, status); 2722 fSets->addElement(fExtendSet, status); 2723 2724 if (U_FAILURE(status)) { 2725 deferredStatus = status; 2726 } 2727 } 2728 2729 2730 setText(const UnicodeString & s)2731 void RBBISentMonkey::setText(const UnicodeString &s) { 2732 fText = &s; 2733 } 2734 charClasses()2735 UVector *RBBISentMonkey::charClasses() { 2736 return fSets; 2737 } 2738 2739 2740 // moveBack() Find the "significant" code point preceding the index i. 2741 // Skips over ($Extend | $Format)* . 2742 // moveBack(int i)2743 int RBBISentMonkey::moveBack(int i) { 2744 if (i <= 0) { 2745 return -1; 2746 } 2747 UChar32 c; 2748 int32_t j = i; 2749 do { 2750 j = fText->moveIndex32(j, -1); 2751 c = fText->char32At(j); 2752 } 2753 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c))); 2754 return j; 2755 2756 } 2757 2758 moveForward(int i)2759 int RBBISentMonkey::moveForward(int i) { 2760 if (i>=fText->length()) { 2761 return fText->length(); 2762 } 2763 UChar32 c; 2764 int32_t j = i; 2765 do { 2766 j = fText->moveIndex32(j, 1); 2767 c = cAt(j); 2768 } 2769 while (fFormatSet->contains(c) || fExtendSet->contains(c)); 2770 return j; 2771 } 2772 cAt(int pos)2773 UChar32 RBBISentMonkey::cAt(int pos) { 2774 if (pos<0 || pos>=fText->length()) { 2775 return -1; 2776 } else { 2777 return fText->char32At(pos); 2778 } 2779 } 2780 next(int32_t prevPos)2781 int32_t RBBISentMonkey::next(int32_t prevPos) { 2782 int p0, p1, p2, p3; // Indices of the significant code points around the 2783 // break position being tested. The candidate break 2784 // location is before p2. 2785 2786 int breakPos = -1; 2787 2788 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 2789 UChar32 c; 2790 2791 if (U_FAILURE(deferredStatus)) { 2792 return -1; 2793 } 2794 2795 // Prev break at end of string. return DONE. 2796 if (prevPos >= fText->length()) { 2797 return -1; 2798 } 2799 p0 = p1 = p2 = p3 = prevPos; 2800 c3 = fText->char32At(prevPos); 2801 c0 = c1 = c2 = 0; 2802 (void)p0; // Suppress set but not used warning. 2803 2804 // Loop runs once per "significant" character position in the input text. 2805 for (;;) { 2806 // Move all of the positions forward in the input string. 2807 p0 = p1; c0 = c1; 2808 p1 = p2; c1 = c2; 2809 p2 = p3; c2 = c3; 2810 2811 // Advancd p3 by X(Extend | Format)* Rule 4 2812 p3 = moveForward(p3); 2813 c3 = cAt(p3); 2814 2815 // Rule (3) CR x LF 2816 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 2817 continue; 2818 } 2819 2820 // Rule (4). Sep <break> 2821 if (fSepSet->contains(c1)) { 2822 p2 = p1+1; // Separators don't combine with Extend or Format. 2823 break; 2824 } 2825 2826 if (p2 >= fText->length()) { 2827 // Reached end of string. Always a break position. 2828 break; 2829 } 2830 2831 if (p2 == prevPos) { 2832 // Still warming up the loop. (won't work with zero length strings, but we don't care) 2833 continue; 2834 } 2835 2836 // Rule (6). ATerm x Numeric 2837 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) { 2838 continue; 2839 } 2840 2841 // Rule (7). (Upper | Lower) ATerm x Uppper 2842 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) && 2843 fATermSet->contains(c1) && fUpperSet->contains(c2)) { 2844 continue; 2845 } 2846 2847 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower 2848 // Note: STerm | ATerm are added to the negated part of the expression by a 2849 // note to the Unicode 5.0 documents. 2850 int p8 = p1; 2851 while (fSpSet->contains(cAt(p8))) { 2852 p8 = moveBack(p8); 2853 } 2854 while (fCloseSet->contains(cAt(p8))) { 2855 p8 = moveBack(p8); 2856 } 2857 if (fATermSet->contains(cAt(p8))) { 2858 p8=p2; 2859 for (;;) { 2860 c = cAt(p8); 2861 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) || 2862 fLowerSet->contains(c) || fSepSet->contains(c) || 2863 fATermSet->contains(c) || fSTermSet->contains(c)) { 2864 break; 2865 } 2866 p8 = moveForward(p8); 2867 } 2868 if (fLowerSet->contains(cAt(p8))) { 2869 continue; 2870 } 2871 } 2872 2873 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm); 2874 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) { 2875 p8 = p1; 2876 while (fSpSet->contains(cAt(p8))) { 2877 p8 = moveBack(p8); 2878 } 2879 while (fCloseSet->contains(cAt(p8))) { 2880 p8 = moveBack(p8); 2881 } 2882 c = cAt(p8); 2883 if (fSTermSet->contains(c) || fATermSet->contains(c)) { 2884 continue; 2885 } 2886 } 2887 2888 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 2889 int p9 = p1; 2890 while (fCloseSet->contains(cAt(p9))) { 2891 p9 = moveBack(p9); 2892 } 2893 c = cAt(p9); 2894 if ((fSTermSet->contains(c) || fATermSet->contains(c))) { 2895 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) { 2896 continue; 2897 } 2898 } 2899 2900 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 2901 int p10 = p1; 2902 while (fSpSet->contains(cAt(p10))) { 2903 p10 = moveBack(p10); 2904 } 2905 while (fCloseSet->contains(cAt(p10))) { 2906 p10 = moveBack(p10); 2907 } 2908 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) { 2909 if (fSpSet->contains(c2) || fSepSet->contains(c2)) { 2910 continue; 2911 } 2912 } 2913 2914 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break> 2915 int p11 = p1; 2916 if (fSepSet->contains(cAt(p11))) { 2917 p11 = moveBack(p11); 2918 } 2919 while (fSpSet->contains(cAt(p11))) { 2920 p11 = moveBack(p11); 2921 } 2922 while (fCloseSet->contains(cAt(p11))) { 2923 p11 = moveBack(p11); 2924 } 2925 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) { 2926 break; 2927 } 2928 2929 // Rule (12) Any x Any 2930 continue; 2931 } 2932 breakPos = p2; 2933 return breakPos; 2934 } 2935 ~RBBISentMonkey()2936 RBBISentMonkey::~RBBISentMonkey() { 2937 delete fSets; 2938 delete fSepSet; 2939 delete fFormatSet; 2940 delete fSpSet; 2941 delete fLowerSet; 2942 delete fUpperSet; 2943 delete fOLetterSet; 2944 delete fNumericSet; 2945 delete fATermSet; 2946 delete fSContinueSet; 2947 delete fSTermSet; 2948 delete fCloseSet; 2949 delete fOtherSet; 2950 delete fExtendSet; 2951 } 2952 2953 2954 2955 //------------------------------------------------------------------------------------------- 2956 // 2957 // RBBILineMonkey 2958 // 2959 //------------------------------------------------------------------------------------------- 2960 2961 class RBBILineMonkey: public RBBIMonkeyKind { 2962 public: 2963 RBBILineMonkey(); 2964 virtual ~RBBILineMonkey(); 2965 virtual UVector *charClasses(); 2966 virtual void setText(const UnicodeString &s); 2967 virtual int32_t next(int32_t i); 2968 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar); 2969 private: 2970 UVector *fSets; 2971 2972 UnicodeSet *fBK; 2973 UnicodeSet *fCR; 2974 UnicodeSet *fLF; 2975 UnicodeSet *fCM; 2976 UnicodeSet *fNL; 2977 UnicodeSet *fSG; 2978 UnicodeSet *fWJ; 2979 UnicodeSet *fZW; 2980 UnicodeSet *fGL; 2981 UnicodeSet *fCB; 2982 UnicodeSet *fSP; 2983 UnicodeSet *fB2; 2984 UnicodeSet *fBA; 2985 UnicodeSet *fBB; 2986 UnicodeSet *fHY; 2987 UnicodeSet *fH2; 2988 UnicodeSet *fH3; 2989 UnicodeSet *fCL; 2990 UnicodeSet *fCP; 2991 UnicodeSet *fEX; 2992 UnicodeSet *fIN; 2993 UnicodeSet *fJL; 2994 UnicodeSet *fJV; 2995 UnicodeSet *fJT; 2996 UnicodeSet *fNS; 2997 UnicodeSet *fOP; 2998 UnicodeSet *fQU; 2999 UnicodeSet *fIS; 3000 UnicodeSet *fNU; 3001 UnicodeSet *fPO; 3002 UnicodeSet *fPR; 3003 UnicodeSet *fSY; 3004 UnicodeSet *fAI; 3005 UnicodeSet *fAL; 3006 UnicodeSet *fCJ; 3007 UnicodeSet *fHL; 3008 UnicodeSet *fID; 3009 UnicodeSet *fRI; 3010 UnicodeSet *fXX; 3011 UnicodeSet *fEB; 3012 UnicodeSet *fEM; 3013 UnicodeSet *fZJ; 3014 UnicodeSet *fExtendedPict; 3015 UnicodeSet *fEmojiNRK; 3016 3017 BreakIterator *fCharBI; 3018 const UnicodeString *fText; 3019 RegexMatcher *fNumberMatcher; 3020 }; 3021 RBBILineMonkey()3022 RBBILineMonkey::RBBILineMonkey() : 3023 RBBIMonkeyKind(), 3024 fSets(NULL), 3025 3026 fCharBI(NULL), 3027 fText(NULL), 3028 fNumberMatcher(NULL) 3029 3030 { 3031 if (U_FAILURE(deferredStatus)) { 3032 return; 3033 } 3034 3035 UErrorCode status = U_ZERO_ERROR; 3036 3037 fSets = new UVector(status); 3038 3039 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status); 3040 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status); 3041 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status); 3042 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status); 3043 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status); 3044 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status); 3045 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status); 3046 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status); 3047 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status); 3048 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status); 3049 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status); 3050 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status); 3051 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status); 3052 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status); 3053 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status); 3054 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status); 3055 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status); 3056 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status); 3057 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status); 3058 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status); 3059 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status); 3060 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status); 3061 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status); 3062 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status); 3063 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status); 3064 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status); 3065 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status); 3066 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status); 3067 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status); 3068 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status); 3069 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status); 3070 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status); 3071 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status); 3072 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status); 3073 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status); 3074 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status); 3075 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status); 3076 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status); 3077 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status); 3078 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE( 3079 "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status); 3080 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status); 3081 fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status); 3082 fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status); 3083 fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status); 3084 3085 if (U_FAILURE(status)) { 3086 deferredStatus = status; 3087 return; 3088 } 3089 3090 fAL->addAll(*fXX); // Default behavior for XX is identical to AL 3091 fAL->addAll(*fAI); // Default behavior for AI is identical to AL 3092 fAL->addAll(*fSG); // Default behavior for SG is identical to AL. 3093 3094 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS. 3095 fCM->addAll(*fZJ); // ZWJ behaves as a CM. 3096 3097 fSets->addElement(fBK, status); 3098 fSets->addElement(fCR, status); 3099 fSets->addElement(fLF, status); 3100 fSets->addElement(fCM, status); 3101 fSets->addElement(fNL, status); 3102 fSets->addElement(fWJ, status); 3103 fSets->addElement(fZW, status); 3104 fSets->addElement(fGL, status); 3105 fSets->addElement(fCB, status); 3106 fSets->addElement(fSP, status); 3107 fSets->addElement(fB2, status); 3108 fSets->addElement(fBA, status); 3109 fSets->addElement(fBB, status); 3110 fSets->addElement(fHY, status); 3111 fSets->addElement(fH2, status); 3112 fSets->addElement(fH3, status); 3113 fSets->addElement(fCL, status); 3114 fSets->addElement(fCP, status); 3115 fSets->addElement(fEX, status); 3116 fSets->addElement(fIN, status); 3117 fSets->addElement(fJL, status); 3118 fSets->addElement(fJT, status); 3119 fSets->addElement(fJV, status); 3120 fSets->addElement(fNS, status); 3121 fSets->addElement(fOP, status); 3122 fSets->addElement(fQU, status); 3123 fSets->addElement(fIS, status); 3124 fSets->addElement(fNU, status); 3125 fSets->addElement(fPO, status); 3126 fSets->addElement(fPR, status); 3127 fSets->addElement(fSY, status); 3128 fSets->addElement(fAI, status); 3129 fSets->addElement(fAL, status); 3130 fSets->addElement(fHL, status); 3131 fSets->addElement(fID, status); 3132 fSets->addElement(fWJ, status); 3133 fSets->addElement(fRI, status); 3134 fSets->addElement(fSG, status); 3135 fSets->addElement(fEB, status); 3136 fSets->addElement(fEM, status); 3137 fSets->addElement(fZJ, status); 3138 fSets->addElement(fExtendedPict, status); 3139 fSets->addElement(fEmojiNRK, status); 3140 3141 3142 const char *rules = 3143 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?" 3144 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?" 3145 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*" 3146 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*" 3147 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?" 3148 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"; 3149 3150 fNumberMatcher = new RegexMatcher( 3151 UnicodeString(rules, -1, US_INV), 0, status); 3152 3153 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status); 3154 3155 if (U_FAILURE(status)) { 3156 deferredStatus = status; 3157 } 3158 } 3159 3160 setText(const UnicodeString & s)3161 void RBBILineMonkey::setText(const UnicodeString &s) { 3162 fText = &s; 3163 fCharBI->setText(s); 3164 fNumberMatcher->reset(s); 3165 } 3166 3167 // 3168 // rule9Adjust 3169 // Line Break TR rules 9 and 10 implementation. 3170 // This deals with combining marks and other sequences that 3171 // that must be treated as if they were something other than what they actually are. 3172 // 3173 // This is factored out into a separate function because it must be applied twice for 3174 // each potential break, once to the chars before the position being checked, then 3175 // again to the text following the possible break. 3176 // rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3177 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) { 3178 if (pos == -1) { 3179 // Invalid initial position. Happens during the warmup iteration of the 3180 // main loop in next(). 3181 return; 3182 } 3183 3184 int32_t nPos = *nextPos; 3185 3186 // LB 9 Keep combining sequences together. 3187 // advance over any CM class chars. Note that Line Break CM is different 3188 // from the normal Grapheme Extend property. 3189 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d || 3190 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) { 3191 for (;;) { 3192 *nextChar = fText->char32At(nPos); 3193 if (!fCM->contains(*nextChar)) { 3194 break; 3195 } 3196 nPos = fText->moveIndex32(nPos, 1); 3197 } 3198 } 3199 3200 3201 // LB 9 Treat X CM* as if it were x. 3202 // No explicit action required. 3203 3204 // LB 10 Treat any remaining combining mark as AL 3205 if (fCM->contains(*posChar)) { 3206 *posChar = 0x41; // thisChar = 'A'; 3207 } 3208 3209 // Push the updated nextPos and nextChar back to our caller. 3210 // This only makes a difference if posChar got bigger by consuming a 3211 // combining sequence. 3212 *nextPos = nPos; 3213 *nextChar = fText->char32At(nPos); 3214 } 3215 3216 3217 next(int32_t startPos)3218 int32_t RBBILineMonkey::next(int32_t startPos) { 3219 UErrorCode status = U_ZERO_ERROR; 3220 int32_t pos; // Index of the char following a potential break position 3221 UChar32 thisChar; // Character at above position "pos" 3222 3223 int32_t prevPos; // Index of the char preceding a potential break position 3224 UChar32 prevChar; // Character at above position. Note that prevChar 3225 // and thisChar may not be adjacent because combining 3226 // characters between them will be ignored. 3227 3228 int32_t prevPosX2; // Second previous character. Wider context for LB21a. 3229 UChar32 prevCharX2; 3230 3231 int32_t nextPos; // Index of the next character following pos. 3232 // Usually skips over combining marks. 3233 int32_t nextCPPos; // Index of the code point following "pos." 3234 // May point to a combining mark. 3235 int32_t tPos; // temp value. 3236 UChar32 c; 3237 3238 if (U_FAILURE(deferredStatus)) { 3239 return -1; 3240 } 3241 3242 if (startPos >= fText->length()) { 3243 return -1; 3244 } 3245 3246 3247 // Initial values for loop. Loop will run the first time without finding breaks, 3248 // while the invalid values shift out and the "this" and 3249 // "prev" positions are filled in with good values. 3250 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration. 3251 thisChar = prevChar = prevCharX2 = 0; 3252 nextPos = nextCPPos = startPos; 3253 3254 3255 // Loop runs once per position in the test text, until a break position 3256 // is found. 3257 for (;;) { 3258 prevPosX2 = prevPos; 3259 prevCharX2 = prevChar; 3260 3261 prevPos = pos; 3262 prevChar = thisChar; 3263 3264 pos = nextPos; 3265 thisChar = fText->char32At(pos); 3266 3267 nextCPPos = fText->moveIndex32(pos, 1); 3268 nextPos = nextCPPos; 3269 3270 // Rule LB2 - Break at end of text. 3271 if (pos >= fText->length()) { 3272 break; 3273 } 3274 3275 // Rule LB 9 - adjust for combining sequences. 3276 // We do this one out-of-order because the adjustment does not change anything 3277 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to 3278 // be applied. 3279 rule9Adjust(prevPos, &prevChar, &pos, &thisChar); 3280 nextCPPos = nextPos = fText->moveIndex32(pos, 1); 3281 c = fText->char32At(nextPos); 3282 rule9Adjust(pos, &thisChar, &nextPos, &c); 3283 3284 // If the loop is still warming up - if we haven't shifted the initial 3285 // -1 positions out of prevPos yet - loop back to advance the 3286 // position in the input without any further looking for breaks. 3287 if (prevPos == -1) { 3288 continue; 3289 } 3290 3291 // LB 4 Always break after hard line breaks, 3292 if (fBK->contains(prevChar)) { 3293 break; 3294 } 3295 3296 // LB 5 Break after CR, LF, NL, but not inside CR LF 3297 if (prevChar == 0x0d && thisChar == 0x0a) { 3298 continue; 3299 } 3300 if (prevChar == 0x0d || 3301 prevChar == 0x0a || 3302 prevChar == 0x85) { 3303 break; 3304 } 3305 3306 // LB 6 Don't break before hard line breaks 3307 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 || 3308 fBK->contains(thisChar)) { 3309 continue; 3310 } 3311 3312 3313 // LB 7 Don't break before spaces or zero-width space. 3314 if (fSP->contains(thisChar)) { 3315 continue; 3316 } 3317 3318 if (fZW->contains(thisChar)) { 3319 continue; 3320 } 3321 3322 // LB 8 Break after zero width space 3323 if (fZW->contains(prevChar)) { 3324 break; 3325 } 3326 3327 // LB 8a ZWJ x (ID | ExtendedPict | Emoji) 3328 // The monkey test's way of ignoring combining characters doesn't work 3329 // for this rule. ZJ is also a CM. Need to get the actual character 3330 // preceding "thisChar", not ignoring combining marks, possibly ZJ. 3331 { 3332 int32_t prevIdx = fText->moveIndex32(pos, -1); 3333 UChar32 prevC = fText->char32At(prevIdx); 3334 if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) { 3335 continue; 3336 } 3337 } 3338 3339 // LB 9, 10 Already done, at top of loop. 3340 // 3341 3342 3343 // LB 11 Do not break before or after WORD JOINER and related characters. 3344 // x WJ 3345 // WJ x 3346 // 3347 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { 3348 continue; 3349 } 3350 3351 // LB 12 3352 // GL x 3353 if (fGL->contains(prevChar)) { 3354 continue; 3355 } 3356 3357 // LB 12a 3358 // [^SP BA HY] x GL 3359 if (!(fSP->contains(prevChar) || 3360 fBA->contains(prevChar) || 3361 fHY->contains(prevChar) ) && fGL->contains(thisChar)) { 3362 continue; 3363 } 3364 3365 3366 3367 // LB 13 Don't break before closings. 3368 // NU x CL, NU x CP and NU x IS are not matched here so that they will 3369 // fall into LB 17 and the more general number regular expression. 3370 // 3371 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) || 3372 (!fNU->contains(prevChar) && fCP->contains(thisChar)) || 3373 fEX->contains(thisChar) || 3374 (!fNU->contains(prevChar) && fIS->contains(thisChar)) || 3375 (!fNU->contains(prevChar) && fSY->contains(thisChar))) { 3376 continue; 3377 } 3378 3379 // LB 14 Don't break after OP SP* 3380 // Scan backwards, checking for this sequence. 3381 // The OP char could include combining marks, so we actually check for 3382 // OP CM* SP* 3383 // Another Twist: The Rule 67 fixes may have changed a SP CM 3384 // sequence into a ID char, so before scanning back through spaces, 3385 // verify that prevChar is indeed a space. The prevChar variable 3386 // may differ from fText[prevPos] 3387 tPos = prevPos; 3388 if (fSP->contains(prevChar)) { 3389 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3390 tPos=fText->moveIndex32(tPos, -1); 3391 } 3392 } 3393 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3394 tPos=fText->moveIndex32(tPos, -1); 3395 } 3396 if (fOP->contains(fText->char32At(tPos))) { 3397 continue; 3398 } 3399 3400 3401 // LB 15 QU SP* x OP 3402 if (fOP->contains(thisChar)) { 3403 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 3404 int tPos = prevPos; 3405 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3406 tPos = fText->moveIndex32(tPos, -1); 3407 } 3408 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3409 tPos = fText->moveIndex32(tPos, -1); 3410 } 3411 if (fQU->contains(fText->char32At(tPos))) { 3412 continue; 3413 } 3414 } 3415 3416 3417 3418 // LB 16 (CL | CP) SP* x NS 3419 // Scan backwards for SP* CM* (CL | CP) 3420 if (fNS->contains(thisChar)) { 3421 int tPos = prevPos; 3422 while (tPos>0 && fSP->contains(fText->char32At(tPos))) { 3423 tPos = fText->moveIndex32(tPos, -1); 3424 } 3425 while (tPos>0 && fCM->contains(fText->char32At(tPos))) { 3426 tPos = fText->moveIndex32(tPos, -1); 3427 } 3428 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) { 3429 continue; 3430 } 3431 } 3432 3433 3434 // LB 17 B2 SP* x B2 3435 if (fB2->contains(thisChar)) { 3436 // Scan backwards, checking for the B2 CM* SP* sequence. 3437 tPos = prevPos; 3438 if (fSP->contains(prevChar)) { 3439 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) { 3440 tPos=fText->moveIndex32(tPos, -1); 3441 } 3442 } 3443 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { 3444 tPos=fText->moveIndex32(tPos, -1); 3445 } 3446 if (fB2->contains(fText->char32At(tPos))) { 3447 continue; 3448 } 3449 } 3450 3451 3452 // LB 18 break after space 3453 if (fSP->contains(prevChar)) { 3454 break; 3455 } 3456 3457 // LB 19 3458 // x QU 3459 // QU x 3460 if (fQU->contains(thisChar) || fQU->contains(prevChar)) { 3461 continue; 3462 } 3463 3464 // LB 20 Break around a CB 3465 if (fCB->contains(thisChar) || fCB->contains(prevChar)) { 3466 break; 3467 } 3468 3469 // LB 21 3470 if (fBA->contains(thisChar) || 3471 fHY->contains(thisChar) || 3472 fNS->contains(thisChar) || 3473 fBB->contains(prevChar) ) { 3474 continue; 3475 } 3476 3477 // LB 21a 3478 // HL (HY | BA) x 3479 if (fHL->contains(prevCharX2) && 3480 (fHY->contains(prevChar) || fBA->contains(prevChar))) { 3481 continue; 3482 } 3483 3484 // LB 21b 3485 // SY x HL 3486 if (fSY->contains(prevChar) && fHL->contains(thisChar)) { 3487 continue; 3488 } 3489 3490 // LB 22 3491 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || 3492 (fEX->contains(prevChar) && fIN->contains(thisChar)) || 3493 (fHL->contains(prevChar) && fIN->contains(thisChar)) || 3494 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) || 3495 (fIN->contains(prevChar) && fIN->contains(thisChar)) || 3496 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) { 3497 continue; 3498 } 3499 3500 3501 // LB 23 (AL | HL) x NU 3502 // NU x (AL | HL) 3503 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) { 3504 continue; 3505 } 3506 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3507 continue; 3508 } 3509 3510 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 3511 // PR x (ID | EB | EM) 3512 // (ID | EB | EM) x PO 3513 if (fPR->contains(prevChar) && 3514 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) { 3515 continue; 3516 } 3517 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && 3518 fPO->contains(thisChar)) { 3519 continue; 3520 } 3521 3522 // LB 24 Do not break between prefix and letters or ideographs. 3523 // (PR | PO) x (AL | HL) 3524 // (AL | HL) x (PR | PO) 3525 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) && 3526 (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3527 continue; 3528 } 3529 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && 3530 (fPR->contains(thisChar) || fPO->contains(thisChar))) { 3531 continue; 3532 } 3533 3534 3535 3536 // LB 25 Numbers 3537 if (fNumberMatcher->lookingAt(prevPos, status)) { 3538 if (U_FAILURE(status)) { 3539 break; 3540 } 3541 // Matched a number. But could have been just a single digit, which would 3542 // not represent a "no break here" between prevChar and thisChar 3543 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num 3544 if (numEndIdx > pos) { 3545 // Number match includes at least our two chars being checked 3546 if (numEndIdx > nextPos) { 3547 // Number match includes additional chars. Update pos and nextPos 3548 // so that next loop iteration will continue at the end of the number, 3549 // checking for breaks between last char in number & whatever follows. 3550 pos = nextPos = numEndIdx; 3551 do { 3552 pos = fText->moveIndex32(pos, -1); 3553 thisChar = fText->char32At(pos); 3554 } while (fCM->contains(thisChar)); 3555 } 3556 continue; 3557 } 3558 } 3559 3560 3561 // LB 26 Do not break a Korean syllable. 3562 if (fJL->contains(prevChar) && (fJL->contains(thisChar) || 3563 fJV->contains(thisChar) || 3564 fH2->contains(thisChar) || 3565 fH3->contains(thisChar))) { 3566 continue; 3567 } 3568 3569 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) && 3570 (fJV->contains(thisChar) || fJT->contains(thisChar))) { 3571 continue; 3572 } 3573 3574 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) && 3575 fJT->contains(thisChar)) { 3576 continue; 3577 } 3578 3579 // LB 27 Treat a Korean Syllable Block the same as ID. 3580 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3581 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3582 fIN->contains(thisChar)) { 3583 continue; 3584 } 3585 if ((fJL->contains(prevChar) || fJV->contains(prevChar) || 3586 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) && 3587 fPO->contains(thisChar)) { 3588 continue; 3589 } 3590 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) || 3591 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) { 3592 continue; 3593 } 3594 3595 3596 3597 // LB 28 Do not break between alphabetics ("at"). 3598 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3599 continue; 3600 } 3601 3602 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g."). 3603 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) { 3604 continue; 3605 } 3606 3607 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 3608 // (AL | NU) x OP 3609 // CP x (AL | NU) 3610 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) { 3611 continue; 3612 } 3613 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) { 3614 continue; 3615 } 3616 3617 // LB30a RI RI <break> RI 3618 // RI x RI 3619 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) { 3620 break; 3621 } 3622 if (fRI->contains(prevChar) && fRI->contains(thisChar)) { 3623 continue; 3624 } 3625 3626 // LB30b Emoji Base x Emoji Modifier 3627 if (fEB->contains(prevChar) && fEM->contains(thisChar)) { 3628 continue; 3629 } 3630 3631 // LB 31 Break everywhere else 3632 break; 3633 3634 } 3635 3636 return pos; 3637 } 3638 3639 charClasses()3640 UVector *RBBILineMonkey::charClasses() { 3641 return fSets; 3642 } 3643 3644 ~RBBILineMonkey()3645 RBBILineMonkey::~RBBILineMonkey() { 3646 delete fSets; 3647 3648 delete fBK; 3649 delete fCR; 3650 delete fLF; 3651 delete fCM; 3652 delete fNL; 3653 delete fWJ; 3654 delete fZW; 3655 delete fGL; 3656 delete fCB; 3657 delete fSP; 3658 delete fB2; 3659 delete fBA; 3660 delete fBB; 3661 delete fHY; 3662 delete fH2; 3663 delete fH3; 3664 delete fCL; 3665 delete fCP; 3666 delete fEX; 3667 delete fIN; 3668 delete fJL; 3669 delete fJV; 3670 delete fJT; 3671 delete fNS; 3672 delete fOP; 3673 delete fQU; 3674 delete fIS; 3675 delete fNU; 3676 delete fPO; 3677 delete fPR; 3678 delete fSY; 3679 delete fAI; 3680 delete fAL; 3681 delete fCJ; 3682 delete fHL; 3683 delete fID; 3684 delete fRI; 3685 delete fSG; 3686 delete fXX; 3687 delete fEB; 3688 delete fEM; 3689 delete fZJ; 3690 delete fExtendedPict; 3691 delete fEmojiNRK; 3692 3693 delete fCharBI; 3694 delete fNumberMatcher; 3695 } 3696 3697 3698 //------------------------------------------------------------------------------------------- 3699 // 3700 // TestMonkey 3701 // 3702 // params 3703 // seed=nnnnn Random number starting seed. 3704 // Setting the seed allows errors to be reproduced. 3705 // loop=nnn Looping count. Controls running time. 3706 // -1: run forever. 3707 // 0 or greater: run length. 3708 // 3709 // type = char | word | line | sent | title 3710 // 3711 // Example: 3712 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1" 3713 // 3714 //------------------------------------------------------------------------------------------- 3715 getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3716 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) { 3717 int32_t val = defaultVal; 3718 name.append(" *= *(-?\\d+)"); 3719 UErrorCode status = U_ZERO_ERROR; 3720 RegexMatcher m(name, params, 0, status); 3721 if (m.find()) { 3722 // The param exists. Convert the string to an int. 3723 char valString[100]; 3724 int32_t paramLength = m.end(1, status) - m.start(1, status); 3725 if (paramLength >= (int32_t)(sizeof(valString)-1)) { 3726 paramLength = (int32_t)(sizeof(valString)-2); 3727 } 3728 params.extract(m.start(1, status), paramLength, valString, sizeof(valString)); 3729 val = strtol(valString, NULL, 10); 3730 3731 // Delete this parameter from the params string. 3732 m.reset(); 3733 params = m.replaceFirst("", status); 3734 } 3735 U_ASSERT(U_SUCCESS(status)); 3736 return val; 3737 } 3738 #endif 3739 3740 #if !UCONFIG_NO_REGULAR_EXPRESSIONS testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3741 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, 3742 BreakIterator *bi, 3743 int expected[], 3744 int expectedcount) 3745 { 3746 int count = 0; 3747 int i = 0; 3748 int forward[50]; 3749 bi->setText(ustr); 3750 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3751 forward[count] = i; 3752 if (count < expectedcount && expected[count] != i) { 3753 test->errln("break forward test failed: expected %d but got %d", 3754 expected[count], i); 3755 break; 3756 } 3757 count ++; 3758 } 3759 if (count != expectedcount) { 3760 printStringBreaks(ustr, expected, expectedcount); 3761 test->errln("break forward test failed: missed %d match", 3762 expectedcount - count); 3763 return; 3764 } 3765 // testing boundaries 3766 for (i = 1; i < expectedcount; i ++) { 3767 int j = expected[i - 1]; 3768 if (!bi->isBoundary(j)) { 3769 printStringBreaks(ustr, expected, expectedcount); 3770 test->errln("isBoundary() failed. Expected boundary at position %d", j); 3771 return; 3772 } 3773 for (j = expected[i - 1] + 1; j < expected[i]; j ++) { 3774 if (bi->isBoundary(j)) { 3775 printStringBreaks(ustr, expected, expectedcount); 3776 test->errln("isBoundary() failed. Not expecting boundary at position %d", j); 3777 return; 3778 } 3779 } 3780 } 3781 3782 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) { 3783 count --; 3784 if (forward[count] != i) { 3785 printStringBreaks(ustr, expected, expectedcount); 3786 test->errln("happy break test previous() failed: expected %d but got %d", 3787 forward[count], i); 3788 break; 3789 } 3790 } 3791 if (count != 0) { 3792 printStringBreaks(ustr, expected, expectedcount); 3793 test->errln("break test previous() failed: missed a match"); 3794 return; 3795 } 3796 3797 // testing preceding 3798 for (i = 0; i < expectedcount - 1; i ++) { 3799 // int j = expected[i] + 1; 3800 int j = ustr.moveIndex32(expected[i], 1); 3801 for (; j <= expected[i + 1]; j ++) { 3802 if (bi->preceding(j) != expected[i]) { 3803 printStringBreaks(ustr, expected, expectedcount); 3804 test->errln("preceding(): Not expecting boundary at position %d", j); 3805 return; 3806 } 3807 } 3808 } 3809 } 3810 #endif 3811 TestWordBreaks(void)3812 void RBBITest::TestWordBreaks(void) 3813 { 3814 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3815 3816 Locale locale("en"); 3817 UErrorCode status = U_ZERO_ERROR; 3818 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3819 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3820 // Replaced any C+J characters in a row with a random sequence of characters 3821 // of the same length to make our C+J segmentation not get in the way. 3822 static const char *strlist[] = 3823 { 3824 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", 3825 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b", 3826 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a", 3827 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", 3828 "\\uac00\\u3588\\u009c\\u0953\\u194b", 3829 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3830 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", 3831 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e", 3832 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3833 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3834 "\\u2027\\U000e0067\\u0a47\\u00b7", 3835 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3836 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3837 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3838 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a", 3839 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3840 "\\u0027\\u11af\\U000e0057\\u0602", 3841 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3842 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3843 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3844 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3845 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3846 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068", 3847 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3848 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3849 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3850 "\\u18f4\\U000e0049\\u20e7\\u2027", 3851 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3852 "\\ua183\\u102d\\u0bec\\u003a", 3853 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3854 "\\u003a\\u0e57\\u0fad\\u002e", 3855 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3856 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3857 "\\U000e005d\\u2044\\u0731\\u0650\\u0061", 3858 "\\u003a\\u0664\\u00b7\\u1fba", 3859 "\\u003b\\u0027\\u00b7\\u47a3", 3860 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b", 3861 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673", 3862 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c", 3863 }; 3864 int loop; 3865 if (U_FAILURE(status)) { 3866 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3867 return; 3868 } 3869 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3870 // printf("looping %d\n", loop); 3871 UnicodeString ustr = CharsToUnicodeString(strlist[loop]); 3872 // RBBICharMonkey monkey; 3873 RBBIWordMonkey monkey; 3874 3875 int expected[50]; 3876 int expectedcount = 0; 3877 3878 monkey.setText(ustr); 3879 int i; 3880 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 3881 expected[expectedcount ++] = i; 3882 } 3883 3884 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 3885 } 3886 delete bi; 3887 #endif 3888 } 3889 TestWordBoundary(void)3890 void RBBITest::TestWordBoundary(void) 3891 { 3892 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data> 3893 Locale locale("en"); 3894 UErrorCode status = U_ZERO_ERROR; 3895 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 3896 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 3897 UChar str[50]; 3898 static const char *strlist[] = 3899 { 3900 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", 3901 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044", 3902 "\\u003b\\u024a\\u102e\\U000e0071\\u0600", 3903 "\\u2027\\U000e0067\\u0a47\\u00b7", 3904 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0", 3905 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027", 3906 "\\u0589\\U000e006e\\u0a42\\U000104a5", 3907 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a", 3908 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7", 3909 "\\u0027\\u11af\\U000e0057\\u0602", 3910 "\\U0001d7f2\\U000e007\\u0004\\u0589", 3911 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b", 3912 "\\U0001d7f2\\U000e007d\\u0004\\u0589", 3913 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d", 3914 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959", 3915 "\\U000e0065\\u302c\\u09ee\\U000e0068", 3916 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7", 3917 "\\u0233\\U000e0020\\u0a69\\u0d6a", 3918 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019", 3919 "\\u58f4\\U000e0049\\u20e7\\u2027", 3920 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe", 3921 "\\ua183\\u102d\\u0bec\\u003a", 3922 "\\u17e8\\u06e7\\u002e\\u096d\\u003b", 3923 "\\u003a\\u0e57\\u0fad\\u002e", 3924 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de", 3925 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a", 3926 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019", 3927 "\\u003a\\u0664\\u00b7\\u1fba", 3928 "\\u003b\\u0027\\u00b7\\u47a3", 3929 }; 3930 int loop; 3931 if (U_FAILURE(status)) { 3932 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 3933 return; 3934 } 3935 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 3936 // printf("looping %d\n", loop); 3937 u_unescape(strlist[loop], str, 20); 3938 UnicodeString ustr(str); 3939 int forward[50]; 3940 int count = 0; 3941 3942 bi->setText(ustr); 3943 int prev = 0; 3944 int i; 3945 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { 3946 forward[count ++] = i; 3947 if (i > prev) { 3948 int j; 3949 for (j = prev + 1; j < i; j ++) { 3950 if (bi->isBoundary(j)) { 3951 printStringBreaks(ustr, forward, count); 3952 errln("happy boundary test failed: expected %d not a boundary", 3953 j); 3954 return; 3955 } 3956 } 3957 } 3958 if (!bi->isBoundary(i)) { 3959 printStringBreaks(ustr, forward, count); 3960 errln("happy boundary test failed: expected %d a boundary", 3961 i); 3962 return; 3963 } 3964 prev = i; 3965 } 3966 } 3967 delete bi; 3968 } 3969 TestLineBreaks(void)3970 void RBBITest::TestLineBreaks(void) 3971 { 3972 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 3973 Locale locale("en"); 3974 UErrorCode status = U_ZERO_ERROR; 3975 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 3976 const int32_t STRSIZE = 50; 3977 UChar str[STRSIZE]; 3978 static const char *strlist[] = 3979 { 3980 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc", 3981 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\" 3982 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d", 3983 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\" 3984 "u2014\\U000e0105\\u118c\\u000a\\u07f8", 3985 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f", 3986 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3987 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", 3988 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", 3989 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", 3990 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", 3991 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", 3992 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", 3993 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", 3994 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", 3995 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", 3996 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", 3997 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", 3998 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", 3999 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", 4000 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9", 4001 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025", 4002 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763", 4003 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029", 4004 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7", 4005 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a", 4006 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945", 4007 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014", 4008 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", 4009 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", 4010 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", 4011 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", 4012 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f", 4013 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010", 4014 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43", 4015 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb", 4016 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc", 4017 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060", 4018 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0", 4019 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07", 4020 }; 4021 int loop; 4022 TEST_ASSERT_SUCCESS(status); 4023 if (U_FAILURE(status)) { 4024 return; 4025 } 4026 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 4027 // printf("looping %d\n", loop); 4028 int32_t t = u_unescape(strlist[loop], str, STRSIZE); 4029 if (t >= STRSIZE) { 4030 TEST_ASSERT(FALSE); 4031 continue; 4032 } 4033 4034 4035 UnicodeString ustr(str); 4036 RBBILineMonkey monkey; 4037 if (U_FAILURE(monkey.deferredStatus)) { 4038 continue; 4039 } 4040 4041 const int EXPECTEDSIZE = 50; 4042 int expected[EXPECTEDSIZE]; 4043 int expectedcount = 0; 4044 4045 monkey.setText(ustr); 4046 int i; 4047 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4048 if (expectedcount >= EXPECTEDSIZE) { 4049 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4050 return; 4051 } 4052 expected[expectedcount ++] = i; 4053 } 4054 4055 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4056 } 4057 delete bi; 4058 #endif 4059 } 4060 TestSentBreaks(void)4061 void RBBITest::TestSentBreaks(void) 4062 { 4063 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4064 Locale locale("en"); 4065 UErrorCode status = U_ZERO_ERROR; 4066 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4067 UChar str[200]; 4068 static const char *strlist[] = 4069 { 4070 "Now\ris\nthe\r\ntime\n\rfor\r\r", 4071 "This\n", 4072 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.", 4073 "\"Sentence ending with a quote.\" Bye.", 4074 " (This is it). Testing the sentence iterator. \"This isn't it.\"", 4075 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"", 4076 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ", 4077 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ", 4078 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ", 4079 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!", 4080 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52" 4081 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a" 4082 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f" 4083 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352", 4084 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171" 4085 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030" 4086 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b" 4087 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b" 4088 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05" 4089 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4" 4090 }; 4091 int loop; 4092 if (U_FAILURE(status)) { 4093 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status)); 4094 return; 4095 } 4096 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) { 4097 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str)); 4098 UnicodeString ustr(str); 4099 4100 RBBISentMonkey monkey; 4101 if (U_FAILURE(monkey.deferredStatus)) { 4102 continue; 4103 } 4104 4105 const int EXPECTEDSIZE = 50; 4106 int expected[EXPECTEDSIZE]; 4107 int expectedcount = 0; 4108 4109 monkey.setText(ustr); 4110 int i; 4111 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) { 4112 if (expectedcount >= EXPECTEDSIZE) { 4113 TEST_ASSERT(expectedcount < EXPECTEDSIZE); 4114 return; 4115 } 4116 expected[expectedcount ++] = i; 4117 } 4118 4119 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount); 4120 } 4121 delete bi; 4122 #endif 4123 } 4124 TestMonkey()4125 void RBBITest::TestMonkey() { 4126 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4127 4128 UErrorCode status = U_ZERO_ERROR; 4129 int32_t loopCount = 500; 4130 int32_t seed = 1; 4131 UnicodeString breakType = "all"; 4132 Locale locale("en"); 4133 UBool useUText = FALSE; 4134 4135 if (quick == FALSE) { 4136 loopCount = 10000; 4137 } 4138 4139 if (fTestParams) { 4140 UnicodeString p(fTestParams); 4141 loopCount = getIntParam("loop", p, loopCount); 4142 seed = getIntParam("seed", p, seed); 4143 4144 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status); 4145 if (m.find()) { 4146 breakType = m.group(1, status); 4147 m.reset(); 4148 p = m.replaceFirst("", status); 4149 } 4150 4151 RegexMatcher u(" *utext", p, 0, status); 4152 if (u.find()) { 4153 useUText = TRUE; 4154 u.reset(); 4155 p = u.replaceFirst("", status); 4156 } 4157 4158 4159 // m.reset(p); 4160 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) { 4161 // Each option is stripped out of the option string as it is processed. 4162 // All options have been checked. The option string should have been completely emptied.. 4163 char buf[100]; 4164 p.extract(buf, sizeof(buf), NULL, status); 4165 buf[sizeof(buf)-1] = 0; 4166 errln("Unrecognized or extra parameter: %s\n", buf); 4167 return; 4168 } 4169 4170 } 4171 4172 if (breakType == "char" || breakType == "all") { 4173 RBBICharMonkey m; 4174 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); 4175 if (U_SUCCESS(status)) { 4176 RunMonkey(bi, m, "char", seed, loopCount, useUText); 4177 if (breakType == "all" && useUText==FALSE) { 4178 // Also run a quick test with UText when "all" is specified 4179 RunMonkey(bi, m, "char", seed, loopCount, TRUE); 4180 } 4181 } 4182 else { 4183 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status)); 4184 } 4185 delete bi; 4186 } 4187 4188 if (breakType == "word" || breakType == "all") { 4189 logln("Word Break Monkey Test"); 4190 RBBIWordMonkey m; 4191 BreakIterator *bi = BreakIterator::createWordInstance(locale, status); 4192 if (U_SUCCESS(status)) { 4193 RunMonkey(bi, m, "word", seed, loopCount, useUText); 4194 } 4195 else { 4196 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status)); 4197 } 4198 delete bi; 4199 } 4200 4201 if (breakType == "line" || breakType == "all") { 4202 logln("Line Break Monkey Test"); 4203 RBBILineMonkey m; 4204 BreakIterator *bi = BreakIterator::createLineInstance(locale, status); 4205 if (loopCount >= 10) { 4206 loopCount = loopCount / 5; // Line break runs slower than the others. 4207 } 4208 if (U_SUCCESS(status)) { 4209 RunMonkey(bi, m, "line", seed, loopCount, useUText); 4210 } 4211 else { 4212 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4213 } 4214 delete bi; 4215 } 4216 4217 if (breakType == "sent" || breakType == "all" ) { 4218 logln("Sentence Break Monkey Test"); 4219 RBBISentMonkey m; 4220 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status); 4221 if (loopCount >= 10) { 4222 loopCount = loopCount / 10; // Sentence runs slower than the other break types 4223 } 4224 if (U_SUCCESS(status)) { 4225 RunMonkey(bi, m, "sentence", seed, loopCount, useUText); 4226 } 4227 else { 4228 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status)); 4229 } 4230 delete bi; 4231 } 4232 4233 #endif 4234 } 4235 4236 // 4237 // Run a RBBI monkey test. Common routine, for all break iterator types. 4238 // Parameters: 4239 // bi - the break iterator to use 4240 // mk - MonkeyKind, abstraction for obtaining expected results 4241 // name - Name of test (char, word, etc.) for use in error messages 4242 // seed - Seed for starting random number generator (parameter from user) 4243 // numIterations 4244 // RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4245 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed, 4246 int32_t numIterations, UBool useUText) { 4247 4248 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 4249 4250 const int32_t TESTSTRINGLEN = 500; 4251 UnicodeString testText; 4252 int32_t numCharClasses; 4253 UVector *chClasses; 4254 int expected[TESTSTRINGLEN*2 + 1]; 4255 int expectedCount = 0; 4256 char expectedBreaks[TESTSTRINGLEN*2 + 1]; 4257 char forwardBreaks[TESTSTRINGLEN*2 + 1]; 4258 char reverseBreaks[TESTSTRINGLEN*2+1]; 4259 char isBoundaryBreaks[TESTSTRINGLEN*2+1]; 4260 char followingBreaks[TESTSTRINGLEN*2+1]; 4261 char precedingBreaks[TESTSTRINGLEN*2+1]; 4262 int i; 4263 int loopCount = 0; 4264 4265 m_seed = seed; 4266 4267 numCharClasses = mk.charClasses()->size(); 4268 chClasses = mk.charClasses(); 4269 4270 // Check for errors that occured during the construction of the MonkeyKind object. 4271 // Can't report them where they occured because errln() is a method coming from intlTest, 4272 // and is not visible outside of RBBITest :-( 4273 if (U_FAILURE(mk.deferredStatus)) { 4274 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus)); 4275 return; 4276 } 4277 4278 // Verify that the character classes all have at least one member. 4279 for (i=0; i<numCharClasses; i++) { 4280 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i); 4281 if (s == NULL || s->size() == 0) { 4282 errln("Character Class #%d is null or of zero size.", i); 4283 return; 4284 } 4285 } 4286 4287 while (loopCount < numIterations || numIterations == -1) { 4288 if (numIterations == -1 && loopCount % 10 == 0) { 4289 // If test is running in an infinite loop, display a periodic tic so 4290 // we can tell that it is making progress. 4291 fprintf(stderr, "."); 4292 } 4293 // Save current random number seed, so that we can recreate the random numbers 4294 // for this loop iteration in event of an error. 4295 seed = m_seed; 4296 4297 // Populate a test string with data. 4298 testText.truncate(0); 4299 for (i=0; i<TESTSTRINGLEN; i++) { 4300 int32_t aClassNum = m_rand() % numCharClasses; 4301 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum); 4302 int32_t charIdx = m_rand() % classSet->size(); 4303 UChar32 c = classSet->charAt(charIdx); 4304 if (c < 0) { // TODO: deal with sets containing strings. 4305 errln("%s:%d c < 0", __FILE__, __LINE__); 4306 break; 4307 } 4308 // Do not assemble a supplementary character from randomly generated separate surrogates. 4309 // (It could be a dictionary character) 4310 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) { 4311 continue; 4312 } 4313 4314 testText.append(c); 4315 } 4316 4317 // Calculate the expected results for this test string. 4318 mk.setText(testText); 4319 memset(expectedBreaks, 0, sizeof(expectedBreaks)); 4320 expectedBreaks[0] = 1; 4321 int32_t breakPos = 0; 4322 expectedCount = 0; 4323 for (;;) { 4324 breakPos = mk.next(breakPos); 4325 if (breakPos == -1) { 4326 break; 4327 } 4328 if (breakPos > testText.length()) { 4329 errln("breakPos > testText.length()"); 4330 } 4331 expectedBreaks[breakPos] = 1; 4332 U_ASSERT(expectedCount<testText.length()); 4333 expected[expectedCount ++] = breakPos; 4334 (void)expected; // Set but not used warning. 4335 // TODO (andy): check it out. 4336 } 4337 4338 // Find the break positions using forward iteration 4339 memset(forwardBreaks, 0, sizeof(forwardBreaks)); 4340 if (useUText) { 4341 UErrorCode status = U_ZERO_ERROR; 4342 UText *testUText = utext_openReplaceable(NULL, &testText, &status); 4343 // testUText = utext_openUnicodeString(testUText, &testText, &status); 4344 bi->setText(testUText, status); 4345 TEST_ASSERT_SUCCESS(status); 4346 utext_close(testUText); // The break iterator does a shallow clone of the UText 4347 // This UText can be closed immediately, so long as the 4348 // testText string continues to exist. 4349 } else { 4350 bi->setText(testText); 4351 } 4352 4353 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) { 4354 if (i < 0 || i > testText.length()) { 4355 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4356 break; 4357 } 4358 forwardBreaks[i] = 1; 4359 } 4360 4361 // Find the break positions using reverse iteration 4362 memset(reverseBreaks, 0, sizeof(reverseBreaks)); 4363 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) { 4364 if (i < 0 || i > testText.length()) { 4365 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name); 4366 break; 4367 } 4368 reverseBreaks[i] = 1; 4369 } 4370 4371 // Find the break positions using isBoundary() tests. 4372 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks)); 4373 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length()); 4374 for (i=0; i<=testText.length(); i++) { 4375 isBoundaryBreaks[i] = bi->isBoundary(i); 4376 } 4377 4378 4379 // Find the break positions using the following() function. 4380 // printf("."); 4381 memset(followingBreaks, 0, sizeof(followingBreaks)); 4382 int32_t lastBreakPos = 0; 4383 followingBreaks[0] = 1; 4384 for (i=0; i<testText.length(); i++) { 4385 breakPos = bi->following(i); 4386 if (breakPos <= i || 4387 breakPos < lastBreakPos || 4388 breakPos > testText.length() || 4389 (breakPos > lastBreakPos && lastBreakPos > i)) { 4390 errln("%s break monkey test: " 4391 "Out of range value returned by BreakIterator::following().\n" 4392 "Random seed=%d index=%d; following returned %d; lastbreak=%d", 4393 name, seed, i, breakPos, lastBreakPos); 4394 break; 4395 } 4396 followingBreaks[breakPos] = 1; 4397 lastBreakPos = breakPos; 4398 } 4399 4400 // Find the break positions using the preceding() function. 4401 memset(precedingBreaks, 0, sizeof(precedingBreaks)); 4402 lastBreakPos = testText.length(); 4403 precedingBreaks[testText.length()] = 1; 4404 for (i=testText.length(); i>0; i--) { 4405 breakPos = bi->preceding(i); 4406 if (breakPos >= i || 4407 breakPos > lastBreakPos || 4408 (breakPos < 0 && testText.getChar32Start(i)>0) || 4409 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) { 4410 errln("%s break monkey test: " 4411 "Out of range value returned by BreakIterator::preceding().\n" 4412 "index=%d; prev returned %d; lastBreak=%d" , 4413 name, i, breakPos, lastBreakPos); 4414 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) { 4415 precedingBreaks[i] = 2; // Forces an error. 4416 } 4417 } else { 4418 if (breakPos >= 0) { 4419 precedingBreaks[breakPos] = 1; 4420 } 4421 lastBreakPos = breakPos; 4422 } 4423 } 4424 4425 // Compare the expected and actual results. 4426 for (i=0; i<=testText.length(); i++) { 4427 const char *errorType = NULL; 4428 if (forwardBreaks[i] != expectedBreaks[i]) { 4429 errorType = "next()"; 4430 } else if (reverseBreaks[i] != forwardBreaks[i]) { 4431 errorType = "previous()"; 4432 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 4433 errorType = "isBoundary()"; 4434 } else if (followingBreaks[i] != expectedBreaks[i]) { 4435 errorType = "following()"; 4436 } else if (precedingBreaks[i] != expectedBreaks[i]) { 4437 errorType = "preceding()"; 4438 } 4439 4440 4441 if (errorType != NULL) { 4442 // Format a range of the test text that includes the failure as 4443 // a data item that can be included in the rbbi test data file. 4444 4445 // Start of the range is the last point where expected and actual results 4446 // both agreed that there was a break position. 4447 int startContext = i; 4448 int32_t count = 0; 4449 for (;;) { 4450 if (startContext==0) { break; } 4451 startContext --; 4452 if (expectedBreaks[startContext] != 0) { 4453 if (count == 2) break; 4454 count ++; 4455 } 4456 } 4457 4458 // End of range is two expected breaks past the start position. 4459 int endContext = i + 1; 4460 int ci; 4461 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 4462 for (;;) { 4463 if (endContext >= testText.length()) {break;} 4464 if (expectedBreaks[endContext-1] != 0) { 4465 if (count == 0) break; 4466 count --; 4467 } 4468 endContext ++; 4469 } 4470 } 4471 4472 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>" 4473 UnicodeString errorText = "<data>"; 4474 /***if (strcmp(errorType, "next()") == 0) { 4475 startContext = 0; 4476 endContext = testText.length(); 4477 4478 printStringBreaks(testText, expected, expectedCount); 4479 }***/ 4480 4481 for (ci=startContext; ci<endContext;) { 4482 UnicodeString hexChars("0123456789abcdef"); 4483 UChar32 c; 4484 int bn; 4485 c = testText.char32At(ci); 4486 if (ci == i) { 4487 // This is the location of the error. 4488 errorText.append("<?>"); 4489 } else if (expectedBreaks[ci] != 0) { 4490 // This a non-error expected break position. 4491 errorText.append("\\"); 4492 } 4493 if (c < 0x10000) { 4494 errorText.append("\\u"); 4495 for (bn=12; bn>=0; bn-=4) { 4496 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4497 } 4498 } else { 4499 errorText.append("\\U"); 4500 for (bn=28; bn>=0; bn-=4) { 4501 errorText.append(hexChars.charAt((c>>bn)&0xf)); 4502 } 4503 } 4504 ci = testText.moveIndex32(ci, 1); 4505 } 4506 errorText.append("\\"); 4507 errorText.append("</data>\n"); 4508 4509 // Output the error 4510 char charErrorTxt[500]; 4511 UErrorCode status = U_ZERO_ERROR; 4512 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status); 4513 charErrorTxt[sizeof(charErrorTxt)-1] = 0; 4514 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status); 4515 4516 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s", 4517 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"), 4518 errorType, seed, i, charErrorTxt); 4519 break; 4520 } 4521 } 4522 4523 loopCount++; 4524 } 4525 #endif 4526 } 4527 4528 4529 // Bug 5532. UTF-8 based UText fails in dictionary code. 4530 // This test checks the initial patch, 4531 // which is to just keep it from crashing. Correct word boundaries 4532 // await a proper fix to the dictionary code. 4533 // TestBug5532(void)4534 void RBBITest::TestBug5532(void) { 4535 // Text includes a mixture of Thai and Latin. 4536 const unsigned char utf8Data[] = { 4537 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u, 4538 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u, 4539 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u, 4540 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 4541 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u, 4542 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u, 4543 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu, 4544 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u, 4545 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 4546 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u, 4547 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00}; 4548 4549 UErrorCode status = U_ZERO_ERROR; 4550 UText utext=UTEXT_INITIALIZER; 4551 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status); 4552 TEST_ASSERT_SUCCESS(status); 4553 4554 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status); 4555 TEST_ASSERT_SUCCESS(status); 4556 if (U_SUCCESS(status)) { 4557 bi->setText(&utext, status); 4558 TEST_ASSERT_SUCCESS(status); 4559 4560 int32_t breakCount = 0; 4561 int32_t previousBreak = -1; 4562 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) { 4563 // For now, just make sure that the break iterator doesn't hang. 4564 TEST_ASSERT(previousBreak < bi->current()); 4565 previousBreak = bi->current(); 4566 } 4567 TEST_ASSERT(breakCount > 0); 4568 } 4569 delete bi; 4570 utext_close(&utext); 4571 } 4572 4573 TestBug9983(void)4574 void RBBITest::TestBug9983(void) { 4575 UnicodeString text = UnicodeString("\\u002A" // * Other 4576 "\\uFF65" // Other 4577 "\\u309C" // Katakana 4578 "\\uFF9F" // Extend 4579 "\\uFF65" // Other 4580 "\\u0020" // Other 4581 "\\u0000").unescape(); 4582 4583 UErrorCode status = U_ZERO_ERROR; 4584 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>( 4585 BreakIterator::createWordInstance(Locale::getRoot(), status))); 4586 TEST_ASSERT_SUCCESS(status); 4587 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>( 4588 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status))); 4589 TEST_ASSERT_SUCCESS(status); 4590 if (U_FAILURE(status)) { 4591 return; 4592 } 4593 int32_t offset, rstatus, iterationCount; 4594 4595 brkiter->setText(text); 4596 brkiter->last(); 4597 iterationCount = 0; 4598 while ( (offset = brkiter->previous()) != UBRK_DONE ) { 4599 iterationCount++; 4600 rstatus = brkiter->getRuleStatus(); 4601 (void)rstatus; // Suppress set but not used warning. 4602 if (iterationCount >= 10) { 4603 break; 4604 } 4605 } 4606 TEST_ASSERT(iterationCount == 6); 4607 4608 brkiterPOSIX->setText(text); 4609 brkiterPOSIX->last(); 4610 iterationCount = 0; 4611 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) { 4612 iterationCount++; 4613 rstatus = brkiterPOSIX->getRuleStatus(); 4614 (void)rstatus; // Suppress set but not used warning. 4615 if (iterationCount >= 10) { 4616 break; 4617 } 4618 } 4619 TEST_ASSERT(iterationCount == 6); 4620 } 4621 4622 // Bug 7547 - verify that building a break itereator from empty rules produces an error. 4623 // TestBug7547()4624 void RBBITest::TestBug7547() { 4625 UnicodeString rules; 4626 UErrorCode status = U_ZERO_ERROR; 4627 UParseError parseError; 4628 RuleBasedBreakIterator breakIterator(rules, parseError, status); 4629 if (status != U_BRK_RULE_SYNTAX) { 4630 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status)); 4631 } 4632 if (parseError.line != 1 || parseError.offset != 0) { 4633 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset); 4634 } 4635 } 4636 4637 TestBug12797()4638 void RBBITest::TestBug12797() { 4639 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;"; 4640 UErrorCode status = U_ZERO_ERROR; 4641 UParseError parseError; 4642 RuleBasedBreakIterator bi(rules, parseError, status); 4643 if (U_FAILURE(status)) { 4644 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status)); 4645 return; 4646 } 4647 UnicodeString text = "abc"; 4648 bi.setText(text); 4649 bi.first(); 4650 int32_t boundary = bi.next(); 4651 if (boundary != 3) { 4652 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary); 4653 } 4654 } 4655 TestBug12918()4656 void RBBITest::TestBug12918() { 4657 // This test triggers an assertion failure in dictbe.cpp 4658 const UChar crasherString[] = { 0x3325, 0x4a16, 0 }; 4659 UErrorCode status = U_ZERO_ERROR; 4660 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status); 4661 if (U_FAILURE(status)) { 4662 errln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status)); 4663 return; 4664 } 4665 ubrk_first(iter); 4666 int32_t pos = 0; 4667 int32_t lastPos = -1; 4668 while((pos = ubrk_next(iter)) != UBRK_DONE) { 4669 if (pos <= lastPos) { 4670 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos); 4671 break; 4672 } 4673 } 4674 ubrk_close(iter); 4675 } 4676 4677 // 4678 // TestDebug - A place-holder test for debugging purposes. 4679 // For putting in fragments of other tests that can be invoked 4680 // for tracing without a lot of unwanted extra stuff happening. 4681 // TestDebug(void)4682 void RBBITest::TestDebug(void) { 4683 4684 } 4685 TestProperties()4686 void RBBITest::TestProperties() { 4687 UErrorCode errorCode = U_ZERO_ERROR; 4688 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode); 4689 if (!prependSet.isEmpty()) { 4690 errln( 4691 "[:GCB=Prepend:] is not empty any more. " 4692 "Uncomment relevant lines in source/data/brkitr/char.txt and " 4693 "change this test to the opposite condition."); 4694 } 4695 } 4696 4697 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 4698