1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8 /************************************************************************
9 * Date Name Description
10 * 12/15/99 Madhu Creation.
11 * 01/12/2000 Madhu Updated for changed API and added new tests
12 ************************************************************************/
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_BREAK_ITERATION
16
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <string.h>
20
21 #include "unicode/brkiter.h"
22 #include "unicode/localpointer.h"
23 #include "unicode/numfmt.h"
24 #include "unicode/rbbi.h"
25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26 #include "unicode/regex.h"
27 #endif
28 #include "unicode/schriter.h"
29 #include "unicode/uchar.h"
30 #include "unicode/utf16.h"
31 #include "unicode/ucnv.h"
32 #include "unicode/uniset.h"
33 #include "unicode/uscript.h"
34 #include "unicode/ustring.h"
35 #include "unicode/utext.h"
36
37 #include "charstr.h"
38 #include "cmemory.h"
39 #include "intltest.h"
40 #include "rbbitst.h"
41 #include "utypeinfo.h" // for 'typeid' to work
42 #include "uvector.h"
43 #include "uvectr32.h"
44
45 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
46 #include "unicode/filteredbrk.h"
47 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
48
49 #define TEST_ASSERT(x) {if (!(x)) { \
50 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
51
52 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
53 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
54
55
56 //---------------------------------------------
57 // runIndexedTest
58 //---------------------------------------------
59
60
61 // Note: Before adding new tests to this file, check whether the desired test data can
62 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
63 // it's much less work than writing a new test, diagnostic output in the event of failures
64 // is good, and the test data file will is shared with ICU4J, so eventually the test
65 // will run there as well, without additional effort.
66
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)67 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
68 {
69 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
70 fTestParams = params;
71
72 TESTCASE_AUTO_BEGIN;
73 #if !UCONFIG_NO_FILE_IO
74 TESTCASE_AUTO(TestBug4153072);
75 #endif
76 TESTCASE_AUTO(TestStatusReturn);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(TestUnicodeFiles);
79 TESTCASE_AUTO(TestEmptyString);
80 #endif
81 TESTCASE_AUTO(TestGetAvailableLocales);
82 TESTCASE_AUTO(TestGetDisplayName);
83 #if !UCONFIG_NO_FILE_IO
84 TESTCASE_AUTO(TestEndBehaviour);
85 TESTCASE_AUTO(TestWordBreaks);
86 TESTCASE_AUTO(TestWordBoundary);
87 TESTCASE_AUTO(TestLineBreaks);
88 TESTCASE_AUTO(TestSentBreaks);
89 TESTCASE_AUTO(TestExtended);
90 #endif
91 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
92 TESTCASE_AUTO(TestMonkey);
93 #endif
94 #if !UCONFIG_NO_FILE_IO
95 TESTCASE_AUTO(TestBug3818);
96 #endif
97 TESTCASE_AUTO(TestDebug);
98 #if !UCONFIG_NO_FILE_IO
99 TESTCASE_AUTO(TestBug5775);
100 #endif
101 TESTCASE_AUTO(TestBug9983);
102 TESTCASE_AUTO(TestDictRules);
103 TESTCASE_AUTO(TestBug5532);
104 TESTCASE_AUTO(TestBug7547);
105 TESTCASE_AUTO(TestBug12797);
106 TESTCASE_AUTO(TestBug12918);
107 TESTCASE_AUTO_END;
108 }
109
110
111 //---------------------------------------------------------------------------
112 //
113 // class BITestData Holds a set of Break iterator test data and results
114 // Includes
115 // - the string data to be broken
116 // - a vector of the expected break positions.
117 // - a vector of source line numbers for the data,
118 // (to help see where errors occured.)
119 // - The expected break tag values.
120 // - Vectors of actual break positions and tag values.
121 // - Functions for comparing actual with expected and
122 // reporting errors.
123 //
124 //----------------------------------------------------------------------------
125 class BITestData {
126 public:
127 UnicodeString fDataToBreak;
128 UVector fExpectedBreakPositions;
129 UVector fExpectedTags;
130 UVector fLineNum;
131 UVector fActualBreakPositions; // Test Results.
132 UVector fActualTags;
133
134 BITestData(UErrorCode &status);
135 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
136 void checkResults(const char *heading, RBBITest *test);
137 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
138 void clearResults();
139 };
140
141 //
142 // Constructor.
143 //
BITestData(UErrorCode & status)144 BITestData::BITestData(UErrorCode &status)
145 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
146 fActualTags(status)
147 {
148 }
149
150 //
151 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
152 // The macro form collects the line number, which is helpful
153 // when tracking down failures.
154 //
155 // A null data item is inserted at the start of each test's data
156 // to put the starting zero into the data list. The position saved for
157 // each non-null item is its ending position.
158 //
159 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)160 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
161 if (U_FAILURE(status)) {return;}
162 if (data != NULL) {
163 fDataToBreak.append(CharsToUnicodeString(data));
164 }
165 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
166 fExpectedTags.addElement(tag, status);
167 fLineNum.addElement(lineNum, status);
168 }
169
170
171 //
172 // checkResults. Compare the actual and expected break positions, report any differences.
173 //
checkResults(const char * heading,RBBITest * test)174 void BITestData::checkResults(const char *heading, RBBITest *test) {
175 int32_t expectedIndex = 0;
176 int32_t actualIndex = 0;
177
178 for (;;) {
179 // If we've run through both the expected and actual results vectors, we're done.
180 // break out of the loop.
181 if (expectedIndex >= fExpectedBreakPositions.size() &&
182 actualIndex >= fActualBreakPositions.size()) {
183 break;
184 }
185
186
187 if (expectedIndex >= fExpectedBreakPositions.size()) {
188 err(heading, test, expectedIndex-1, actualIndex);
189 actualIndex++;
190 continue;
191 }
192
193 if (actualIndex >= fActualBreakPositions.size()) {
194 err(heading, test, expectedIndex, actualIndex-1);
195 expectedIndex++;
196 continue;
197 }
198
199 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
200 err(heading, test, expectedIndex, actualIndex);
201 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
202 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
203 actualIndex++;
204 } else {
205 expectedIndex++;
206 }
207 continue;
208 }
209
210 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
211 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
212 heading, fLineNum.elementAt(expectedIndex),
213 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
214 }
215
216 actualIndex++;
217 expectedIndex++;
218 }
219 }
220
221 //
222 // err - An error was found. Report it, along with information about where the
223 // incorrectly broken test data appeared in the source file.
224 //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)225 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
226 {
227 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
228 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
229 int32_t o = 0;
230 int32_t line = fLineNum.elementAti(expectedIdx);
231 if (expectedIdx > 0) {
232 // The line numbers are off by one because a premature break occurs somewhere
233 // within the previous item, rather than at the start of the current (expected) item.
234 // We want to report the offset of the unexpected break from the start of
235 // this previous item.
236 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
237 }
238 if (actual < expected) {
239 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
240 } else {
241 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
242 }
243 }
244
245
clearResults()246 void BITestData::clearResults() {
247 fActualBreakPositions.removeAllElements();
248 fActualTags.removeAllElements();
249 }
250
251
252 //--------------------------------------------------------------------------------------
253 //
254 // RBBITest constructor and destructor
255 //
256 //--------------------------------------------------------------------------------------
257
RBBITest()258 RBBITest::RBBITest() {
259 fTestParams = NULL;
260 }
261
262
~RBBITest()263 RBBITest::~RBBITest() {
264 }
265
266 //-----------------------------------------------------------------------------------
267 //
268 // Test for status {tag} return value from break rules.
269 // TODO: a more thorough test.
270 //
271 //-----------------------------------------------------------------------------------
TestStatusReturn()272 void RBBITest::TestStatusReturn() {
273 UnicodeString rulesString1("$Letters = [:L:];\n"
274 "$Numbers = [:N:];\n"
275 "$Letters+{1};\n"
276 "$Numbers+{2};\n"
277 "Help\\ /me\\!{4};\n"
278 "[^$Letters $Numbers];\n"
279 "!.*;\n", -1, US_INV);
280 UnicodeString testString1 = "abc123..abc Help me Help me!";
281 // 01234567890123456789012345678
282 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
283 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
284
285 UErrorCode status=U_ZERO_ERROR;
286 UParseError parseError;
287
288 LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
289 if(U_FAILURE(status)) {
290 dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__, u_errorName(status));
291 return;
292 }
293 int32_t pos;
294 int32_t i = 0;
295 bi->setText(testString1);
296 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
297 if (pos != bounds1[i]) {
298 errln("%s:%d expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
299 break;
300 }
301
302 int tag = bi->getRuleStatus();
303 if (tag != brkStatus[i]) {
304 errln("%s:%d break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
305 break;
306 }
307 i++;
308 }
309 }
310
311
printStringBreaks(UText * tstr,int expected[],int expectedCount)312 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
313 UErrorCode status = U_ZERO_ERROR;
314 char name[100];
315 printf("code alpha extend alphanum type word sent line name\n");
316 int nextExpectedIndex = 0;
317 utext_setNativeIndex(tstr, 0);
318 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
319 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
320 printf("------------------------------------------------ %d\n", j);
321 ++nextExpectedIndex;
322 }
323
324 UChar32 c = utext_next32(tstr);
325 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
326 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
327 u_isUAlphabetic(c),
328 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
329 u_isalnum(c),
330 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
331 u_charType(c),
332 U_SHORT_PROPERTY_NAME),
333 u_getPropertyValueName(UCHAR_WORD_BREAK,
334 u_getIntPropertyValue(c,
335 UCHAR_WORD_BREAK),
336 U_SHORT_PROPERTY_NAME),
337 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
338 u_getIntPropertyValue(c,
339 UCHAR_SENTENCE_BREAK),
340 U_SHORT_PROPERTY_NAME),
341 u_getPropertyValueName(UCHAR_LINE_BREAK,
342 u_getIntPropertyValue(c,
343 UCHAR_LINE_BREAK),
344 U_SHORT_PROPERTY_NAME),
345 name);
346 }
347 }
348
349
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)350 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
351 UErrorCode status = U_ZERO_ERROR;
352 UText *tstr = NULL;
353 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
354 if (U_FAILURE(status)) {
355 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
356 return;
357 }
358 printStringBreaks(tstr, expected, expectedCount);
359 utext_close(tstr);
360 }
361
362
TestBug3818()363 void RBBITest::TestBug3818() {
364 UErrorCode status = U_ZERO_ERROR;
365
366 // Four Thai words...
367 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
368 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
369 UnicodeString thaiStr(thaiWordData);
370
371 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
372 if (U_FAILURE(status) || bi == NULL) {
373 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
374 return;
375 }
376 bi->setText(thaiStr);
377
378 int32_t startOfSecondWord = bi->following(1);
379 if (startOfSecondWord != 4) {
380 errln("Fail at file %s, line %d expected start of word at 4, got %d",
381 __FILE__, __LINE__, startOfSecondWord);
382 }
383 startOfSecondWord = bi->following(0);
384 if (startOfSecondWord != 4) {
385 errln("Fail at file %s, line %d expected start of word at 4, got %d",
386 __FILE__, __LINE__, startOfSecondWord);
387 }
388 delete bi;
389 }
390
391 //----------------------------------------------------------------------------
392 //
393 // generalIteratorTest Given a break iterator and a set of test data,
394 // Run the tests and report the results.
395 //
396 //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)397 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
398 {
399
400 bi.setText(td.fDataToBreak);
401
402 testFirstAndNext(bi, td);
403
404 testLastAndPrevious(bi, td);
405
406 testFollowing(bi, td);
407 testPreceding(bi, td);
408 testIsBoundary(bi, td);
409 doMultipleSelectionTest(bi, td);
410 }
411
412
413 //
414 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
415 // kind of loop.
416 //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)417 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
418 {
419 UErrorCode status = U_ZERO_ERROR;
420 int32_t p;
421 int32_t lastP = -1;
422 int32_t tag;
423
424 logln("Test first and next");
425 bi.setText(td.fDataToBreak);
426 td.clearResults();
427
428 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
429 td.fActualBreakPositions.addElement(p, status); // Save result.
430 tag = bi.getRuleStatus();
431 td.fActualTags.addElement(tag, status);
432 if (p <= lastP) {
433 // If the iterator is not making forward progress, stop.
434 // No need to raise an error here, it'll be detected in the normal check of results.
435 break;
436 }
437 lastP = p;
438 }
439 td.checkResults("testFirstAndNext", this);
440 }
441
442
443 //
444 // TestLastAndPrevious. Run the iterator backwards, starting with last().
445 //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)446 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
447 {
448 UErrorCode status = U_ZERO_ERROR;
449 int32_t p;
450 int32_t lastP = 0x7ffffffe;
451 int32_t tag;
452
453 logln("Test last and previous");
454 bi.setText(td.fDataToBreak);
455 td.clearResults();
456
457 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
458 // Save break position. Insert it at start of vector of results, shoving
459 // already-saved results further towards the end.
460 td.fActualBreakPositions.insertElementAt(p, 0, status);
461 // bi.previous(); // TODO: Why does this fix things up????
462 // bi.next();
463 tag = bi.getRuleStatus();
464 td.fActualTags.insertElementAt(tag, 0, status);
465 if (p >= lastP) {
466 // If the iterator is not making progress, stop.
467 // No need to raise an error here, it'll be detected in the normal check of results.
468 break;
469 }
470 lastP = p;
471 }
472 td.checkResults("testLastAndPrevious", this);
473 }
474
475
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)476 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
477 {
478 UErrorCode status = U_ZERO_ERROR;
479 int32_t p;
480 int32_t tag;
481 int32_t lastP = -2; // A value that will never be returned as a break position.
482 // cannot be -1; that is returned for DONE.
483 int i;
484
485 logln("testFollowing():");
486 bi.setText(td.fDataToBreak);
487 td.clearResults();
488
489 // Save the starting point, since we won't get that out of following.
490 p = bi.first();
491 td.fActualBreakPositions.addElement(p, status); // Save result.
492 tag = bi.getRuleStatus();
493 td.fActualTags.addElement(tag, status);
494
495 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
496 p = bi.following(i);
497 if (p != lastP) {
498 if (p == RuleBasedBreakIterator::DONE) {
499 break;
500 }
501 // We've reached a new break position. Save it.
502 td.fActualBreakPositions.addElement(p, status); // Save result.
503 tag = bi.getRuleStatus();
504 td.fActualTags.addElement(tag, status);
505 lastP = p;
506 }
507 }
508 // The loop normally exits by means of the break in the middle.
509 // Make sure that the index was at the correct position for the break iterator to have
510 // returned DONE.
511 if (i != td.fDataToBreak.length()) {
512 errln("testFollowing(): iterator returned DONE prematurely.");
513 }
514
515 // Full check of all results.
516 td.checkResults("testFollowing", this);
517 }
518
519
520
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)521 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
522 UErrorCode status = U_ZERO_ERROR;
523 int32_t p;
524 int32_t tag;
525 int32_t lastP = 0x7ffffffe;
526 int i;
527
528 logln("testPreceding():");
529 bi.setText(td.fDataToBreak);
530 td.clearResults();
531
532 p = bi.last();
533 td.fActualBreakPositions.addElement(p, status);
534 tag = bi.getRuleStatus();
535 td.fActualTags.addElement(tag, status);
536
537 for (i = td.fDataToBreak.length(); i>=-1; i--) {
538 p = bi.preceding(i);
539 if (p != lastP) {
540 if (p == RuleBasedBreakIterator::DONE) {
541 break;
542 }
543 // We've reached a new break position. Save it.
544 td.fActualBreakPositions.insertElementAt(p, 0, status);
545 lastP = p;
546 tag = bi.getRuleStatus();
547 td.fActualTags.insertElementAt(tag, 0, status);
548 }
549 }
550 // The loop normally exits by means of the break in the middle.
551 // Make sure that the index was at the correct position for the break iterator to have
552 // returned DONE.
553 if (i != 0) {
554 errln("testPreceding(): iterator returned DONE prematurely.");
555 }
556
557 // Full check of all results.
558 td.checkResults("testPreceding", this);
559 }
560
561
562
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)563 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
564 UErrorCode status = U_ZERO_ERROR;
565 int i;
566 int32_t tag;
567
568 logln("testIsBoundary():");
569 bi.setText(td.fDataToBreak);
570 td.clearResults();
571
572 for (i = 0; i <= td.fDataToBreak.length(); i++) {
573 if (bi.isBoundary(i)) {
574 td.fActualBreakPositions.addElement(i, status); // Save result.
575 tag = bi.getRuleStatus();
576 td.fActualTags.addElement(tag, status);
577 }
578 }
579 td.checkResults("testIsBoundary: ", this);
580 }
581
582
583
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)584 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
585 {
586 iterator.setText(td.fDataToBreak);
587
588 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
589 int32_t offset = iterator.first();
590 int32_t testOffset;
591 int32_t count = 0;
592
593 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
594
595 if (*testIterator != iterator)
596 errln("clone() or operator!= failed: two clones compared unequal");
597
598 do {
599 testOffset = testIterator->first();
600 testOffset = testIterator->next(count);
601 if (offset != testOffset)
602 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
603
604 if (offset != RuleBasedBreakIterator::DONE) {
605 count++;
606 offset = iterator.next();
607
608 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
609 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
610 if (count > 10000 || offset == -1) {
611 errln("operator== failed too many times. Stopping test.");
612 if (offset == -1) {
613 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
614 }
615 return;
616 }
617 }
618 }
619 } while (offset != RuleBasedBreakIterator::DONE);
620
621 // now do it backwards...
622 offset = iterator.last();
623 count = 0;
624
625 do {
626 testOffset = testIterator->last();
627 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
628 if (offset != testOffset)
629 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
630
631 if (offset != RuleBasedBreakIterator::DONE) {
632 count--;
633 offset = iterator.previous();
634 }
635 } while (offset != RuleBasedBreakIterator::DONE);
636
637 delete testIterator;
638 }
639
640
641 //---------------------------------------------
642 //
643 // other tests
644 //
645 //---------------------------------------------
TestEmptyString()646 void RBBITest::TestEmptyString()
647 {
648 UnicodeString text = "";
649 UErrorCode status = U_ZERO_ERROR;
650
651 BITestData x(status);
652 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
653 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
654 if (U_FAILURE(status))
655 {
656 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
657 return;
658 }
659 generalIteratorTest(*bi, x);
660 delete bi;
661 }
662
TestGetAvailableLocales()663 void RBBITest::TestGetAvailableLocales()
664 {
665 int32_t locCount = 0;
666 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
667
668 if (locCount == 0)
669 dataerrln("getAvailableLocales() returned an empty list!");
670 // Just make sure that it's returning good memory.
671 int32_t i;
672 for (i = 0; i < locCount; ++i) {
673 logln(locList[i].getName());
674 }
675 }
676
677 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()678 void RBBITest::TestGetDisplayName()
679 {
680 UnicodeString result;
681
682 BreakIterator::getDisplayName(Locale::getUS(), result);
683 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
684 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
685 + result);
686
687 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
688 if (result != "French (France)")
689 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
690 + result);
691 }
692 /**
693 * Test End Behaviour
694 * @bug 4068137
695 */
TestEndBehaviour()696 void RBBITest::TestEndBehaviour()
697 {
698 UErrorCode status = U_ZERO_ERROR;
699 UnicodeString testString("boo.");
700 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
701 if (U_FAILURE(status))
702 {
703 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
704 return;
705 }
706 wb->setText(testString);
707
708 if (wb->first() != 0)
709 errln("Didn't get break at beginning of string.");
710 if (wb->next() != 3)
711 errln("Didn't get break before period in \"boo.\"");
712 if (wb->current() != 4 && wb->next() != 4)
713 errln("Didn't get break at end of string.");
714 delete wb;
715 }
716 /*
717 * @bug 4153072
718 */
TestBug4153072()719 void RBBITest::TestBug4153072() {
720 UErrorCode status = U_ZERO_ERROR;
721 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
722 if (U_FAILURE(status))
723 {
724 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
725 return;
726 }
727 UnicodeString str("...Hello, World!...");
728 int32_t begin = 3;
729 int32_t end = str.length() - 3;
730 UBool onBoundary;
731
732 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
733 iter->adoptText(textIterator);
734 int index;
735 // Note: with the switch to UText, there is no way to restrict the
736 // iteration range to begin at an index other than zero.
737 // String character iterators created with a non-zero bound are
738 // treated by RBBI as being empty.
739 for (index = -1; index < begin + 1; ++index) {
740 onBoundary = iter->isBoundary(index);
741 if (index == 0? !onBoundary : onBoundary) {
742 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
743 " and begin index = " + begin);
744 }
745 }
746 delete iter;
747 }
748
749
750 //
751 // Test for problem reported by Ashok Matoria on 9 July 2007
752 // One.<kSoftHyphen><kSpace>Two.
753 //
754 // Sentence break at start (0) and then on calling next() it breaks at
755 // 'T' of "Two". Now, at this point if I do next() and
756 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
757 //
TestBug5775()758 void RBBITest::TestBug5775() {
759 UErrorCode status = U_ZERO_ERROR;
760 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
761 TEST_ASSERT_SUCCESS(status);
762 if (U_FAILURE(status)) {
763 return;
764 }
765 // Check for status first for better handling of no data errors.
766 TEST_ASSERT(bi != NULL);
767 if (bi == NULL) {
768 return;
769 }
770
771 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
772 // 01234 56789
773 s = s.unescape();
774 bi->setText(s);
775 int pos = bi->next();
776 TEST_ASSERT(pos == 6);
777 pos = bi->next();
778 TEST_ASSERT(pos == 10);
779 pos = bi->previous();
780 TEST_ASSERT(pos == 6);
781 delete bi;
782 }
783
784
785
786 //------------------------------------------------------------------------------
787 //
788 // RBBITest::Extended Run RBBI Tests from an external test data file
789 //
790 //------------------------------------------------------------------------------
791
792 struct TestParams {
793 BreakIterator *bi; // Break iterator is set while parsing test source.
794 // Changed out whenever test data changes break type.
795
796 UnicodeString dataToBreak; // Data that is built up while parsing the test.
797 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
798 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
799 UVector32 *srcCol;
800
801 UText *textToBreak; // UText, could be UTF8 or UTF16.
802 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
803 CharString utf8String; // UTF-8 form of text to break.
804
TestParamsTestParams805 TestParams(UErrorCode &status) : dataToBreak() {
806 bi = NULL;
807 expectedBreaks = new UVector32(status);
808 srcLine = new UVector32(status);
809 srcCol = new UVector32(status);
810 textToBreak = NULL;
811 textMap = new UVector32(status);
812 }
813
~TestParamsTestParams814 ~TestParams() {
815 delete bi;
816 delete expectedBreaks;
817 delete srcLine;
818 delete srcCol;
819 utext_close(textToBreak);
820 delete textMap;
821 }
822
823 int32_t getSrcLine(int32_t bp);
824 int32_t getExpectedBreak(int32_t bp);
825 int32_t getSrcCol(int32_t bp);
826
827 void setUTF16(UErrorCode &status);
828 void setUTF8(UErrorCode &status);
829 };
830
831 // Append a UnicodeString to a CharString with UTF-8 encoding.
832 // Substitute any invalid chars.
833 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)834 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
835 if (U_FAILURE(status)) {
836 return;
837 }
838 int32_t utf8Length;
839 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
840 src.getBuffer(), src.length(), // UTF-16 data
841 0xfffd, NULL, // Substitution char, number of subs.
842 &status);
843 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
844 return;
845 }
846 status = U_ZERO_ERROR;
847 int32_t capacity;
848 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
849 u_strToUTF8WithSub(buffer, utf8Length, NULL,
850 src.getBuffer(), src.length(),
851 0xfffd, NULL, &status);
852 dest.append(buffer, utf8Length, status);
853 }
854
855
setUTF16(UErrorCode & status)856 void TestParams::setUTF16(UErrorCode &status) {
857 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
858 textMap->removeAllElements();
859 for (int32_t i=0; i<dataToBreak.length(); i++) {
860 if (i == dataToBreak.getChar32Start(i)) {
861 textMap->addElement(i, status);
862 } else {
863 textMap->addElement(-1, status);
864 }
865 }
866 textMap->addElement(dataToBreak.length(), status);
867 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
868 }
869
870
setUTF8(UErrorCode & status)871 void TestParams::setUTF8(UErrorCode &status) {
872 if (U_FAILURE(status)) {
873 return;
874 }
875 utf8String.clear();
876 CharStringAppend(utf8String, dataToBreak, status);
877 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
878 if (U_FAILURE(status)) {
879 return;
880 }
881
882 textMap->removeAllElements();
883 int32_t utf16Index = 0;
884 for (;;) {
885 textMap->addElement(utf16Index, status);
886 UChar32 c32 = utext_current32(textToBreak);
887 if (c32 < 0) {
888 break;
889 }
890 utf16Index += U16_LENGTH(c32);
891 utext_next32(textToBreak);
892 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
893 textMap->addElement(-1, status);
894 }
895 }
896 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
897 }
898
899
getSrcLine(int32_t bp)900 int32_t TestParams::getSrcLine(int32_t bp) {
901 if (bp >= textMap->size()) {
902 bp = textMap->size() - 1;
903 }
904 int32_t i = 0;
905 for(; bp >= 0 ; --bp) {
906 // Move to a character boundary if we are not on one already.
907 i = textMap->elementAti(bp);
908 if (i >= 0) {
909 break;
910 }
911 }
912 return srcLine->elementAti(i);
913 }
914
915
getExpectedBreak(int32_t bp)916 int32_t TestParams::getExpectedBreak(int32_t bp) {
917 if (bp >= textMap->size()) {
918 return 0;
919 }
920 int32_t i = textMap->elementAti(bp);
921 int32_t retVal = 0;
922 if (i >= 0) {
923 retVal = expectedBreaks->elementAti(i);
924 }
925 return retVal;
926 }
927
928
getSrcCol(int32_t bp)929 int32_t TestParams::getSrcCol(int32_t bp) {
930 if (bp >= textMap->size()) {
931 bp = textMap->size() - 1;
932 }
933 int32_t i = 0;
934 for(; bp >= 0; --bp) {
935 // Move bp to a character boundary if we are not on one already.
936 i = textMap->elementAti(bp);
937 if (i >= 0) {
938 break;
939 }
940 }
941 return srcCol->elementAti(i);
942 }
943
944
executeTest(TestParams * t,UErrorCode & status)945 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
946 int32_t bp;
947 int32_t prevBP;
948 int32_t i;
949
950 TEST_ASSERT_SUCCESS(status);
951 if (U_FAILURE(status)) {
952 return;
953 }
954
955 if (t->bi == NULL) {
956 return;
957 }
958
959 t->bi->setText(t->textToBreak, status);
960 //
961 // Run the iterator forward
962 //
963 prevBP = -1;
964 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
965 if (prevBP == bp) {
966 // Fail for lack of forward progress.
967 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
968 bp, t->getSrcLine(bp), t->getSrcCol(bp));
969 break;
970 }
971
972 // Check that there we didn't miss an expected break between the last one
973 // and this one.
974 for (i=prevBP+1; i<bp; i++) {
975 if (t->getExpectedBreak(i) != 0) {
976 int expected[] = {0, i};
977 printStringBreaks(t->dataToBreak, expected, 2);
978 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
979 i, t->getSrcLine(i), t->getSrcCol(i));
980 }
981 }
982
983 // Check that the break we did find was expected
984 if (t->getExpectedBreak(bp) == 0) {
985 int expected[] = {0, bp};
986 printStringBreaks(t->textToBreak, expected, 2);
987 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
988 bp, t->getSrcLine(bp), t->getSrcCol(bp));
989 } else {
990 // The break was expected.
991 // Check that the {nnn} tag value is correct.
992 int32_t expectedTagVal = t->getExpectedBreak(bp);
993 if (expectedTagVal == -1) {
994 expectedTagVal = 0;
995 }
996 int32_t line = t->getSrcLine(bp);
997 int32_t rs = t->bi->getRuleStatus();
998 if (rs != expectedTagVal) {
999 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1000 " Actual, Expected status = %4d, %4d",
1001 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1002 }
1003 }
1004
1005 prevBP = bp;
1006 }
1007
1008 // Verify that there were no missed expected breaks after the last one found
1009 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1010 if (t->getExpectedBreak(i) != 0) {
1011 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1012 i, t->getSrcLine(i), t->getSrcCol(i));
1013 }
1014 }
1015
1016 //
1017 // Run the iterator backwards, verify that the same breaks are found.
1018 //
1019 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
1020 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1021 if (prevBP == bp) {
1022 // Fail for lack of progress.
1023 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1024 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1025 break;
1026 }
1027
1028 // Check that we didn't miss an expected break between the last one
1029 // and this one. (UVector returns zeros for index out of bounds.)
1030 for (i=prevBP-1; i>bp; i--) {
1031 if (t->getExpectedBreak(i) != 0) {
1032 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1033 i, t->getSrcLine(i), t->getSrcCol(i));
1034 }
1035 }
1036
1037 // Check that the break we did find was expected
1038 if (t->getExpectedBreak(bp) == 0) {
1039 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1040 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1041 } else {
1042 // The break was expected.
1043 // Check that the {nnn} tag value is correct.
1044 int32_t expectedTagVal = t->getExpectedBreak(bp);
1045 if (expectedTagVal == -1) {
1046 expectedTagVal = 0;
1047 }
1048 int line = t->getSrcLine(bp);
1049 int32_t rs = t->bi->getRuleStatus();
1050 if (rs != expectedTagVal) {
1051 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1052 " Actual, Expected status = %4d, %4d",
1053 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1054 }
1055 }
1056
1057 prevBP = bp;
1058 }
1059
1060 // Verify that there were no missed breaks prior to the last one found
1061 for (i=prevBP-1; i>=0; i--) {
1062 if (t->getExpectedBreak(i) != 0) {
1063 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1064 i, t->getSrcLine(i), t->getSrcCol(i));
1065 }
1066 }
1067
1068 // Check isBoundary()
1069 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1070 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1071 UBool boundaryFound = t->bi->isBoundary(i);
1072 if (boundaryExpected != boundaryFound) {
1073 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1074 " Expected, Actual= %s, %s",
1075 i, t->getSrcLine(i), t->getSrcCol(i),
1076 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1077 }
1078 }
1079
1080 // Check following()
1081 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1082 int32_t actualBreak = t->bi->following(i);
1083 int32_t expectedBreak = BreakIterator::DONE;
1084 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1085 if (t->getExpectedBreak(j) != 0) {
1086 expectedBreak = j;
1087 break;
1088 }
1089 }
1090 if (expectedBreak != actualBreak) {
1091 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1092 " Expected, Actual= %d, %d",
1093 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1094 }
1095 }
1096
1097 // Check preceding()
1098 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1099 int32_t actualBreak = t->bi->preceding(i);
1100 int32_t expectedBreak = BreakIterator::DONE;
1101
1102 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1103 // preceding(trailing byte) will return the index of some preceding code point,
1104 // not the lead byte of the current code point, even though that has a smaller index.
1105 // Therefore, start looking at the expected break data not at i-1, but at
1106 // the start of code point index - 1.
1107 utext_setNativeIndex(t->textToBreak, i);
1108 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1109 for (; j >= 0; j--) {
1110 if (t->getExpectedBreak(j) != 0) {
1111 expectedBreak = j;
1112 break;
1113 }
1114 }
1115 if (expectedBreak != actualBreak) {
1116 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1117 " Expected, Actual= %d, %d",
1118 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1119 }
1120 }
1121 }
1122
1123
TestExtended()1124 void RBBITest::TestExtended() {
1125 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1126 UErrorCode status = U_ZERO_ERROR;
1127 Locale locale("");
1128
1129 UnicodeString rules;
1130 TestParams tp(status);
1131
1132 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1133 if (U_FAILURE(status)) {
1134 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1135 }
1136
1137
1138 //
1139 // Open and read the test data file.
1140 //
1141 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1142 char testFileName[1000];
1143 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1144 errln("Can't open test data. Path too long.");
1145 return;
1146 }
1147 strcpy(testFileName, testDataDirectory);
1148 strcat(testFileName, "rbbitst.txt");
1149
1150 int len;
1151 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1152 if (U_FAILURE(status)) {
1153 return; /* something went wrong, error already output */
1154 }
1155
1156
1157 bool skipTest = false; // Skip this test?
1158
1159 //
1160 // Put the test data into a UnicodeString
1161 //
1162 UnicodeString testString(FALSE, testFile, len);
1163
1164 enum EParseState{
1165 PARSE_COMMENT,
1166 PARSE_TAG,
1167 PARSE_DATA,
1168 PARSE_NUM
1169 }
1170 parseState = PARSE_TAG;
1171
1172 EParseState savedState = PARSE_TAG;
1173
1174 static const UChar CH_LF = 0x0a;
1175 static const UChar CH_CR = 0x0d;
1176 static const UChar CH_HASH = 0x23;
1177 /*static const UChar CH_PERIOD = 0x2e;*/
1178 static const UChar CH_LT = 0x3c;
1179 static const UChar CH_GT = 0x3e;
1180 static const UChar CH_BACKSLASH = 0x5c;
1181 static const UChar CH_BULLET = 0x2022;
1182
1183 int32_t lineNum = 1;
1184 int32_t colStart = 0;
1185 int32_t column = 0;
1186 int32_t charIdx = 0;
1187
1188 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1189
1190 for (charIdx = 0; charIdx < len; ) {
1191 status = U_ZERO_ERROR;
1192 UChar c = testString.charAt(charIdx);
1193 charIdx++;
1194 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1195 // treat CRLF as a unit
1196 c = CH_LF;
1197 charIdx++;
1198 }
1199 if (c == CH_LF || c == CH_CR) {
1200 lineNum++;
1201 colStart = charIdx;
1202 }
1203 column = charIdx - colStart + 1;
1204
1205 switch (parseState) {
1206 case PARSE_COMMENT:
1207 if (c == 0x0a || c == 0x0d) {
1208 parseState = savedState;
1209 }
1210 break;
1211
1212 case PARSE_TAG:
1213 {
1214 if (c == CH_HASH) {
1215 parseState = PARSE_COMMENT;
1216 savedState = PARSE_TAG;
1217 break;
1218 }
1219 if (u_isUWhiteSpace(c)) {
1220 break;
1221 }
1222 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1223 delete tp.bi;
1224 tp.bi = BreakIterator::createWordInstance(locale, status);
1225 skipTest = false;
1226 charIdx += 5;
1227 break;
1228 }
1229 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1230 delete tp.bi;
1231 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1232 skipTest = false;
1233 charIdx += 5;
1234 break;
1235 }
1236 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1237 delete tp.bi;
1238 tp.bi = BreakIterator::createLineInstance(locale, status);
1239 skipTest = false;
1240 charIdx += 5;
1241 break;
1242 }
1243 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1244 delete tp.bi;
1245 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1246 skipTest = false;
1247 charIdx += 5;
1248 break;
1249 }
1250 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1251 delete tp.bi;
1252 tp.bi = BreakIterator::createTitleInstance(locale, status);
1253 charIdx += 6;
1254 break;
1255 }
1256
1257 // <locale loc_name>
1258 localeMatcher.reset(testString);
1259 if (localeMatcher.lookingAt(charIdx-1, status)) {
1260 UnicodeString localeName = localeMatcher.group(1, status);
1261 char localeName8[100];
1262 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1263 locale = Locale::createFromName(localeName8);
1264 charIdx += localeMatcher.group(0, status).length() - 1;
1265 TEST_ASSERT_SUCCESS(status);
1266 break;
1267 }
1268 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1269 parseState = PARSE_DATA;
1270 charIdx += 5;
1271 tp.dataToBreak = "";
1272 tp.expectedBreaks->removeAllElements();
1273 tp.srcCol ->removeAllElements();
1274 tp.srcLine->removeAllElements();
1275 break;
1276 }
1277
1278 errln("line %d: Tag expected in test file.", lineNum);
1279 parseState = PARSE_COMMENT;
1280 savedState = PARSE_DATA;
1281 goto end_test; // Stop the test.
1282 }
1283 break;
1284
1285 case PARSE_DATA:
1286 if (c == CH_BULLET) {
1287 int32_t breakIdx = tp.dataToBreak.length();
1288 tp.expectedBreaks->setSize(breakIdx+1);
1289 tp.expectedBreaks->setElementAt(-1, breakIdx);
1290 tp.srcLine->setSize(breakIdx+1);
1291 tp.srcLine->setElementAt(lineNum, breakIdx);
1292 tp.srcCol ->setSize(breakIdx+1);
1293 tp.srcCol ->setElementAt(column, breakIdx);
1294 break;
1295 }
1296
1297 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1298 // Add final entry to mappings from break location to source file position.
1299 // Need one extra because last break position returned is after the
1300 // last char in the data, not at the last char.
1301 tp.srcLine->addElement(lineNum, status);
1302 tp.srcCol ->addElement(column, status);
1303
1304 parseState = PARSE_TAG;
1305 charIdx += 6;
1306
1307 if (!skipTest) {
1308 // RUN THE TEST!
1309 status = U_ZERO_ERROR;
1310 tp.setUTF16(status);
1311 executeTest(&tp, status);
1312 TEST_ASSERT_SUCCESS(status);
1313
1314 // Run again, this time with UTF-8 text wrapped in a UText.
1315 status = U_ZERO_ERROR;
1316 tp.setUTF8(status);
1317 TEST_ASSERT_SUCCESS(status);
1318 executeTest(&tp, status);
1319 }
1320 break;
1321 }
1322
1323 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1324 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1325 // Get the code point from the name and insert it into the test data.
1326 // (Damn, no API takes names in Unicode !!!
1327 // we've got to take it back to char *)
1328 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1329 int32_t nameLength = nameEndIdx - (charIdx+2);
1330 char charNameBuf[200];
1331 UChar32 theChar = -1;
1332 if (nameEndIdx != -1) {
1333 UErrorCode status = U_ZERO_ERROR;
1334 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1335 charNameBuf[sizeof(charNameBuf)-1] = 0;
1336 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1337 if (U_FAILURE(status)) {
1338 theChar = -1;
1339 }
1340 }
1341 if (theChar == -1) {
1342 errln("Error in named character in test file at line %d, col %d",
1343 lineNum, column);
1344 } else {
1345 // Named code point was recognized. Insert it
1346 // into the test data.
1347 tp.dataToBreak.append(theChar);
1348 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1349 tp.srcLine->addElement(lineNum, status);
1350 tp.srcCol ->addElement(column, status);
1351 }
1352 }
1353 if (nameEndIdx > charIdx) {
1354 charIdx = nameEndIdx+1;
1355
1356 }
1357 break;
1358 }
1359
1360
1361
1362
1363 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1364 charIdx++;
1365 int32_t breakIdx = tp.dataToBreak.length();
1366 tp.expectedBreaks->setSize(breakIdx+1);
1367 tp.expectedBreaks->setElementAt(-1, breakIdx);
1368 tp.srcLine->setSize(breakIdx+1);
1369 tp.srcLine->setElementAt(lineNum, breakIdx);
1370 tp.srcCol ->setSize(breakIdx+1);
1371 tp.srcCol ->setElementAt(column, breakIdx);
1372 break;
1373 }
1374
1375 if (c == CH_LT) {
1376 tagValue = 0;
1377 parseState = PARSE_NUM;
1378 break;
1379 }
1380
1381 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1382 parseState = PARSE_COMMENT;
1383 savedState = PARSE_DATA;
1384 break;
1385 }
1386
1387 if (c == CH_BACKSLASH) {
1388 // Check for \ at end of line, a line continuation.
1389 // Advance over (discard) the newline
1390 UChar32 cp = testString.char32At(charIdx);
1391 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1392 // We have a CR LF
1393 // Need an extra increment of the input ptr to move over both of them
1394 charIdx++;
1395 }
1396 if (cp == CH_LF || cp == CH_CR) {
1397 lineNum++;
1398 colStart = charIdx;
1399 charIdx++;
1400 break;
1401 }
1402
1403 // Let unescape handle the back slash.
1404 cp = testString.unescapeAt(charIdx);
1405 if (cp != -1) {
1406 // Escape sequence was recognized. Insert the char
1407 // into the test data.
1408 tp.dataToBreak.append(cp);
1409 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1410 tp.srcLine->addElement(lineNum, status);
1411 tp.srcCol ->addElement(column, status);
1412 }
1413 break;
1414 }
1415
1416
1417 // Not a recognized backslash escape sequence.
1418 // Take the next char as a literal.
1419 // TODO: Should this be an error?
1420 c = testString.charAt(charIdx);
1421 charIdx = testString.moveIndex32(charIdx, 1);
1422 }
1423
1424 // Normal, non-escaped data char.
1425 tp.dataToBreak.append(c);
1426
1427 // Save the mapping from offset in the data to line/column numbers in
1428 // the original input file. Will be used for better error messages only.
1429 // If there's an expected break before this char, the slot in the mapping
1430 // vector will already be set for this char; don't overwrite it.
1431 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1432 tp.srcLine->addElement(lineNum, status);
1433 tp.srcCol ->addElement(column, status);
1434 }
1435 break;
1436
1437
1438 case PARSE_NUM:
1439 // We are parsing an expected numeric tag value, like <1234>,
1440 // within a chunk of data.
1441 if (u_isUWhiteSpace(c)) {
1442 break;
1443 }
1444
1445 if (c == CH_GT) {
1446 // Finished the number. Add the info to the expected break data,
1447 // and switch parse state back to doing plain data.
1448 parseState = PARSE_DATA;
1449 if (tagValue == 0) {
1450 tagValue = -1;
1451 }
1452 int32_t breakIdx = tp.dataToBreak.length();
1453 tp.expectedBreaks->setSize(breakIdx+1);
1454 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1455 tp.srcLine->setSize(breakIdx+1);
1456 tp.srcLine->setElementAt(lineNum, breakIdx);
1457 tp.srcCol ->setSize(breakIdx+1);
1458 tp.srcCol ->setElementAt(column, breakIdx);
1459 break;
1460 }
1461
1462 if (u_isdigit(c)) {
1463 tagValue = tagValue*10 + u_charDigitValue(c);
1464 break;
1465 }
1466
1467 errln("Syntax Error in test file at line %d, col %d",
1468 lineNum, column);
1469 parseState = PARSE_COMMENT;
1470 goto end_test; // Stop the test
1471 break;
1472 }
1473
1474
1475 if (U_FAILURE(status)) {
1476 dataerrln("ICU Error %s while parsing test file at line %d.",
1477 u_errorName(status), lineNum);
1478 status = U_ZERO_ERROR;
1479 goto end_test; // Stop the test
1480 }
1481
1482 }
1483
1484 end_test:
1485 delete [] testFile;
1486 #endif
1487 }
1488
1489
1490 //-------------------------------------------------------------------------------
1491 //
1492 // TestDictRules create a break iterator from source rules that includes a
1493 // dictionary range. Regression for bug #7130. Source rules
1494 // do not declare a break iterator type (word, line, sentence, etc.
1495 // but the dictionary code, without a type, would loop.
1496 //
1497 //-------------------------------------------------------------------------------
TestDictRules()1498 void RBBITest::TestDictRules() {
1499 const char *rules = "$dictionary = [a-z]; \n"
1500 "!!forward; \n"
1501 "$dictionary $dictionary; \n"
1502 "!!reverse; \n"
1503 "$dictionary $dictionary; \n";
1504 const char *text = "aa";
1505 UErrorCode status = U_ZERO_ERROR;
1506 UParseError parseError;
1507
1508 RuleBasedBreakIterator bi(rules, parseError, status);
1509 if (U_SUCCESS(status)) {
1510 UnicodeString utext = text;
1511 bi.setText(utext);
1512 int32_t position;
1513 int32_t loops;
1514 for (loops = 0; loops<10; loops++) {
1515 position = bi.next();
1516 if (position == RuleBasedBreakIterator::DONE) {
1517 break;
1518 }
1519 }
1520 TEST_ASSERT(loops == 1);
1521 } else {
1522 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1523 }
1524 }
1525
1526
1527
1528 //-------------------------------------------------------------------------------
1529 //
1530 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1531 // return the data in one big UChar * buffer, which the caller must delete.
1532 //
1533 // parameters:
1534 // fileName: the name of the file, with no directory part. The test data directory
1535 // is assumed.
1536 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1537 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1538 // specified here. The BOM, if it exists, will be stripped from the returned data.
1539 // Pass NULL for the system default encoding.
1540 // status
1541 // returns:
1542 // The file data, converted to UChar.
1543 // The caller must delete this when done with
1544 // delete [] theBuffer;
1545 //
1546 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1547 // Move this function to some common place.
1548 //
1549 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1550 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1551 UChar *retPtr = NULL;
1552 char *fileBuf = NULL;
1553 UConverter* conv = NULL;
1554 FILE *f = NULL;
1555
1556 ulen = 0;
1557 if (U_FAILURE(status)) {
1558 return retPtr;
1559 }
1560
1561 //
1562 // Open the file.
1563 //
1564 f = fopen(fileName, "rb");
1565 if (f == 0) {
1566 dataerrln("Error opening test data file %s\n", fileName);
1567 status = U_FILE_ACCESS_ERROR;
1568 return NULL;
1569 }
1570 //
1571 // Read it in
1572 //
1573 int fileSize;
1574 int amt_read;
1575
1576 fseek( f, 0, SEEK_END);
1577 fileSize = ftell(f);
1578 fileBuf = new char[fileSize];
1579 fseek(f, 0, SEEK_SET);
1580 amt_read = fread(fileBuf, 1, fileSize, f);
1581 if (amt_read != fileSize || fileSize <= 0) {
1582 errln("Error reading test data file.");
1583 goto cleanUpAndReturn;
1584 }
1585
1586 //
1587 // Look for a Unicode Signature (BOM) on the data just read
1588 //
1589 int32_t signatureLength;
1590 const char * fileBufC;
1591 const char* bomEncoding;
1592
1593 fileBufC = fileBuf;
1594 bomEncoding = ucnv_detectUnicodeSignature(
1595 fileBuf, fileSize, &signatureLength, &status);
1596 if(bomEncoding!=NULL ){
1597 fileBufC += signatureLength;
1598 fileSize -= signatureLength;
1599 encoding = bomEncoding;
1600 }
1601
1602 //
1603 // Open a converter to take the rule file to UTF-16
1604 //
1605 conv = ucnv_open(encoding, &status);
1606 if (U_FAILURE(status)) {
1607 goto cleanUpAndReturn;
1608 }
1609
1610 //
1611 // Convert the rules to UChar.
1612 // Preflight first to determine required buffer size.
1613 //
1614 ulen = ucnv_toUChars(conv,
1615 NULL, // dest,
1616 0, // destCapacity,
1617 fileBufC,
1618 fileSize,
1619 &status);
1620 if (status == U_BUFFER_OVERFLOW_ERROR) {
1621 // Buffer Overflow is expected from the preflight operation.
1622 status = U_ZERO_ERROR;
1623
1624 retPtr = new UChar[ulen+1];
1625 ucnv_toUChars(conv,
1626 retPtr, // dest,
1627 ulen+1,
1628 fileBufC,
1629 fileSize,
1630 &status);
1631 }
1632
1633 cleanUpAndReturn:
1634 fclose(f);
1635 delete []fileBuf;
1636 ucnv_close(conv);
1637 if (U_FAILURE(status)) {
1638 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1639 delete []retPtr;
1640 retPtr = 0;
1641 ulen = 0;
1642 };
1643 return retPtr;
1644 }
1645
1646
1647
1648 //--------------------------------------------------------------------------------------------
1649 //
1650 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1651 //
1652 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1653 void RBBITest::TestUnicodeFiles() {
1654 RuleBasedBreakIterator *bi;
1655 UErrorCode status = U_ZERO_ERROR;
1656
1657 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1658 TEST_ASSERT_SUCCESS(status);
1659 if (U_SUCCESS(status)) {
1660 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1661 }
1662 delete bi;
1663
1664 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1665 TEST_ASSERT_SUCCESS(status);
1666 if (U_SUCCESS(status)) {
1667 runUnicodeTestData("WordBreakTest.txt", bi);
1668 }
1669 delete bi;
1670
1671 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1672 TEST_ASSERT_SUCCESS(status);
1673 if (U_SUCCESS(status)) {
1674 runUnicodeTestData("SentenceBreakTest.txt", bi);
1675 }
1676 delete bi;
1677
1678 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1679 TEST_ASSERT_SUCCESS(status);
1680 if (U_SUCCESS(status)) {
1681 runUnicodeTestData("LineBreakTest.txt", bi);
1682 }
1683 delete bi;
1684 }
1685
1686
1687 // Check for test cases from the Unicode test data files that are known to fail
1688 // and should be skipped because ICU is not yet able to fully implement the spec.
1689 // See ticket #7270.
1690
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1691 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1692 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
1693 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
1694 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
1695 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
1696 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
1697 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
1698 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
1699 };
1700 if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1701 return FALSE;
1702 }
1703
1704 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1705 if (testCase == UnicodeString(badTestCases[i])) {
1706 return logKnownIssue("7270");
1707 }
1708 }
1709 return FALSE;
1710 }
1711
1712
1713 //--------------------------------------------------------------------------------------------
1714 //
1715 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1716 //
1717 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1718 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1719 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1720 UErrorCode status = U_ZERO_ERROR;
1721
1722 //
1723 // Open and read the test data file, put it into a UnicodeString.
1724 //
1725 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1726 char testFileName[1000];
1727 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1728 dataerrln("Can't open test data. Path too long.");
1729 return;
1730 }
1731 strcpy(testFileName, testDataDirectory);
1732 strcat(testFileName, fileName);
1733
1734 logln("Opening data file %s\n", fileName);
1735
1736 int len;
1737 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1738 if (status != U_FILE_ACCESS_ERROR) {
1739 TEST_ASSERT_SUCCESS(status);
1740 TEST_ASSERT(testFile != NULL);
1741 }
1742 if (U_FAILURE(status) || testFile == NULL) {
1743 return; /* something went wrong, error already output */
1744 }
1745 UnicodeString testFileAsString(TRUE, testFile, len);
1746
1747 //
1748 // Parse the test data file using a regular expression.
1749 // Each kind of token is recognized in its own capture group; what type of item was scanned
1750 // is identified by which group had a match.
1751 //
1752 // Caputure Group # 1 2 3 4 5
1753 // Parses this item: divide x hex digits comment \n unrecognized \n
1754 //
1755 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1756 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1757 UnicodeString testString;
1758 UVector32 breakPositions(status);
1759 int lineNumber = 1;
1760 TEST_ASSERT_SUCCESS(status);
1761 if (U_FAILURE(status)) {
1762 return;
1763 }
1764
1765 //
1766 // Scan through each test case, building up the string to be broken in testString,
1767 // and the positions that should be boundaries in the breakPositions vector.
1768 //
1769 int spin = 0;
1770 while (tokenMatcher.find()) {
1771 if(tokenMatcher.hitEnd()) {
1772 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1773 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1774 and caused an infinite loop here on EBCDIC systems!
1775 */
1776 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1777 // return;
1778 }
1779 if (tokenMatcher.start(1, status) >= 0) {
1780 // Scanned a divide sign, indicating a break position in the test data.
1781 if (testString.length()>0) {
1782 breakPositions.addElement(testString.length(), status);
1783 }
1784 }
1785 else if (tokenMatcher.start(2, status) >= 0) {
1786 // Scanned an 'x', meaning no break at this position in the test data
1787 // Nothing to be done here.
1788 }
1789 else if (tokenMatcher.start(3, status) >= 0) {
1790 // Scanned Hex digits. Convert them to binary, append to the character data string.
1791 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1792 int length = hexNumber.length();
1793 if (length<=8) {
1794 char buf[10];
1795 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1796 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1797 if (c<=0x10ffff) {
1798 testString.append(c);
1799 } else {
1800 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1801 fileName, lineNumber);
1802 }
1803 } else {
1804 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1805 fileName, lineNumber);
1806 }
1807 }
1808 else if (tokenMatcher.start(4, status) >= 0) {
1809 // Scanned to end of a line, possibly skipping over a comment in the process.
1810 // If the line from the file contained test data, run the test now.
1811 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1812 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1813 }
1814
1815 // Clear out this test case.
1816 // The string and breakPositions vector will be refilled as the next
1817 // test case is parsed.
1818 testString.remove();
1819 breakPositions.removeAllElements();
1820 lineNumber++;
1821 } else {
1822 // Scanner catchall. Something unrecognized appeared on the line.
1823 char token[16];
1824 UnicodeString uToken = tokenMatcher.group(0, status);
1825 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1826 token[sizeof(token)-1] = 0;
1827 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1828
1829 // Clean up, in preparation for continuing with the next line.
1830 testString.remove();
1831 breakPositions.removeAllElements();
1832 lineNumber++;
1833 }
1834 TEST_ASSERT_SUCCESS(status);
1835 if (U_FAILURE(status)) {
1836 break;
1837 }
1838 }
1839
1840 delete [] testFile;
1841 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1842 }
1843
1844 //--------------------------------------------------------------------------------------------
1845 //
1846 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1847 // test data files. Do only a simple, forward-only check -
1848 // this test is mostly to check that ICU and the Unicode
1849 // data agree with each other.
1850 //
1851 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1852 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1853 const UnicodeString &testString, // Text data to be broken
1854 UVector32 *breakPositions, // Positions where breaks should be found.
1855 RuleBasedBreakIterator *bi) {
1856 int32_t pos; // Break Position in the test string
1857 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1858 int32_t expectedPos; // Expected break position (index into test string)
1859
1860 bi->setText(testString);
1861 pos = bi->first();
1862 pos = bi->next();
1863
1864 while (pos != BreakIterator::DONE) {
1865 if (expectedI >= breakPositions->size()) {
1866 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1867 testFileName, lineNumber, pos);
1868 break;
1869 }
1870 expectedPos = breakPositions->elementAti(expectedI);
1871 if (pos < expectedPos) {
1872 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1873 testFileName, lineNumber, pos);
1874 break;
1875 }
1876 if (pos > expectedPos) {
1877 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1878 testFileName, lineNumber, expectedPos);
1879 break;
1880 }
1881 pos = bi->next();
1882 expectedI++;
1883 }
1884
1885 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1886 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1887 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1888 }
1889 }
1890
1891
1892
1893 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1894 //---------------------------------------------------------------------------------------
1895 //
1896 // classs RBBIMonkeyKind
1897 //
1898 // Monkey Test for Break Iteration
1899 // Abstract interface class. Concrete derived classes independently
1900 // implement the break rules for different iterator types.
1901 //
1902 // The Monkey Test itself uses doesn't know which type of break iterator it is
1903 // testing, but works purely in terms of the interface defined here.
1904 //
1905 //---------------------------------------------------------------------------------------
1906 class RBBIMonkeyKind {
1907 public:
1908 // Return a UVector of UnicodeSets, representing the character classes used
1909 // for this type of iterator.
1910 virtual UVector *charClasses() = 0;
1911
1912 // Set the test text on which subsequent calls to next() will operate
1913 virtual void setText(const UnicodeString &s) = 0;
1914
1915 // Find the next break postion, starting from the prev break position, or from zero.
1916 // Return -1 after reaching end of string.
1917 virtual int32_t next(int32_t i) = 0;
1918
1919 virtual ~RBBIMonkeyKind();
1920 UErrorCode deferredStatus;
1921
1922
1923 protected:
1924 RBBIMonkeyKind();
1925
1926 private:
1927 };
1928
RBBIMonkeyKind()1929 RBBIMonkeyKind::RBBIMonkeyKind() {
1930 deferredStatus = U_ZERO_ERROR;
1931 }
1932
~RBBIMonkeyKind()1933 RBBIMonkeyKind::~RBBIMonkeyKind() {
1934 }
1935
1936
1937 //----------------------------------------------------------------------------------------
1938 //
1939 // Random Numbers. Similar to standard lib rand() and srand()
1940 // Not using library to
1941 // 1. Get same results on all platforms.
1942 // 2. Get access to current seed, to more easily reproduce failures.
1943 //
1944 //---------------------------------------------------------------------------------------
1945 static uint32_t m_seed = 1;
1946
m_rand()1947 static uint32_t m_rand()
1948 {
1949 m_seed = m_seed * 1103515245 + 12345;
1950 return (uint32_t)(m_seed/65536) % 32768;
1951 }
1952
1953
1954 //
1955 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
1956 //
1957 static const char *gExtended_Pict = "["
1958 "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
1959 "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1960 "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
1961 "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
1962 "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
1963 "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
1964 "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
1965 "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
1966 "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1967 "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
1968 "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
1969 "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
1970 "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
1971 "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
1972 "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
1973 "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
1974 "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
1975 "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
1976 "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
1977 "]";
1978
1979 //------------------------------------------------------------------------------------------
1980 //
1981 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
1982 // of RBBIMonkeyKind.
1983 //
1984 //------------------------------------------------------------------------------------------
1985 class RBBICharMonkey: public RBBIMonkeyKind {
1986 public:
1987 RBBICharMonkey();
1988 virtual ~RBBICharMonkey();
1989 virtual UVector *charClasses();
1990 virtual void setText(const UnicodeString &s);
1991 virtual int32_t next(int32_t i);
1992 private:
1993 UVector *fSets;
1994
1995 UnicodeSet *fCRLFSet;
1996 UnicodeSet *fControlSet;
1997 UnicodeSet *fExtendSet;
1998 UnicodeSet *fZWJSet;
1999 UnicodeSet *fRegionalIndicatorSet;
2000 UnicodeSet *fPrependSet;
2001 UnicodeSet *fSpacingSet;
2002 UnicodeSet *fLSet;
2003 UnicodeSet *fVSet;
2004 UnicodeSet *fTSet;
2005 UnicodeSet *fLVSet;
2006 UnicodeSet *fLVTSet;
2007 UnicodeSet *fHangulSet;
2008 UnicodeSet *fEmojiBaseSet;
2009 UnicodeSet *fEmojiModifierSet;
2010 UnicodeSet *fExtendedPictSet;
2011 UnicodeSet *fEBGSet;
2012 UnicodeSet *fEmojiNRKSet;
2013 UnicodeSet *fAnySet;
2014
2015 const UnicodeString *fText;
2016 };
2017
2018
RBBICharMonkey()2019 RBBICharMonkey::RBBICharMonkey() {
2020 UErrorCode status = U_ZERO_ERROR;
2021
2022 fText = NULL;
2023
2024 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2025 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
2026 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
2027 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
2028 fRegionalIndicatorSet =
2029 new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2030 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2031 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2032 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2033 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2034 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2035 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2036 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2037 fHangulSet = new UnicodeSet();
2038 fHangulSet->addAll(*fLSet);
2039 fHangulSet->addAll(*fVSet);
2040 fHangulSet->addAll(*fTSet);
2041 fHangulSet->addAll(*fLVSet);
2042 fHangulSet->addAll(*fLVTSet);
2043
2044 fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2045 fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
2046 fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2047 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
2048 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
2049 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2050 fAnySet = new UnicodeSet(0, 0x10ffff);
2051
2052 fSets = new UVector(status);
2053 fSets->addElement(fCRLFSet, status);
2054 fSets->addElement(fControlSet, status);
2055 fSets->addElement(fExtendSet, status);
2056 fSets->addElement(fRegionalIndicatorSet, status);
2057 if (!fPrependSet->isEmpty()) {
2058 fSets->addElement(fPrependSet, status);
2059 }
2060 fSets->addElement(fSpacingSet, status);
2061 fSets->addElement(fHangulSet, status);
2062 fSets->addElement(fAnySet, status);
2063 fSets->addElement(fEmojiBaseSet, status);
2064 fSets->addElement(fEmojiModifierSet, status);
2065 fSets->addElement(fZWJSet, status);
2066 fSets->addElement(fExtendedPictSet, status);
2067 fSets->addElement(fEBGSet, status);
2068 fSets->addElement(fEmojiNRKSet,status);
2069 if (U_FAILURE(status)) {
2070 deferredStatus = status;
2071 }
2072 }
2073
2074
setText(const UnicodeString & s)2075 void RBBICharMonkey::setText(const UnicodeString &s) {
2076 fText = &s;
2077 }
2078
2079
2080
next(int32_t prevPos)2081 int32_t RBBICharMonkey::next(int32_t prevPos) {
2082 int p0, p1, p2, p3; // Indices of the significant code points around the
2083 // break position being tested. The candidate break
2084 // location is before p2.
2085
2086 int breakPos = -1;
2087
2088 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2089 UChar32 cBase; // for (X Extend*) patterns, the X character.
2090
2091 if (U_FAILURE(deferredStatus)) {
2092 return -1;
2093 }
2094
2095 // Previous break at end of string. return DONE.
2096 if (prevPos >= fText->length()) {
2097 return -1;
2098 }
2099 p0 = p1 = p2 = p3 = prevPos;
2100 c3 = fText->char32At(prevPos);
2101 c0 = c1 = c2 = cBase = 0;
2102 (void)p0; // suppress set but not used warning.
2103 (void)c0;
2104
2105 // Loop runs once per "significant" character position in the input text.
2106 for (;;) {
2107 // Move all of the positions forward in the input string.
2108 p0 = p1; c0 = c1;
2109 p1 = p2; c1 = c2;
2110 p2 = p3; c2 = c3;
2111
2112 // Advancd p3 by one codepoint
2113 p3 = fText->moveIndex32(p3, 1);
2114 c3 = fText->char32At(p3);
2115
2116 if (p1 == p2) {
2117 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2118 continue;
2119 }
2120 if (p2 == fText->length()) {
2121 // Reached end of string. Always a break position.
2122 break;
2123 }
2124
2125 // Rule GB3 CR x LF
2126 // No Extend or Format characters may appear between the CR and LF,
2127 // which requires the additional check for p2 immediately following p1.
2128 //
2129 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2130 continue;
2131 }
2132
2133 // Rule (GB4). ( Control | CR | LF ) <break>
2134 if (fControlSet->contains(c1) ||
2135 c1 == 0x0D ||
2136 c1 == 0x0A) {
2137 break;
2138 }
2139
2140 // Rule (GB5) <break> ( Control | CR | LF )
2141 //
2142 if (fControlSet->contains(c2) ||
2143 c2 == 0x0D ||
2144 c2 == 0x0A) {
2145 break;
2146 }
2147
2148
2149 // Rule (GB6) L x ( L | V | LV | LVT )
2150 if (fLSet->contains(c1) &&
2151 (fLSet->contains(c2) ||
2152 fVSet->contains(c2) ||
2153 fLVSet->contains(c2) ||
2154 fLVTSet->contains(c2))) {
2155 continue;
2156 }
2157
2158 // Rule (GB7) ( LV | V ) x ( V | T )
2159 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2160 (fVSet->contains(c2) || fTSet->contains(c2))) {
2161 continue;
2162 }
2163
2164 // Rule (GB8) ( LVT | T) x T
2165 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2166 fTSet->contains(c2)) {
2167 continue;
2168 }
2169
2170 // Rule (GB9) x (Extend | ZWJ)
2171 if (fExtendSet->contains(c2) || fZWJSet->contains(c2)) {
2172 if (!fExtendSet->contains(c1)) {
2173 cBase = c1;
2174 }
2175 continue;
2176 }
2177
2178 // Rule (GB9a) x SpacingMark
2179 if (fSpacingSet->contains(c2)) {
2180 continue;
2181 }
2182
2183 // Rule (GB9b) Prepend x
2184 if (fPrependSet->contains(c1)) {
2185 continue;
2186 }
2187
2188 // Rule (GB10) (Emoji_Base | EBG) Extend * x Emoji_Modifier
2189 if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2190 continue;
2191 }
2192 if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
2193 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2194 continue;
2195 }
2196
2197 // Rule (GB11) (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
2198 if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
2199 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2200 continue;
2201 }
2202
2203 // Rule (GB12-13) Regional_Indicator x Regional_Indicator
2204 // Note: The first if condition is a little tricky. We only need to force
2205 // a break if there are three or more contiguous RIs. If there are
2206 // only two, a break following will occur via other rules, and will include
2207 // any trailing extend characters, which is needed behavior.
2208 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2209 && fRegionalIndicatorSet->contains(c2)) {
2210 break;
2211 }
2212 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2213 continue;
2214 }
2215
2216 // Rule (GB999) Any <break> Any
2217 break;
2218 }
2219
2220 breakPos = p2;
2221 return breakPos;
2222 }
2223
2224
2225
charClasses()2226 UVector *RBBICharMonkey::charClasses() {
2227 return fSets;
2228 }
2229
2230
~RBBICharMonkey()2231 RBBICharMonkey::~RBBICharMonkey() {
2232 delete fSets;
2233 delete fCRLFSet;
2234 delete fControlSet;
2235 delete fExtendSet;
2236 delete fRegionalIndicatorSet;
2237 delete fPrependSet;
2238 delete fSpacingSet;
2239 delete fLSet;
2240 delete fVSet;
2241 delete fTSet;
2242 delete fLVSet;
2243 delete fLVTSet;
2244 delete fHangulSet;
2245 delete fAnySet;
2246 delete fEmojiBaseSet;
2247 delete fEmojiModifierSet;
2248 delete fZWJSet;
2249 delete fExtendedPictSet;
2250 delete fEBGSet;
2251 delete fEmojiNRKSet;
2252 }
2253
2254 //------------------------------------------------------------------------------------------
2255 //
2256 // class RBBIWordMonkey Word Break specific implementation
2257 // of RBBIMonkeyKind.
2258 //
2259 //------------------------------------------------------------------------------------------
2260 class RBBIWordMonkey: public RBBIMonkeyKind {
2261 public:
2262 RBBIWordMonkey();
2263 virtual ~RBBIWordMonkey();
2264 virtual UVector *charClasses();
2265 virtual void setText(const UnicodeString &s);
2266 virtual int32_t next(int32_t i);
2267 private:
2268 UVector *fSets;
2269
2270 UnicodeSet *fCRSet;
2271 UnicodeSet *fLFSet;
2272 UnicodeSet *fNewlineSet;
2273 UnicodeSet *fRegionalIndicatorSet;
2274 UnicodeSet *fKatakanaSet;
2275 UnicodeSet *fHebrew_LetterSet;
2276 UnicodeSet *fALetterSet;
2277 UnicodeSet *fSingle_QuoteSet;
2278 UnicodeSet *fDouble_QuoteSet;
2279 UnicodeSet *fMidNumLetSet;
2280 UnicodeSet *fMidLetterSet;
2281 UnicodeSet *fMidNumSet;
2282 UnicodeSet *fNumericSet;
2283 UnicodeSet *fFormatSet;
2284 UnicodeSet *fOtherSet;
2285 UnicodeSet *fExtendSet;
2286 UnicodeSet *fExtendNumLetSet;
2287 UnicodeSet *fDictionarySet;
2288 UnicodeSet *fEBaseSet;
2289 UnicodeSet *fEBGSet;
2290 UnicodeSet *fEModifierSet;
2291 UnicodeSet *fZWJSet;
2292 UnicodeSet *fExtendedPictSet;
2293 UnicodeSet *fEmojiNRKSet;
2294
2295 const UnicodeString *fText;
2296 };
2297
2298
RBBIWordMonkey()2299 RBBIWordMonkey::RBBIWordMonkey()
2300 {
2301 UErrorCode status = U_ZERO_ERROR;
2302
2303 fSets = new UVector(status);
2304
2305 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2306 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2307 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2308 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2309 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2310 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2311 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2312 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
2313 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
2314 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2315 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2316 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2317 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2318 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2319 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2320 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2321
2322 fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
2323 "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2324 fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"), status);
2325 fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"), status);
2326 fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"), status);
2327 fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2328 fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
2329 "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2330
2331 fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
2332 fDictionarySet->addAll(*fKatakanaSet);
2333 fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2334
2335 fALetterSet->removeAll(*fDictionarySet);
2336
2337 fOtherSet = new UnicodeSet();
2338 if(U_FAILURE(status)) {
2339 deferredStatus = status;
2340 return;
2341 }
2342
2343 fOtherSet->complement();
2344 fOtherSet->removeAll(*fCRSet);
2345 fOtherSet->removeAll(*fLFSet);
2346 fOtherSet->removeAll(*fNewlineSet);
2347 fOtherSet->removeAll(*fKatakanaSet);
2348 fOtherSet->removeAll(*fHebrew_LetterSet);
2349 fOtherSet->removeAll(*fALetterSet);
2350 fOtherSet->removeAll(*fSingle_QuoteSet);
2351 fOtherSet->removeAll(*fDouble_QuoteSet);
2352 fOtherSet->removeAll(*fMidLetterSet);
2353 fOtherSet->removeAll(*fMidNumSet);
2354 fOtherSet->removeAll(*fNumericSet);
2355 fOtherSet->removeAll(*fExtendNumLetSet);
2356 fOtherSet->removeAll(*fFormatSet);
2357 fOtherSet->removeAll(*fExtendSet);
2358 fOtherSet->removeAll(*fRegionalIndicatorSet);
2359 fOtherSet->removeAll(*fEBaseSet);
2360 fOtherSet->removeAll(*fEBGSet);
2361 fOtherSet->removeAll(*fEModifierSet);
2362 fOtherSet->removeAll(*fZWJSet);
2363 fOtherSet->removeAll(*fExtendedPictSet);
2364 fOtherSet->removeAll(*fEmojiNRKSet);
2365
2366 // Inhibit dictionary characters from being tested at all.
2367 fOtherSet->removeAll(*fDictionarySet);
2368
2369 fSets->addElement(fCRSet, status);
2370 fSets->addElement(fLFSet, status);
2371 fSets->addElement(fNewlineSet, status);
2372 fSets->addElement(fRegionalIndicatorSet, status);
2373 fSets->addElement(fHebrew_LetterSet, status);
2374 fSets->addElement(fALetterSet, status);
2375 fSets->addElement(fSingle_QuoteSet, status);
2376 fSets->addElement(fDouble_QuoteSet, status);
2377 //fSets->addElement(fKatakanaSet, status); // Omit Katakana from fSets, which omits Katakana characters
2378 // from the test data. They are all in the dictionary set,
2379 // which this (old, to be retired) monkey test cannot handle.
2380 fSets->addElement(fMidLetterSet, status);
2381 fSets->addElement(fMidNumLetSet, status);
2382 fSets->addElement(fMidNumSet, status);
2383 fSets->addElement(fNumericSet, status);
2384 fSets->addElement(fFormatSet, status);
2385 fSets->addElement(fExtendSet, status);
2386 fSets->addElement(fOtherSet, status);
2387 fSets->addElement(fExtendNumLetSet, status);
2388
2389 fSets->addElement(fEBaseSet, status);
2390 fSets->addElement(fEBGSet, status);
2391 fSets->addElement(fEModifierSet, status);
2392 fSets->addElement(fZWJSet, status);
2393 fSets->addElement(fExtendedPictSet, status);
2394 fSets->addElement(fEmojiNRKSet, status);
2395
2396 if (U_FAILURE(status)) {
2397 deferredStatus = status;
2398 }
2399 }
2400
setText(const UnicodeString & s)2401 void RBBIWordMonkey::setText(const UnicodeString &s) {
2402 fText = &s;
2403 }
2404
2405
next(int32_t prevPos)2406 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2407 int p0, p1, p2, p3; // Indices of the significant code points around the
2408 // break position being tested. The candidate break
2409 // location is before p2.
2410
2411 int breakPos = -1;
2412
2413 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2414
2415 if (U_FAILURE(deferredStatus)) {
2416 return -1;
2417 }
2418
2419 // Prev break at end of string. return DONE.
2420 if (prevPos >= fText->length()) {
2421 return -1;
2422 }
2423 p0 = p1 = p2 = p3 = prevPos;
2424 c3 = fText->char32At(prevPos);
2425 c0 = c1 = c2 = 0;
2426 (void)p0; // Suppress set but not used warning.
2427
2428 // Loop runs once per "significant" character position in the input text.
2429 for (;;) {
2430 // Move all of the positions forward in the input string.
2431 p0 = p1; c0 = c1;
2432 p1 = p2; c1 = c2;
2433 p2 = p3; c2 = c3;
2434
2435 // Advancd p3 by X(Extend | Format)* Rule 4
2436 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2437 do {
2438 p3 = fText->moveIndex32(p3, 1);
2439 c3 = fText->char32At(p3);
2440 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2441 break;
2442 };
2443 }
2444 while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2445
2446
2447 if (p1 == p2) {
2448 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2449 continue;
2450 }
2451 if (p2 == fText->length()) {
2452 // Reached end of string. Always a break position.
2453 break;
2454 }
2455
2456 // Rule (3) CR x LF
2457 // No Extend or Format characters may appear between the CR and LF,
2458 // which requires the additional check for p2 immediately following p1.
2459 //
2460 if (c1==0x0D && c2==0x0A) {
2461 continue;
2462 }
2463
2464 // Rule (3a) Break before and after newlines (including CR and LF)
2465 //
2466 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2467 break;
2468 };
2469 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2470 break;
2471 };
2472
2473 // Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
2474 // Not ignoring extend chars, so peek into input text to
2475 // get the potential ZWJ, the character immediately preceding c2.
2476 // Sloppy UChar32 indexing: p2-1 may reference trail half
2477 // but char32At will get the full code point.
2478 if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2479 continue;
2480 }
2481
2482 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2483 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2484 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2485 continue;
2486 }
2487
2488 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2489 //
2490 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2491 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2492 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2493 continue;
2494 }
2495
2496 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2497 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2498 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2499 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2500 continue;
2501 }
2502
2503 // Rule (7a) Hebrew_Letter x Single_Quote
2504 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2505 continue;
2506 }
2507
2508 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2509 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2510 continue;
2511 }
2512
2513 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2514 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2515 continue;
2516 }
2517
2518 // Rule (8) Numeric x Numeric
2519 if (fNumericSet->contains(c1) &&
2520 fNumericSet->contains(c2)) {
2521 continue;
2522 }
2523
2524 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2525 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2526 fNumericSet->contains(c2)) {
2527 continue;
2528 }
2529
2530 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2531 if (fNumericSet->contains(c1) &&
2532 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2533 continue;
2534 }
2535
2536 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2537 if (fNumericSet->contains(c0) &&
2538 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2539 fNumericSet->contains(c2)) {
2540 continue;
2541 }
2542
2543 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2544 if (fNumericSet->contains(c1) &&
2545 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2546 fNumericSet->contains(c3)) {
2547 continue;
2548 }
2549
2550 // Rule (13) Katakana x Katakana
2551 // Note: matches UAX 29 rules, but doesn't come into play for ICU because
2552 // all Katakana are handled by the dictionary breaker.
2553 if (fKatakanaSet->contains(c1) &&
2554 fKatakanaSet->contains(c2)) {
2555 continue;
2556 }
2557
2558 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2559 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2560 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2561 fExtendNumLetSet->contains(c2)) {
2562 continue;
2563 }
2564
2565 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2566 if (fExtendNumLetSet->contains(c1) &&
2567 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2568 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2569 continue;
2570 }
2571
2572 // WB 14 (E_Base | EBG) x E_Modifier
2573 if ((fEBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
2574 continue;
2575 }
2576
2577 // Rule 15 - 17 Group pairs of Regional Indicators.
2578 if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2579 break;
2580 }
2581 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2582 continue;
2583 }
2584
2585 // Rule 999. Break found here.
2586 break;
2587 }
2588
2589 breakPos = p2;
2590 return breakPos;
2591 }
2592
2593
charClasses()2594 UVector *RBBIWordMonkey::charClasses() {
2595 return fSets;
2596 }
2597
2598
~RBBIWordMonkey()2599 RBBIWordMonkey::~RBBIWordMonkey() {
2600 delete fSets;
2601 delete fCRSet;
2602 delete fLFSet;
2603 delete fNewlineSet;
2604 delete fKatakanaSet;
2605 delete fHebrew_LetterSet;
2606 delete fALetterSet;
2607 delete fSingle_QuoteSet;
2608 delete fDouble_QuoteSet;
2609 delete fMidNumLetSet;
2610 delete fMidLetterSet;
2611 delete fMidNumSet;
2612 delete fNumericSet;
2613 delete fFormatSet;
2614 delete fExtendSet;
2615 delete fExtendNumLetSet;
2616 delete fRegionalIndicatorSet;
2617 delete fDictionarySet;
2618 delete fOtherSet;
2619 delete fEBaseSet;
2620 delete fEBGSet;
2621 delete fEModifierSet;
2622 delete fZWJSet;
2623 delete fExtendedPictSet;
2624 delete fEmojiNRKSet;
2625 }
2626
2627
2628
2629
2630 //------------------------------------------------------------------------------------------
2631 //
2632 // class RBBISentMonkey Sentence Break specific implementation
2633 // of RBBIMonkeyKind.
2634 //
2635 //------------------------------------------------------------------------------------------
2636 class RBBISentMonkey: public RBBIMonkeyKind {
2637 public:
2638 RBBISentMonkey();
2639 virtual ~RBBISentMonkey();
2640 virtual UVector *charClasses();
2641 virtual void setText(const UnicodeString &s);
2642 virtual int32_t next(int32_t i);
2643 private:
2644 int moveBack(int posFrom);
2645 int moveForward(int posFrom);
2646 UChar32 cAt(int pos);
2647
2648 UVector *fSets;
2649
2650 UnicodeSet *fSepSet;
2651 UnicodeSet *fFormatSet;
2652 UnicodeSet *fSpSet;
2653 UnicodeSet *fLowerSet;
2654 UnicodeSet *fUpperSet;
2655 UnicodeSet *fOLetterSet;
2656 UnicodeSet *fNumericSet;
2657 UnicodeSet *fATermSet;
2658 UnicodeSet *fSContinueSet;
2659 UnicodeSet *fSTermSet;
2660 UnicodeSet *fCloseSet;
2661 UnicodeSet *fOtherSet;
2662 UnicodeSet *fExtendSet;
2663
2664 const UnicodeString *fText;
2665
2666 };
2667
RBBISentMonkey()2668 RBBISentMonkey::RBBISentMonkey()
2669 {
2670 UErrorCode status = U_ZERO_ERROR;
2671
2672 fSets = new UVector(status);
2673
2674 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2675 // set and made into character classes of their own. For the monkey impl,
2676 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2677 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2678 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2679 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2680 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2681 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2682 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2683 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2684 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2685 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2686 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2687 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2688 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2689 fOtherSet = new UnicodeSet();
2690
2691 if(U_FAILURE(status)) {
2692 deferredStatus = status;
2693 return;
2694 }
2695
2696 fOtherSet->complement();
2697 fOtherSet->removeAll(*fSepSet);
2698 fOtherSet->removeAll(*fFormatSet);
2699 fOtherSet->removeAll(*fSpSet);
2700 fOtherSet->removeAll(*fLowerSet);
2701 fOtherSet->removeAll(*fUpperSet);
2702 fOtherSet->removeAll(*fOLetterSet);
2703 fOtherSet->removeAll(*fNumericSet);
2704 fOtherSet->removeAll(*fATermSet);
2705 fOtherSet->removeAll(*fSContinueSet);
2706 fOtherSet->removeAll(*fSTermSet);
2707 fOtherSet->removeAll(*fCloseSet);
2708 fOtherSet->removeAll(*fExtendSet);
2709
2710 fSets->addElement(fSepSet, status);
2711 fSets->addElement(fFormatSet, status);
2712 fSets->addElement(fSpSet, status);
2713 fSets->addElement(fLowerSet, status);
2714 fSets->addElement(fUpperSet, status);
2715 fSets->addElement(fOLetterSet, status);
2716 fSets->addElement(fNumericSet, status);
2717 fSets->addElement(fATermSet, status);
2718 fSets->addElement(fSContinueSet, status);
2719 fSets->addElement(fSTermSet, status);
2720 fSets->addElement(fCloseSet, status);
2721 fSets->addElement(fOtherSet, status);
2722 fSets->addElement(fExtendSet, status);
2723
2724 if (U_FAILURE(status)) {
2725 deferredStatus = status;
2726 }
2727 }
2728
2729
2730
setText(const UnicodeString & s)2731 void RBBISentMonkey::setText(const UnicodeString &s) {
2732 fText = &s;
2733 }
2734
charClasses()2735 UVector *RBBISentMonkey::charClasses() {
2736 return fSets;
2737 }
2738
2739
2740 // moveBack() Find the "significant" code point preceding the index i.
2741 // Skips over ($Extend | $Format)* .
2742 //
moveBack(int i)2743 int RBBISentMonkey::moveBack(int i) {
2744 if (i <= 0) {
2745 return -1;
2746 }
2747 UChar32 c;
2748 int32_t j = i;
2749 do {
2750 j = fText->moveIndex32(j, -1);
2751 c = fText->char32At(j);
2752 }
2753 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2754 return j;
2755
2756 }
2757
2758
moveForward(int i)2759 int RBBISentMonkey::moveForward(int i) {
2760 if (i>=fText->length()) {
2761 return fText->length();
2762 }
2763 UChar32 c;
2764 int32_t j = i;
2765 do {
2766 j = fText->moveIndex32(j, 1);
2767 c = cAt(j);
2768 }
2769 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2770 return j;
2771 }
2772
cAt(int pos)2773 UChar32 RBBISentMonkey::cAt(int pos) {
2774 if (pos<0 || pos>=fText->length()) {
2775 return -1;
2776 } else {
2777 return fText->char32At(pos);
2778 }
2779 }
2780
next(int32_t prevPos)2781 int32_t RBBISentMonkey::next(int32_t prevPos) {
2782 int p0, p1, p2, p3; // Indices of the significant code points around the
2783 // break position being tested. The candidate break
2784 // location is before p2.
2785
2786 int breakPos = -1;
2787
2788 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2789 UChar32 c;
2790
2791 if (U_FAILURE(deferredStatus)) {
2792 return -1;
2793 }
2794
2795 // Prev break at end of string. return DONE.
2796 if (prevPos >= fText->length()) {
2797 return -1;
2798 }
2799 p0 = p1 = p2 = p3 = prevPos;
2800 c3 = fText->char32At(prevPos);
2801 c0 = c1 = c2 = 0;
2802 (void)p0; // Suppress set but not used warning.
2803
2804 // Loop runs once per "significant" character position in the input text.
2805 for (;;) {
2806 // Move all of the positions forward in the input string.
2807 p0 = p1; c0 = c1;
2808 p1 = p2; c1 = c2;
2809 p2 = p3; c2 = c3;
2810
2811 // Advancd p3 by X(Extend | Format)* Rule 4
2812 p3 = moveForward(p3);
2813 c3 = cAt(p3);
2814
2815 // Rule (3) CR x LF
2816 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2817 continue;
2818 }
2819
2820 // Rule (4). Sep <break>
2821 if (fSepSet->contains(c1)) {
2822 p2 = p1+1; // Separators don't combine with Extend or Format.
2823 break;
2824 }
2825
2826 if (p2 >= fText->length()) {
2827 // Reached end of string. Always a break position.
2828 break;
2829 }
2830
2831 if (p2 == prevPos) {
2832 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2833 continue;
2834 }
2835
2836 // Rule (6). ATerm x Numeric
2837 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2838 continue;
2839 }
2840
2841 // Rule (7). (Upper | Lower) ATerm x Uppper
2842 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2843 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2844 continue;
2845 }
2846
2847 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2848 // Note: STerm | ATerm are added to the negated part of the expression by a
2849 // note to the Unicode 5.0 documents.
2850 int p8 = p1;
2851 while (fSpSet->contains(cAt(p8))) {
2852 p8 = moveBack(p8);
2853 }
2854 while (fCloseSet->contains(cAt(p8))) {
2855 p8 = moveBack(p8);
2856 }
2857 if (fATermSet->contains(cAt(p8))) {
2858 p8=p2;
2859 for (;;) {
2860 c = cAt(p8);
2861 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2862 fLowerSet->contains(c) || fSepSet->contains(c) ||
2863 fATermSet->contains(c) || fSTermSet->contains(c)) {
2864 break;
2865 }
2866 p8 = moveForward(p8);
2867 }
2868 if (fLowerSet->contains(cAt(p8))) {
2869 continue;
2870 }
2871 }
2872
2873 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2874 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2875 p8 = p1;
2876 while (fSpSet->contains(cAt(p8))) {
2877 p8 = moveBack(p8);
2878 }
2879 while (fCloseSet->contains(cAt(p8))) {
2880 p8 = moveBack(p8);
2881 }
2882 c = cAt(p8);
2883 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2884 continue;
2885 }
2886 }
2887
2888 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2889 int p9 = p1;
2890 while (fCloseSet->contains(cAt(p9))) {
2891 p9 = moveBack(p9);
2892 }
2893 c = cAt(p9);
2894 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2895 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2896 continue;
2897 }
2898 }
2899
2900 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2901 int p10 = p1;
2902 while (fSpSet->contains(cAt(p10))) {
2903 p10 = moveBack(p10);
2904 }
2905 while (fCloseSet->contains(cAt(p10))) {
2906 p10 = moveBack(p10);
2907 }
2908 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2909 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2910 continue;
2911 }
2912 }
2913
2914 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2915 int p11 = p1;
2916 if (fSepSet->contains(cAt(p11))) {
2917 p11 = moveBack(p11);
2918 }
2919 while (fSpSet->contains(cAt(p11))) {
2920 p11 = moveBack(p11);
2921 }
2922 while (fCloseSet->contains(cAt(p11))) {
2923 p11 = moveBack(p11);
2924 }
2925 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2926 break;
2927 }
2928
2929 // Rule (12) Any x Any
2930 continue;
2931 }
2932 breakPos = p2;
2933 return breakPos;
2934 }
2935
~RBBISentMonkey()2936 RBBISentMonkey::~RBBISentMonkey() {
2937 delete fSets;
2938 delete fSepSet;
2939 delete fFormatSet;
2940 delete fSpSet;
2941 delete fLowerSet;
2942 delete fUpperSet;
2943 delete fOLetterSet;
2944 delete fNumericSet;
2945 delete fATermSet;
2946 delete fSContinueSet;
2947 delete fSTermSet;
2948 delete fCloseSet;
2949 delete fOtherSet;
2950 delete fExtendSet;
2951 }
2952
2953
2954
2955 //-------------------------------------------------------------------------------------------
2956 //
2957 // RBBILineMonkey
2958 //
2959 //-------------------------------------------------------------------------------------------
2960
2961 class RBBILineMonkey: public RBBIMonkeyKind {
2962 public:
2963 RBBILineMonkey();
2964 virtual ~RBBILineMonkey();
2965 virtual UVector *charClasses();
2966 virtual void setText(const UnicodeString &s);
2967 virtual int32_t next(int32_t i);
2968 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2969 private:
2970 UVector *fSets;
2971
2972 UnicodeSet *fBK;
2973 UnicodeSet *fCR;
2974 UnicodeSet *fLF;
2975 UnicodeSet *fCM;
2976 UnicodeSet *fNL;
2977 UnicodeSet *fSG;
2978 UnicodeSet *fWJ;
2979 UnicodeSet *fZW;
2980 UnicodeSet *fGL;
2981 UnicodeSet *fCB;
2982 UnicodeSet *fSP;
2983 UnicodeSet *fB2;
2984 UnicodeSet *fBA;
2985 UnicodeSet *fBB;
2986 UnicodeSet *fHY;
2987 UnicodeSet *fH2;
2988 UnicodeSet *fH3;
2989 UnicodeSet *fCL;
2990 UnicodeSet *fCP;
2991 UnicodeSet *fEX;
2992 UnicodeSet *fIN;
2993 UnicodeSet *fJL;
2994 UnicodeSet *fJV;
2995 UnicodeSet *fJT;
2996 UnicodeSet *fNS;
2997 UnicodeSet *fOP;
2998 UnicodeSet *fQU;
2999 UnicodeSet *fIS;
3000 UnicodeSet *fNU;
3001 UnicodeSet *fPO;
3002 UnicodeSet *fPR;
3003 UnicodeSet *fSY;
3004 UnicodeSet *fAI;
3005 UnicodeSet *fAL;
3006 UnicodeSet *fCJ;
3007 UnicodeSet *fHL;
3008 UnicodeSet *fID;
3009 UnicodeSet *fRI;
3010 UnicodeSet *fXX;
3011 UnicodeSet *fEB;
3012 UnicodeSet *fEM;
3013 UnicodeSet *fZJ;
3014 UnicodeSet *fExtendedPict;
3015 UnicodeSet *fEmojiNRK;
3016
3017 BreakIterator *fCharBI;
3018 const UnicodeString *fText;
3019 RegexMatcher *fNumberMatcher;
3020 };
3021
RBBILineMonkey()3022 RBBILineMonkey::RBBILineMonkey() :
3023 RBBIMonkeyKind(),
3024 fSets(NULL),
3025
3026 fCharBI(NULL),
3027 fText(NULL),
3028 fNumberMatcher(NULL)
3029
3030 {
3031 if (U_FAILURE(deferredStatus)) {
3032 return;
3033 }
3034
3035 UErrorCode status = U_ZERO_ERROR;
3036
3037 fSets = new UVector(status);
3038
3039 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3040 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3041 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3042 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3043 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3044 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3045 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3046 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3047 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3048 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3049 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3050 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3051 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3052 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3053 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3054 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3055 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3056 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3057 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3058 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3059 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3060 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3061 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3062 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3063 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3064 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3065 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3066 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3067 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3068 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3069 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3070 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3071 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3072 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3073 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3074 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3075 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3076 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3077 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3078 fEB = new UnicodeSet(UNICODE_STRING_SIMPLE(
3079 "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
3080 fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3081 fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
3082 fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
3083 fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
3084
3085 if (U_FAILURE(status)) {
3086 deferredStatus = status;
3087 return;
3088 }
3089
3090 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3091 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
3092 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3093
3094 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
3095 fCM->addAll(*fZJ); // ZWJ behaves as a CM.
3096
3097 fSets->addElement(fBK, status);
3098 fSets->addElement(fCR, status);
3099 fSets->addElement(fLF, status);
3100 fSets->addElement(fCM, status);
3101 fSets->addElement(fNL, status);
3102 fSets->addElement(fWJ, status);
3103 fSets->addElement(fZW, status);
3104 fSets->addElement(fGL, status);
3105 fSets->addElement(fCB, status);
3106 fSets->addElement(fSP, status);
3107 fSets->addElement(fB2, status);
3108 fSets->addElement(fBA, status);
3109 fSets->addElement(fBB, status);
3110 fSets->addElement(fHY, status);
3111 fSets->addElement(fH2, status);
3112 fSets->addElement(fH3, status);
3113 fSets->addElement(fCL, status);
3114 fSets->addElement(fCP, status);
3115 fSets->addElement(fEX, status);
3116 fSets->addElement(fIN, status);
3117 fSets->addElement(fJL, status);
3118 fSets->addElement(fJT, status);
3119 fSets->addElement(fJV, status);
3120 fSets->addElement(fNS, status);
3121 fSets->addElement(fOP, status);
3122 fSets->addElement(fQU, status);
3123 fSets->addElement(fIS, status);
3124 fSets->addElement(fNU, status);
3125 fSets->addElement(fPO, status);
3126 fSets->addElement(fPR, status);
3127 fSets->addElement(fSY, status);
3128 fSets->addElement(fAI, status);
3129 fSets->addElement(fAL, status);
3130 fSets->addElement(fHL, status);
3131 fSets->addElement(fID, status);
3132 fSets->addElement(fWJ, status);
3133 fSets->addElement(fRI, status);
3134 fSets->addElement(fSG, status);
3135 fSets->addElement(fEB, status);
3136 fSets->addElement(fEM, status);
3137 fSets->addElement(fZJ, status);
3138 fSets->addElement(fExtendedPict, status);
3139 fSets->addElement(fEmojiNRK, status);
3140
3141
3142 const char *rules =
3143 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3144 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3145 "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3146 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3147 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3148 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
3149
3150 fNumberMatcher = new RegexMatcher(
3151 UnicodeString(rules, -1, US_INV), 0, status);
3152
3153 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3154
3155 if (U_FAILURE(status)) {
3156 deferredStatus = status;
3157 }
3158 }
3159
3160
setText(const UnicodeString & s)3161 void RBBILineMonkey::setText(const UnicodeString &s) {
3162 fText = &s;
3163 fCharBI->setText(s);
3164 fNumberMatcher->reset(s);
3165 }
3166
3167 //
3168 // rule9Adjust
3169 // Line Break TR rules 9 and 10 implementation.
3170 // This deals with combining marks and other sequences that
3171 // that must be treated as if they were something other than what they actually are.
3172 //
3173 // This is factored out into a separate function because it must be applied twice for
3174 // each potential break, once to the chars before the position being checked, then
3175 // again to the text following the possible break.
3176 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3177 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3178 if (pos == -1) {
3179 // Invalid initial position. Happens during the warmup iteration of the
3180 // main loop in next().
3181 return;
3182 }
3183
3184 int32_t nPos = *nextPos;
3185
3186 // LB 9 Keep combining sequences together.
3187 // advance over any CM class chars. Note that Line Break CM is different
3188 // from the normal Grapheme Extend property.
3189 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3190 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3191 for (;;) {
3192 *nextChar = fText->char32At(nPos);
3193 if (!fCM->contains(*nextChar)) {
3194 break;
3195 }
3196 nPos = fText->moveIndex32(nPos, 1);
3197 }
3198 }
3199
3200
3201 // LB 9 Treat X CM* as if it were x.
3202 // No explicit action required.
3203
3204 // LB 10 Treat any remaining combining mark as AL
3205 if (fCM->contains(*posChar)) {
3206 *posChar = 0x41; // thisChar = 'A';
3207 }
3208
3209 // Push the updated nextPos and nextChar back to our caller.
3210 // This only makes a difference if posChar got bigger by consuming a
3211 // combining sequence.
3212 *nextPos = nPos;
3213 *nextChar = fText->char32At(nPos);
3214 }
3215
3216
3217
next(int32_t startPos)3218 int32_t RBBILineMonkey::next(int32_t startPos) {
3219 UErrorCode status = U_ZERO_ERROR;
3220 int32_t pos; // Index of the char following a potential break position
3221 UChar32 thisChar; // Character at above position "pos"
3222
3223 int32_t prevPos; // Index of the char preceding a potential break position
3224 UChar32 prevChar; // Character at above position. Note that prevChar
3225 // and thisChar may not be adjacent because combining
3226 // characters between them will be ignored.
3227
3228 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3229 UChar32 prevCharX2;
3230
3231 int32_t nextPos; // Index of the next character following pos.
3232 // Usually skips over combining marks.
3233 int32_t nextCPPos; // Index of the code point following "pos."
3234 // May point to a combining mark.
3235 int32_t tPos; // temp value.
3236 UChar32 c;
3237
3238 if (U_FAILURE(deferredStatus)) {
3239 return -1;
3240 }
3241
3242 if (startPos >= fText->length()) {
3243 return -1;
3244 }
3245
3246
3247 // Initial values for loop. Loop will run the first time without finding breaks,
3248 // while the invalid values shift out and the "this" and
3249 // "prev" positions are filled in with good values.
3250 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3251 thisChar = prevChar = prevCharX2 = 0;
3252 nextPos = nextCPPos = startPos;
3253
3254
3255 // Loop runs once per position in the test text, until a break position
3256 // is found.
3257 for (;;) {
3258 prevPosX2 = prevPos;
3259 prevCharX2 = prevChar;
3260
3261 prevPos = pos;
3262 prevChar = thisChar;
3263
3264 pos = nextPos;
3265 thisChar = fText->char32At(pos);
3266
3267 nextCPPos = fText->moveIndex32(pos, 1);
3268 nextPos = nextCPPos;
3269
3270 // Rule LB2 - Break at end of text.
3271 if (pos >= fText->length()) {
3272 break;
3273 }
3274
3275 // Rule LB 9 - adjust for combining sequences.
3276 // We do this one out-of-order because the adjustment does not change anything
3277 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3278 // be applied.
3279 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3280 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3281 c = fText->char32At(nextPos);
3282 rule9Adjust(pos, &thisChar, &nextPos, &c);
3283
3284 // If the loop is still warming up - if we haven't shifted the initial
3285 // -1 positions out of prevPos yet - loop back to advance the
3286 // position in the input without any further looking for breaks.
3287 if (prevPos == -1) {
3288 continue;
3289 }
3290
3291 // LB 4 Always break after hard line breaks,
3292 if (fBK->contains(prevChar)) {
3293 break;
3294 }
3295
3296 // LB 5 Break after CR, LF, NL, but not inside CR LF
3297 if (prevChar == 0x0d && thisChar == 0x0a) {
3298 continue;
3299 }
3300 if (prevChar == 0x0d ||
3301 prevChar == 0x0a ||
3302 prevChar == 0x85) {
3303 break;
3304 }
3305
3306 // LB 6 Don't break before hard line breaks
3307 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3308 fBK->contains(thisChar)) {
3309 continue;
3310 }
3311
3312
3313 // LB 7 Don't break before spaces or zero-width space.
3314 if (fSP->contains(thisChar)) {
3315 continue;
3316 }
3317
3318 if (fZW->contains(thisChar)) {
3319 continue;
3320 }
3321
3322 // LB 8 Break after zero width space
3323 if (fZW->contains(prevChar)) {
3324 break;
3325 }
3326
3327 // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
3328 // The monkey test's way of ignoring combining characters doesn't work
3329 // for this rule. ZJ is also a CM. Need to get the actual character
3330 // preceding "thisChar", not ignoring combining marks, possibly ZJ.
3331 {
3332 int32_t prevIdx = fText->moveIndex32(pos, -1);
3333 UChar32 prevC = fText->char32At(prevIdx);
3334 if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
3335 continue;
3336 }
3337 }
3338
3339 // LB 9, 10 Already done, at top of loop.
3340 //
3341
3342
3343 // LB 11 Do not break before or after WORD JOINER and related characters.
3344 // x WJ
3345 // WJ x
3346 //
3347 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3348 continue;
3349 }
3350
3351 // LB 12
3352 // GL x
3353 if (fGL->contains(prevChar)) {
3354 continue;
3355 }
3356
3357 // LB 12a
3358 // [^SP BA HY] x GL
3359 if (!(fSP->contains(prevChar) ||
3360 fBA->contains(prevChar) ||
3361 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3362 continue;
3363 }
3364
3365
3366
3367 // LB 13 Don't break before closings.
3368 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3369 // fall into LB 17 and the more general number regular expression.
3370 //
3371 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3372 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3373 fEX->contains(thisChar) ||
3374 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3375 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3376 continue;
3377 }
3378
3379 // LB 14 Don't break after OP SP*
3380 // Scan backwards, checking for this sequence.
3381 // The OP char could include combining marks, so we actually check for
3382 // OP CM* SP*
3383 // Another Twist: The Rule 67 fixes may have changed a SP CM
3384 // sequence into a ID char, so before scanning back through spaces,
3385 // verify that prevChar is indeed a space. The prevChar variable
3386 // may differ from fText[prevPos]
3387 tPos = prevPos;
3388 if (fSP->contains(prevChar)) {
3389 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3390 tPos=fText->moveIndex32(tPos, -1);
3391 }
3392 }
3393 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3394 tPos=fText->moveIndex32(tPos, -1);
3395 }
3396 if (fOP->contains(fText->char32At(tPos))) {
3397 continue;
3398 }
3399
3400
3401 // LB 15 QU SP* x OP
3402 if (fOP->contains(thisChar)) {
3403 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3404 int tPos = prevPos;
3405 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3406 tPos = fText->moveIndex32(tPos, -1);
3407 }
3408 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3409 tPos = fText->moveIndex32(tPos, -1);
3410 }
3411 if (fQU->contains(fText->char32At(tPos))) {
3412 continue;
3413 }
3414 }
3415
3416
3417
3418 // LB 16 (CL | CP) SP* x NS
3419 // Scan backwards for SP* CM* (CL | CP)
3420 if (fNS->contains(thisChar)) {
3421 int tPos = prevPos;
3422 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3423 tPos = fText->moveIndex32(tPos, -1);
3424 }
3425 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3426 tPos = fText->moveIndex32(tPos, -1);
3427 }
3428 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3429 continue;
3430 }
3431 }
3432
3433
3434 // LB 17 B2 SP* x B2
3435 if (fB2->contains(thisChar)) {
3436 // Scan backwards, checking for the B2 CM* SP* sequence.
3437 tPos = prevPos;
3438 if (fSP->contains(prevChar)) {
3439 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3440 tPos=fText->moveIndex32(tPos, -1);
3441 }
3442 }
3443 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3444 tPos=fText->moveIndex32(tPos, -1);
3445 }
3446 if (fB2->contains(fText->char32At(tPos))) {
3447 continue;
3448 }
3449 }
3450
3451
3452 // LB 18 break after space
3453 if (fSP->contains(prevChar)) {
3454 break;
3455 }
3456
3457 // LB 19
3458 // x QU
3459 // QU x
3460 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3461 continue;
3462 }
3463
3464 // LB 20 Break around a CB
3465 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3466 break;
3467 }
3468
3469 // LB 21
3470 if (fBA->contains(thisChar) ||
3471 fHY->contains(thisChar) ||
3472 fNS->contains(thisChar) ||
3473 fBB->contains(prevChar) ) {
3474 continue;
3475 }
3476
3477 // LB 21a
3478 // HL (HY | BA) x
3479 if (fHL->contains(prevCharX2) &&
3480 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3481 continue;
3482 }
3483
3484 // LB 21b
3485 // SY x HL
3486 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3487 continue;
3488 }
3489
3490 // LB 22
3491 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3492 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3493 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3494 ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3495 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3496 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3497 continue;
3498 }
3499
3500
3501 // LB 23 (AL | HL) x NU
3502 // NU x (AL | HL)
3503 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3504 continue;
3505 }
3506 if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3507 continue;
3508 }
3509
3510 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3511 // PR x (ID | EB | EM)
3512 // (ID | EB | EM) x PO
3513 if (fPR->contains(prevChar) &&
3514 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar))) {
3515 continue;
3516 }
3517 if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3518 fPO->contains(thisChar)) {
3519 continue;
3520 }
3521
3522 // LB 24 Do not break between prefix and letters or ideographs.
3523 // (PR | PO) x (AL | HL)
3524 // (AL | HL) x (PR | PO)
3525 if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3526 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3527 continue;
3528 }
3529 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3530 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3531 continue;
3532 }
3533
3534
3535
3536 // LB 25 Numbers
3537 if (fNumberMatcher->lookingAt(prevPos, status)) {
3538 if (U_FAILURE(status)) {
3539 break;
3540 }
3541 // Matched a number. But could have been just a single digit, which would
3542 // not represent a "no break here" between prevChar and thisChar
3543 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3544 if (numEndIdx > pos) {
3545 // Number match includes at least our two chars being checked
3546 if (numEndIdx > nextPos) {
3547 // Number match includes additional chars. Update pos and nextPos
3548 // so that next loop iteration will continue at the end of the number,
3549 // checking for breaks between last char in number & whatever follows.
3550 pos = nextPos = numEndIdx;
3551 do {
3552 pos = fText->moveIndex32(pos, -1);
3553 thisChar = fText->char32At(pos);
3554 } while (fCM->contains(thisChar));
3555 }
3556 continue;
3557 }
3558 }
3559
3560
3561 // LB 26 Do not break a Korean syllable.
3562 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3563 fJV->contains(thisChar) ||
3564 fH2->contains(thisChar) ||
3565 fH3->contains(thisChar))) {
3566 continue;
3567 }
3568
3569 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3570 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3571 continue;
3572 }
3573
3574 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3575 fJT->contains(thisChar)) {
3576 continue;
3577 }
3578
3579 // LB 27 Treat a Korean Syllable Block the same as ID.
3580 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3581 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3582 fIN->contains(thisChar)) {
3583 continue;
3584 }
3585 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3586 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3587 fPO->contains(thisChar)) {
3588 continue;
3589 }
3590 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3591 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3592 continue;
3593 }
3594
3595
3596
3597 // LB 28 Do not break between alphabetics ("at").
3598 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3599 continue;
3600 }
3601
3602 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3603 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3604 continue;
3605 }
3606
3607 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3608 // (AL | NU) x OP
3609 // CP x (AL | NU)
3610 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3611 continue;
3612 }
3613 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3614 continue;
3615 }
3616
3617 // LB30a RI RI <break> RI
3618 // RI x RI
3619 if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3620 break;
3621 }
3622 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3623 continue;
3624 }
3625
3626 // LB30b Emoji Base x Emoji Modifier
3627 if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3628 continue;
3629 }
3630
3631 // LB 31 Break everywhere else
3632 break;
3633
3634 }
3635
3636 return pos;
3637 }
3638
3639
charClasses()3640 UVector *RBBILineMonkey::charClasses() {
3641 return fSets;
3642 }
3643
3644
~RBBILineMonkey()3645 RBBILineMonkey::~RBBILineMonkey() {
3646 delete fSets;
3647
3648 delete fBK;
3649 delete fCR;
3650 delete fLF;
3651 delete fCM;
3652 delete fNL;
3653 delete fWJ;
3654 delete fZW;
3655 delete fGL;
3656 delete fCB;
3657 delete fSP;
3658 delete fB2;
3659 delete fBA;
3660 delete fBB;
3661 delete fHY;
3662 delete fH2;
3663 delete fH3;
3664 delete fCL;
3665 delete fCP;
3666 delete fEX;
3667 delete fIN;
3668 delete fJL;
3669 delete fJV;
3670 delete fJT;
3671 delete fNS;
3672 delete fOP;
3673 delete fQU;
3674 delete fIS;
3675 delete fNU;
3676 delete fPO;
3677 delete fPR;
3678 delete fSY;
3679 delete fAI;
3680 delete fAL;
3681 delete fCJ;
3682 delete fHL;
3683 delete fID;
3684 delete fRI;
3685 delete fSG;
3686 delete fXX;
3687 delete fEB;
3688 delete fEM;
3689 delete fZJ;
3690 delete fExtendedPict;
3691 delete fEmojiNRK;
3692
3693 delete fCharBI;
3694 delete fNumberMatcher;
3695 }
3696
3697
3698 //-------------------------------------------------------------------------------------------
3699 //
3700 // TestMonkey
3701 //
3702 // params
3703 // seed=nnnnn Random number starting seed.
3704 // Setting the seed allows errors to be reproduced.
3705 // loop=nnn Looping count. Controls running time.
3706 // -1: run forever.
3707 // 0 or greater: run length.
3708 //
3709 // type = char | word | line | sent | title
3710 //
3711 // Example:
3712 // intltest rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3713 //
3714 //-------------------------------------------------------------------------------------------
3715
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3716 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3717 int32_t val = defaultVal;
3718 name.append(" *= *(-?\\d+)");
3719 UErrorCode status = U_ZERO_ERROR;
3720 RegexMatcher m(name, params, 0, status);
3721 if (m.find()) {
3722 // The param exists. Convert the string to an int.
3723 char valString[100];
3724 int32_t paramLength = m.end(1, status) - m.start(1, status);
3725 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3726 paramLength = (int32_t)(sizeof(valString)-2);
3727 }
3728 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3729 val = strtol(valString, NULL, 10);
3730
3731 // Delete this parameter from the params string.
3732 m.reset();
3733 params = m.replaceFirst("", status);
3734 }
3735 U_ASSERT(U_SUCCESS(status));
3736 return val;
3737 }
3738 #endif
3739
3740 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3741 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3742 BreakIterator *bi,
3743 int expected[],
3744 int expectedcount)
3745 {
3746 int count = 0;
3747 int i = 0;
3748 int forward[50];
3749 bi->setText(ustr);
3750 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3751 forward[count] = i;
3752 if (count < expectedcount && expected[count] != i) {
3753 test->errln("break forward test failed: expected %d but got %d",
3754 expected[count], i);
3755 break;
3756 }
3757 count ++;
3758 }
3759 if (count != expectedcount) {
3760 printStringBreaks(ustr, expected, expectedcount);
3761 test->errln("break forward test failed: missed %d match",
3762 expectedcount - count);
3763 return;
3764 }
3765 // testing boundaries
3766 for (i = 1; i < expectedcount; i ++) {
3767 int j = expected[i - 1];
3768 if (!bi->isBoundary(j)) {
3769 printStringBreaks(ustr, expected, expectedcount);
3770 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3771 return;
3772 }
3773 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3774 if (bi->isBoundary(j)) {
3775 printStringBreaks(ustr, expected, expectedcount);
3776 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3777 return;
3778 }
3779 }
3780 }
3781
3782 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3783 count --;
3784 if (forward[count] != i) {
3785 printStringBreaks(ustr, expected, expectedcount);
3786 test->errln("happy break test previous() failed: expected %d but got %d",
3787 forward[count], i);
3788 break;
3789 }
3790 }
3791 if (count != 0) {
3792 printStringBreaks(ustr, expected, expectedcount);
3793 test->errln("break test previous() failed: missed a match");
3794 return;
3795 }
3796
3797 // testing preceding
3798 for (i = 0; i < expectedcount - 1; i ++) {
3799 // int j = expected[i] + 1;
3800 int j = ustr.moveIndex32(expected[i], 1);
3801 for (; j <= expected[i + 1]; j ++) {
3802 if (bi->preceding(j) != expected[i]) {
3803 printStringBreaks(ustr, expected, expectedcount);
3804 test->errln("preceding(): Not expecting boundary at position %d", j);
3805 return;
3806 }
3807 }
3808 }
3809 }
3810 #endif
3811
TestWordBreaks(void)3812 void RBBITest::TestWordBreaks(void)
3813 {
3814 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3815
3816 Locale locale("en");
3817 UErrorCode status = U_ZERO_ERROR;
3818 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3819 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3820 // Replaced any C+J characters in a row with a random sequence of characters
3821 // of the same length to make our C+J segmentation not get in the way.
3822 static const char *strlist[] =
3823 {
3824 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3825 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3826 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3827 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3828 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3829 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3830 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3831 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3832 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3833 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3834 "\\u2027\\U000e0067\\u0a47\\u00b7",
3835 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3836 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3837 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3838 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3839 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3840 "\\u0027\\u11af\\U000e0057\\u0602",
3841 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3842 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3843 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3844 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3845 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3846 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3847 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3848 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3849 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3850 "\\u18f4\\U000e0049\\u20e7\\u2027",
3851 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3852 "\\ua183\\u102d\\u0bec\\u003a",
3853 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3854 "\\u003a\\u0e57\\u0fad\\u002e",
3855 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3856 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3857 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3858 "\\u003a\\u0664\\u00b7\\u1fba",
3859 "\\u003b\\u0027\\u00b7\\u47a3",
3860 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3861 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3862 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3863 };
3864 int loop;
3865 if (U_FAILURE(status)) {
3866 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3867 return;
3868 }
3869 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3870 // printf("looping %d\n", loop);
3871 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3872 // RBBICharMonkey monkey;
3873 RBBIWordMonkey monkey;
3874
3875 int expected[50];
3876 int expectedcount = 0;
3877
3878 monkey.setText(ustr);
3879 int i;
3880 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3881 expected[expectedcount ++] = i;
3882 }
3883
3884 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3885 }
3886 delete bi;
3887 #endif
3888 }
3889
TestWordBoundary(void)3890 void RBBITest::TestWordBoundary(void)
3891 {
3892 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3893 Locale locale("en");
3894 UErrorCode status = U_ZERO_ERROR;
3895 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3896 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3897 UChar str[50];
3898 static const char *strlist[] =
3899 {
3900 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3901 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3902 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3903 "\\u2027\\U000e0067\\u0a47\\u00b7",
3904 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3905 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3906 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3907 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3908 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3909 "\\u0027\\u11af\\U000e0057\\u0602",
3910 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3911 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3912 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3913 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3914 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3915 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3916 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3917 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3918 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3919 "\\u58f4\\U000e0049\\u20e7\\u2027",
3920 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3921 "\\ua183\\u102d\\u0bec\\u003a",
3922 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3923 "\\u003a\\u0e57\\u0fad\\u002e",
3924 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3925 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3926 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3927 "\\u003a\\u0664\\u00b7\\u1fba",
3928 "\\u003b\\u0027\\u00b7\\u47a3",
3929 };
3930 int loop;
3931 if (U_FAILURE(status)) {
3932 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3933 return;
3934 }
3935 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3936 // printf("looping %d\n", loop);
3937 u_unescape(strlist[loop], str, 20);
3938 UnicodeString ustr(str);
3939 int forward[50];
3940 int count = 0;
3941
3942 bi->setText(ustr);
3943 int prev = 0;
3944 int i;
3945 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3946 forward[count ++] = i;
3947 if (i > prev) {
3948 int j;
3949 for (j = prev + 1; j < i; j ++) {
3950 if (bi->isBoundary(j)) {
3951 printStringBreaks(ustr, forward, count);
3952 errln("happy boundary test failed: expected %d not a boundary",
3953 j);
3954 return;
3955 }
3956 }
3957 }
3958 if (!bi->isBoundary(i)) {
3959 printStringBreaks(ustr, forward, count);
3960 errln("happy boundary test failed: expected %d a boundary",
3961 i);
3962 return;
3963 }
3964 prev = i;
3965 }
3966 }
3967 delete bi;
3968 }
3969
TestLineBreaks(void)3970 void RBBITest::TestLineBreaks(void)
3971 {
3972 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3973 Locale locale("en");
3974 UErrorCode status = U_ZERO_ERROR;
3975 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3976 const int32_t STRSIZE = 50;
3977 UChar str[STRSIZE];
3978 static const char *strlist[] =
3979 {
3980 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3981 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3982 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3983 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3984 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3985 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3986 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3987 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3988 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3989 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3990 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3991 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3992 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3993 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3994 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3995 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3996 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3997 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3998 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3999 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4000 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4001 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4002 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4003 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4004 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4005 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4006 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4007 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4008 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4009 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4010 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4011 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4012 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4013 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4014 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4015 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4016 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4017 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4018 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4019 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4020 };
4021 int loop;
4022 TEST_ASSERT_SUCCESS(status);
4023 if (U_FAILURE(status)) {
4024 return;
4025 }
4026 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4027 // printf("looping %d\n", loop);
4028 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4029 if (t >= STRSIZE) {
4030 TEST_ASSERT(FALSE);
4031 continue;
4032 }
4033
4034
4035 UnicodeString ustr(str);
4036 RBBILineMonkey monkey;
4037 if (U_FAILURE(monkey.deferredStatus)) {
4038 continue;
4039 }
4040
4041 const int EXPECTEDSIZE = 50;
4042 int expected[EXPECTEDSIZE];
4043 int expectedcount = 0;
4044
4045 monkey.setText(ustr);
4046 int i;
4047 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4048 if (expectedcount >= EXPECTEDSIZE) {
4049 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4050 return;
4051 }
4052 expected[expectedcount ++] = i;
4053 }
4054
4055 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4056 }
4057 delete bi;
4058 #endif
4059 }
4060
TestSentBreaks(void)4061 void RBBITest::TestSentBreaks(void)
4062 {
4063 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4064 Locale locale("en");
4065 UErrorCode status = U_ZERO_ERROR;
4066 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4067 UChar str[200];
4068 static const char *strlist[] =
4069 {
4070 "Now\ris\nthe\r\ntime\n\rfor\r\r",
4071 "This\n",
4072 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4073 "\"Sentence ending with a quote.\" Bye.",
4074 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
4075 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4076 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4077 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4078 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4079 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4080 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4081 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4082 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4083 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4084 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4085 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4086 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4087 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4088 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4089 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4090 };
4091 int loop;
4092 if (U_FAILURE(status)) {
4093 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4094 return;
4095 }
4096 for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4097 u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4098 UnicodeString ustr(str);
4099
4100 RBBISentMonkey monkey;
4101 if (U_FAILURE(monkey.deferredStatus)) {
4102 continue;
4103 }
4104
4105 const int EXPECTEDSIZE = 50;
4106 int expected[EXPECTEDSIZE];
4107 int expectedcount = 0;
4108
4109 monkey.setText(ustr);
4110 int i;
4111 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4112 if (expectedcount >= EXPECTEDSIZE) {
4113 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4114 return;
4115 }
4116 expected[expectedcount ++] = i;
4117 }
4118
4119 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4120 }
4121 delete bi;
4122 #endif
4123 }
4124
TestMonkey()4125 void RBBITest::TestMonkey() {
4126 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4127
4128 UErrorCode status = U_ZERO_ERROR;
4129 int32_t loopCount = 500;
4130 int32_t seed = 1;
4131 UnicodeString breakType = "all";
4132 Locale locale("en");
4133 UBool useUText = FALSE;
4134
4135 if (quick == FALSE) {
4136 loopCount = 10000;
4137 }
4138
4139 if (fTestParams) {
4140 UnicodeString p(fTestParams);
4141 loopCount = getIntParam("loop", p, loopCount);
4142 seed = getIntParam("seed", p, seed);
4143
4144 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4145 if (m.find()) {
4146 breakType = m.group(1, status);
4147 m.reset();
4148 p = m.replaceFirst("", status);
4149 }
4150
4151 RegexMatcher u(" *utext", p, 0, status);
4152 if (u.find()) {
4153 useUText = TRUE;
4154 u.reset();
4155 p = u.replaceFirst("", status);
4156 }
4157
4158
4159 // m.reset(p);
4160 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4161 // Each option is stripped out of the option string as it is processed.
4162 // All options have been checked. The option string should have been completely emptied..
4163 char buf[100];
4164 p.extract(buf, sizeof(buf), NULL, status);
4165 buf[sizeof(buf)-1] = 0;
4166 errln("Unrecognized or extra parameter: %s\n", buf);
4167 return;
4168 }
4169
4170 }
4171
4172 if (breakType == "char" || breakType == "all") {
4173 RBBICharMonkey m;
4174 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4175 if (U_SUCCESS(status)) {
4176 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4177 if (breakType == "all" && useUText==FALSE) {
4178 // Also run a quick test with UText when "all" is specified
4179 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4180 }
4181 }
4182 else {
4183 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4184 }
4185 delete bi;
4186 }
4187
4188 if (breakType == "word" || breakType == "all") {
4189 logln("Word Break Monkey Test");
4190 RBBIWordMonkey m;
4191 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4192 if (U_SUCCESS(status)) {
4193 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4194 }
4195 else {
4196 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4197 }
4198 delete bi;
4199 }
4200
4201 if (breakType == "line" || breakType == "all") {
4202 logln("Line Break Monkey Test");
4203 RBBILineMonkey m;
4204 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4205 if (loopCount >= 10) {
4206 loopCount = loopCount / 5; // Line break runs slower than the others.
4207 }
4208 if (U_SUCCESS(status)) {
4209 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4210 }
4211 else {
4212 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4213 }
4214 delete bi;
4215 }
4216
4217 if (breakType == "sent" || breakType == "all" ) {
4218 logln("Sentence Break Monkey Test");
4219 RBBISentMonkey m;
4220 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4221 if (loopCount >= 10) {
4222 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4223 }
4224 if (U_SUCCESS(status)) {
4225 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4226 }
4227 else {
4228 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4229 }
4230 delete bi;
4231 }
4232
4233 #endif
4234 }
4235
4236 //
4237 // Run a RBBI monkey test. Common routine, for all break iterator types.
4238 // Parameters:
4239 // bi - the break iterator to use
4240 // mk - MonkeyKind, abstraction for obtaining expected results
4241 // name - Name of test (char, word, etc.) for use in error messages
4242 // seed - Seed for starting random number generator (parameter from user)
4243 // numIterations
4244 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4245 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4246 int32_t numIterations, UBool useUText) {
4247
4248 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4249
4250 const int32_t TESTSTRINGLEN = 500;
4251 UnicodeString testText;
4252 int32_t numCharClasses;
4253 UVector *chClasses;
4254 int expected[TESTSTRINGLEN*2 + 1];
4255 int expectedCount = 0;
4256 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4257 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4258 char reverseBreaks[TESTSTRINGLEN*2+1];
4259 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4260 char followingBreaks[TESTSTRINGLEN*2+1];
4261 char precedingBreaks[TESTSTRINGLEN*2+1];
4262 int i;
4263 int loopCount = 0;
4264
4265 m_seed = seed;
4266
4267 numCharClasses = mk.charClasses()->size();
4268 chClasses = mk.charClasses();
4269
4270 // Check for errors that occured during the construction of the MonkeyKind object.
4271 // Can't report them where they occured because errln() is a method coming from intlTest,
4272 // and is not visible outside of RBBITest :-(
4273 if (U_FAILURE(mk.deferredStatus)) {
4274 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4275 return;
4276 }
4277
4278 // Verify that the character classes all have at least one member.
4279 for (i=0; i<numCharClasses; i++) {
4280 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4281 if (s == NULL || s->size() == 0) {
4282 errln("Character Class #%d is null or of zero size.", i);
4283 return;
4284 }
4285 }
4286
4287 while (loopCount < numIterations || numIterations == -1) {
4288 if (numIterations == -1 && loopCount % 10 == 0) {
4289 // If test is running in an infinite loop, display a periodic tic so
4290 // we can tell that it is making progress.
4291 fprintf(stderr, ".");
4292 }
4293 // Save current random number seed, so that we can recreate the random numbers
4294 // for this loop iteration in event of an error.
4295 seed = m_seed;
4296
4297 // Populate a test string with data.
4298 testText.truncate(0);
4299 for (i=0; i<TESTSTRINGLEN; i++) {
4300 int32_t aClassNum = m_rand() % numCharClasses;
4301 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4302 int32_t charIdx = m_rand() % classSet->size();
4303 UChar32 c = classSet->charAt(charIdx);
4304 if (c < 0) { // TODO: deal with sets containing strings.
4305 errln("%s:%d c < 0", __FILE__, __LINE__);
4306 break;
4307 }
4308 // Do not assemble a supplementary character from randomly generated separate surrogates.
4309 // (It could be a dictionary character)
4310 if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4311 continue;
4312 }
4313
4314 testText.append(c);
4315 }
4316
4317 // Calculate the expected results for this test string.
4318 mk.setText(testText);
4319 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4320 expectedBreaks[0] = 1;
4321 int32_t breakPos = 0;
4322 expectedCount = 0;
4323 for (;;) {
4324 breakPos = mk.next(breakPos);
4325 if (breakPos == -1) {
4326 break;
4327 }
4328 if (breakPos > testText.length()) {
4329 errln("breakPos > testText.length()");
4330 }
4331 expectedBreaks[breakPos] = 1;
4332 U_ASSERT(expectedCount<testText.length());
4333 expected[expectedCount ++] = breakPos;
4334 (void)expected; // Set but not used warning.
4335 // TODO (andy): check it out.
4336 }
4337
4338 // Find the break positions using forward iteration
4339 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4340 if (useUText) {
4341 UErrorCode status = U_ZERO_ERROR;
4342 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4343 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4344 bi->setText(testUText, status);
4345 TEST_ASSERT_SUCCESS(status);
4346 utext_close(testUText); // The break iterator does a shallow clone of the UText
4347 // This UText can be closed immediately, so long as the
4348 // testText string continues to exist.
4349 } else {
4350 bi->setText(testText);
4351 }
4352
4353 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4354 if (i < 0 || i > testText.length()) {
4355 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4356 break;
4357 }
4358 forwardBreaks[i] = 1;
4359 }
4360
4361 // Find the break positions using reverse iteration
4362 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4363 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4364 if (i < 0 || i > testText.length()) {
4365 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4366 break;
4367 }
4368 reverseBreaks[i] = 1;
4369 }
4370
4371 // Find the break positions using isBoundary() tests.
4372 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4373 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4374 for (i=0; i<=testText.length(); i++) {
4375 isBoundaryBreaks[i] = bi->isBoundary(i);
4376 }
4377
4378
4379 // Find the break positions using the following() function.
4380 // printf(".");
4381 memset(followingBreaks, 0, sizeof(followingBreaks));
4382 int32_t lastBreakPos = 0;
4383 followingBreaks[0] = 1;
4384 for (i=0; i<testText.length(); i++) {
4385 breakPos = bi->following(i);
4386 if (breakPos <= i ||
4387 breakPos < lastBreakPos ||
4388 breakPos > testText.length() ||
4389 (breakPos > lastBreakPos && lastBreakPos > i)) {
4390 errln("%s break monkey test: "
4391 "Out of range value returned by BreakIterator::following().\n"
4392 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4393 name, seed, i, breakPos, lastBreakPos);
4394 break;
4395 }
4396 followingBreaks[breakPos] = 1;
4397 lastBreakPos = breakPos;
4398 }
4399
4400 // Find the break positions using the preceding() function.
4401 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4402 lastBreakPos = testText.length();
4403 precedingBreaks[testText.length()] = 1;
4404 for (i=testText.length(); i>0; i--) {
4405 breakPos = bi->preceding(i);
4406 if (breakPos >= i ||
4407 breakPos > lastBreakPos ||
4408 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4409 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4410 errln("%s break monkey test: "
4411 "Out of range value returned by BreakIterator::preceding().\n"
4412 "index=%d; prev returned %d; lastBreak=%d" ,
4413 name, i, breakPos, lastBreakPos);
4414 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4415 precedingBreaks[i] = 2; // Forces an error.
4416 }
4417 } else {
4418 if (breakPos >= 0) {
4419 precedingBreaks[breakPos] = 1;
4420 }
4421 lastBreakPos = breakPos;
4422 }
4423 }
4424
4425 // Compare the expected and actual results.
4426 for (i=0; i<=testText.length(); i++) {
4427 const char *errorType = NULL;
4428 if (forwardBreaks[i] != expectedBreaks[i]) {
4429 errorType = "next()";
4430 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4431 errorType = "previous()";
4432 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4433 errorType = "isBoundary()";
4434 } else if (followingBreaks[i] != expectedBreaks[i]) {
4435 errorType = "following()";
4436 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4437 errorType = "preceding()";
4438 }
4439
4440
4441 if (errorType != NULL) {
4442 // Format a range of the test text that includes the failure as
4443 // a data item that can be included in the rbbi test data file.
4444
4445 // Start of the range is the last point where expected and actual results
4446 // both agreed that there was a break position.
4447 int startContext = i;
4448 int32_t count = 0;
4449 for (;;) {
4450 if (startContext==0) { break; }
4451 startContext --;
4452 if (expectedBreaks[startContext] != 0) {
4453 if (count == 2) break;
4454 count ++;
4455 }
4456 }
4457
4458 // End of range is two expected breaks past the start position.
4459 int endContext = i + 1;
4460 int ci;
4461 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4462 for (;;) {
4463 if (endContext >= testText.length()) {break;}
4464 if (expectedBreaks[endContext-1] != 0) {
4465 if (count == 0) break;
4466 count --;
4467 }
4468 endContext ++;
4469 }
4470 }
4471
4472 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4473 UnicodeString errorText = "<data>";
4474 /***if (strcmp(errorType, "next()") == 0) {
4475 startContext = 0;
4476 endContext = testText.length();
4477
4478 printStringBreaks(testText, expected, expectedCount);
4479 }***/
4480
4481 for (ci=startContext; ci<endContext;) {
4482 UnicodeString hexChars("0123456789abcdef");
4483 UChar32 c;
4484 int bn;
4485 c = testText.char32At(ci);
4486 if (ci == i) {
4487 // This is the location of the error.
4488 errorText.append("<?>");
4489 } else if (expectedBreaks[ci] != 0) {
4490 // This a non-error expected break position.
4491 errorText.append("\\");
4492 }
4493 if (c < 0x10000) {
4494 errorText.append("\\u");
4495 for (bn=12; bn>=0; bn-=4) {
4496 errorText.append(hexChars.charAt((c>>bn)&0xf));
4497 }
4498 } else {
4499 errorText.append("\\U");
4500 for (bn=28; bn>=0; bn-=4) {
4501 errorText.append(hexChars.charAt((c>>bn)&0xf));
4502 }
4503 }
4504 ci = testText.moveIndex32(ci, 1);
4505 }
4506 errorText.append("\\");
4507 errorText.append("</data>\n");
4508
4509 // Output the error
4510 char charErrorTxt[500];
4511 UErrorCode status = U_ZERO_ERROR;
4512 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4513 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4514 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4515
4516 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4517 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4518 errorType, seed, i, charErrorTxt);
4519 break;
4520 }
4521 }
4522
4523 loopCount++;
4524 }
4525 #endif
4526 }
4527
4528
4529 // Bug 5532. UTF-8 based UText fails in dictionary code.
4530 // This test checks the initial patch,
4531 // which is to just keep it from crashing. Correct word boundaries
4532 // await a proper fix to the dictionary code.
4533 //
TestBug5532(void)4534 void RBBITest::TestBug5532(void) {
4535 // Text includes a mixture of Thai and Latin.
4536 const unsigned char utf8Data[] = {
4537 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4538 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4539 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4540 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4541 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4542 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4543 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4544 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4545 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4546 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4547 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4548
4549 UErrorCode status = U_ZERO_ERROR;
4550 UText utext=UTEXT_INITIALIZER;
4551 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4552 TEST_ASSERT_SUCCESS(status);
4553
4554 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4555 TEST_ASSERT_SUCCESS(status);
4556 if (U_SUCCESS(status)) {
4557 bi->setText(&utext, status);
4558 TEST_ASSERT_SUCCESS(status);
4559
4560 int32_t breakCount = 0;
4561 int32_t previousBreak = -1;
4562 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4563 // For now, just make sure that the break iterator doesn't hang.
4564 TEST_ASSERT(previousBreak < bi->current());
4565 previousBreak = bi->current();
4566 }
4567 TEST_ASSERT(breakCount > 0);
4568 }
4569 delete bi;
4570 utext_close(&utext);
4571 }
4572
4573
TestBug9983(void)4574 void RBBITest::TestBug9983(void) {
4575 UnicodeString text = UnicodeString("\\u002A" // * Other
4576 "\\uFF65" // Other
4577 "\\u309C" // Katakana
4578 "\\uFF9F" // Extend
4579 "\\uFF65" // Other
4580 "\\u0020" // Other
4581 "\\u0000").unescape();
4582
4583 UErrorCode status = U_ZERO_ERROR;
4584 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4585 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4586 TEST_ASSERT_SUCCESS(status);
4587 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4588 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4589 TEST_ASSERT_SUCCESS(status);
4590 if (U_FAILURE(status)) {
4591 return;
4592 }
4593 int32_t offset, rstatus, iterationCount;
4594
4595 brkiter->setText(text);
4596 brkiter->last();
4597 iterationCount = 0;
4598 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4599 iterationCount++;
4600 rstatus = brkiter->getRuleStatus();
4601 (void)rstatus; // Suppress set but not used warning.
4602 if (iterationCount >= 10) {
4603 break;
4604 }
4605 }
4606 TEST_ASSERT(iterationCount == 6);
4607
4608 brkiterPOSIX->setText(text);
4609 brkiterPOSIX->last();
4610 iterationCount = 0;
4611 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4612 iterationCount++;
4613 rstatus = brkiterPOSIX->getRuleStatus();
4614 (void)rstatus; // Suppress set but not used warning.
4615 if (iterationCount >= 10) {
4616 break;
4617 }
4618 }
4619 TEST_ASSERT(iterationCount == 6);
4620 }
4621
4622 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4623 //
TestBug7547()4624 void RBBITest::TestBug7547() {
4625 UnicodeString rules;
4626 UErrorCode status = U_ZERO_ERROR;
4627 UParseError parseError;
4628 RuleBasedBreakIterator breakIterator(rules, parseError, status);
4629 if (status != U_BRK_RULE_SYNTAX) {
4630 errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4631 }
4632 if (parseError.line != 1 || parseError.offset != 0) {
4633 errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4634 }
4635 }
4636
4637
TestBug12797()4638 void RBBITest::TestBug12797() {
4639 UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4640 UErrorCode status = U_ZERO_ERROR;
4641 UParseError parseError;
4642 RuleBasedBreakIterator bi(rules, parseError, status);
4643 if (U_FAILURE(status)) {
4644 errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4645 return;
4646 }
4647 UnicodeString text = "abc";
4648 bi.setText(text);
4649 bi.first();
4650 int32_t boundary = bi.next();
4651 if (boundary != 3) {
4652 errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4653 }
4654 }
4655
TestBug12918()4656 void RBBITest::TestBug12918() {
4657 // This test triggers an assertion failure in dictbe.cpp
4658 const UChar crasherString[] = { 0x3325, 0x4a16, 0 };
4659 UErrorCode status = U_ZERO_ERROR;
4660 UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4661 if (U_FAILURE(status)) {
4662 errln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4663 return;
4664 }
4665 ubrk_first(iter);
4666 int32_t pos = 0;
4667 int32_t lastPos = -1;
4668 while((pos = ubrk_next(iter)) != UBRK_DONE) {
4669 if (pos <= lastPos) {
4670 errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4671 break;
4672 }
4673 }
4674 ubrk_close(iter);
4675 }
4676
4677 //
4678 // TestDebug - A place-holder test for debugging purposes.
4679 // For putting in fragments of other tests that can be invoked
4680 // for tracing without a lot of unwanted extra stuff happening.
4681 //
TestDebug(void)4682 void RBBITest::TestDebug(void) {
4683
4684 }
4685
TestProperties()4686 void RBBITest::TestProperties() {
4687 UErrorCode errorCode = U_ZERO_ERROR;
4688 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4689 if (!prependSet.isEmpty()) {
4690 errln(
4691 "[:GCB=Prepend:] is not empty any more. "
4692 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4693 "change this test to the opposite condition.");
4694 }
4695 }
4696
4697 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4698