1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 1999-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6 /************************************************************************
7 * Date Name Description
8 * 12/15/99 Madhu Creation.
9 * 01/12/2000 Madhu Updated for changed API and added new tests
10 ************************************************************************/
11
12 #include "utypeinfo.h" // for 'typeid' to work
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
28 #endif
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
31 #include "intltest.h"
32 #include "rbbitst.h"
33 #include <string.h>
34 #include "charstr.h"
35 #include "uvector.h"
36 #include "uvectr32.h"
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
41 #include "cmemory.h"
42
43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
44 #include "unicode/filteredbrk.h"
45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
46
47 #define TEST_ASSERT(x) {if (!(x)) { \
48 errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
49
50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
51 errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
52
53
54 //---------------------------------------------
55 // runIndexedTest
56 //---------------------------------------------
57
58
59 // Note: Before adding new tests to this file, check whether the desired test data can
60 // simply be added to the file testdata/rbbitest.txt. In most cases it can,
61 // it's much less work than writing a new test, diagnostic output in the event of failures
62 // is good, and the test data file will is shared with ICU4J, so eventually the test
63 // will run there as well, without additional effort.
64
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
66 {
67 if (exec) logln("TestSuite RuleBasedBreakIterator: ");
68
69 switch (index) {
70 #if !UCONFIG_NO_FILE_IO
71 case 0: name = "TestBug4153072";
72 if(exec) TestBug4153072(); break;
73 #else
74 case 0: name = "skip";
75 break;
76 #endif
77
78 case 1: name = "skip";
79 break;
80 case 2: name = "TestStatusReturn";
81 if(exec) TestStatusReturn(); break;
82
83 #if !UCONFIG_NO_FILE_IO
84 case 3: name = "TestUnicodeFiles";
85 if(exec) TestUnicodeFiles(); break;
86 case 4: name = "TestEmptyString";
87 if(exec) TestEmptyString(); break;
88 #else
89 case 3: case 4: name = "skip";
90 break;
91 #endif
92
93 case 5: name = "TestGetAvailableLocales";
94 if(exec) TestGetAvailableLocales(); break;
95
96 case 6: name = "TestGetDisplayName";
97 if(exec) TestGetDisplayName(); break;
98
99 #if !UCONFIG_NO_FILE_IO
100 case 7: name = "TestEndBehaviour";
101 if(exec) TestEndBehaviour(); break;
102 case 8: case 9: case 10: name = "skip";
103 break;
104 case 11: name = "TestWordBreaks";
105 if(exec) TestWordBreaks(); break;
106 case 12: name = "TestWordBoundary";
107 if(exec) TestWordBoundary(); break;
108 case 13: name = "TestLineBreaks";
109 if(exec) TestLineBreaks(); break;
110 case 14: name = "TestSentBreaks";
111 if(exec) TestSentBreaks(); break;
112 case 15: name = "TestExtended";
113 if(exec) TestExtended(); break;
114 #else
115 case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
116 break;
117 #endif
118
119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
120 case 16:
121 name = "TestMonkey"; if(exec) TestMonkey(params); break;
122 #else
123 case 16:
124 name = "skip"; break;
125 #endif
126
127 #if !UCONFIG_NO_FILE_IO
128 case 17: name = "TestBug3818";
129 if(exec) TestBug3818(); break;
130 #else
131 case 17: name = "skip";
132 break;
133 #endif
134
135 case 18: name = "skip";
136 break;
137 case 19: name = "TestDebug";
138 if(exec) TestDebug(); break;
139 case 20: name = "skip";
140 break;
141
142 #if !UCONFIG_NO_FILE_IO
143 case 21: name = "TestBug5775";
144 if (exec) TestBug5775(); break;
145 #else
146 case 21: name = "skip";
147 break;
148 #endif
149
150 case 22: name = "TestBug9983";
151 if (exec) TestBug9983(); break;
152 case 23: name = "TestDictRules";
153 if (exec) TestDictRules(); break;
154 case 24: name = "TestBug5532";
155 if (exec) TestBug5532(); break;
156 default: name = ""; break; //needed to end loop
157 }
158 }
159
160
161 //---------------------------------------------------------------------------
162 //
163 // class BITestData Holds a set of Break iterator test data and results
164 // Includes
165 // - the string data to be broken
166 // - a vector of the expected break positions.
167 // - a vector of source line numbers for the data,
168 // (to help see where errors occured.)
169 // - The expected break tag values.
170 // - Vectors of actual break positions and tag values.
171 // - Functions for comparing actual with expected and
172 // reporting errors.
173 //
174 //----------------------------------------------------------------------------
175 class BITestData {
176 public:
177 UnicodeString fDataToBreak;
178 UVector fExpectedBreakPositions;
179 UVector fExpectedTags;
180 UVector fLineNum;
181 UVector fActualBreakPositions; // Test Results.
182 UVector fActualTags;
183
184 BITestData(UErrorCode &status);
185 void addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
186 void checkResults(const char *heading, RBBITest *test);
187 void err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
188 void clearResults();
189 };
190
191 //
192 // Constructor.
193 //
BITestData(UErrorCode & status)194 BITestData::BITestData(UErrorCode &status)
195 : fExpectedBreakPositions(status), fExpectedTags(status), fLineNum(status), fActualBreakPositions(status),
196 fActualTags(status)
197 {
198 }
199
200 //
201 // addDataChunk. Add a section (non-breaking) piece if data to the test data.
202 // The macro form collects the line number, which is helpful
203 // when tracking down failures.
204 //
205 // A null data item is inserted at the start of each test's data
206 // to put the starting zero into the data list. The position saved for
207 // each non-null item is its ending position.
208 //
209 #define ADD_DATACHUNK(td, data, tag, status) td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
211 if (U_FAILURE(status)) {return;}
212 if (data != NULL) {
213 fDataToBreak.append(CharsToUnicodeString(data));
214 }
215 fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
216 fExpectedTags.addElement(tag, status);
217 fLineNum.addElement(lineNum, status);
218 }
219
220
221 //
222 // checkResults. Compare the actual and expected break positions, report any differences.
223 //
checkResults(const char * heading,RBBITest * test)224 void BITestData::checkResults(const char *heading, RBBITest *test) {
225 int32_t expectedIndex = 0;
226 int32_t actualIndex = 0;
227
228 for (;;) {
229 // If we've run through both the expected and actual results vectors, we're done.
230 // break out of the loop.
231 if (expectedIndex >= fExpectedBreakPositions.size() &&
232 actualIndex >= fActualBreakPositions.size()) {
233 break;
234 }
235
236
237 if (expectedIndex >= fExpectedBreakPositions.size()) {
238 err(heading, test, expectedIndex-1, actualIndex);
239 actualIndex++;
240 continue;
241 }
242
243 if (actualIndex >= fActualBreakPositions.size()) {
244 err(heading, test, expectedIndex, actualIndex-1);
245 expectedIndex++;
246 continue;
247 }
248
249 if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
250 err(heading, test, expectedIndex, actualIndex);
251 // Try to resync the positions of the indices, to avoid a rash of spurious erros.
252 if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
253 actualIndex++;
254 } else {
255 expectedIndex++;
256 }
257 continue;
258 }
259
260 if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
261 test->errln("%s, tag mismatch. Test Line = %d, expected tag=%d, got %d",
262 heading, fLineNum.elementAt(expectedIndex),
263 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
264 }
265
266 actualIndex++;
267 expectedIndex++;
268 }
269 }
270
271 //
272 // err - An error was found. Report it, along with information about where the
273 // incorrectly broken test data appeared in the source file.
274 //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)275 void BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
276 {
277 int32_t expected = fExpectedBreakPositions.elementAti(expectedIdx);
278 int32_t actual = fActualBreakPositions.elementAti(actualIdx);
279 int32_t o = 0;
280 int32_t line = fLineNum.elementAti(expectedIdx);
281 if (expectedIdx > 0) {
282 // The line numbers are off by one because a premature break occurs somewhere
283 // within the previous item, rather than at the start of the current (expected) item.
284 // We want to report the offset of the unexpected break from the start of
285 // this previous item.
286 o = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
287 }
288 if (actual < expected) {
289 test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d expected break: %d", heading, o, line, actual, expected);
290 } else {
291 test->errln("%s Failed to find break at end of item from line %d. actual break: %d expected break: %d", heading, line, actual, expected);
292 }
293 }
294
295
clearResults()296 void BITestData::clearResults() {
297 fActualBreakPositions.removeAllElements();
298 fActualTags.removeAllElements();
299 }
300
301
302 //--------------------------------------------------------------------------------------
303 //
304 // RBBITest constructor and destructor
305 //
306 //--------------------------------------------------------------------------------------
307
RBBITest()308 RBBITest::RBBITest() {
309 }
310
311
~RBBITest()312 RBBITest::~RBBITest() {
313 }
314
315 //-----------------------------------------------------------------------------------
316 //
317 // Test for status {tag} return value from break rules.
318 // TODO: a more thorough test.
319 //
320 //-----------------------------------------------------------------------------------
TestStatusReturn()321 void RBBITest::TestStatusReturn() {
322 UnicodeString rulesString1("$Letters = [:L:];\n"
323 "$Numbers = [:N:];\n"
324 "$Letters+{1};\n"
325 "$Numbers+{2};\n"
326 "Help\\ {4}/me\\!;\n"
327 "[^$Letters $Numbers];\n"
328 "!.*;\n", -1, US_INV);
329 UnicodeString testString1 = "abc123..abc Help me Help me!";
330 // 01234567890123456789012345678
331 int32_t bounds1[] = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
332 int32_t brkStatus[] = {0, 1, 2, 0, 0, 1, 0, 1, 0, 1, 0, 4, 1, 0, -1};
333
334 UErrorCode status=U_ZERO_ERROR;
335 UParseError parseError;
336
337 BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
338 if(U_FAILURE(status)) {
339 dataerrln("FAIL : in construction - %s", u_errorName(status));
340 } else {
341 int32_t pos;
342 int32_t i = 0;
343 bi->setText(testString1);
344 for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
345 if (pos != bounds1[i]) {
346 errln("FAIL: expected break at %d, got %d\n", bounds1[i], pos);
347 break;
348 }
349
350 int tag = bi->getRuleStatus();
351 if (tag != brkStatus[i]) {
352 errln("FAIL: break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
353 break;
354 }
355 i++;
356 }
357 }
358 delete bi;
359 }
360
361
printStringBreaks(UText * tstr,int expected[],int expectedCount)362 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
363 UErrorCode status = U_ZERO_ERROR;
364 char name[100];
365 printf("code alpha extend alphanum type word sent line name\n");
366 int nextExpectedIndex = 0;
367 utext_setNativeIndex(tstr, 0);
368 for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
369 if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
370 printf("------------------------------------------------ %d\n", j);
371 ++nextExpectedIndex;
372 }
373
374 UChar32 c = utext_next32(tstr);
375 u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
376 printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
377 u_isUAlphabetic(c),
378 u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
379 u_isalnum(c),
380 u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
381 u_charType(c),
382 U_SHORT_PROPERTY_NAME),
383 u_getPropertyValueName(UCHAR_WORD_BREAK,
384 u_getIntPropertyValue(c,
385 UCHAR_WORD_BREAK),
386 U_SHORT_PROPERTY_NAME),
387 u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
388 u_getIntPropertyValue(c,
389 UCHAR_SENTENCE_BREAK),
390 U_SHORT_PROPERTY_NAME),
391 u_getPropertyValueName(UCHAR_LINE_BREAK,
392 u_getIntPropertyValue(c,
393 UCHAR_LINE_BREAK),
394 U_SHORT_PROPERTY_NAME),
395 name);
396 }
397 }
398
399
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)400 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
401 UErrorCode status = U_ZERO_ERROR;
402 UText *tstr = NULL;
403 tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
404 if (U_FAILURE(status)) {
405 printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
406 return;
407 }
408 printStringBreaks(tstr, expected, expectedCount);
409 utext_close(tstr);
410 }
411
412
TestBug3818()413 void RBBITest::TestBug3818() {
414 UErrorCode status = U_ZERO_ERROR;
415
416 // Four Thai words...
417 static const UChar thaiWordData[] = { 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
418 0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
419 UnicodeString thaiStr(thaiWordData);
420
421 BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
422 if (U_FAILURE(status) || bi == NULL) {
423 errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
424 return;
425 }
426 bi->setText(thaiStr);
427
428 int32_t startOfSecondWord = bi->following(1);
429 if (startOfSecondWord != 4) {
430 errln("Fail at file %s, line %d expected start of word at 4, got %d",
431 __FILE__, __LINE__, startOfSecondWord);
432 }
433 startOfSecondWord = bi->following(0);
434 if (startOfSecondWord != 4) {
435 errln("Fail at file %s, line %d expected start of word at 4, got %d",
436 __FILE__, __LINE__, startOfSecondWord);
437 }
438 delete bi;
439 }
440
441 //----------------------------------------------------------------------------
442 //
443 // generalIteratorTest Given a break iterator and a set of test data,
444 // Run the tests and report the results.
445 //
446 //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)447 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
448 {
449
450 bi.setText(td.fDataToBreak);
451
452 testFirstAndNext(bi, td);
453
454 testLastAndPrevious(bi, td);
455
456 testFollowing(bi, td);
457 testPreceding(bi, td);
458 testIsBoundary(bi, td);
459 doMultipleSelectionTest(bi, td);
460 }
461
462
463 //
464 // testFirstAndNext. Run the iterator forwards in the obvious first(), next()
465 // kind of loop.
466 //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)467 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
468 {
469 UErrorCode status = U_ZERO_ERROR;
470 int32_t p;
471 int32_t lastP = -1;
472 int32_t tag;
473
474 logln("Test first and next");
475 bi.setText(td.fDataToBreak);
476 td.clearResults();
477
478 for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
479 td.fActualBreakPositions.addElement(p, status); // Save result.
480 tag = bi.getRuleStatus();
481 td.fActualTags.addElement(tag, status);
482 if (p <= lastP) {
483 // If the iterator is not making forward progress, stop.
484 // No need to raise an error here, it'll be detected in the normal check of results.
485 break;
486 }
487 lastP = p;
488 }
489 td.checkResults("testFirstAndNext", this);
490 }
491
492
493 //
494 // TestLastAndPrevious. Run the iterator backwards, starting with last().
495 //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)496 void RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, BITestData &td)
497 {
498 UErrorCode status = U_ZERO_ERROR;
499 int32_t p;
500 int32_t lastP = 0x7ffffffe;
501 int32_t tag;
502
503 logln("Test last and previous");
504 bi.setText(td.fDataToBreak);
505 td.clearResults();
506
507 for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
508 // Save break position. Insert it at start of vector of results, shoving
509 // already-saved results further towards the end.
510 td.fActualBreakPositions.insertElementAt(p, 0, status);
511 // bi.previous(); // TODO: Why does this fix things up????
512 // bi.next();
513 tag = bi.getRuleStatus();
514 td.fActualTags.insertElementAt(tag, 0, status);
515 if (p >= lastP) {
516 // If the iterator is not making progress, stop.
517 // No need to raise an error here, it'll be detected in the normal check of results.
518 break;
519 }
520 lastP = p;
521 }
522 td.checkResults("testLastAndPrevious", this);
523 }
524
525
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)526 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
527 {
528 UErrorCode status = U_ZERO_ERROR;
529 int32_t p;
530 int32_t tag;
531 int32_t lastP = -2; // A value that will never be returned as a break position.
532 // cannot be -1; that is returned for DONE.
533 int i;
534
535 logln("testFollowing():");
536 bi.setText(td.fDataToBreak);
537 td.clearResults();
538
539 // Save the starting point, since we won't get that out of following.
540 p = bi.first();
541 td.fActualBreakPositions.addElement(p, status); // Save result.
542 tag = bi.getRuleStatus();
543 td.fActualTags.addElement(tag, status);
544
545 for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
546 p = bi.following(i);
547 if (p != lastP) {
548 if (p == RuleBasedBreakIterator::DONE) {
549 break;
550 }
551 // We've reached a new break position. Save it.
552 td.fActualBreakPositions.addElement(p, status); // Save result.
553 tag = bi.getRuleStatus();
554 td.fActualTags.addElement(tag, status);
555 lastP = p;
556 }
557 }
558 // The loop normally exits by means of the break in the middle.
559 // Make sure that the index was at the correct position for the break iterator to have
560 // returned DONE.
561 if (i != td.fDataToBreak.length()) {
562 errln("testFollowing(): iterator returned DONE prematurely.");
563 }
564
565 // Full check of all results.
566 td.checkResults("testFollowing", this);
567 }
568
569
570
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)571 void RBBITest::testPreceding(RuleBasedBreakIterator& bi, BITestData &td) {
572 UErrorCode status = U_ZERO_ERROR;
573 int32_t p;
574 int32_t tag;
575 int32_t lastP = 0x7ffffffe;
576 int i;
577
578 logln("testPreceding():");
579 bi.setText(td.fDataToBreak);
580 td.clearResults();
581
582 p = bi.last();
583 td.fActualBreakPositions.addElement(p, status);
584 tag = bi.getRuleStatus();
585 td.fActualTags.addElement(tag, status);
586
587 for (i = td.fDataToBreak.length(); i>=-1; i--) {
588 p = bi.preceding(i);
589 if (p != lastP) {
590 if (p == RuleBasedBreakIterator::DONE) {
591 break;
592 }
593 // We've reached a new break position. Save it.
594 td.fActualBreakPositions.insertElementAt(p, 0, status);
595 lastP = p;
596 tag = bi.getRuleStatus();
597 td.fActualTags.insertElementAt(tag, 0, status);
598 }
599 }
600 // The loop normally exits by means of the break in the middle.
601 // Make sure that the index was at the correct position for the break iterator to have
602 // returned DONE.
603 if (i != 0) {
604 errln("testPreceding(): iterator returned DONE prematurely.");
605 }
606
607 // Full check of all results.
608 td.checkResults("testPreceding", this);
609 }
610
611
612
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)613 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi, BITestData &td) {
614 UErrorCode status = U_ZERO_ERROR;
615 int i;
616 int32_t tag;
617
618 logln("testIsBoundary():");
619 bi.setText(td.fDataToBreak);
620 td.clearResults();
621
622 for (i = 0; i <= td.fDataToBreak.length(); i++) {
623 if (bi.isBoundary(i)) {
624 td.fActualBreakPositions.addElement(i, status); // Save result.
625 tag = bi.getRuleStatus();
626 td.fActualTags.addElement(tag, status);
627 }
628 }
629 td.checkResults("testIsBoundary: ", this);
630 }
631
632
633
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)634 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
635 {
636 iterator.setText(td.fDataToBreak);
637
638 RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
639 int32_t offset = iterator.first();
640 int32_t testOffset;
641 int32_t count = 0;
642
643 logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
644
645 if (*testIterator != iterator)
646 errln("clone() or operator!= failed: two clones compared unequal");
647
648 do {
649 testOffset = testIterator->first();
650 testOffset = testIterator->next(count);
651 if (offset != testOffset)
652 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
653
654 if (offset != RuleBasedBreakIterator::DONE) {
655 count++;
656 offset = iterator.next();
657
658 if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
659 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
660 if (count > 10000 || offset == -1) {
661 errln("operator== failed too many times. Stopping test.");
662 if (offset == -1) {
663 errln("Does (RuleBasedBreakIterator::DONE == -1)?");
664 }
665 return;
666 }
667 }
668 }
669 } while (offset != RuleBasedBreakIterator::DONE);
670
671 // now do it backwards...
672 offset = iterator.last();
673 count = 0;
674
675 do {
676 testOffset = testIterator->last();
677 testOffset = testIterator->next(count); // next() with a negative arg is same as previous
678 if (offset != testOffset)
679 errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
680
681 if (offset != RuleBasedBreakIterator::DONE) {
682 count--;
683 offset = iterator.previous();
684 }
685 } while (offset != RuleBasedBreakIterator::DONE);
686
687 delete testIterator;
688 }
689
690
691 //---------------------------------------------
692 //
693 // other tests
694 //
695 //---------------------------------------------
TestEmptyString()696 void RBBITest::TestEmptyString()
697 {
698 UnicodeString text = "";
699 UErrorCode status = U_ZERO_ERROR;
700
701 BITestData x(status);
702 ADD_DATACHUNK(x, "", 0, status); // Break at start of data
703 RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
704 if (U_FAILURE(status))
705 {
706 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
707 return;
708 }
709 generalIteratorTest(*bi, x);
710 delete bi;
711 }
712
TestGetAvailableLocales()713 void RBBITest::TestGetAvailableLocales()
714 {
715 int32_t locCount = 0;
716 const Locale* locList = BreakIterator::getAvailableLocales(locCount);
717
718 if (locCount == 0)
719 dataerrln("getAvailableLocales() returned an empty list!");
720 // Just make sure that it's returning good memory.
721 int32_t i;
722 for (i = 0; i < locCount; ++i) {
723 logln(locList[i].getName());
724 }
725 }
726
727 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()728 void RBBITest::TestGetDisplayName()
729 {
730 UnicodeString result;
731
732 BreakIterator::getDisplayName(Locale::getUS(), result);
733 if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
734 dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
735 + result);
736
737 BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
738 if (result != "French (France)")
739 dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
740 + result);
741 }
742 /**
743 * Test End Behaviour
744 * @bug 4068137
745 */
TestEndBehaviour()746 void RBBITest::TestEndBehaviour()
747 {
748 UErrorCode status = U_ZERO_ERROR;
749 UnicodeString testString("boo.");
750 BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
751 if (U_FAILURE(status))
752 {
753 errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
754 return;
755 }
756 wb->setText(testString);
757
758 if (wb->first() != 0)
759 errln("Didn't get break at beginning of string.");
760 if (wb->next() != 3)
761 errln("Didn't get break before period in \"boo.\"");
762 if (wb->current() != 4 && wb->next() != 4)
763 errln("Didn't get break at end of string.");
764 delete wb;
765 }
766 /*
767 * @bug 4153072
768 */
TestBug4153072()769 void RBBITest::TestBug4153072() {
770 UErrorCode status = U_ZERO_ERROR;
771 BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
772 if (U_FAILURE(status))
773 {
774 errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
775 return;
776 }
777 UnicodeString str("...Hello, World!...");
778 int32_t begin = 3;
779 int32_t end = str.length() - 3;
780 UBool onBoundary;
781
782 StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
783 iter->adoptText(textIterator);
784 int index;
785 // Note: with the switch to UText, there is no way to restrict the
786 // iteration range to begin at an index other than zero.
787 // String character iterators created with a non-zero bound are
788 // treated by RBBI as being empty.
789 for (index = -1; index < begin + 1; ++index) {
790 onBoundary = iter->isBoundary(index);
791 if (index == 0? !onBoundary : onBoundary) {
792 errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
793 " and begin index = " + begin);
794 }
795 }
796 delete iter;
797 }
798
799
800 //
801 // Test for problem reported by Ashok Matoria on 9 July 2007
802 // One.<kSoftHyphen><kSpace>Two.
803 //
804 // Sentence break at start (0) and then on calling next() it breaks at
805 // 'T' of "Two". Now, at this point if I do next() and
806 // then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
807 //
TestBug5775()808 void RBBITest::TestBug5775() {
809 UErrorCode status = U_ZERO_ERROR;
810 BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
811 TEST_ASSERT_SUCCESS(status);
812 if (U_FAILURE(status)) {
813 return;
814 }
815 // Check for status first for better handling of no data errors.
816 TEST_ASSERT(bi != NULL);
817 if (bi == NULL) {
818 return;
819 }
820
821 UnicodeString s("One.\\u00ad Two.", -1, US_INV);
822 // 01234 56789
823 s = s.unescape();
824 bi->setText(s);
825 int pos = bi->next();
826 TEST_ASSERT(pos == 6);
827 pos = bi->next();
828 TEST_ASSERT(pos == 10);
829 pos = bi->previous();
830 TEST_ASSERT(pos == 6);
831 delete bi;
832 }
833
834
835
836 //------------------------------------------------------------------------------
837 //
838 // RBBITest::Extended Run RBBI Tests from an external test data file
839 //
840 //------------------------------------------------------------------------------
841
842 struct TestParams {
843 BreakIterator *bi; // Break iterator is set while parsing test source.
844 // Changed out whenever test data changes break type.
845
846 UnicodeString dataToBreak; // Data that is built up while parsing the test.
847 UVector32 *expectedBreaks; // Expected break positions, matches dataToBreak UnicodeString.
848 UVector32 *srcLine; // Positions in source file, indexed same as dataToBreak.
849 UVector32 *srcCol;
850
851 UText *textToBreak; // UText, could be UTF8 or UTF16.
852 UVector32 *textMap; // Map from UTF-16 dataToBreak offsets to UText offsets.
853 CharString utf8String; // UTF-8 form of text to break.
854
TestParamsTestParams855 TestParams(UErrorCode &status) : dataToBreak() {
856 bi = NULL;
857 expectedBreaks = new UVector32(status);
858 srcLine = new UVector32(status);
859 srcCol = new UVector32(status);
860 textToBreak = NULL;
861 textMap = new UVector32(status);
862 }
863
~TestParamsTestParams864 ~TestParams() {
865 delete bi;
866 delete expectedBreaks;
867 delete srcLine;
868 delete srcCol;
869 utext_close(textToBreak);
870 delete textMap;
871 }
872
873 int32_t getSrcLine(int32_t bp);
874 int32_t getExpectedBreak(int32_t bp);
875 int32_t getSrcCol(int32_t bp);
876
877 void setUTF16(UErrorCode &status);
878 void setUTF8(UErrorCode &status);
879 };
880
881 // Append a UnicodeString to a CharString with UTF-8 encoding.
882 // Substitute any invalid chars.
883 // Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)884 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
885 if (U_FAILURE(status)) {
886 return;
887 }
888 int32_t utf8Length;
889 u_strToUTF8WithSub(NULL, 0, &utf8Length, // Output Buffer, NULL for preflight.
890 src.getBuffer(), src.length(), // UTF-16 data
891 0xfffd, NULL, // Substitution char, number of subs.
892 &status);
893 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
894 return;
895 }
896 status = U_ZERO_ERROR;
897 int32_t capacity;
898 char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
899 u_strToUTF8WithSub(buffer, utf8Length, NULL,
900 src.getBuffer(), src.length(),
901 0xfffd, NULL, &status);
902 dest.append(buffer, utf8Length, status);
903 }
904
905
setUTF16(UErrorCode & status)906 void TestParams::setUTF16(UErrorCode &status) {
907 textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
908 textMap->removeAllElements();
909 for (int32_t i=0; i<dataToBreak.length(); i++) {
910 if (i == dataToBreak.getChar32Start(i)) {
911 textMap->addElement(i, status);
912 } else {
913 textMap->addElement(-1, status);
914 }
915 }
916 textMap->addElement(dataToBreak.length(), status);
917 U_ASSERT(dataToBreak.length() + 1 == textMap->size());
918 }
919
920
setUTF8(UErrorCode & status)921 void TestParams::setUTF8(UErrorCode &status) {
922 if (U_FAILURE(status)) {
923 return;
924 }
925 utf8String.clear();
926 CharStringAppend(utf8String, dataToBreak, status);
927 textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
928 if (U_FAILURE(status)) {
929 return;
930 }
931
932 textMap->removeAllElements();
933 int32_t utf16Index = 0;
934 for (;;) {
935 textMap->addElement(utf16Index, status);
936 UChar32 c32 = utext_current32(textToBreak);
937 if (c32 < 0) {
938 break;
939 }
940 utf16Index += U16_LENGTH(c32);
941 utext_next32(textToBreak);
942 while (textMap->size() < utext_getNativeIndex(textToBreak)) {
943 textMap->addElement(-1, status);
944 }
945 }
946 U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
947 }
948
949
getSrcLine(int bp)950 int32_t TestParams::getSrcLine(int bp) {
951 if (bp >= textMap->size()) {
952 bp = textMap->size() - 1;
953 }
954 int32_t i = 0;
955 for(; bp >= 0 ; --bp) {
956 // Move to a character boundary if we are not on one already.
957 i = textMap->elementAti(bp);
958 if (i >= 0) {
959 break;
960 }
961 }
962 return srcLine->elementAti(i);
963 }
964
965
getExpectedBreak(int bp)966 int32_t TestParams::getExpectedBreak(int bp) {
967 if (bp >= textMap->size()) {
968 return 0;
969 }
970 int32_t i = textMap->elementAti(bp);
971 int32_t retVal = 0;
972 if (i >= 0) {
973 retVal = expectedBreaks->elementAti(i);
974 }
975 return retVal;
976 }
977
978
getSrcCol(int bp)979 int32_t TestParams::getSrcCol(int bp) {
980 if (bp >= textMap->size()) {
981 bp = textMap->size() - 1;
982 }
983 int32_t i = 0;
984 for(; bp >= 0; --bp) {
985 // Move bp to a character boundary if we are not on one already.
986 i = textMap->elementAti(bp);
987 if (i >= 0) {
988 break;
989 }
990 }
991 return srcCol->elementAti(i);
992 }
993
994
executeTest(TestParams * t,UErrorCode & status)995 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
996 int32_t bp;
997 int32_t prevBP;
998 int32_t i;
999
1000 TEST_ASSERT_SUCCESS(status);
1001 if (U_FAILURE(status)) {
1002 return;
1003 }
1004
1005 if (t->bi == NULL) {
1006 return;
1007 }
1008
1009 t->bi->setText(t->textToBreak, status);
1010 //
1011 // Run the iterator forward
1012 //
1013 prevBP = -1;
1014 for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1015 if (prevBP == bp) {
1016 // Fail for lack of forward progress.
1017 errln("Forward Iteration, no forward progress. Break Pos=%4d File line,col=%4d,%4d",
1018 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1019 break;
1020 }
1021
1022 // Check that there we didn't miss an expected break between the last one
1023 // and this one.
1024 for (i=prevBP+1; i<bp; i++) {
1025 if (t->getExpectedBreak(i) != 0) {
1026 int expected[] = {0, i};
1027 printStringBreaks(t->dataToBreak, expected, 2);
1028 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1029 i, t->getSrcLine(i), t->getSrcCol(i));
1030 }
1031 }
1032
1033 // Check that the break we did find was expected
1034 if (t->getExpectedBreak(bp) == 0) {
1035 int expected[] = {0, bp};
1036 printStringBreaks(t->textToBreak, expected, 2);
1037 errln("Forward Iteration, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1038 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1039 } else {
1040 // The break was expected.
1041 // Check that the {nnn} tag value is correct.
1042 int32_t expectedTagVal = t->getExpectedBreak(bp);
1043 if (expectedTagVal == -1) {
1044 expectedTagVal = 0;
1045 }
1046 int32_t line = t->getSrcLine(bp);
1047 int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1048 if (rs != expectedTagVal) {
1049 errln("Incorrect status for forward break. Pos=%4d File line,col= %4d,%4d.\n"
1050 " Actual, Expected status = %4d, %4d",
1051 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1052 }
1053 }
1054
1055 prevBP = bp;
1056 }
1057
1058 // Verify that there were no missed expected breaks after the last one found
1059 for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1060 if (t->getExpectedBreak(i) != 0) {
1061 errln("Forward Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1062 i, t->getSrcLine(i), t->getSrcCol(i));
1063 }
1064 }
1065
1066 //
1067 // Run the iterator backwards, verify that the same breaks are found.
1068 //
1069 prevBP = utext_nativeLength(t->textToBreak)+2; // start with a phony value for the last break pos seen.
1070 for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1071 if (prevBP == bp) {
1072 // Fail for lack of progress.
1073 errln("Reverse Iteration, no progress. Break Pos=%4d File line,col=%4d,%4d",
1074 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1075 break;
1076 }
1077
1078 // Check that we didn't miss an expected break between the last one
1079 // and this one. (UVector returns zeros for index out of bounds.)
1080 for (i=prevBP-1; i>bp; i--) {
1081 if (t->getExpectedBreak(i) != 0) {
1082 errln("Reverse Iteration, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1083 i, t->getSrcLine(i), t->getSrcCol(i));
1084 }
1085 }
1086
1087 // Check that the break we did find was expected
1088 if (t->getExpectedBreak(bp) == 0) {
1089 errln("Reverse Itertion, break found, but not expected. Pos=%4d File line,col= %4d,%4d",
1090 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1091 } else {
1092 // The break was expected.
1093 // Check that the {nnn} tag value is correct.
1094 int32_t expectedTagVal = t->getExpectedBreak(bp);
1095 if (expectedTagVal == -1) {
1096 expectedTagVal = 0;
1097 }
1098 int line = t->getSrcLine(bp);
1099 int32_t rs = t->bi->getRuleStatus();
1100 if (rs != expectedTagVal) {
1101 errln("Incorrect status for reverse break. Pos=%4d File line,col= %4d,%4d.\n"
1102 " Actual, Expected status = %4d, %4d",
1103 bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1104 }
1105 }
1106
1107 prevBP = bp;
1108 }
1109
1110 // Verify that there were no missed breaks prior to the last one found
1111 for (i=prevBP-1; i>=0; i--) {
1112 if (t->getExpectedBreak(i) != 0) {
1113 errln("Forward Itertion, break expected, but not found. Pos=%4d File line,col= %4d,%4d",
1114 i, t->getSrcLine(i), t->getSrcCol(i));
1115 }
1116 }
1117
1118 // Check isBoundary()
1119 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1120 UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1121 UBool boundaryFound = t->bi->isBoundary(i);
1122 if (boundaryExpected != boundaryFound) {
1123 errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1124 " Expected, Actual= %s, %s",
1125 i, t->getSrcLine(i), t->getSrcCol(i),
1126 boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1127 }
1128 }
1129
1130 // Check following()
1131 for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1132 int32_t actualBreak = t->bi->following(i);
1133 int32_t expectedBreak = BreakIterator::DONE;
1134 for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1135 if (t->getExpectedBreak(j) != 0) {
1136 expectedBreak = j;
1137 break;
1138 }
1139 }
1140 if (expectedBreak != actualBreak) {
1141 errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1142 " Expected, Actual= %d, %d",
1143 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1144 }
1145 }
1146
1147 // Check preceding()
1148 for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1149 int32_t actualBreak = t->bi->preceding(i);
1150 int32_t expectedBreak = BreakIterator::DONE;
1151
1152 // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1153 // preceding(trailing byte) will return the index of some preceding code point,
1154 // not the lead byte of the current code point, even though that has a smaller index.
1155 // Therefore, start looking at the expected break data not at i-1, but at
1156 // the start of code point index - 1.
1157 utext_setNativeIndex(t->textToBreak, i);
1158 int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1159 for (; j >= 0; j--) {
1160 if (t->getExpectedBreak(j) != 0) {
1161 expectedBreak = j;
1162 break;
1163 }
1164 }
1165 if (expectedBreak != actualBreak) {
1166 errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1167 " Expected, Actual= %d, %d",
1168 i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1169 }
1170 }
1171 }
1172
1173
TestExtended()1174 void RBBITest::TestExtended() {
1175 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1176 UErrorCode status = U_ZERO_ERROR;
1177 Locale locale("");
1178
1179 UnicodeString rules;
1180 TestParams tp(status);
1181
1182 RegexMatcher localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1183 if (U_FAILURE(status)) {
1184 dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1185 }
1186
1187
1188 //
1189 // Open and read the test data file.
1190 //
1191 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1192 char testFileName[1000];
1193 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1194 errln("Can't open test data. Path too long.");
1195 return;
1196 }
1197 strcpy(testFileName, testDataDirectory);
1198 strcat(testFileName, "rbbitst.txt");
1199
1200 int len;
1201 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1202 if (U_FAILURE(status)) {
1203 return; /* something went wrong, error already output */
1204 }
1205
1206
1207 bool skipTest = false; // Skip this test?
1208
1209 //
1210 // Put the test data into a UnicodeString
1211 //
1212 UnicodeString testString(FALSE, testFile, len);
1213
1214 enum EParseState{
1215 PARSE_COMMENT,
1216 PARSE_TAG,
1217 PARSE_DATA,
1218 PARSE_NUM
1219 }
1220 parseState = PARSE_TAG;
1221
1222 EParseState savedState = PARSE_TAG;
1223
1224 static const UChar CH_LF = 0x0a;
1225 static const UChar CH_CR = 0x0d;
1226 static const UChar CH_HASH = 0x23;
1227 /*static const UChar CH_PERIOD = 0x2e;*/
1228 static const UChar CH_LT = 0x3c;
1229 static const UChar CH_GT = 0x3e;
1230 static const UChar CH_BACKSLASH = 0x5c;
1231 static const UChar CH_BULLET = 0x2022;
1232
1233 int32_t lineNum = 1;
1234 int32_t colStart = 0;
1235 int32_t column = 0;
1236 int32_t charIdx = 0;
1237
1238 int32_t tagValue = 0; // The numeric value of a <nnn> tag.
1239
1240 for (charIdx = 0; charIdx < len; ) {
1241 status = U_ZERO_ERROR;
1242 UChar c = testString.charAt(charIdx);
1243 charIdx++;
1244 if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1245 // treat CRLF as a unit
1246 c = CH_LF;
1247 charIdx++;
1248 }
1249 if (c == CH_LF || c == CH_CR) {
1250 lineNum++;
1251 colStart = charIdx;
1252 }
1253 column = charIdx - colStart + 1;
1254
1255 switch (parseState) {
1256 case PARSE_COMMENT:
1257 if (c == 0x0a || c == 0x0d) {
1258 parseState = savedState;
1259 }
1260 break;
1261
1262 case PARSE_TAG:
1263 {
1264 if (c == CH_HASH) {
1265 parseState = PARSE_COMMENT;
1266 savedState = PARSE_TAG;
1267 break;
1268 }
1269 if (u_isUWhiteSpace(c)) {
1270 break;
1271 }
1272 if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1273 delete tp.bi;
1274 tp.bi = BreakIterator::createWordInstance(locale, status);
1275 skipTest = false;
1276 charIdx += 5;
1277 break;
1278 }
1279 if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1280 delete tp.bi;
1281 tp.bi = BreakIterator::createCharacterInstance(locale, status);
1282 skipTest = false;
1283 charIdx += 5;
1284 break;
1285 }
1286 if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1287 delete tp.bi;
1288 tp.bi = BreakIterator::createLineInstance(locale, status);
1289 skipTest = false;
1290 charIdx += 5;
1291 break;
1292 }
1293 if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1294 delete tp.bi;
1295 tp.bi = BreakIterator::createSentenceInstance(locale, status);
1296 skipTest = false;
1297 charIdx += 5;
1298 break;
1299 }
1300 if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1301 delete tp.bi;
1302 tp.bi = BreakIterator::createTitleInstance(locale, status);
1303 charIdx += 6;
1304 break;
1305 }
1306
1307 // <locale loc_name>
1308 localeMatcher.reset(testString);
1309 if (localeMatcher.lookingAt(charIdx-1, status)) {
1310 UnicodeString localeName = localeMatcher.group(1, status);
1311 char localeName8[100];
1312 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1313 locale = Locale::createFromName(localeName8);
1314 charIdx += localeMatcher.group(0, status).length() - 1;
1315 TEST_ASSERT_SUCCESS(status);
1316 break;
1317 }
1318 if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1319 parseState = PARSE_DATA;
1320 charIdx += 5;
1321 tp.dataToBreak = "";
1322 tp.expectedBreaks->removeAllElements();
1323 tp.srcCol ->removeAllElements();
1324 tp.srcLine->removeAllElements();
1325 break;
1326 }
1327
1328 errln("line %d: Tag expected in test file.", lineNum);
1329 parseState = PARSE_COMMENT;
1330 savedState = PARSE_DATA;
1331 goto end_test; // Stop the test.
1332 }
1333 break;
1334
1335 case PARSE_DATA:
1336 if (c == CH_BULLET) {
1337 int32_t breakIdx = tp.dataToBreak.length();
1338 tp.expectedBreaks->setSize(breakIdx+1);
1339 tp.expectedBreaks->setElementAt(-1, breakIdx);
1340 tp.srcLine->setSize(breakIdx+1);
1341 tp.srcLine->setElementAt(lineNum, breakIdx);
1342 tp.srcCol ->setSize(breakIdx+1);
1343 tp.srcCol ->setElementAt(column, breakIdx);
1344 break;
1345 }
1346
1347 if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1348 // Add final entry to mappings from break location to source file position.
1349 // Need one extra because last break position returned is after the
1350 // last char in the data, not at the last char.
1351 tp.srcLine->addElement(lineNum, status);
1352 tp.srcCol ->addElement(column, status);
1353
1354 parseState = PARSE_TAG;
1355 charIdx += 6;
1356
1357 if (!skipTest) {
1358 // RUN THE TEST!
1359 status = U_ZERO_ERROR;
1360 tp.setUTF16(status);
1361 executeTest(&tp, status);
1362 TEST_ASSERT_SUCCESS(status);
1363
1364 // Run again, this time with UTF-8 text wrapped in a UText.
1365 status = U_ZERO_ERROR;
1366 tp.setUTF8(status);
1367 TEST_ASSERT_SUCCESS(status);
1368 executeTest(&tp, status);
1369 }
1370 break;
1371 }
1372
1373 if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1374 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1375 // Get the code point from the name and insert it into the test data.
1376 // (Damn, no API takes names in Unicode !!!
1377 // we've got to take it back to char *)
1378 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1379 int32_t nameLength = nameEndIdx - (charIdx+2);
1380 char charNameBuf[200];
1381 UChar32 theChar = -1;
1382 if (nameEndIdx != -1) {
1383 UErrorCode status = U_ZERO_ERROR;
1384 testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1385 charNameBuf[sizeof(charNameBuf)-1] = 0;
1386 theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1387 if (U_FAILURE(status)) {
1388 theChar = -1;
1389 }
1390 }
1391 if (theChar == -1) {
1392 errln("Error in named character in test file at line %d, col %d",
1393 lineNum, column);
1394 } else {
1395 // Named code point was recognized. Insert it
1396 // into the test data.
1397 tp.dataToBreak.append(theChar);
1398 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1399 tp.srcLine->addElement(lineNum, status);
1400 tp.srcCol ->addElement(column, status);
1401 }
1402 }
1403 if (nameEndIdx > charIdx) {
1404 charIdx = nameEndIdx+1;
1405
1406 }
1407 break;
1408 }
1409
1410
1411
1412
1413 if (testString.compare(charIdx-1, 2, "<>") == 0) {
1414 charIdx++;
1415 int32_t breakIdx = tp.dataToBreak.length();
1416 tp.expectedBreaks->setSize(breakIdx+1);
1417 tp.expectedBreaks->setElementAt(-1, breakIdx);
1418 tp.srcLine->setSize(breakIdx+1);
1419 tp.srcLine->setElementAt(lineNum, breakIdx);
1420 tp.srcCol ->setSize(breakIdx+1);
1421 tp.srcCol ->setElementAt(column, breakIdx);
1422 break;
1423 }
1424
1425 if (c == CH_LT) {
1426 tagValue = 0;
1427 parseState = PARSE_NUM;
1428 break;
1429 }
1430
1431 if (c == CH_HASH && column==3) { // TODO: why is column off so far?
1432 parseState = PARSE_COMMENT;
1433 savedState = PARSE_DATA;
1434 break;
1435 }
1436
1437 if (c == CH_BACKSLASH) {
1438 // Check for \ at end of line, a line continuation.
1439 // Advance over (discard) the newline
1440 UChar32 cp = testString.char32At(charIdx);
1441 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1442 // We have a CR LF
1443 // Need an extra increment of the input ptr to move over both of them
1444 charIdx++;
1445 }
1446 if (cp == CH_LF || cp == CH_CR) {
1447 lineNum++;
1448 colStart = charIdx;
1449 charIdx++;
1450 break;
1451 }
1452
1453 // Let unescape handle the back slash.
1454 cp = testString.unescapeAt(charIdx);
1455 if (cp != -1) {
1456 // Escape sequence was recognized. Insert the char
1457 // into the test data.
1458 tp.dataToBreak.append(cp);
1459 while (tp.dataToBreak.length() > tp.srcLine->size()) {
1460 tp.srcLine->addElement(lineNum, status);
1461 tp.srcCol ->addElement(column, status);
1462 }
1463 break;
1464 }
1465
1466
1467 // Not a recognized backslash escape sequence.
1468 // Take the next char as a literal.
1469 // TODO: Should this be an error?
1470 c = testString.charAt(charIdx);
1471 charIdx = testString.moveIndex32(charIdx, 1);
1472 }
1473
1474 // Normal, non-escaped data char.
1475 tp.dataToBreak.append(c);
1476
1477 // Save the mapping from offset in the data to line/column numbers in
1478 // the original input file. Will be used for better error messages only.
1479 // If there's an expected break before this char, the slot in the mapping
1480 // vector will already be set for this char; don't overwrite it.
1481 if (tp.dataToBreak.length() > tp.srcLine->size()) {
1482 tp.srcLine->addElement(lineNum, status);
1483 tp.srcCol ->addElement(column, status);
1484 }
1485 break;
1486
1487
1488 case PARSE_NUM:
1489 // We are parsing an expected numeric tag value, like <1234>,
1490 // within a chunk of data.
1491 if (u_isUWhiteSpace(c)) {
1492 break;
1493 }
1494
1495 if (c == CH_GT) {
1496 // Finished the number. Add the info to the expected break data,
1497 // and switch parse state back to doing plain data.
1498 parseState = PARSE_DATA;
1499 if (tagValue == 0) {
1500 tagValue = -1;
1501 }
1502 int32_t breakIdx = tp.dataToBreak.length();
1503 tp.expectedBreaks->setSize(breakIdx+1);
1504 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1505 tp.srcLine->setSize(breakIdx+1);
1506 tp.srcLine->setElementAt(lineNum, breakIdx);
1507 tp.srcCol ->setSize(breakIdx+1);
1508 tp.srcCol ->setElementAt(column, breakIdx);
1509 break;
1510 }
1511
1512 if (u_isdigit(c)) {
1513 tagValue = tagValue*10 + u_charDigitValue(c);
1514 break;
1515 }
1516
1517 errln("Syntax Error in test file at line %d, col %d",
1518 lineNum, column);
1519 parseState = PARSE_COMMENT;
1520 goto end_test; // Stop the test
1521 break;
1522 }
1523
1524
1525 if (U_FAILURE(status)) {
1526 dataerrln("ICU Error %s while parsing test file at line %d.",
1527 u_errorName(status), lineNum);
1528 status = U_ZERO_ERROR;
1529 goto end_test; // Stop the test
1530 }
1531
1532 }
1533
1534 end_test:
1535 delete [] testFile;
1536 #endif
1537 }
1538
1539
1540 //-------------------------------------------------------------------------------
1541 //
1542 // TestDictRules create a break iterator from source rules that includes a
1543 // dictionary range. Regression for bug #7130. Source rules
1544 // do not declare a break iterator type (word, line, sentence, etc.
1545 // but the dictionary code, without a type, would loop.
1546 //
1547 //-------------------------------------------------------------------------------
TestDictRules()1548 void RBBITest::TestDictRules() {
1549 const char *rules = "$dictionary = [a-z]; \n"
1550 "!!forward; \n"
1551 "$dictionary $dictionary; \n"
1552 "!!reverse; \n"
1553 "$dictionary $dictionary; \n";
1554 const char *text = "aa";
1555 UErrorCode status = U_ZERO_ERROR;
1556 UParseError parseError;
1557
1558 RuleBasedBreakIterator bi(rules, parseError, status);
1559 if (U_SUCCESS(status)) {
1560 UnicodeString utext = text;
1561 bi.setText(utext);
1562 int32_t position;
1563 int32_t loops;
1564 for (loops = 0; loops<10; loops++) {
1565 position = bi.next();
1566 if (position == RuleBasedBreakIterator::DONE) {
1567 break;
1568 }
1569 }
1570 TEST_ASSERT(loops == 1);
1571 } else {
1572 dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1573 }
1574 }
1575
1576
1577
1578 //-------------------------------------------------------------------------------
1579 //
1580 // ReadAndConvertFile Read a text data file, convert it to UChars, and
1581 // return the datain one big UChar * buffer, which the caller must delete.
1582 //
1583 // parameters:
1584 // fileName: the name of the file, with no directory part. The test data directory
1585 // is assumed.
1586 // ulen an out parameter, receives the actual length (in UChars) of the file data.
1587 // encoding The file encoding. If the file contains a BOM, that will override the encoding
1588 // specified here. The BOM, if it exists, will be stripped from the returned data.
1589 // Pass NULL for the system default encoding.
1590 // status
1591 // returns:
1592 // The file data, converted to UChar.
1593 // The caller must delete this when done with
1594 // delete [] theBuffer;
1595 //
1596 // TODO: This is a clone of RegexTest::ReadAndConvertFile.
1597 // Move this function to some common place.
1598 //
1599 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1600 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1601 UChar *retPtr = NULL;
1602 char *fileBuf = NULL;
1603 UConverter* conv = NULL;
1604 FILE *f = NULL;
1605
1606 ulen = 0;
1607 if (U_FAILURE(status)) {
1608 return retPtr;
1609 }
1610
1611 //
1612 // Open the file.
1613 //
1614 f = fopen(fileName, "rb");
1615 if (f == 0) {
1616 dataerrln("Error opening test data file %s\n", fileName);
1617 status = U_FILE_ACCESS_ERROR;
1618 return NULL;
1619 }
1620 //
1621 // Read it in
1622 //
1623 int fileSize;
1624 int amt_read;
1625
1626 fseek( f, 0, SEEK_END);
1627 fileSize = ftell(f);
1628 fileBuf = new char[fileSize];
1629 fseek(f, 0, SEEK_SET);
1630 amt_read = fread(fileBuf, 1, fileSize, f);
1631 if (amt_read != fileSize || fileSize <= 0) {
1632 errln("Error reading test data file.");
1633 goto cleanUpAndReturn;
1634 }
1635
1636 //
1637 // Look for a Unicode Signature (BOM) on the data just read
1638 //
1639 int32_t signatureLength;
1640 const char * fileBufC;
1641 const char* bomEncoding;
1642
1643 fileBufC = fileBuf;
1644 bomEncoding = ucnv_detectUnicodeSignature(
1645 fileBuf, fileSize, &signatureLength, &status);
1646 if(bomEncoding!=NULL ){
1647 fileBufC += signatureLength;
1648 fileSize -= signatureLength;
1649 encoding = bomEncoding;
1650 }
1651
1652 //
1653 // Open a converter to take the rule file to UTF-16
1654 //
1655 conv = ucnv_open(encoding, &status);
1656 if (U_FAILURE(status)) {
1657 goto cleanUpAndReturn;
1658 }
1659
1660 //
1661 // Convert the rules to UChar.
1662 // Preflight first to determine required buffer size.
1663 //
1664 ulen = ucnv_toUChars(conv,
1665 NULL, // dest,
1666 0, // destCapacity,
1667 fileBufC,
1668 fileSize,
1669 &status);
1670 if (status == U_BUFFER_OVERFLOW_ERROR) {
1671 // Buffer Overflow is expected from the preflight operation.
1672 status = U_ZERO_ERROR;
1673
1674 retPtr = new UChar[ulen+1];
1675 ucnv_toUChars(conv,
1676 retPtr, // dest,
1677 ulen+1,
1678 fileBufC,
1679 fileSize,
1680 &status);
1681 }
1682
1683 cleanUpAndReturn:
1684 fclose(f);
1685 delete []fileBuf;
1686 ucnv_close(conv);
1687 if (U_FAILURE(status)) {
1688 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1689 delete []retPtr;
1690 retPtr = 0;
1691 ulen = 0;
1692 };
1693 return retPtr;
1694 }
1695
1696
1697
1698 //--------------------------------------------------------------------------------------------
1699 //
1700 // Run tests from each of the boundary test data files distributed by the Unicode Consortium
1701 //
1702 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1703 void RBBITest::TestUnicodeFiles() {
1704 RuleBasedBreakIterator *bi;
1705 UErrorCode status = U_ZERO_ERROR;
1706
1707 bi = (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1708 TEST_ASSERT_SUCCESS(status);
1709 if (U_SUCCESS(status)) {
1710 runUnicodeTestData("GraphemeBreakTest.txt", bi);
1711 }
1712 delete bi;
1713
1714 bi = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1715 TEST_ASSERT_SUCCESS(status);
1716 if (U_SUCCESS(status)) {
1717 runUnicodeTestData("WordBreakTest.txt", bi);
1718 }
1719 delete bi;
1720
1721 bi = (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1722 TEST_ASSERT_SUCCESS(status);
1723 if (U_SUCCESS(status)) {
1724 runUnicodeTestData("SentenceBreakTest.txt", bi);
1725 }
1726 delete bi;
1727
1728 bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1729 TEST_ASSERT_SUCCESS(status);
1730 if (U_SUCCESS(status)) {
1731 runUnicodeTestData("LineBreakTest.txt", bi);
1732 }
1733 delete bi;
1734 }
1735
1736
1737 // Check for test cases from the Unicode test data files that are known to fail
1738 // and should be skipped because ICU is not yet able to fully implement the spec.
1739 // See ticket #7270.
1740
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1741 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1742 static const UChar badTestCases[][4] = { // Line Numbers from Unicode 7.0.0 file.
1743 {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000}, // Line 5198
1744 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000}, // Line 5202
1745 {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000}, // Line 5214
1746 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000}, // Line 5246
1747 {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000}, // Line 5298
1748 {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000} // Line 5302
1749 };
1750 if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1751 return FALSE;
1752 }
1753
1754 for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1755 if (testCase == UnicodeString(badTestCases[i])) {
1756 return logKnownIssue("7270");
1757 }
1758 }
1759 return FALSE;
1760 }
1761
1762
1763 //--------------------------------------------------------------------------------------------
1764 //
1765 // Run tests from one of the boundary test data files distributed by the Unicode Consortium
1766 //
1767 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1768 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1769 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1770 UErrorCode status = U_ZERO_ERROR;
1771
1772 //
1773 // Open and read the test data file, put it into a UnicodeString.
1774 //
1775 const char *testDataDirectory = IntlTest::getSourceTestData(status);
1776 char testFileName[1000];
1777 if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1778 dataerrln("Can't open test data. Path too long.");
1779 return;
1780 }
1781 strcpy(testFileName, testDataDirectory);
1782 strcat(testFileName, fileName);
1783
1784 logln("Opening data file %s\n", fileName);
1785
1786 int len;
1787 UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1788 if (status != U_FILE_ACCESS_ERROR) {
1789 TEST_ASSERT_SUCCESS(status);
1790 TEST_ASSERT(testFile != NULL);
1791 }
1792 if (U_FAILURE(status) || testFile == NULL) {
1793 return; /* something went wrong, error already output */
1794 }
1795 UnicodeString testFileAsString(TRUE, testFile, len);
1796
1797 //
1798 // Parse the test data file using a regular expression.
1799 // Each kind of token is recognized in its own capture group; what type of item was scanned
1800 // is identified by which group had a match.
1801 //
1802 // Caputure Group # 1 2 3 4 5
1803 // Parses this item: divide x hex digits comment \n unrecognized \n
1804 //
1805 UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1806 RegexMatcher tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1807 UnicodeString testString;
1808 UVector32 breakPositions(status);
1809 int lineNumber = 1;
1810 TEST_ASSERT_SUCCESS(status);
1811 if (U_FAILURE(status)) {
1812 return;
1813 }
1814
1815 //
1816 // Scan through each test case, building up the string to be broken in testString,
1817 // and the positions that should be boundaries in the breakPositions vector.
1818 //
1819 int spin = 0;
1820 while (tokenMatcher.find()) {
1821 if(tokenMatcher.hitEnd()) {
1822 /* Shouldnt Happen(TM). This means we didn't find the symbols we were looking for.
1823 This occurred when the text file was corrupt (wasn't marked as UTF-8)
1824 and caused an infinite loop here on EBCDIC systems!
1825 */
1826 fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1827 // return;
1828 }
1829 if (tokenMatcher.start(1, status) >= 0) {
1830 // Scanned a divide sign, indicating a break position in the test data.
1831 if (testString.length()>0) {
1832 breakPositions.addElement(testString.length(), status);
1833 }
1834 }
1835 else if (tokenMatcher.start(2, status) >= 0) {
1836 // Scanned an 'x', meaning no break at this position in the test data
1837 // Nothing to be done here.
1838 }
1839 else if (tokenMatcher.start(3, status) >= 0) {
1840 // Scanned Hex digits. Convert them to binary, append to the character data string.
1841 const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1842 int length = hexNumber.length();
1843 if (length<=8) {
1844 char buf[10];
1845 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1846 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1847 if (c<=0x10ffff) {
1848 testString.append(c);
1849 } else {
1850 errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1851 fileName, lineNumber);
1852 }
1853 } else {
1854 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1855 fileName, lineNumber);
1856 }
1857 }
1858 else if (tokenMatcher.start(4, status) >= 0) {
1859 // Scanned to end of a line, possibly skipping over a comment in the process.
1860 // If the line from the file contained test data, run the test now.
1861 if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1862 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1863 }
1864
1865 // Clear out this test case.
1866 // The string and breakPositions vector will be refilled as the next
1867 // test case is parsed.
1868 testString.remove();
1869 breakPositions.removeAllElements();
1870 lineNumber++;
1871 } else {
1872 // Scanner catchall. Something unrecognized appeared on the line.
1873 char token[16];
1874 UnicodeString uToken = tokenMatcher.group(0, status);
1875 uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1876 token[sizeof(token)-1] = 0;
1877 errln("Syntax error in test data file \'%s\', line %d. Scanning \"%s\"\n", fileName, lineNumber, token);
1878
1879 // Clean up, in preparation for continuing with the next line.
1880 testString.remove();
1881 breakPositions.removeAllElements();
1882 lineNumber++;
1883 }
1884 TEST_ASSERT_SUCCESS(status);
1885 if (U_FAILURE(status)) {
1886 break;
1887 }
1888 }
1889
1890 delete [] testFile;
1891 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
1892 }
1893
1894 //--------------------------------------------------------------------------------------------
1895 //
1896 // checkUnicodeTestCase() Run one test case from one of the Unicode Consortium
1897 // test data files. Do only a simple, forward-only check -
1898 // this test is mostly to check that ICU and the Unicode
1899 // data agree with each other.
1900 //
1901 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1902 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1903 const UnicodeString &testString, // Text data to be broken
1904 UVector32 *breakPositions, // Positions where breaks should be found.
1905 RuleBasedBreakIterator *bi) {
1906 int32_t pos; // Break Position in the test string
1907 int32_t expectedI = 0; // Index of expected break position in the vector of expected results.
1908 int32_t expectedPos; // Expected break position (index into test string)
1909
1910 bi->setText(testString);
1911 pos = bi->first();
1912 pos = bi->next();
1913
1914 while (pos != BreakIterator::DONE) {
1915 if (expectedI >= breakPositions->size()) {
1916 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1917 testFileName, lineNumber, pos);
1918 break;
1919 }
1920 expectedPos = breakPositions->elementAti(expectedI);
1921 if (pos < expectedPos) {
1922 errln("Test file \"%s\", line %d, unexpected break found at position %d",
1923 testFileName, lineNumber, pos);
1924 break;
1925 }
1926 if (pos > expectedPos) {
1927 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1928 testFileName, lineNumber, expectedPos);
1929 break;
1930 }
1931 pos = bi->next();
1932 expectedI++;
1933 }
1934
1935 if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1936 errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1937 testFileName, lineNumber, breakPositions->elementAti(expectedI));
1938 }
1939 }
1940
1941
1942
1943 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1944 //---------------------------------------------------------------------------------------
1945 //
1946 // classs RBBIMonkeyKind
1947 //
1948 // Monkey Test for Break Iteration
1949 // Abstract interface class. Concrete derived classes independently
1950 // implement the break rules for different iterator types.
1951 //
1952 // The Monkey Test itself uses doesn't know which type of break iterator it is
1953 // testing, but works purely in terms of the interface defined here.
1954 //
1955 //---------------------------------------------------------------------------------------
1956 class RBBIMonkeyKind {
1957 public:
1958 // Return a UVector of UnicodeSets, representing the character classes used
1959 // for this type of iterator.
1960 virtual UVector *charClasses() = 0;
1961
1962 // Set the test text on which subsequent calls to next() will operate
1963 virtual void setText(const UnicodeString &s) = 0;
1964
1965 // Find the next break postion, starting from the prev break position, or from zero.
1966 // Return -1 after reaching end of string.
1967 virtual int32_t next(int32_t i) = 0;
1968
1969 virtual ~RBBIMonkeyKind();
1970 UErrorCode deferredStatus;
1971
1972
1973 protected:
1974 RBBIMonkeyKind();
1975
1976 private:
1977 };
1978
RBBIMonkeyKind()1979 RBBIMonkeyKind::RBBIMonkeyKind() {
1980 deferredStatus = U_ZERO_ERROR;
1981 }
1982
~RBBIMonkeyKind()1983 RBBIMonkeyKind::~RBBIMonkeyKind() {
1984 }
1985
1986
1987 //----------------------------------------------------------------------------------------
1988 //
1989 // Random Numbers. Similar to standard lib rand() and srand()
1990 // Not using library to
1991 // 1. Get same results on all platforms.
1992 // 2. Get access to current seed, to more easily reproduce failures.
1993 //
1994 //---------------------------------------------------------------------------------------
1995 static uint32_t m_seed = 1;
1996
m_rand()1997 static uint32_t m_rand()
1998 {
1999 m_seed = m_seed * 1103515245 + 12345;
2000 return (uint32_t)(m_seed/65536) % 32768;
2001 }
2002
2003
2004 //------------------------------------------------------------------------------------------
2005 //
2006 // class RBBICharMonkey Character (Grapheme Cluster) specific implementation
2007 // of RBBIMonkeyKind.
2008 //
2009 //------------------------------------------------------------------------------------------
2010 class RBBICharMonkey: public RBBIMonkeyKind {
2011 public:
2012 RBBICharMonkey();
2013 virtual ~RBBICharMonkey();
2014 virtual UVector *charClasses();
2015 virtual void setText(const UnicodeString &s);
2016 virtual int32_t next(int32_t i);
2017 private:
2018 UVector *fSets;
2019
2020 UnicodeSet *fCRLFSet;
2021 UnicodeSet *fControlSet;
2022 UnicodeSet *fExtendSet;
2023 UnicodeSet *fRegionalIndicatorSet;
2024 UnicodeSet *fPrependSet;
2025 UnicodeSet *fSpacingSet;
2026 UnicodeSet *fLSet;
2027 UnicodeSet *fVSet;
2028 UnicodeSet *fTSet;
2029 UnicodeSet *fLVSet;
2030 UnicodeSet *fLVTSet;
2031 UnicodeSet *fHangulSet;
2032 UnicodeSet *fAnySet;
2033
2034 const UnicodeString *fText;
2035 };
2036
2037
RBBICharMonkey()2038 RBBICharMonkey::RBBICharMonkey() {
2039 UErrorCode status = U_ZERO_ERROR;
2040
2041 fText = NULL;
2042
2043 fCRLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2044 fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2045 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2046 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2047 fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2048 fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2049 fLSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2050 fVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2051 fTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2052 fLVSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2053 fLVTSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2054 fHangulSet = new UnicodeSet();
2055 fHangulSet->addAll(*fLSet);
2056 fHangulSet->addAll(*fVSet);
2057 fHangulSet->addAll(*fTSet);
2058 fHangulSet->addAll(*fLVSet);
2059 fHangulSet->addAll(*fLVTSet);
2060 fAnySet = new UnicodeSet(0, 0x10ffff);
2061
2062 fSets = new UVector(status);
2063 fSets->addElement(fCRLFSet, status);
2064 fSets->addElement(fControlSet, status);
2065 fSets->addElement(fExtendSet, status);
2066 fSets->addElement(fRegionalIndicatorSet, status);
2067 if (!fPrependSet->isEmpty()) {
2068 fSets->addElement(fPrependSet, status);
2069 }
2070 fSets->addElement(fSpacingSet, status);
2071 fSets->addElement(fHangulSet, status);
2072 fSets->addElement(fAnySet, status);
2073 if (U_FAILURE(status)) {
2074 deferredStatus = status;
2075 }
2076 }
2077
2078
setText(const UnicodeString & s)2079 void RBBICharMonkey::setText(const UnicodeString &s) {
2080 fText = &s;
2081 }
2082
2083
2084
next(int32_t prevPos)2085 int32_t RBBICharMonkey::next(int32_t prevPos) {
2086 int p0, p1, p2, p3; // Indices of the significant code points around the
2087 // break position being tested. The candidate break
2088 // location is before p2.
2089
2090 int breakPos = -1;
2091
2092 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2093
2094 if (U_FAILURE(deferredStatus)) {
2095 return -1;
2096 }
2097
2098 // Previous break at end of string. return DONE.
2099 if (prevPos >= fText->length()) {
2100 return -1;
2101 }
2102 p0 = p1 = p2 = p3 = prevPos;
2103 c3 = fText->char32At(prevPos);
2104 c0 = c1 = c2 = 0;
2105 (void)p0; // suppress set but not used warning.
2106 (void)c0;
2107
2108 // Loop runs once per "significant" character position in the input text.
2109 for (;;) {
2110 // Move all of the positions forward in the input string.
2111 p0 = p1; c0 = c1;
2112 p1 = p2; c1 = c2;
2113 p2 = p3; c2 = c3;
2114
2115 // Advancd p3 by one codepoint
2116 p3 = fText->moveIndex32(p3, 1);
2117 c3 = fText->char32At(p3);
2118
2119 if (p1 == p2) {
2120 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2121 continue;
2122 }
2123 if (p2 == fText->length()) {
2124 // Reached end of string. Always a break position.
2125 break;
2126 }
2127
2128 // Rule GB3 CR x LF
2129 // No Extend or Format characters may appear between the CR and LF,
2130 // which requires the additional check for p2 immediately following p1.
2131 //
2132 if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2133 continue;
2134 }
2135
2136 // Rule (GB4). ( Control | CR | LF ) <break>
2137 if (fControlSet->contains(c1) ||
2138 c1 == 0x0D ||
2139 c1 == 0x0A) {
2140 break;
2141 }
2142
2143 // Rule (GB5) <break> ( Control | CR | LF )
2144 //
2145 if (fControlSet->contains(c2) ||
2146 c2 == 0x0D ||
2147 c2 == 0x0A) {
2148 break;
2149 }
2150
2151
2152 // Rule (GB6) L x ( L | V | LV | LVT )
2153 if (fLSet->contains(c1) &&
2154 (fLSet->contains(c2) ||
2155 fVSet->contains(c2) ||
2156 fLVSet->contains(c2) ||
2157 fLVTSet->contains(c2))) {
2158 continue;
2159 }
2160
2161 // Rule (GB7) ( LV | V ) x ( V | T )
2162 if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2163 (fVSet->contains(c2) || fTSet->contains(c2))) {
2164 continue;
2165 }
2166
2167 // Rule (GB8) ( LVT | T) x T
2168 if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2169 fTSet->contains(c2)) {
2170 continue;
2171 }
2172
2173 // Rule (GB8a) Regional_Indicator x Regional_Indicator
2174 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2175 continue;
2176 }
2177
2178 // Rule (GB9) Numeric x ALetter
2179 if (fExtendSet->contains(c2)) {
2180 continue;
2181 }
2182
2183 // Rule (GB9a) x SpacingMark
2184 if (fSpacingSet->contains(c2)) {
2185 continue;
2186 }
2187
2188 // Rule (GB9b) Prepend x
2189 if (fPrependSet->contains(c1)) {
2190 continue;
2191 }
2192
2193 // Rule (GB10) Any <break> Any
2194 break;
2195 }
2196
2197 breakPos = p2;
2198 return breakPos;
2199 }
2200
2201
2202
charClasses()2203 UVector *RBBICharMonkey::charClasses() {
2204 return fSets;
2205 }
2206
2207
~RBBICharMonkey()2208 RBBICharMonkey::~RBBICharMonkey() {
2209 delete fSets;
2210 delete fCRLFSet;
2211 delete fControlSet;
2212 delete fExtendSet;
2213 delete fRegionalIndicatorSet;
2214 delete fPrependSet;
2215 delete fSpacingSet;
2216 delete fLSet;
2217 delete fVSet;
2218 delete fTSet;
2219 delete fLVSet;
2220 delete fLVTSet;
2221 delete fHangulSet;
2222 delete fAnySet;
2223 }
2224
2225 //------------------------------------------------------------------------------------------
2226 //
2227 // class RBBIWordMonkey Word Break specific implementation
2228 // of RBBIMonkeyKind.
2229 //
2230 //------------------------------------------------------------------------------------------
2231 class RBBIWordMonkey: public RBBIMonkeyKind {
2232 public:
2233 RBBIWordMonkey();
2234 virtual ~RBBIWordMonkey();
2235 virtual UVector *charClasses();
2236 virtual void setText(const UnicodeString &s);
2237 virtual int32_t next(int32_t i);
2238 private:
2239 UVector *fSets;
2240
2241 UnicodeSet *fCRSet;
2242 UnicodeSet *fLFSet;
2243 UnicodeSet *fNewlineSet;
2244 UnicodeSet *fRegionalIndicatorSet;
2245 UnicodeSet *fKatakanaSet;
2246 UnicodeSet *fHebrew_LetterSet;
2247 UnicodeSet *fALetterSet;
2248 // TODO(jungshik): Do we still need this change?
2249 // UnicodeSet *fALetterSet; // matches ALetterPlus in word.txt
2250 UnicodeSet *fSingle_QuoteSet;
2251 UnicodeSet *fDouble_QuoteSet;
2252 UnicodeSet *fMidNumLetSet;
2253 UnicodeSet *fMidLetterSet;
2254 UnicodeSet *fMidNumSet;
2255 UnicodeSet *fNumericSet;
2256 UnicodeSet *fFormatSet;
2257 UnicodeSet *fOtherSet;
2258 UnicodeSet *fExtendSet;
2259 UnicodeSet *fExtendNumLetSet;
2260 UnicodeSet *fDictionaryCjkSet;
2261
2262 const UnicodeString *fText;
2263 };
2264
2265
RBBIWordMonkey()2266 RBBIWordMonkey::RBBIWordMonkey()
2267 {
2268 UErrorCode status = U_ZERO_ERROR;
2269
2270 fSets = new UVector(status);
2271
2272 fCRSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"), status);
2273 fLFSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"), status);
2274 fNewlineSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"), status);
2275 fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2276 // Exclude Hangul syllables from ALetterSet during testing.
2277 // Leave CJK dictionary characters out from the monkey tests!
2278 #if 0
2279 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}"
2280 "[\\p{Line_Break = Complex_Context}"
2281 "-\\p{Grapheme_Cluster_Break = Extend}"
2282 "-\\p{Grapheme_Cluster_Break = Control}"
2283 "]]",
2284 status);
2285 #endif
2286 fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2287 fKatakanaSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"), status);
2288 fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2289 fALetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2290 fALetterSet->removeAll(*fDictionaryCjkSet);
2291 fSingle_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"), status);
2292 fDouble_QuoteSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"), status);
2293 fMidNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"), status);
2294 fMidLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"), status);
2295 fMidNumSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"), status);
2296 // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2297 // we should figure out why
2298 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"), status);
2299 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"), status);
2300 fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2301 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
2302
2303 fOtherSet = new UnicodeSet();
2304 if(U_FAILURE(status)) {
2305 deferredStatus = status;
2306 return;
2307 }
2308
2309 fOtherSet->complement();
2310 fOtherSet->removeAll(*fCRSet);
2311 fOtherSet->removeAll(*fLFSet);
2312 fOtherSet->removeAll(*fNewlineSet);
2313 fOtherSet->removeAll(*fKatakanaSet);
2314 fOtherSet->removeAll(*fHebrew_LetterSet);
2315 fOtherSet->removeAll(*fALetterSet);
2316 fOtherSet->removeAll(*fSingle_QuoteSet);
2317 fOtherSet->removeAll(*fDouble_QuoteSet);
2318 fOtherSet->removeAll(*fMidLetterSet);
2319 fOtherSet->removeAll(*fMidNumSet);
2320 fOtherSet->removeAll(*fNumericSet);
2321 fOtherSet->removeAll(*fExtendNumLetSet);
2322 fOtherSet->removeAll(*fFormatSet);
2323 fOtherSet->removeAll(*fExtendSet);
2324 fOtherSet->removeAll(*fRegionalIndicatorSet);
2325 // Inhibit dictionary characters from being tested at all.
2326 fOtherSet->removeAll(*fDictionaryCjkSet);
2327 fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2328
2329 fSets->addElement(fCRSet, status);
2330 fSets->addElement(fLFSet, status);
2331 fSets->addElement(fNewlineSet, status);
2332 fSets->addElement(fRegionalIndicatorSet, status);
2333 fSets->addElement(fHebrew_LetterSet, status);
2334 fSets->addElement(fALetterSet, status);
2335 fSets->addElement(fSingle_QuoteSet, status);
2336 fSets->addElement(fDouble_QuoteSet, status);
2337 //fSets->addElement(fKatakanaSet, status); //TODO: work out how to test katakana
2338 fSets->addElement(fMidLetterSet, status);
2339 fSets->addElement(fMidNumLetSet, status);
2340 fSets->addElement(fMidNumSet, status);
2341 fSets->addElement(fNumericSet, status);
2342 fSets->addElement(fFormatSet, status);
2343 fSets->addElement(fExtendSet, status);
2344 fSets->addElement(fOtherSet, status);
2345 fSets->addElement(fExtendNumLetSet, status);
2346
2347 if (U_FAILURE(status)) {
2348 deferredStatus = status;
2349 }
2350 }
2351
setText(const UnicodeString & s)2352 void RBBIWordMonkey::setText(const UnicodeString &s) {
2353 fText = &s;
2354 }
2355
2356
next(int32_t prevPos)2357 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2358 int p0, p1, p2, p3; // Indices of the significant code points around the
2359 // break position being tested. The candidate break
2360 // location is before p2.
2361
2362 int breakPos = -1;
2363
2364 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2365
2366 if (U_FAILURE(deferredStatus)) {
2367 return -1;
2368 }
2369
2370 // Prev break at end of string. return DONE.
2371 if (prevPos >= fText->length()) {
2372 return -1;
2373 }
2374 p0 = p1 = p2 = p3 = prevPos;
2375 c3 = fText->char32At(prevPos);
2376 c0 = c1 = c2 = 0;
2377 (void)p0; // Suppress set but not used warning.
2378
2379 // Loop runs once per "significant" character position in the input text.
2380 for (;;) {
2381 // Move all of the positions forward in the input string.
2382 p0 = p1; c0 = c1;
2383 p1 = p2; c1 = c2;
2384 p2 = p3; c2 = c3;
2385
2386 // Advancd p3 by X(Extend | Format)* Rule 4
2387 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2388 do {
2389 p3 = fText->moveIndex32(p3, 1);
2390 c3 = fText->char32At(p3);
2391 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2392 break;
2393 };
2394 }
2395 while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2396
2397
2398 if (p1 == p2) {
2399 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2400 continue;
2401 }
2402 if (p2 == fText->length()) {
2403 // Reached end of string. Always a break position.
2404 break;
2405 }
2406
2407 // Rule (3) CR x LF
2408 // No Extend or Format characters may appear between the CR and LF,
2409 // which requires the additional check for p2 immediately following p1.
2410 //
2411 if (c1==0x0D && c2==0x0A) {
2412 continue;
2413 }
2414
2415 // Rule (3a) Break before and after newlines (including CR and LF)
2416 //
2417 if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2418 break;
2419 };
2420 if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2421 break;
2422 };
2423
2424 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2425 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2426 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2427 continue;
2428 }
2429
2430 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2431 //
2432 if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2433 (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2434 (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2435 continue;
2436 }
2437
2438 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter)
2439 if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2440 (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2441 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2442 continue;
2443 }
2444
2445 // Rule (7a) Hebrew_Letter x Single_Quote
2446 if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2447 continue;
2448 }
2449
2450 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter
2451 if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2452 continue;
2453 }
2454
2455 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter
2456 if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2457 continue;
2458 }
2459
2460 // Rule (8) Numeric x Numeric
2461 if (fNumericSet->contains(c1) &&
2462 fNumericSet->contains(c2)) {
2463 continue;
2464 }
2465
2466 // Rule (9) (ALetter | Hebrew_Letter) x Numeric
2467 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2468 fNumericSet->contains(c2)) {
2469 continue;
2470 }
2471
2472 // Rule (10) Numeric x (ALetter | Hebrew_Letter)
2473 if (fNumericSet->contains(c1) &&
2474 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2475 continue;
2476 }
2477
2478 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric
2479 if (fNumericSet->contains(c0) &&
2480 (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2481 fNumericSet->contains(c2)) {
2482 continue;
2483 }
2484
2485 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2486 if (fNumericSet->contains(c1) &&
2487 (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2488 fNumericSet->contains(c3)) {
2489 continue;
2490 }
2491
2492 // Rule (13) Katakana x Katakana
2493 if (fKatakanaSet->contains(c1) &&
2494 fKatakanaSet->contains(c2)) {
2495 continue;
2496 }
2497
2498 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2499 if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2500 fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2501 fExtendNumLetSet->contains(c2)) {
2502 continue;
2503 }
2504
2505 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2506 if (fExtendNumLetSet->contains(c1) &&
2507 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2508 fNumericSet->contains(c2) || fKatakanaSet->contains(c2))) {
2509 continue;
2510 }
2511
2512 // Rule 13c
2513 if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2514 continue;
2515 }
2516
2517 // Rule 14. Break found here.
2518 break;
2519 }
2520
2521 breakPos = p2;
2522 return breakPos;
2523 }
2524
2525
charClasses()2526 UVector *RBBIWordMonkey::charClasses() {
2527 return fSets;
2528 }
2529
2530
~RBBIWordMonkey()2531 RBBIWordMonkey::~RBBIWordMonkey() {
2532 delete fSets;
2533 delete fCRSet;
2534 delete fLFSet;
2535 delete fNewlineSet;
2536 delete fKatakanaSet;
2537 delete fHebrew_LetterSet;
2538 delete fALetterSet;
2539 delete fSingle_QuoteSet;
2540 delete fDouble_QuoteSet;
2541 delete fMidNumLetSet;
2542 delete fMidLetterSet;
2543 delete fMidNumSet;
2544 delete fNumericSet;
2545 delete fFormatSet;
2546 delete fExtendSet;
2547 delete fExtendNumLetSet;
2548 delete fRegionalIndicatorSet;
2549 delete fDictionaryCjkSet;
2550 delete fOtherSet;
2551 }
2552
2553
2554
2555
2556 //------------------------------------------------------------------------------------------
2557 //
2558 // class RBBISentMonkey Sentence Break specific implementation
2559 // of RBBIMonkeyKind.
2560 //
2561 //------------------------------------------------------------------------------------------
2562 class RBBISentMonkey: public RBBIMonkeyKind {
2563 public:
2564 RBBISentMonkey();
2565 virtual ~RBBISentMonkey();
2566 virtual UVector *charClasses();
2567 virtual void setText(const UnicodeString &s);
2568 virtual int32_t next(int32_t i);
2569 private:
2570 int moveBack(int posFrom);
2571 int moveForward(int posFrom);
2572 UChar32 cAt(int pos);
2573
2574 UVector *fSets;
2575
2576 UnicodeSet *fSepSet;
2577 UnicodeSet *fFormatSet;
2578 UnicodeSet *fSpSet;
2579 UnicodeSet *fLowerSet;
2580 UnicodeSet *fUpperSet;
2581 UnicodeSet *fOLetterSet;
2582 UnicodeSet *fNumericSet;
2583 UnicodeSet *fATermSet;
2584 UnicodeSet *fSContinueSet;
2585 UnicodeSet *fSTermSet;
2586 UnicodeSet *fCloseSet;
2587 UnicodeSet *fOtherSet;
2588 UnicodeSet *fExtendSet;
2589
2590 const UnicodeString *fText;
2591
2592 };
2593
RBBISentMonkey()2594 RBBISentMonkey::RBBISentMonkey()
2595 {
2596 UErrorCode status = U_ZERO_ERROR;
2597
2598 fSets = new UVector(status);
2599
2600 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator
2601 // set and made into character classes of their own. For the monkey impl,
2602 // they remain in SEP, since Sep always appears with CR and LF in the rules.
2603 fSepSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"), status);
2604 fFormatSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"), status);
2605 fSpSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"), status);
2606 fLowerSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"), status);
2607 fUpperSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"), status);
2608 fOLetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"), status);
2609 fNumericSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"), status);
2610 fATermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"), status);
2611 fSContinueSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2612 fSTermSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"), status);
2613 fCloseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"), status);
2614 fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"), status);
2615 fOtherSet = new UnicodeSet();
2616
2617 if(U_FAILURE(status)) {
2618 deferredStatus = status;
2619 return;
2620 }
2621
2622 fOtherSet->complement();
2623 fOtherSet->removeAll(*fSepSet);
2624 fOtherSet->removeAll(*fFormatSet);
2625 fOtherSet->removeAll(*fSpSet);
2626 fOtherSet->removeAll(*fLowerSet);
2627 fOtherSet->removeAll(*fUpperSet);
2628 fOtherSet->removeAll(*fOLetterSet);
2629 fOtherSet->removeAll(*fNumericSet);
2630 fOtherSet->removeAll(*fATermSet);
2631 fOtherSet->removeAll(*fSContinueSet);
2632 fOtherSet->removeAll(*fSTermSet);
2633 fOtherSet->removeAll(*fCloseSet);
2634 fOtherSet->removeAll(*fExtendSet);
2635
2636 fSets->addElement(fSepSet, status);
2637 fSets->addElement(fFormatSet, status);
2638 fSets->addElement(fSpSet, status);
2639 fSets->addElement(fLowerSet, status);
2640 fSets->addElement(fUpperSet, status);
2641 fSets->addElement(fOLetterSet, status);
2642 fSets->addElement(fNumericSet, status);
2643 fSets->addElement(fATermSet, status);
2644 fSets->addElement(fSContinueSet, status);
2645 fSets->addElement(fSTermSet, status);
2646 fSets->addElement(fCloseSet, status);
2647 fSets->addElement(fOtherSet, status);
2648 fSets->addElement(fExtendSet, status);
2649
2650 if (U_FAILURE(status)) {
2651 deferredStatus = status;
2652 }
2653 }
2654
2655
2656
setText(const UnicodeString & s)2657 void RBBISentMonkey::setText(const UnicodeString &s) {
2658 fText = &s;
2659 }
2660
charClasses()2661 UVector *RBBISentMonkey::charClasses() {
2662 return fSets;
2663 }
2664
2665
2666 // moveBack() Find the "significant" code point preceding the index i.
2667 // Skips over ($Extend | $Format)* .
2668 //
moveBack(int i)2669 int RBBISentMonkey::moveBack(int i) {
2670 if (i <= 0) {
2671 return -1;
2672 }
2673 UChar32 c;
2674 int32_t j = i;
2675 do {
2676 j = fText->moveIndex32(j, -1);
2677 c = fText->char32At(j);
2678 }
2679 while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2680 return j;
2681
2682 }
2683
2684
moveForward(int i)2685 int RBBISentMonkey::moveForward(int i) {
2686 if (i>=fText->length()) {
2687 return fText->length();
2688 }
2689 UChar32 c;
2690 int32_t j = i;
2691 do {
2692 j = fText->moveIndex32(j, 1);
2693 c = cAt(j);
2694 }
2695 while (fFormatSet->contains(c) || fExtendSet->contains(c));
2696 return j;
2697 }
2698
cAt(int pos)2699 UChar32 RBBISentMonkey::cAt(int pos) {
2700 if (pos<0 || pos>=fText->length()) {
2701 return -1;
2702 } else {
2703 return fText->char32At(pos);
2704 }
2705 }
2706
next(int32_t prevPos)2707 int32_t RBBISentMonkey::next(int32_t prevPos) {
2708 int p0, p1, p2, p3; // Indices of the significant code points around the
2709 // break position being tested. The candidate break
2710 // location is before p2.
2711
2712 int breakPos = -1;
2713
2714 UChar32 c0, c1, c2, c3; // The code points at p0, p1, p2 & p3.
2715 UChar32 c;
2716
2717 if (U_FAILURE(deferredStatus)) {
2718 return -1;
2719 }
2720
2721 // Prev break at end of string. return DONE.
2722 if (prevPos >= fText->length()) {
2723 return -1;
2724 }
2725 p0 = p1 = p2 = p3 = prevPos;
2726 c3 = fText->char32At(prevPos);
2727 c0 = c1 = c2 = 0;
2728 (void)p0; // Suppress set but not used warning.
2729
2730 // Loop runs once per "significant" character position in the input text.
2731 for (;;) {
2732 // Move all of the positions forward in the input string.
2733 p0 = p1; c0 = c1;
2734 p1 = p2; c1 = c2;
2735 p2 = p3; c2 = c3;
2736
2737 // Advancd p3 by X(Extend | Format)* Rule 4
2738 p3 = moveForward(p3);
2739 c3 = cAt(p3);
2740
2741 // Rule (3) CR x LF
2742 if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2743 continue;
2744 }
2745
2746 // Rule (4). Sep <break>
2747 if (fSepSet->contains(c1)) {
2748 p2 = p1+1; // Separators don't combine with Extend or Format.
2749 break;
2750 }
2751
2752 if (p2 >= fText->length()) {
2753 // Reached end of string. Always a break position.
2754 break;
2755 }
2756
2757 if (p2 == prevPos) {
2758 // Still warming up the loop. (won't work with zero length strings, but we don't care)
2759 continue;
2760 }
2761
2762 // Rule (6). ATerm x Numeric
2763 if (fATermSet->contains(c1) && fNumericSet->contains(c2)) {
2764 continue;
2765 }
2766
2767 // Rule (7). (Upper | Lower) ATerm x Uppper
2768 if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2769 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2770 continue;
2771 }
2772
2773 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2774 // Note: STerm | ATerm are added to the negated part of the expression by a
2775 // note to the Unicode 5.0 documents.
2776 int p8 = p1;
2777 while (fSpSet->contains(cAt(p8))) {
2778 p8 = moveBack(p8);
2779 }
2780 while (fCloseSet->contains(cAt(p8))) {
2781 p8 = moveBack(p8);
2782 }
2783 if (fATermSet->contains(cAt(p8))) {
2784 p8=p2;
2785 for (;;) {
2786 c = cAt(p8);
2787 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2788 fLowerSet->contains(c) || fSepSet->contains(c) ||
2789 fATermSet->contains(c) || fSTermSet->contains(c)) {
2790 break;
2791 }
2792 p8 = moveForward(p8);
2793 }
2794 if (fLowerSet->contains(cAt(p8))) {
2795 continue;
2796 }
2797 }
2798
2799 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2800 if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2801 p8 = p1;
2802 while (fSpSet->contains(cAt(p8))) {
2803 p8 = moveBack(p8);
2804 }
2805 while (fCloseSet->contains(cAt(p8))) {
2806 p8 = moveBack(p8);
2807 }
2808 c = cAt(p8);
2809 if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2810 continue;
2811 }
2812 }
2813
2814 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF)
2815 int p9 = p1;
2816 while (fCloseSet->contains(cAt(p9))) {
2817 p9 = moveBack(p9);
2818 }
2819 c = cAt(p9);
2820 if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2821 if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2822 continue;
2823 }
2824 }
2825
2826 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF)
2827 int p10 = p1;
2828 while (fSpSet->contains(cAt(p10))) {
2829 p10 = moveBack(p10);
2830 }
2831 while (fCloseSet->contains(cAt(p10))) {
2832 p10 = moveBack(p10);
2833 }
2834 if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2835 if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2836 continue;
2837 }
2838 }
2839
2840 // Rule (11) (STerm | ATerm) Close* Sp* (Sep | CR | LF)? <break>
2841 int p11 = p1;
2842 if (fSepSet->contains(cAt(p11))) {
2843 p11 = moveBack(p11);
2844 }
2845 while (fSpSet->contains(cAt(p11))) {
2846 p11 = moveBack(p11);
2847 }
2848 while (fCloseSet->contains(cAt(p11))) {
2849 p11 = moveBack(p11);
2850 }
2851 if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2852 break;
2853 }
2854
2855 // Rule (12) Any x Any
2856 continue;
2857 }
2858 breakPos = p2;
2859 return breakPos;
2860 }
2861
~RBBISentMonkey()2862 RBBISentMonkey::~RBBISentMonkey() {
2863 delete fSets;
2864 delete fSepSet;
2865 delete fFormatSet;
2866 delete fSpSet;
2867 delete fLowerSet;
2868 delete fUpperSet;
2869 delete fOLetterSet;
2870 delete fNumericSet;
2871 delete fATermSet;
2872 delete fSContinueSet;
2873 delete fSTermSet;
2874 delete fCloseSet;
2875 delete fOtherSet;
2876 delete fExtendSet;
2877 }
2878
2879
2880
2881 //-------------------------------------------------------------------------------------------
2882 //
2883 // RBBILineMonkey
2884 //
2885 //-------------------------------------------------------------------------------------------
2886
2887 class RBBILineMonkey: public RBBIMonkeyKind {
2888 public:
2889 RBBILineMonkey();
2890 virtual ~RBBILineMonkey();
2891 virtual UVector *charClasses();
2892 virtual void setText(const UnicodeString &s);
2893 virtual int32_t next(int32_t i);
2894 virtual void rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2895 private:
2896 UVector *fSets;
2897
2898 UnicodeSet *fBK;
2899 UnicodeSet *fCR;
2900 UnicodeSet *fLF;
2901 UnicodeSet *fCM;
2902 UnicodeSet *fNL;
2903 UnicodeSet *fSG;
2904 UnicodeSet *fWJ;
2905 UnicodeSet *fZW;
2906 UnicodeSet *fGL;
2907 UnicodeSet *fCB;
2908 UnicodeSet *fSP;
2909 UnicodeSet *fB2;
2910 UnicodeSet *fBA;
2911 UnicodeSet *fBB;
2912 UnicodeSet *fHY;
2913 UnicodeSet *fH2;
2914 UnicodeSet *fH3;
2915 UnicodeSet *fCL;
2916 UnicodeSet *fCP;
2917 UnicodeSet *fEX;
2918 UnicodeSet *fIN;
2919 UnicodeSet *fJL;
2920 UnicodeSet *fJV;
2921 UnicodeSet *fJT;
2922 UnicodeSet *fNS;
2923 UnicodeSet *fOP;
2924 UnicodeSet *fQU;
2925 UnicodeSet *fIS;
2926 UnicodeSet *fNU;
2927 UnicodeSet *fPO;
2928 UnicodeSet *fPR;
2929 UnicodeSet *fSY;
2930 UnicodeSet *fAI;
2931 UnicodeSet *fAL;
2932 UnicodeSet *fCJ;
2933 UnicodeSet *fHL;
2934 UnicodeSet *fID;
2935 UnicodeSet *fRI;
2936 UnicodeSet *fSA;
2937 UnicodeSet *fXX;
2938
2939 BreakIterator *fCharBI;
2940 const UnicodeString *fText;
2941 RegexMatcher *fNumberMatcher;
2942 };
2943
2944
RBBILineMonkey()2945 RBBILineMonkey::RBBILineMonkey()
2946 {
2947 UErrorCode status = U_ZERO_ERROR;
2948
2949 fSets = new UVector(status);
2950
2951 fBK = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2952 fCR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2953 fLF = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2954 fCM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2955 fNL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2956 fWJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2957 fZW = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2958 fGL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2959 fCB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2960 fSP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2961 fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2962 fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2963 fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2964 fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2965 fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2966 fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2967 fCL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2968 fCP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2969 fEX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2970 fIN = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2971 fJL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2972 fJV = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2973 fJT = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2974 fNS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2975 fOP = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2976 fQU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2977 fIS = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2978 fNU = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2979 fPO = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2980 fPR = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2981 fSY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2982 fAI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2983 fAL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2984 fCJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2985 fHL = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2986 fID = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2987 fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2988 fSA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2989 fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2990 fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2991
2992 if (U_FAILURE(status)) {
2993 deferredStatus = status;
2994 fCharBI = NULL;
2995 fNumberMatcher = NULL;
2996 return;
2997 }
2998
2999 fAL->addAll(*fXX); // Default behavior for XX is identical to AL
3000 fAL->addAll(*fAI); // Default behavior for AI is identical to AL
3001 fAL->addAll(*fSA); // Default behavior for SA is XX, which defaults to AL
3002 fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
3003
3004 fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
3005
3006 fSets->addElement(fBK, status);
3007 fSets->addElement(fCR, status);
3008 fSets->addElement(fLF, status);
3009 fSets->addElement(fCM, status);
3010 fSets->addElement(fNL, status);
3011 fSets->addElement(fWJ, status);
3012 fSets->addElement(fZW, status);
3013 fSets->addElement(fGL, status);
3014 fSets->addElement(fCB, status);
3015 fSets->addElement(fSP, status);
3016 fSets->addElement(fB2, status);
3017 fSets->addElement(fBA, status);
3018 fSets->addElement(fBB, status);
3019 fSets->addElement(fHY, status);
3020 fSets->addElement(fH2, status);
3021 fSets->addElement(fH3, status);
3022 fSets->addElement(fCL, status);
3023 fSets->addElement(fCP, status);
3024 fSets->addElement(fEX, status);
3025 fSets->addElement(fIN, status);
3026 fSets->addElement(fJL, status);
3027 fSets->addElement(fJT, status);
3028 fSets->addElement(fJV, status);
3029 fSets->addElement(fNS, status);
3030 fSets->addElement(fOP, status);
3031 fSets->addElement(fQU, status);
3032 fSets->addElement(fIS, status);
3033 fSets->addElement(fNU, status);
3034 fSets->addElement(fPO, status);
3035 fSets->addElement(fPR, status);
3036 fSets->addElement(fSY, status);
3037 fSets->addElement(fAI, status);
3038 fSets->addElement(fAL, status);
3039 fSets->addElement(fHL, status);
3040 fSets->addElement(fID, status);
3041 fSets->addElement(fWJ, status);
3042 fSets->addElement(fRI, status);
3043 fSets->addElement(fSA, status);
3044 fSets->addElement(fSG, status);
3045
3046 const char *rules =
3047 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3048 "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3049 "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3050 "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3051 "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3052 "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3053
3054 fNumberMatcher = new RegexMatcher(
3055 UnicodeString(rules, -1, US_INV), 0, status);
3056
3057 fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3058
3059 if (U_FAILURE(status)) {
3060 deferredStatus = status;
3061 }
3062 }
3063
3064
setText(const UnicodeString & s)3065 void RBBILineMonkey::setText(const UnicodeString &s) {
3066 fText = &s;
3067 fCharBI->setText(s);
3068 fNumberMatcher->reset(s);
3069 }
3070
3071 //
3072 // rule9Adjust
3073 // Line Break TR rules 9 and 10 implementation.
3074 // This deals with combining marks and other sequences that
3075 // that must be treated as if they were something other than what they actually are.
3076 //
3077 // This is factored out into a separate function because it must be applied twice for
3078 // each potential break, once to the chars before the position being checked, then
3079 // again to the text following the possible break.
3080 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3081 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3082 if (pos == -1) {
3083 // Invalid initial position. Happens during the warmup iteration of the
3084 // main loop in next().
3085 return;
3086 }
3087
3088 int32_t nPos = *nextPos;
3089
3090 // LB 9 Keep combining sequences together.
3091 // advance over any CM class chars. Note that Line Break CM is different
3092 // from the normal Grapheme Extend property.
3093 if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3094 *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3095 for (;;) {
3096 *nextChar = fText->char32At(nPos);
3097 if (!fCM->contains(*nextChar)) {
3098 break;
3099 }
3100 nPos = fText->moveIndex32(nPos, 1);
3101 }
3102 }
3103
3104
3105 // LB 9 Treat X CM* as if it were x.
3106 // No explicit action required.
3107
3108 // LB 10 Treat any remaining combining mark as AL
3109 if (fCM->contains(*posChar)) {
3110 *posChar = 0x41; // thisChar = 'A';
3111 }
3112
3113 // Push the updated nextPos and nextChar back to our caller.
3114 // This only makes a difference if posChar got bigger by consuming a
3115 // combining sequence.
3116 *nextPos = nPos;
3117 *nextChar = fText->char32At(nPos);
3118 }
3119
3120
3121
next(int32_t startPos)3122 int32_t RBBILineMonkey::next(int32_t startPos) {
3123 UErrorCode status = U_ZERO_ERROR;
3124 int32_t pos; // Index of the char following a potential break position
3125 UChar32 thisChar; // Character at above position "pos"
3126
3127 int32_t prevPos; // Index of the char preceding a potential break position
3128 UChar32 prevChar; // Character at above position. Note that prevChar
3129 // and thisChar may not be adjacent because combining
3130 // characters between them will be ignored.
3131
3132 int32_t prevPosX2; // Second previous character. Wider context for LB21a.
3133 UChar32 prevCharX2;
3134
3135 int32_t nextPos; // Index of the next character following pos.
3136 // Usually skips over combining marks.
3137 int32_t nextCPPos; // Index of the code point following "pos."
3138 // May point to a combining mark.
3139 int32_t tPos; // temp value.
3140 UChar32 c;
3141
3142 if (U_FAILURE(deferredStatus)) {
3143 return -1;
3144 }
3145
3146 if (startPos >= fText->length()) {
3147 return -1;
3148 }
3149
3150
3151 // Initial values for loop. Loop will run the first time without finding breaks,
3152 // while the invalid values shift out and the "this" and
3153 // "prev" positions are filled in with good values.
3154 pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
3155 thisChar = prevChar = prevCharX2 = 0;
3156 nextPos = nextCPPos = startPos;
3157
3158
3159 // Loop runs once per position in the test text, until a break position
3160 // is found.
3161 for (;;) {
3162 prevPosX2 = prevPos;
3163 prevCharX2 = prevChar;
3164
3165 prevPos = pos;
3166 prevChar = thisChar;
3167
3168 pos = nextPos;
3169 thisChar = fText->char32At(pos);
3170
3171 nextCPPos = fText->moveIndex32(pos, 1);
3172 nextPos = nextCPPos;
3173
3174 // Rule LB2 - Break at end of text.
3175 if (pos >= fText->length()) {
3176 break;
3177 }
3178
3179 // Rule LB 9 - adjust for combining sequences.
3180 // We do this one out-of-order because the adjustment does not change anything
3181 // that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3182 // be applied.
3183 rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
3184 nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3185 c = fText->char32At(nextPos);
3186 rule9Adjust(pos, &thisChar, &nextPos, &c);
3187
3188 // If the loop is still warming up - if we haven't shifted the initial
3189 // -1 positions out of prevPos yet - loop back to advance the
3190 // position in the input without any further looking for breaks.
3191 if (prevPos == -1) {
3192 continue;
3193 }
3194
3195 // LB 4 Always break after hard line breaks,
3196 if (fBK->contains(prevChar)) {
3197 break;
3198 }
3199
3200 // LB 5 Break after CR, LF, NL, but not inside CR LF
3201 if (prevChar == 0x0d && thisChar == 0x0a) {
3202 continue;
3203 }
3204 if (prevChar == 0x0d ||
3205 prevChar == 0x0a ||
3206 prevChar == 0x85) {
3207 break;
3208 }
3209
3210 // LB 6 Don't break before hard line breaks
3211 if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3212 fBK->contains(thisChar)) {
3213 continue;
3214 }
3215
3216
3217 // LB 7 Don't break before spaces or zero-width space.
3218 if (fSP->contains(thisChar)) {
3219 continue;
3220 }
3221
3222 if (fZW->contains(thisChar)) {
3223 continue;
3224 }
3225
3226 // LB 8 Break after zero width space
3227 if (fZW->contains(prevChar)) {
3228 break;
3229 }
3230
3231 // LB 9, 10 Already done, at top of loop.
3232 //
3233
3234
3235 // LB 11 Do not break before or after WORD JOINER and related characters.
3236 // x WJ
3237 // WJ x
3238 //
3239 if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3240 continue;
3241 }
3242
3243 // LB 12
3244 // GL x
3245 if (fGL->contains(prevChar)) {
3246 continue;
3247 }
3248
3249 // LB 12a
3250 // [^SP BA HY] x GL
3251 if (!(fSP->contains(prevChar) ||
3252 fBA->contains(prevChar) ||
3253 fHY->contains(prevChar) ) && fGL->contains(thisChar)) {
3254 continue;
3255 }
3256
3257
3258
3259 // LB 13 Don't break before closings.
3260 // NU x CL, NU x CP and NU x IS are not matched here so that they will
3261 // fall into LB 17 and the more general number regular expression.
3262 //
3263 if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3264 (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3265 fEX->contains(thisChar) ||
3266 (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3267 (!fNU->contains(prevChar) && fSY->contains(thisChar))) {
3268 continue;
3269 }
3270
3271 // LB 14 Don't break after OP SP*
3272 // Scan backwards, checking for this sequence.
3273 // The OP char could include combining marks, so we actually check for
3274 // OP CM* SP*
3275 // Another Twist: The Rule 67 fixes may have changed a SP CM
3276 // sequence into a ID char, so before scanning back through spaces,
3277 // verify that prevChar is indeed a space. The prevChar variable
3278 // may differ from fText[prevPos]
3279 tPos = prevPos;
3280 if (fSP->contains(prevChar)) {
3281 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3282 tPos=fText->moveIndex32(tPos, -1);
3283 }
3284 }
3285 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3286 tPos=fText->moveIndex32(tPos, -1);
3287 }
3288 if (fOP->contains(fText->char32At(tPos))) {
3289 continue;
3290 }
3291
3292
3293 // LB 15 QU SP* x OP
3294 if (fOP->contains(thisChar)) {
3295 // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3296 int tPos = prevPos;
3297 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3298 tPos = fText->moveIndex32(tPos, -1);
3299 }
3300 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3301 tPos = fText->moveIndex32(tPos, -1);
3302 }
3303 if (fQU->contains(fText->char32At(tPos))) {
3304 continue;
3305 }
3306 }
3307
3308
3309
3310 // LB 16 (CL | CP) SP* x NS
3311 // Scan backwards for SP* CM* (CL | CP)
3312 if (fNS->contains(thisChar)) {
3313 int tPos = prevPos;
3314 while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3315 tPos = fText->moveIndex32(tPos, -1);
3316 }
3317 while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3318 tPos = fText->moveIndex32(tPos, -1);
3319 }
3320 if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3321 continue;
3322 }
3323 }
3324
3325
3326 // LB 17 B2 SP* x B2
3327 if (fB2->contains(thisChar)) {
3328 // Scan backwards, checking for the B2 CM* SP* sequence.
3329 tPos = prevPos;
3330 if (fSP->contains(prevChar)) {
3331 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3332 tPos=fText->moveIndex32(tPos, -1);
3333 }
3334 }
3335 while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3336 tPos=fText->moveIndex32(tPos, -1);
3337 }
3338 if (fB2->contains(fText->char32At(tPos))) {
3339 continue;
3340 }
3341 }
3342
3343
3344 // LB 18 break after space
3345 if (fSP->contains(prevChar)) {
3346 break;
3347 }
3348
3349 // LB 19
3350 // x QU
3351 // QU x
3352 if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3353 continue;
3354 }
3355
3356 // LB 20 Break around a CB
3357 if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3358 break;
3359 }
3360
3361 // LB 21
3362 if (fBA->contains(thisChar) ||
3363 fHY->contains(thisChar) ||
3364 fNS->contains(thisChar) ||
3365 fBB->contains(prevChar) ) {
3366 continue;
3367 }
3368
3369 // LB 21a
3370 // HL (HY | BA) x
3371 if (fHL->contains(prevCharX2) &&
3372 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3373 continue;
3374 }
3375
3376 // LB 21b
3377 // SY x HL
3378 if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3379 continue;
3380 }
3381
3382 // LB 22
3383 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3384 (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3385 (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3386 (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3387 (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3388 (fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
3389 continue;
3390 }
3391
3392
3393 // LB 23 ID x PO
3394 // AL x NU
3395 // HL x NU
3396 // NU x AL
3397 if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3398 (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3399 (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3400 (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3401 (fNU->contains(prevChar) && fHL->contains(thisChar)) ) {
3402 continue;
3403 }
3404
3405 // LB 24 Do not break between prefix and letters or ideographs.
3406 // PR x ID
3407 // PR x (AL | HL)
3408 // PO x (AL | HL)
3409 if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3410 (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3411 (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar)))) {
3412 continue;
3413 }
3414
3415
3416
3417 // LB 25 Numbers
3418 if (fNumberMatcher->lookingAt(prevPos, status)) {
3419 if (U_FAILURE(status)) {
3420 break;
3421 }
3422 // Matched a number. But could have been just a single digit, which would
3423 // not represent a "no break here" between prevChar and thisChar
3424 int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
3425 if (numEndIdx > pos) {
3426 // Number match includes at least our two chars being checked
3427 if (numEndIdx > nextPos) {
3428 // Number match includes additional chars. Update pos and nextPos
3429 // so that next loop iteration will continue at the end of the number,
3430 // checking for breaks between last char in number & whatever follows.
3431 pos = nextPos = numEndIdx;
3432 do {
3433 pos = fText->moveIndex32(pos, -1);
3434 thisChar = fText->char32At(pos);
3435 } while (fCM->contains(thisChar));
3436 }
3437 continue;
3438 }
3439 }
3440
3441
3442 // LB 26 Do not break a Korean syllable.
3443 if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3444 fJV->contains(thisChar) ||
3445 fH2->contains(thisChar) ||
3446 fH3->contains(thisChar))) {
3447 continue;
3448 }
3449
3450 if ((fJV->contains(prevChar) || fH2->contains(prevChar)) &&
3451 (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3452 continue;
3453 }
3454
3455 if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3456 fJT->contains(thisChar)) {
3457 continue;
3458 }
3459
3460 // LB 27 Treat a Korean Syllable Block the same as ID.
3461 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3462 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3463 fIN->contains(thisChar)) {
3464 continue;
3465 }
3466 if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3467 fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3468 fPO->contains(thisChar)) {
3469 continue;
3470 }
3471 if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3472 fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3473 continue;
3474 }
3475
3476
3477
3478 // LB 28 Do not break between alphabetics ("at").
3479 if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3480 continue;
3481 }
3482
3483 // LB 29 Do not break between numeric punctuation and alphabetics ("e.g.").
3484 if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3485 continue;
3486 }
3487
3488 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3489 // (AL | NU) x OP
3490 // CP x (AL | NU)
3491 if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3492 continue;
3493 }
3494 if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3495 continue;
3496 }
3497
3498 // LB30a Do not break between regional indicators.
3499 // RI x RI
3500 if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3501 continue;
3502 }
3503
3504 // LB 31 Break everywhere else
3505 break;
3506
3507 }
3508
3509 return pos;
3510 }
3511
3512
charClasses()3513 UVector *RBBILineMonkey::charClasses() {
3514 return fSets;
3515 }
3516
3517
~RBBILineMonkey()3518 RBBILineMonkey::~RBBILineMonkey() {
3519 delete fSets;
3520
3521 delete fBK;
3522 delete fCR;
3523 delete fLF;
3524 delete fCM;
3525 delete fNL;
3526 delete fWJ;
3527 delete fZW;
3528 delete fGL;
3529 delete fCB;
3530 delete fSP;
3531 delete fB2;
3532 delete fBA;
3533 delete fBB;
3534 delete fHY;
3535 delete fH2;
3536 delete fH3;
3537 delete fCL;
3538 delete fCP;
3539 delete fEX;
3540 delete fIN;
3541 delete fJL;
3542 delete fJV;
3543 delete fJT;
3544 delete fNS;
3545 delete fOP;
3546 delete fQU;
3547 delete fIS;
3548 delete fNU;
3549 delete fPO;
3550 delete fPR;
3551 delete fSY;
3552 delete fAI;
3553 delete fAL;
3554 delete fCJ;
3555 delete fHL;
3556 delete fID;
3557 delete fRI;
3558 delete fSA;
3559 delete fSG;
3560 delete fXX;
3561
3562 delete fCharBI;
3563 delete fNumberMatcher;
3564 }
3565
3566
3567 //-------------------------------------------------------------------------------------------
3568 //
3569 // TestMonkey
3570 //
3571 // params
3572 // seed=nnnnn Random number starting seed.
3573 // Setting the seed allows errors to be reproduced.
3574 // loop=nnn Looping count. Controls running time.
3575 // -1: run forever.
3576 // 0 or greater: run length.
3577 //
3578 // type = char | word | line | sent | title
3579 //
3580 //-------------------------------------------------------------------------------------------
3581
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3582 static int32_t getIntParam(UnicodeString name, UnicodeString ¶ms, int32_t defaultVal) {
3583 int32_t val = defaultVal;
3584 name.append(" *= *(-?\\d+)");
3585 UErrorCode status = U_ZERO_ERROR;
3586 RegexMatcher m(name, params, 0, status);
3587 if (m.find()) {
3588 // The param exists. Convert the string to an int.
3589 char valString[100];
3590 int32_t paramLength = m.end(1, status) - m.start(1, status);
3591 if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3592 paramLength = (int32_t)(sizeof(valString)-2);
3593 }
3594 params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3595 val = strtol(valString, NULL, 10);
3596
3597 // Delete this parameter from the params string.
3598 m.reset();
3599 params = m.replaceFirst("", status);
3600 }
3601 U_ASSERT(U_SUCCESS(status));
3602 return val;
3603 }
3604 #endif
3605
3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3607 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3608 BreakIterator *bi,
3609 int expected[],
3610 int expectedcount)
3611 {
3612 int count = 0;
3613 int i = 0;
3614 int forward[50];
3615 bi->setText(ustr);
3616 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3617 forward[count] = i;
3618 if (count < expectedcount && expected[count] != i) {
3619 test->errln("break forward test failed: expected %d but got %d",
3620 expected[count], i);
3621 break;
3622 }
3623 count ++;
3624 }
3625 if (count != expectedcount) {
3626 printStringBreaks(ustr, expected, expectedcount);
3627 test->errln("break forward test failed: missed %d match",
3628 expectedcount - count);
3629 return;
3630 }
3631 // testing boundaries
3632 for (i = 1; i < expectedcount; i ++) {
3633 int j = expected[i - 1];
3634 if (!bi->isBoundary(j)) {
3635 printStringBreaks(ustr, expected, expectedcount);
3636 test->errln("isBoundary() failed. Expected boundary at position %d", j);
3637 return;
3638 }
3639 for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3640 if (bi->isBoundary(j)) {
3641 printStringBreaks(ustr, expected, expectedcount);
3642 test->errln("isBoundary() failed. Not expecting boundary at position %d", j);
3643 return;
3644 }
3645 }
3646 }
3647
3648 for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3649 count --;
3650 if (forward[count] != i) {
3651 printStringBreaks(ustr, expected, expectedcount);
3652 test->errln("happy break test previous() failed: expected %d but got %d",
3653 forward[count], i);
3654 break;
3655 }
3656 }
3657 if (count != 0) {
3658 printStringBreaks(ustr, expected, expectedcount);
3659 test->errln("break test previous() failed: missed a match");
3660 return;
3661 }
3662
3663 // testing preceding
3664 for (i = 0; i < expectedcount - 1; i ++) {
3665 // int j = expected[i] + 1;
3666 int j = ustr.moveIndex32(expected[i], 1);
3667 for (; j <= expected[i + 1]; j ++) {
3668 if (bi->preceding(j) != expected[i]) {
3669 printStringBreaks(ustr, expected, expectedcount);
3670 test->errln("preceding(): Not expecting boundary at position %d", j);
3671 return;
3672 }
3673 }
3674 }
3675 }
3676 #endif
3677
TestWordBreaks(void)3678 void RBBITest::TestWordBreaks(void)
3679 {
3680 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3681
3682 Locale locale("en");
3683 UErrorCode status = U_ZERO_ERROR;
3684 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3685 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3686 // Replaced any C+J characters in a row with a random sequence of characters
3687 // of the same length to make our C+J segmentation not get in the way.
3688 static const char *strlist[] =
3689 {
3690 "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3691 "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3692 "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3693 "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3694 "\\uac00\\u3588\\u009c\\u0953\\u194b",
3695 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3696 "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3697 "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3698 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3699 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3700 "\\u2027\\U000e0067\\u0a47\\u00b7",
3701 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3702 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3703 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3704 "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3705 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3706 "\\u0027\\u11af\\U000e0057\\u0602",
3707 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3708 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3709 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3710 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3711 "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3712 "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3713 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3714 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3715 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3716 "\\u18f4\\U000e0049\\u20e7\\u2027",
3717 "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3718 "\\ua183\\u102d\\u0bec\\u003a",
3719 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3720 "\\u003a\\u0e57\\u0fad\\u002e",
3721 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3722 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3723 "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3724 "\\u003a\\u0664\\u00b7\\u1fba",
3725 "\\u003b\\u0027\\u00b7\\u47a3",
3726 "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3727 "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3728 "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3729 };
3730 int loop;
3731 if (U_FAILURE(status)) {
3732 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3733 return;
3734 }
3735 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3736 // printf("looping %d\n", loop);
3737 UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3738 // RBBICharMonkey monkey;
3739 RBBIWordMonkey monkey;
3740
3741 int expected[50];
3742 int expectedcount = 0;
3743
3744 monkey.setText(ustr);
3745 int i;
3746 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3747 expected[expectedcount ++] = i;
3748 }
3749
3750 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3751 }
3752 delete bi;
3753 #endif
3754 }
3755
TestWordBoundary(void)3756 void RBBITest::TestWordBoundary(void)
3757 {
3758 // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3759 Locale locale("en");
3760 UErrorCode status = U_ZERO_ERROR;
3761 // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
3762 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3763 UChar str[50];
3764 static const char *strlist[] =
3765 {
3766 "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3767 "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3768 "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3769 "\\u2027\\U000e0067\\u0a47\\u00b7",
3770 "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3771 "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3772 "\\u0589\\U000e006e\\u0a42\\U000104a5",
3773 "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3774 "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3775 "\\u0027\\u11af\\U000e0057\\u0602",
3776 "\\U0001d7f2\\U000e007\\u0004\\u0589",
3777 "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3778 "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3779 "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3780 "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3781 "\\U000e0065\\u302c\\u09ee\\U000e0068",
3782 "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3783 "\\u0233\\U000e0020\\u0a69\\u0d6a",
3784 "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3785 "\\u58f4\\U000e0049\\u20e7\\u2027",
3786 "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3787 "\\ua183\\u102d\\u0bec\\u003a",
3788 "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3789 "\\u003a\\u0e57\\u0fad\\u002e",
3790 "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3791 "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3792 "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3793 "\\u003a\\u0664\\u00b7\\u1fba",
3794 "\\u003b\\u0027\\u00b7\\u47a3",
3795 };
3796 int loop;
3797 if (U_FAILURE(status)) {
3798 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3799 return;
3800 }
3801 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3802 // printf("looping %d\n", loop);
3803 u_unescape(strlist[loop], str, 20);
3804 UnicodeString ustr(str);
3805 int forward[50];
3806 int count = 0;
3807
3808 bi->setText(ustr);
3809 int prev = 0;
3810 int i;
3811 for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3812 forward[count ++] = i;
3813 if (i > prev) {
3814 int j;
3815 for (j = prev + 1; j < i; j ++) {
3816 if (bi->isBoundary(j)) {
3817 printStringBreaks(ustr, forward, count);
3818 errln("happy boundary test failed: expected %d not a boundary",
3819 j);
3820 return;
3821 }
3822 }
3823 }
3824 if (!bi->isBoundary(i)) {
3825 printStringBreaks(ustr, forward, count);
3826 errln("happy boundary test failed: expected %d a boundary",
3827 i);
3828 return;
3829 }
3830 prev = i;
3831 }
3832 }
3833 delete bi;
3834 }
3835
TestLineBreaks(void)3836 void RBBITest::TestLineBreaks(void)
3837 {
3838 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3839 Locale locale("en");
3840 UErrorCode status = U_ZERO_ERROR;
3841 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3842 const int32_t STRSIZE = 50;
3843 UChar str[STRSIZE];
3844 static const char *strlist[] =
3845 {
3846 "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3847 "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3848 "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3849 "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3850 "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3851 "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3852 "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3853 "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3854 "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3855 "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3856 "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3857 "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3858 "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3859 "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3860 "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3861 "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3862 "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3863 "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3864 "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3865 "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3866 "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3867 "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3868 "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3869 "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3870 "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3871 "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3872 "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3873 "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3874 "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3875 "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3876 "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3877 "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3878 "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3879 "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3880 "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3881 "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3882 "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3883 "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3884 "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3885 "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3886 "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3887 "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3888 "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3889 "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3890 "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3891 "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3892 "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3893 };
3894 int loop;
3895 TEST_ASSERT_SUCCESS(status);
3896 if (U_FAILURE(status)) {
3897 return;
3898 }
3899 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3900 // printf("looping %d\n", loop);
3901 int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3902 if (t >= STRSIZE) {
3903 TEST_ASSERT(FALSE);
3904 continue;
3905 }
3906
3907
3908 UnicodeString ustr(str);
3909 RBBILineMonkey monkey;
3910 if (U_FAILURE(monkey.deferredStatus)) {
3911 continue;
3912 }
3913
3914 const int EXPECTEDSIZE = 50;
3915 int expected[EXPECTEDSIZE];
3916 int expectedcount = 0;
3917
3918 monkey.setText(ustr);
3919 int i;
3920 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3921 if (expectedcount >= EXPECTEDSIZE) {
3922 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3923 return;
3924 }
3925 expected[expectedcount ++] = i;
3926 }
3927
3928 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3929 }
3930 delete bi;
3931 #endif
3932 }
3933
TestSentBreaks(void)3934 void RBBITest::TestSentBreaks(void)
3935 {
3936 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3937 Locale locale("en");
3938 UErrorCode status = U_ZERO_ERROR;
3939 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3940 UChar str[200];
3941 static const char *strlist[] =
3942 {
3943 "Now\ris\nthe\r\ntime\n\rfor\r\r",
3944 "This\n",
3945 "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3946 "\"Sentence ending with a quote.\" Bye.",
3947 " (This is it). Testing the sentence iterator. \"This isn't it.\"",
3948 "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3949 "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3950 "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3951 "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3952 "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3953 "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3954 "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3955 "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3956 "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3957 "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3958 "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3959 "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3960 "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3961 "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3962 "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3963 };
3964 int loop;
3965 if (U_FAILURE(status)) {
3966 errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3967 return;
3968 }
3969 for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3970 u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3971 UnicodeString ustr(str);
3972
3973 RBBISentMonkey monkey;
3974 if (U_FAILURE(monkey.deferredStatus)) {
3975 continue;
3976 }
3977
3978 const int EXPECTEDSIZE = 50;
3979 int expected[EXPECTEDSIZE];
3980 int expectedcount = 0;
3981
3982 monkey.setText(ustr);
3983 int i;
3984 for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3985 if (expectedcount >= EXPECTEDSIZE) {
3986 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3987 return;
3988 }
3989 expected[expectedcount ++] = i;
3990 }
3991
3992 testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3993 }
3994 delete bi;
3995 #endif
3996 }
3997
TestMonkey(char * params)3998 void RBBITest::TestMonkey(char *params) {
3999 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4000
4001 UErrorCode status = U_ZERO_ERROR;
4002 int32_t loopCount = 500;
4003 int32_t seed = 1;
4004 UnicodeString breakType = "all";
4005 Locale locale("en");
4006 UBool useUText = FALSE;
4007
4008 if (quick == FALSE) {
4009 loopCount = 10000;
4010 }
4011
4012 if (params) {
4013 UnicodeString p(params);
4014 loopCount = getIntParam("loop", p, loopCount);
4015 seed = getIntParam("seed", p, seed);
4016
4017 RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4018 if (m.find()) {
4019 breakType = m.group(1, status);
4020 m.reset();
4021 p = m.replaceFirst("", status);
4022 }
4023
4024 RegexMatcher u(" *utext", p, 0, status);
4025 if (u.find()) {
4026 useUText = TRUE;
4027 u.reset();
4028 p = u.replaceFirst("", status);
4029 }
4030
4031
4032 // m.reset(p);
4033 if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4034 // Each option is stripped out of the option string as it is processed.
4035 // All options have been checked. The option string should have been completely emptied..
4036 char buf[100];
4037 p.extract(buf, sizeof(buf), NULL, status);
4038 buf[sizeof(buf)-1] = 0;
4039 errln("Unrecognized or extra parameter: %s\n", buf);
4040 return;
4041 }
4042
4043 }
4044
4045 if (breakType == "char" || breakType == "all") {
4046 RBBICharMonkey m;
4047 BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status);
4048 if (U_SUCCESS(status)) {
4049 RunMonkey(bi, m, "char", seed, loopCount, useUText);
4050 if (breakType == "all" && useUText==FALSE) {
4051 // Also run a quick test with UText when "all" is specified
4052 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4053 }
4054 }
4055 else {
4056 errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4057 }
4058 delete bi;
4059 }
4060
4061 if (breakType == "word" || breakType == "all") {
4062 logln("Word Break Monkey Test");
4063 RBBIWordMonkey m;
4064 BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4065 if (U_SUCCESS(status)) {
4066 RunMonkey(bi, m, "word", seed, loopCount, useUText);
4067 }
4068 else {
4069 errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4070 }
4071 delete bi;
4072 }
4073
4074 if (breakType == "line" || breakType == "all") {
4075 logln("Line Break Monkey Test");
4076 RBBILineMonkey m;
4077 BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4078 if (loopCount >= 10) {
4079 loopCount = loopCount / 5; // Line break runs slower than the others.
4080 }
4081 if (U_SUCCESS(status)) {
4082 RunMonkey(bi, m, "line", seed, loopCount, useUText);
4083 }
4084 else {
4085 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4086 }
4087 delete bi;
4088 }
4089
4090 if (breakType == "sent" || breakType == "all" ) {
4091 logln("Sentence Break Monkey Test");
4092 RBBISentMonkey m;
4093 BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4094 if (loopCount >= 10) {
4095 loopCount = loopCount / 10; // Sentence runs slower than the other break types
4096 }
4097 if (U_SUCCESS(status)) {
4098 RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4099 }
4100 else {
4101 errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4102 }
4103 delete bi;
4104 }
4105
4106 #endif
4107 }
4108
4109 //
4110 // Run a RBBI monkey test. Common routine, for all break iterator types.
4111 // Parameters:
4112 // bi - the break iterator to use
4113 // mk - MonkeyKind, abstraction for obtaining expected results
4114 // name - Name of test (char, word, etc.) for use in error messages
4115 // seed - Seed for starting random number generator (parameter from user)
4116 // numIterations
4117 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4118 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t seed,
4119 int32_t numIterations, UBool useUText) {
4120
4121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4122
4123 const int32_t TESTSTRINGLEN = 500;
4124 UnicodeString testText;
4125 int32_t numCharClasses;
4126 UVector *chClasses;
4127 int expected[TESTSTRINGLEN*2 + 1];
4128 int expectedCount = 0;
4129 char expectedBreaks[TESTSTRINGLEN*2 + 1];
4130 char forwardBreaks[TESTSTRINGLEN*2 + 1];
4131 char reverseBreaks[TESTSTRINGLEN*2+1];
4132 char isBoundaryBreaks[TESTSTRINGLEN*2+1];
4133 char followingBreaks[TESTSTRINGLEN*2+1];
4134 char precedingBreaks[TESTSTRINGLEN*2+1];
4135 int i;
4136 int loopCount = 0;
4137
4138 m_seed = seed;
4139
4140 numCharClasses = mk.charClasses()->size();
4141 chClasses = mk.charClasses();
4142
4143 // Check for errors that occured during the construction of the MonkeyKind object.
4144 // Can't report them where they occured because errln() is a method coming from intlTest,
4145 // and is not visible outside of RBBITest :-(
4146 if (U_FAILURE(mk.deferredStatus)) {
4147 errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4148 return;
4149 }
4150
4151 // Verify that the character classes all have at least one member.
4152 for (i=0; i<numCharClasses; i++) {
4153 UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4154 if (s == NULL || s->size() == 0) {
4155 errln("Character Class #%d is null or of zero size.", i);
4156 return;
4157 }
4158 }
4159
4160 while (loopCount < numIterations || numIterations == -1) {
4161 if (numIterations == -1 && loopCount % 10 == 0) {
4162 // If test is running in an infinite loop, display a periodic tic so
4163 // we can tell that it is making progress.
4164 fprintf(stderr, ".");
4165 }
4166 // Save current random number seed, so that we can recreate the random numbers
4167 // for this loop iteration in event of an error.
4168 seed = m_seed;
4169
4170 // Populate a test string with data.
4171 testText.truncate(0);
4172 for (i=0; i<TESTSTRINGLEN; i++) {
4173 int32_t aClassNum = m_rand() % numCharClasses;
4174 UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4175 int32_t charIdx = m_rand() % classSet->size();
4176 UChar32 c = classSet->charAt(charIdx);
4177 if (c < 0) { // TODO: deal with sets containing strings.
4178 errln("c < 0");
4179 break;
4180 }
4181 testText.append(c);
4182 }
4183
4184 // Calculate the expected results for this test string.
4185 mk.setText(testText);
4186 memset(expectedBreaks, 0, sizeof(expectedBreaks));
4187 expectedBreaks[0] = 1;
4188 int32_t breakPos = 0;
4189 expectedCount = 0;
4190 for (;;) {
4191 breakPos = mk.next(breakPos);
4192 if (breakPos == -1) {
4193 break;
4194 }
4195 if (breakPos > testText.length()) {
4196 errln("breakPos > testText.length()");
4197 }
4198 expectedBreaks[breakPos] = 1;
4199 U_ASSERT(expectedCount<testText.length());
4200 expected[expectedCount ++] = breakPos;
4201 (void)expected; // Set but not used warning.
4202 // TODO (andy): check it out.
4203 }
4204
4205 // Find the break positions using forward iteration
4206 memset(forwardBreaks, 0, sizeof(forwardBreaks));
4207 if (useUText) {
4208 UErrorCode status = U_ZERO_ERROR;
4209 UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4210 // testUText = utext_openUnicodeString(testUText, &testText, &status);
4211 bi->setText(testUText, status);
4212 TEST_ASSERT_SUCCESS(status);
4213 utext_close(testUText); // The break iterator does a shallow clone of the UText
4214 // This UText can be closed immediately, so long as the
4215 // testText string continues to exist.
4216 } else {
4217 bi->setText(testText);
4218 }
4219
4220 for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4221 if (i < 0 || i > testText.length()) {
4222 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4223 break;
4224 }
4225 forwardBreaks[i] = 1;
4226 }
4227
4228 // Find the break positions using reverse iteration
4229 memset(reverseBreaks, 0, sizeof(reverseBreaks));
4230 for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4231 if (i < 0 || i > testText.length()) {
4232 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4233 break;
4234 }
4235 reverseBreaks[i] = 1;
4236 }
4237
4238 // Find the break positions using isBoundary() tests.
4239 memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4240 U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4241 for (i=0; i<=testText.length(); i++) {
4242 isBoundaryBreaks[i] = bi->isBoundary(i);
4243 }
4244
4245
4246 // Find the break positions using the following() function.
4247 // printf(".");
4248 memset(followingBreaks, 0, sizeof(followingBreaks));
4249 int32_t lastBreakPos = 0;
4250 followingBreaks[0] = 1;
4251 for (i=0; i<testText.length(); i++) {
4252 breakPos = bi->following(i);
4253 if (breakPos <= i ||
4254 breakPos < lastBreakPos ||
4255 breakPos > testText.length() ||
4256 (breakPos > lastBreakPos && lastBreakPos > i)) {
4257 errln("%s break monkey test: "
4258 "Out of range value returned by BreakIterator::following().\n"
4259 "Random seed=%d index=%d; following returned %d; lastbreak=%d",
4260 name, seed, i, breakPos, lastBreakPos);
4261 break;
4262 }
4263 followingBreaks[breakPos] = 1;
4264 lastBreakPos = breakPos;
4265 }
4266
4267 // Find the break positions using the preceding() function.
4268 memset(precedingBreaks, 0, sizeof(precedingBreaks));
4269 lastBreakPos = testText.length();
4270 precedingBreaks[testText.length()] = 1;
4271 for (i=testText.length(); i>0; i--) {
4272 breakPos = bi->preceding(i);
4273 if (breakPos >= i ||
4274 breakPos > lastBreakPos ||
4275 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4276 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4277 errln("%s break monkey test: "
4278 "Out of range value returned by BreakIterator::preceding().\n"
4279 "index=%d; prev returned %d; lastBreak=%d" ,
4280 name, i, breakPos, lastBreakPos);
4281 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4282 precedingBreaks[i] = 2; // Forces an error.
4283 }
4284 } else {
4285 if (breakPos >= 0) {
4286 precedingBreaks[breakPos] = 1;
4287 }
4288 lastBreakPos = breakPos;
4289 }
4290 }
4291
4292 // Compare the expected and actual results.
4293 for (i=0; i<=testText.length(); i++) {
4294 const char *errorType = NULL;
4295 if (forwardBreaks[i] != expectedBreaks[i]) {
4296 errorType = "next()";
4297 } else if (reverseBreaks[i] != forwardBreaks[i]) {
4298 errorType = "previous()";
4299 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4300 errorType = "isBoundary()";
4301 } else if (followingBreaks[i] != expectedBreaks[i]) {
4302 errorType = "following()";
4303 } else if (precedingBreaks[i] != expectedBreaks[i]) {
4304 errorType = "preceding()";
4305 }
4306
4307
4308 if (errorType != NULL) {
4309 // Format a range of the test text that includes the failure as
4310 // a data item that can be included in the rbbi test data file.
4311
4312 // Start of the range is the last point where expected and actual results
4313 // both agreed that there was a break position.
4314 int startContext = i;
4315 int32_t count = 0;
4316 for (;;) {
4317 if (startContext==0) { break; }
4318 startContext --;
4319 if (expectedBreaks[startContext] != 0) {
4320 if (count == 2) break;
4321 count ++;
4322 }
4323 }
4324
4325 // End of range is two expected breaks past the start position.
4326 int endContext = i + 1;
4327 int ci;
4328 for (ci=0; ci<2; ci++) { // Number of items to include in error text.
4329 for (;;) {
4330 if (endContext >= testText.length()) {break;}
4331 if (expectedBreaks[endContext-1] != 0) {
4332 if (count == 0) break;
4333 count --;
4334 }
4335 endContext ++;
4336 }
4337 }
4338
4339 // Format looks like "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4340 UnicodeString errorText = "<data>";
4341 /***if (strcmp(errorType, "next()") == 0) {
4342 startContext = 0;
4343 endContext = testText.length();
4344
4345 printStringBreaks(testText, expected, expectedCount);
4346 }***/
4347
4348 for (ci=startContext; ci<endContext;) {
4349 UnicodeString hexChars("0123456789abcdef");
4350 UChar32 c;
4351 int bn;
4352 c = testText.char32At(ci);
4353 if (ci == i) {
4354 // This is the location of the error.
4355 errorText.append("<?>");
4356 } else if (expectedBreaks[ci] != 0) {
4357 // This a non-error expected break position.
4358 errorText.append("\\");
4359 }
4360 if (c < 0x10000) {
4361 errorText.append("\\u");
4362 for (bn=12; bn>=0; bn-=4) {
4363 errorText.append(hexChars.charAt((c>>bn)&0xf));
4364 }
4365 } else {
4366 errorText.append("\\U");
4367 for (bn=28; bn>=0; bn-=4) {
4368 errorText.append(hexChars.charAt((c>>bn)&0xf));
4369 }
4370 }
4371 ci = testText.moveIndex32(ci, 1);
4372 }
4373 errorText.append("\\");
4374 errorText.append("</data>\n");
4375
4376 // Output the error
4377 char charErrorTxt[500];
4378 UErrorCode status = U_ZERO_ERROR;
4379 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4380 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4381 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4382
4383 errln("%s break monkey test error [%s]. %s. Operation = %s; Random seed = %d; buf Idx = %d\n%s",
4384 name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4385 errorType, seed, i, charErrorTxt);
4386 break;
4387 }
4388 }
4389
4390 loopCount++;
4391 }
4392 #endif
4393 }
4394
4395
4396 // Bug 5532. UTF-8 based UText fails in dictionary code.
4397 // This test checks the initial patch,
4398 // which is to just keep it from crashing. Correct word boundaries
4399 // await a proper fix to the dictionary code.
4400 //
TestBug5532(void)4401 void RBBITest::TestBug5532(void) {
4402 // Text includes a mixture of Thai and Latin.
4403 const unsigned char utf8Data[] = {
4404 0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4405 0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4406 0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4407 0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4408 0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4409 0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4410 0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4411 0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4412 0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4413 0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4414 0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4415
4416 UErrorCode status = U_ZERO_ERROR;
4417 UText utext=UTEXT_INITIALIZER;
4418 utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4419 TEST_ASSERT_SUCCESS(status);
4420
4421 BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4422 TEST_ASSERT_SUCCESS(status);
4423 if (U_SUCCESS(status)) {
4424 bi->setText(&utext, status);
4425 TEST_ASSERT_SUCCESS(status);
4426
4427 int32_t breakCount = 0;
4428 int32_t previousBreak = -1;
4429 for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4430 // For now, just make sure that the break iterator doesn't hang.
4431 TEST_ASSERT(previousBreak < bi->current());
4432 previousBreak = bi->current();
4433 }
4434 TEST_ASSERT(breakCount > 0);
4435 }
4436 delete bi;
4437 utext_close(&utext);
4438 }
4439
4440
TestBug9983(void)4441 void RBBITest::TestBug9983(void) {
4442 UnicodeString text = UnicodeString("\\u002A" // * Other
4443 "\\uFF65" // Other
4444 "\\u309C" // Katakana
4445 "\\uFF9F" // Extend
4446 "\\uFF65" // Other
4447 "\\u0020" // Other
4448 "\\u0000").unescape();
4449
4450 UErrorCode status = U_ZERO_ERROR;
4451 LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4452 BreakIterator::createWordInstance(Locale::getRoot(), status)));
4453 TEST_ASSERT_SUCCESS(status);
4454 LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4455 BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4456 TEST_ASSERT_SUCCESS(status);
4457 if (U_FAILURE(status)) {
4458 return;
4459 }
4460 int32_t offset, rstatus, iterationCount;
4461
4462 brkiter->setText(text);
4463 brkiter->last();
4464 iterationCount = 0;
4465 while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4466 iterationCount++;
4467 rstatus = brkiter->getRuleStatus();
4468 (void)rstatus; // Suppress set but not used warning.
4469 if (iterationCount >= 10) {
4470 break;
4471 }
4472 }
4473 TEST_ASSERT(iterationCount == 6);
4474
4475 brkiterPOSIX->setText(text);
4476 brkiterPOSIX->last();
4477 iterationCount = 0;
4478 while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4479 iterationCount++;
4480 rstatus = brkiterPOSIX->getRuleStatus();
4481 (void)rstatus; // Suppress set but not used warning.
4482 if (iterationCount >= 10) {
4483 break;
4484 }
4485 }
4486 TEST_ASSERT(iterationCount == 6);
4487 }
4488
4489
4490 //
4491 // TestDebug - A place-holder test for debugging purposes.
4492 // For putting in fragments of other tests that can be invoked
4493 // for tracing without a lot of unwanted extra stuff happening.
4494 //
TestDebug(void)4495 void RBBITest::TestDebug(void) {
4496 #if 0
4497 UErrorCode status = U_ZERO_ERROR;
4498 int pos = 0;
4499 int ruleStatus = 0;
4500
4501 RuleBasedBreakIterator* bi =
4502 // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4503 // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4504 (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4505 UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4506 // UnicodeString s("Aaa. Bcd");
4507 s = s.unescape();
4508 bi->setText(s);
4509 UBool r = bi->isBoundary(8);
4510 printf("%s", r?"true":"false");
4511 return;
4512 pos = bi->last();
4513 do {
4514 // ruleStatus = bi->getRuleStatus();
4515 printf("%d\t%d\n", pos, ruleStatus);
4516 pos = bi->previous();
4517 } while (pos != BreakIterator::DONE);
4518 #endif
4519 }
4520
TestProperties()4521 void RBBITest::TestProperties() {
4522 UErrorCode errorCode = U_ZERO_ERROR;
4523 UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4524 if (!prependSet.isEmpty()) {
4525 errln(
4526 "[:GCB=Prepend:] is not empty any more. "
4527 "Uncomment relevant lines in source/data/brkitr/char.txt and "
4528 "change this test to the opposite condition.");
4529 }
4530 }
4531
4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4533