1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 1999-2015, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 /************************************************************************
7 *   Date        Name        Description
8 *   12/15/99    Madhu        Creation.
9 *   01/12/2000  Madhu        Updated for changed API and added new tests
10 ************************************************************************/
11 
12 #include "utypeinfo.h"  // for 'typeid' to work
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/utypes.h"
19 #include "unicode/brkiter.h"
20 #include "unicode/rbbi.h"
21 #include "unicode/uchar.h"
22 #include "unicode/utf16.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/schriter.h"
25 #include "unicode/uniset.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 #include "unicode/regex.h"
28 #endif
29 #include "unicode/ustring.h"
30 #include "unicode/utext.h"
31 #include "intltest.h"
32 #include "rbbitst.h"
33 #include <string.h>
34 #include "charstr.h"
35 #include "uvector.h"
36 #include "uvectr32.h"
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include "unicode/numfmt.h"
40 #include "unicode/uscript.h"
41 #include "cmemory.h"
42 
43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
44 #include "unicode/filteredbrk.h"
45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
46 
47 #define TEST_ASSERT(x) {if (!(x)) { \
48     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
49 
50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
51     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
52 
53 
54 //---------------------------------------------
55 // runIndexedTest
56 //---------------------------------------------
57 
58 
59 //  Note:  Before adding new tests to this file, check whether the desired test data can
60 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
61 //         it's much less work than writing a new test, diagnostic output in the event of failures
62 //         is good, and the test data file will is shared with ICU4J, so eventually the test
63 //         will run there as well, without additional effort.
64 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
66 {
67     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
68 
69     switch (index) {
70 #if !UCONFIG_NO_FILE_IO
71         case 0: name = "TestBug4153072";
72             if(exec) TestBug4153072();                         break;
73 #else
74         case 0: name = "skip";
75             break;
76 #endif
77 
78         case 1: name = "skip";
79             break;
80         case 2: name = "TestStatusReturn";
81             if(exec) TestStatusReturn();                       break;
82 
83 #if !UCONFIG_NO_FILE_IO
84         case 3: name = "TestUnicodeFiles";
85             if(exec) TestUnicodeFiles();                       break;
86         case 4: name = "TestEmptyString";
87             if(exec) TestEmptyString();                        break;
88 #else
89         case 3: case 4: name = "skip";
90             break;
91 #endif
92 
93         case 5: name = "TestGetAvailableLocales";
94             if(exec) TestGetAvailableLocales();                break;
95 
96         case 6: name = "TestGetDisplayName";
97             if(exec) TestGetDisplayName();                     break;
98 
99 #if !UCONFIG_NO_FILE_IO
100         case 7: name = "TestEndBehaviour";
101             if(exec) TestEndBehaviour();                       break;
102         case 8: case 9: case 10: name = "skip";
103              break;
104         case 11: name = "TestWordBreaks";
105              if(exec) TestWordBreaks();                        break;
106         case 12: name = "TestWordBoundary";
107              if(exec) TestWordBoundary();                      break;
108         case 13: name = "TestLineBreaks";
109              if(exec) TestLineBreaks();                        break;
110         case 14: name = "TestSentBreaks";
111              if(exec) TestSentBreaks();                        break;
112         case 15: name = "TestExtended";
113              if(exec) TestExtended();                          break;
114 #else
115         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
116              break;
117 #endif
118 
119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
120         case 16:
121             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
122 #else
123         case 16:
124              name = "skip";                                    break;
125 #endif
126 
127 #if !UCONFIG_NO_FILE_IO
128         case 17: name = "TestBug3818";
129             if(exec) TestBug3818();                            break;
130 #else
131         case 17: name = "skip";
132             break;
133 #endif
134 
135         case 18: name = "skip";
136             break;
137         case 19: name = "TestDebug";
138             if(exec) TestDebug();                              break;
139         case 20: name = "skip";
140             break;
141 
142 #if !UCONFIG_NO_FILE_IO
143         case 21: name = "TestBug5775";
144             if (exec) TestBug5775();                           break;
145 #else
146         case 21: name = "skip";
147             break;
148 #endif
149 
150         case 22: name = "TestBug9983";
151             if (exec) TestBug9983();                           break;
152         case 23: name = "TestDictRules";
153             if (exec) TestDictRules();                         break;
154         case 24: name = "TestBug5532";
155             if (exec) TestBug5532();                           break;
156         default: name = ""; break; //needed to end loop
157     }
158 }
159 
160 
161 //---------------------------------------------------------------------------
162 //
163 //   class BITestData   Holds a set of Break iterator test data and results
164 //                      Includes
165 //                         - the string data to be broken
166 //                         - a vector of the expected break positions.
167 //                         - a vector of source line numbers for the data,
168 //                               (to help see where errors occured.)
169 //                         - The expected break tag values.
170 //                         - Vectors of actual break positions and tag values.
171 //                         - Functions for comparing actual with expected and
172 //                            reporting errors.
173 //
174 //----------------------------------------------------------------------------
175 class BITestData {
176 public:
177     UnicodeString    fDataToBreak;
178     UVector          fExpectedBreakPositions;
179     UVector          fExpectedTags;
180     UVector          fLineNum;
181     UVector          fActualBreakPositions;   // Test Results.
182     UVector          fActualTags;
183 
184     BITestData(UErrorCode &status);
185     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
186     void             checkResults(const char *heading, RBBITest *test);
187     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
188     void             clearResults();
189 };
190 
191 //
192 // Constructor.
193 //
BITestData(UErrorCode & status)194 BITestData::BITestData(UErrorCode &status)
195 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
196   fActualTags(status)
197 {
198 }
199 
200 //
201 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
202 //                 The macro form collects the line number, which is helpful
203 //                 when tracking down failures.
204 //
205 //                 A null data item is inserted at the start of each test's data
206 //                  to put the starting zero into the data list.  The position saved for
207 //                  each non-null item is its ending position.
208 //
209 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
211     if (U_FAILURE(status)) {return;}
212     if (data != NULL) {
213         fDataToBreak.append(CharsToUnicodeString(data));
214     }
215     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
216     fExpectedTags.addElement(tag, status);
217     fLineNum.addElement(lineNum, status);
218 }
219 
220 
221 //
222 //  checkResults.   Compare the actual and expected break positions, report any differences.
223 //
checkResults(const char * heading,RBBITest * test)224 void BITestData::checkResults(const char *heading, RBBITest *test) {
225     int32_t   expectedIndex = 0;
226     int32_t   actualIndex = 0;
227 
228     for (;;) {
229         // If we've run through both the expected and actual results vectors, we're done.
230         //   break out of the loop.
231         if (expectedIndex >= fExpectedBreakPositions.size() &&
232             actualIndex   >= fActualBreakPositions.size()) {
233             break;
234         }
235 
236 
237         if (expectedIndex >= fExpectedBreakPositions.size()) {
238             err(heading, test, expectedIndex-1, actualIndex);
239             actualIndex++;
240             continue;
241         }
242 
243         if (actualIndex >= fActualBreakPositions.size()) {
244             err(heading, test, expectedIndex, actualIndex-1);
245             expectedIndex++;
246             continue;
247         }
248 
249         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
250             err(heading, test, expectedIndex, actualIndex);
251             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
252             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
253                 actualIndex++;
254             } else {
255                 expectedIndex++;
256             }
257             continue;
258         }
259 
260         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
261             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
262                 heading, fLineNum.elementAt(expectedIndex),
263                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
264         }
265 
266         actualIndex++;
267         expectedIndex++;
268     }
269 }
270 
271 //
272 //  err   -  An error was found.  Report it, along with information about where the
273 //                                incorrectly broken test data appeared in the source file.
274 //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)275 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
276 {
277     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
278     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
279     int32_t   o        = 0;
280     int32_t   line     = fLineNum.elementAti(expectedIdx);
281     if (expectedIdx > 0) {
282         // The line numbers are off by one because a premature break occurs somewhere
283         //    within the previous item, rather than at the start of the current (expected) item.
284         //    We want to report the offset of the unexpected break from the start of
285         //      this previous item.
286         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
287     }
288     if (actual < expected) {
289         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
290     } else {
291         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
292     }
293 }
294 
295 
clearResults()296 void BITestData::clearResults() {
297     fActualBreakPositions.removeAllElements();
298     fActualTags.removeAllElements();
299 }
300 
301 
302 //--------------------------------------------------------------------------------------
303 //
304 //    RBBITest    constructor and destructor
305 //
306 //--------------------------------------------------------------------------------------
307 
RBBITest()308 RBBITest::RBBITest() {
309 }
310 
311 
~RBBITest()312 RBBITest::~RBBITest() {
313 }
314 
315 //-----------------------------------------------------------------------------------
316 //
317 //   Test for status {tag} return value from break rules.
318 //        TODO:  a more thorough test.
319 //
320 //-----------------------------------------------------------------------------------
TestStatusReturn()321 void RBBITest::TestStatusReturn() {
322      UnicodeString rulesString1("$Letters = [:L:];\n"
323                                   "$Numbers = [:N:];\n"
324                                   "$Letters+{1};\n"
325                                   "$Numbers+{2};\n"
326                                   "Help\\ {4}/me\\!;\n"
327                                   "[^$Letters $Numbers];\n"
328                                   "!.*;\n", -1, US_INV);
329      UnicodeString testString1  = "abc123..abc Help me Help me!";
330                                 // 01234567890123456789012345678
331      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
332      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
333 
334      UErrorCode status=U_ZERO_ERROR;
335      UParseError    parseError;
336 
337      BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
338      if(U_FAILURE(status)) {
339          dataerrln("FAIL : in construction - %s", u_errorName(status));
340      } else {
341          int32_t  pos;
342          int32_t  i = 0;
343          bi->setText(testString1);
344          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
345              if (pos != bounds1[i]) {
346                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
347                  break;
348              }
349 
350              int tag = bi->getRuleStatus();
351              if (tag != brkStatus[i]) {
352                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
353                  break;
354              }
355              i++;
356          }
357      }
358      delete bi;
359 }
360 
361 
printStringBreaks(UText * tstr,int expected[],int expectedCount)362 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
363     UErrorCode status = U_ZERO_ERROR;
364     char name[100];
365     printf("code    alpha extend alphanum type word sent line name\n");
366     int nextExpectedIndex = 0;
367     utext_setNativeIndex(tstr, 0);
368     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
369         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
370             printf("------------------------------------------------ %d\n", j);
371             ++nextExpectedIndex;
372         }
373 
374         UChar32 c = utext_next32(tstr);
375         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
376         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
377                            u_isUAlphabetic(c),
378                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
379                            u_isalnum(c),
380                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
381                                                   u_charType(c),
382                                                   U_SHORT_PROPERTY_NAME),
383                            u_getPropertyValueName(UCHAR_WORD_BREAK,
384                                                   u_getIntPropertyValue(c,
385                                                           UCHAR_WORD_BREAK),
386                                                   U_SHORT_PROPERTY_NAME),
387                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
388                                    u_getIntPropertyValue(c,
389                                            UCHAR_SENTENCE_BREAK),
390                                    U_SHORT_PROPERTY_NAME),
391                            u_getPropertyValueName(UCHAR_LINE_BREAK,
392                                    u_getIntPropertyValue(c,
393                                            UCHAR_LINE_BREAK),
394                                    U_SHORT_PROPERTY_NAME),
395                            name);
396     }
397 }
398 
399 
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)400 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
401    UErrorCode status = U_ZERO_ERROR;
402    UText *tstr = NULL;
403    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
404    if (U_FAILURE(status)) {
405        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
406        return;
407     }
408    printStringBreaks(tstr, expected, expectedCount);
409    utext_close(tstr);
410 }
411 
412 
TestBug3818()413 void RBBITest::TestBug3818() {
414     UErrorCode  status = U_ZERO_ERROR;
415 
416     // Four Thai words...
417     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
418                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
419     UnicodeString  thaiStr(thaiWordData);
420 
421     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
422     if (U_FAILURE(status) || bi == NULL) {
423         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
424         return;
425     }
426     bi->setText(thaiStr);
427 
428     int32_t  startOfSecondWord = bi->following(1);
429     if (startOfSecondWord != 4) {
430         errln("Fail at file %s, line %d expected start of word at 4, got %d",
431             __FILE__, __LINE__, startOfSecondWord);
432     }
433     startOfSecondWord = bi->following(0);
434     if (startOfSecondWord != 4) {
435         errln("Fail at file %s, line %d expected start of word at 4, got %d",
436             __FILE__, __LINE__, startOfSecondWord);
437     }
438     delete bi;
439 }
440 
441 //----------------------------------------------------------------------------
442 //
443 // generalIteratorTest      Given a break iterator and a set of test data,
444 //                          Run the tests and report the results.
445 //
446 //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)447 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
448 {
449 
450     bi.setText(td.fDataToBreak);
451 
452     testFirstAndNext(bi, td);
453 
454     testLastAndPrevious(bi, td);
455 
456     testFollowing(bi, td);
457     testPreceding(bi, td);
458     testIsBoundary(bi, td);
459     doMultipleSelectionTest(bi, td);
460 }
461 
462 
463 //
464 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
465 //                       kind of loop.
466 //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)467 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
468 {
469     UErrorCode  status = U_ZERO_ERROR;
470     int32_t     p;
471     int32_t     lastP = -1;
472     int32_t     tag;
473 
474     logln("Test first and next");
475     bi.setText(td.fDataToBreak);
476     td.clearResults();
477 
478     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
479         td.fActualBreakPositions.addElement(p, status);  // Save result.
480         tag = bi.getRuleStatus();
481         td.fActualTags.addElement(tag, status);
482         if (p <= lastP) {
483             // If the iterator is not making forward progress, stop.
484             //  No need to raise an error here, it'll be detected in the normal check of results.
485             break;
486         }
487         lastP = p;
488     }
489     td.checkResults("testFirstAndNext", this);
490 }
491 
492 
493 //
494 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
495 //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)496 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
497 {
498     UErrorCode  status = U_ZERO_ERROR;
499     int32_t     p;
500     int32_t     lastP  = 0x7ffffffe;
501     int32_t     tag;
502 
503     logln("Test last and previous");
504     bi.setText(td.fDataToBreak);
505     td.clearResults();
506 
507     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
508         // Save break position.  Insert it at start of vector of results, shoving
509         //    already-saved results further towards the end.
510         td.fActualBreakPositions.insertElementAt(p, 0, status);
511         // bi.previous();   // TODO:  Why does this fix things up????
512         // bi.next();
513         tag = bi.getRuleStatus();
514         td.fActualTags.insertElementAt(tag, 0, status);
515         if (p >= lastP) {
516             // If the iterator is not making progress, stop.
517             //  No need to raise an error here, it'll be detected in the normal check of results.
518             break;
519         }
520         lastP = p;
521     }
522     td.checkResults("testLastAndPrevious", this);
523 }
524 
525 
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)526 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
527 {
528     UErrorCode  status = U_ZERO_ERROR;
529     int32_t     p;
530     int32_t     tag;
531     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
532                                  //   cannot be -1; that is returned for DONE.
533     int         i;
534 
535     logln("testFollowing():");
536     bi.setText(td.fDataToBreak);
537     td.clearResults();
538 
539     // Save the starting point, since we won't get that out of following.
540     p = bi.first();
541     td.fActualBreakPositions.addElement(p, status);  // Save result.
542     tag = bi.getRuleStatus();
543     td.fActualTags.addElement(tag, status);
544 
545     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
546         p = bi.following(i);
547         if (p != lastP) {
548             if (p == RuleBasedBreakIterator::DONE) {
549                 break;
550             }
551             // We've reached a new break position.  Save it.
552             td.fActualBreakPositions.addElement(p, status);  // Save result.
553             tag = bi.getRuleStatus();
554             td.fActualTags.addElement(tag, status);
555             lastP = p;
556         }
557     }
558     // The loop normally exits by means of the break in the middle.
559     // Make sure that the index was at the correct position for the break iterator to have
560     //   returned DONE.
561     if (i != td.fDataToBreak.length()) {
562         errln("testFollowing():  iterator returned DONE prematurely.");
563     }
564 
565     // Full check of all results.
566     td.checkResults("testFollowing", this);
567 }
568 
569 
570 
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)571 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
572     UErrorCode  status = U_ZERO_ERROR;
573     int32_t     p;
574     int32_t     tag;
575     int32_t     lastP  = 0x7ffffffe;
576     int         i;
577 
578     logln("testPreceding():");
579     bi.setText(td.fDataToBreak);
580     td.clearResults();
581 
582     p = bi.last();
583     td.fActualBreakPositions.addElement(p, status);
584     tag = bi.getRuleStatus();
585     td.fActualTags.addElement(tag, status);
586 
587     for (i = td.fDataToBreak.length(); i>=-1; i--) {
588         p = bi.preceding(i);
589         if (p != lastP) {
590             if (p == RuleBasedBreakIterator::DONE) {
591                 break;
592             }
593             // We've reached a new break position.  Save it.
594             td.fActualBreakPositions.insertElementAt(p, 0, status);
595             lastP = p;
596             tag = bi.getRuleStatus();
597             td.fActualTags.insertElementAt(tag, 0, status);
598         }
599     }
600     // The loop normally exits by means of the break in the middle.
601     // Make sure that the index was at the correct position for the break iterator to have
602     //   returned DONE.
603     if (i != 0) {
604         errln("testPreceding():  iterator returned DONE prematurely.");
605     }
606 
607     // Full check of all results.
608     td.checkResults("testPreceding", this);
609 }
610 
611 
612 
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)613 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
614     UErrorCode  status = U_ZERO_ERROR;
615     int         i;
616     int32_t     tag;
617 
618     logln("testIsBoundary():");
619     bi.setText(td.fDataToBreak);
620     td.clearResults();
621 
622     for (i = 0; i <= td.fDataToBreak.length(); i++) {
623         if (bi.isBoundary(i)) {
624             td.fActualBreakPositions.addElement(i, status);  // Save result.
625             tag = bi.getRuleStatus();
626             td.fActualTags.addElement(tag, status);
627         }
628     }
629     td.checkResults("testIsBoundary: ", this);
630 }
631 
632 
633 
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)634 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
635 {
636     iterator.setText(td.fDataToBreak);
637 
638     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
639     int32_t offset = iterator.first();
640     int32_t testOffset;
641     int32_t count = 0;
642 
643     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
644 
645     if (*testIterator != iterator)
646         errln("clone() or operator!= failed: two clones compared unequal");
647 
648     do {
649         testOffset = testIterator->first();
650         testOffset = testIterator->next(count);
651         if (offset != testOffset)
652             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
653 
654         if (offset != RuleBasedBreakIterator::DONE) {
655             count++;
656             offset = iterator.next();
657 
658             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
659                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
660                 if (count > 10000 || offset == -1) {
661                     errln("operator== failed too many times. Stopping test.");
662                     if (offset == -1) {
663                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
664                     }
665                     return;
666                 }
667             }
668         }
669     } while (offset != RuleBasedBreakIterator::DONE);
670 
671     // now do it backwards...
672     offset = iterator.last();
673     count = 0;
674 
675     do {
676         testOffset = testIterator->last();
677         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
678         if (offset != testOffset)
679             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
680 
681         if (offset != RuleBasedBreakIterator::DONE) {
682             count--;
683             offset = iterator.previous();
684         }
685     } while (offset != RuleBasedBreakIterator::DONE);
686 
687     delete testIterator;
688 }
689 
690 
691 //---------------------------------------------
692 //
693 //     other tests
694 //
695 //---------------------------------------------
TestEmptyString()696 void RBBITest::TestEmptyString()
697 {
698     UnicodeString text = "";
699     UErrorCode status = U_ZERO_ERROR;
700 
701     BITestData x(status);
702     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
703     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
704     if (U_FAILURE(status))
705     {
706         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
707         return;
708     }
709     generalIteratorTest(*bi, x);
710     delete bi;
711 }
712 
TestGetAvailableLocales()713 void RBBITest::TestGetAvailableLocales()
714 {
715     int32_t locCount = 0;
716     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
717 
718     if (locCount == 0)
719         dataerrln("getAvailableLocales() returned an empty list!");
720     // Just make sure that it's returning good memory.
721     int32_t i;
722     for (i = 0; i < locCount; ++i) {
723         logln(locList[i].getName());
724     }
725 }
726 
727 //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()728 void RBBITest::TestGetDisplayName()
729 {
730     UnicodeString   result;
731 
732     BreakIterator::getDisplayName(Locale::getUS(), result);
733     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
734         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
735                 + result);
736 
737     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
738     if (result != "French (France)")
739         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
740                 + result);
741 }
742 /**
743  * Test End Behaviour
744  * @bug 4068137
745  */
TestEndBehaviour()746 void RBBITest::TestEndBehaviour()
747 {
748     UErrorCode status = U_ZERO_ERROR;
749     UnicodeString testString("boo.");
750     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
751     if (U_FAILURE(status))
752     {
753         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
754         return;
755     }
756     wb->setText(testString);
757 
758     if (wb->first() != 0)
759         errln("Didn't get break at beginning of string.");
760     if (wb->next() != 3)
761         errln("Didn't get break before period in \"boo.\"");
762     if (wb->current() != 4 && wb->next() != 4)
763         errln("Didn't get break at end of string.");
764     delete wb;
765 }
766 /*
767  * @bug 4153072
768  */
TestBug4153072()769 void RBBITest::TestBug4153072() {
770     UErrorCode status = U_ZERO_ERROR;
771     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
772     if (U_FAILURE(status))
773     {
774         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
775         return;
776     }
777     UnicodeString str("...Hello, World!...");
778     int32_t begin = 3;
779     int32_t end = str.length() - 3;
780     UBool onBoundary;
781 
782     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
783     iter->adoptText(textIterator);
784     int index;
785     // Note: with the switch to UText, there is no way to restrict the
786     //       iteration range to begin at an index other than zero.
787     //       String character iterators created with a non-zero bound are
788     //         treated by RBBI as being empty.
789     for (index = -1; index < begin + 1; ++index) {
790         onBoundary = iter->isBoundary(index);
791         if (index == 0?  !onBoundary : onBoundary) {
792             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
793                             " and begin index = " + begin);
794         }
795     }
796     delete iter;
797 }
798 
799 
800 //
801 // Test for problem reported by Ashok Matoria on 9 July 2007
802 //    One.<kSoftHyphen><kSpace>Two.
803 //
804 //    Sentence break at start (0) and then on calling next() it breaks at
805 //   'T' of "Two". Now, at this point if I do next() and
806 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
807 //
TestBug5775()808 void RBBITest::TestBug5775() {
809     UErrorCode status = U_ZERO_ERROR;
810     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
811     TEST_ASSERT_SUCCESS(status);
812     if (U_FAILURE(status)) {
813         return;
814     }
815 // Check for status first for better handling of no data errors.
816     TEST_ASSERT(bi != NULL);
817     if (bi == NULL) {
818         return;
819     }
820 
821     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
822     //               01234      56789
823     s = s.unescape();
824     bi->setText(s);
825     int pos = bi->next();
826     TEST_ASSERT(pos == 6);
827     pos = bi->next();
828     TEST_ASSERT(pos == 10);
829     pos = bi->previous();
830     TEST_ASSERT(pos == 6);
831     delete bi;
832 }
833 
834 
835 
836 //------------------------------------------------------------------------------
837 //
838 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
839 //
840 //------------------------------------------------------------------------------
841 
842 struct TestParams {
843     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
844                                            //   Changed out whenever test data changes break type.
845 
846     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
847     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
848     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
849     UVector32       *srcCol;
850 
851     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
852     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
853     CharString       utf8String;           // UTF-8 form of text to break.
854 
TestParamsTestParams855     TestParams(UErrorCode &status) : dataToBreak() {
856         bi               = NULL;
857         expectedBreaks   = new UVector32(status);
858         srcLine          = new UVector32(status);
859         srcCol           = new UVector32(status);
860         textToBreak      = NULL;
861         textMap          = new UVector32(status);
862     }
863 
~TestParamsTestParams864     ~TestParams() {
865         delete bi;
866         delete expectedBreaks;
867         delete srcLine;
868         delete srcCol;
869         utext_close(textToBreak);
870         delete textMap;
871     }
872 
873     int32_t getSrcLine(int32_t bp);
874     int32_t getExpectedBreak(int32_t bp);
875     int32_t getSrcCol(int32_t bp);
876 
877     void setUTF16(UErrorCode &status);
878     void setUTF8(UErrorCode &status);
879 };
880 
881 // Append a UnicodeString to a CharString with UTF-8 encoding.
882 // Substitute any invalid chars.
883 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)884 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
885     if (U_FAILURE(status)) {
886         return;
887     }
888     int32_t utf8Length;
889     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
890                        src.getBuffer(), src.length(),   // UTF-16 data
891                        0xfffd, NULL,                    // Substitution char, number of subs.
892                        &status);
893     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
894         return;
895     }
896     status = U_ZERO_ERROR;
897     int32_t capacity;
898     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
899     u_strToUTF8WithSub(buffer, utf8Length, NULL,
900                        src.getBuffer(), src.length(),
901                        0xfffd, NULL, &status);
902     dest.append(buffer, utf8Length, status);
903 }
904 
905 
setUTF16(UErrorCode & status)906 void TestParams::setUTF16(UErrorCode &status) {
907     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
908     textMap->removeAllElements();
909     for (int32_t i=0; i<dataToBreak.length(); i++) {
910         if (i == dataToBreak.getChar32Start(i)) {
911             textMap->addElement(i, status);
912         } else {
913             textMap->addElement(-1, status);
914         }
915     }
916     textMap->addElement(dataToBreak.length(), status);
917     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
918 }
919 
920 
setUTF8(UErrorCode & status)921 void TestParams::setUTF8(UErrorCode &status) {
922     if (U_FAILURE(status)) {
923         return;
924     }
925     utf8String.clear();
926     CharStringAppend(utf8String, dataToBreak, status);
927     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
928     if (U_FAILURE(status)) {
929         return;
930     }
931 
932     textMap->removeAllElements();
933     int32_t utf16Index = 0;
934     for (;;) {
935         textMap->addElement(utf16Index, status);
936         UChar32 c32 = utext_current32(textToBreak);
937         if (c32 < 0) {
938             break;
939         }
940         utf16Index += U16_LENGTH(c32);
941         utext_next32(textToBreak);
942         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
943             textMap->addElement(-1, status);
944         }
945     }
946     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
947 }
948 
949 
getSrcLine(int bp)950 int32_t TestParams::getSrcLine(int bp) {
951     if (bp >= textMap->size()) {
952         bp = textMap->size() - 1;
953     }
954     int32_t i = 0;
955     for(; bp >= 0 ; --bp) {
956         // Move to a character boundary if we are not on one already.
957         i = textMap->elementAti(bp);
958         if (i >= 0) {
959             break;
960         }
961     }
962     return srcLine->elementAti(i);
963 }
964 
965 
getExpectedBreak(int bp)966 int32_t TestParams::getExpectedBreak(int bp) {
967     if (bp >= textMap->size()) {
968         return 0;
969     }
970     int32_t i = textMap->elementAti(bp);
971     int32_t retVal = 0;
972     if (i >= 0) {
973         retVal = expectedBreaks->elementAti(i);
974     }
975     return retVal;
976 }
977 
978 
getSrcCol(int bp)979 int32_t TestParams::getSrcCol(int bp) {
980     if (bp >= textMap->size()) {
981         bp = textMap->size() - 1;
982     }
983     int32_t i = 0;
984     for(; bp >= 0; --bp) {
985         // Move bp to a character boundary if we are not on one already.
986         i = textMap->elementAti(bp);
987         if (i >= 0) {
988             break;
989         }
990     }
991     return srcCol->elementAti(i);
992 }
993 
994 
executeTest(TestParams * t,UErrorCode & status)995 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
996     int32_t    bp;
997     int32_t    prevBP;
998     int32_t    i;
999 
1000     TEST_ASSERT_SUCCESS(status);
1001     if (U_FAILURE(status)) {
1002         return;
1003     }
1004 
1005     if (t->bi == NULL) {
1006         return;
1007     }
1008 
1009     t->bi->setText(t->textToBreak, status);
1010     //
1011     //  Run the iterator forward
1012     //
1013     prevBP = -1;
1014     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1015         if (prevBP ==  bp) {
1016             // Fail for lack of forward progress.
1017             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1018                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1019             break;
1020         }
1021 
1022         // Check that there we didn't miss an expected break between the last one
1023         //  and this one.
1024         for (i=prevBP+1; i<bp; i++) {
1025             if (t->getExpectedBreak(i) != 0) {
1026                 int expected[] = {0, i};
1027                 printStringBreaks(t->dataToBreak, expected, 2);
1028                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1029                       i, t->getSrcLine(i), t->getSrcCol(i));
1030             }
1031         }
1032 
1033         // Check that the break we did find was expected
1034         if (t->getExpectedBreak(bp) == 0) {
1035             int expected[] = {0, bp};
1036             printStringBreaks(t->textToBreak, expected, 2);
1037             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1038                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1039         } else {
1040             // The break was expected.
1041             //   Check that the {nnn} tag value is correct.
1042             int32_t expectedTagVal = t->getExpectedBreak(bp);
1043             if (expectedTagVal == -1) {
1044                 expectedTagVal = 0;
1045             }
1046             int32_t line = t->getSrcLine(bp);
1047             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1048             if (rs != expectedTagVal) {
1049                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1050                       "          Actual, Expected status = %4d, %4d",
1051                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1052             }
1053         }
1054 
1055         prevBP = bp;
1056     }
1057 
1058     // Verify that there were no missed expected breaks after the last one found
1059     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1060         if (t->getExpectedBreak(i) != 0) {
1061             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1062                       i, t->getSrcLine(i), t->getSrcCol(i));
1063         }
1064     }
1065 
1066     //
1067     //  Run the iterator backwards, verify that the same breaks are found.
1068     //
1069     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1070     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1071         if (prevBP ==  bp) {
1072             // Fail for lack of progress.
1073             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1074                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1075             break;
1076         }
1077 
1078         // Check that we didn't miss an expected break between the last one
1079         //  and this one.  (UVector returns zeros for index out of bounds.)
1080         for (i=prevBP-1; i>bp; i--) {
1081             if (t->getExpectedBreak(i) != 0) {
1082                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1083                       i, t->getSrcLine(i), t->getSrcCol(i));
1084             }
1085         }
1086 
1087         // Check that the break we did find was expected
1088         if (t->getExpectedBreak(bp) == 0) {
1089             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1090                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
1091         } else {
1092             // The break was expected.
1093             //   Check that the {nnn} tag value is correct.
1094             int32_t expectedTagVal = t->getExpectedBreak(bp);
1095             if (expectedTagVal == -1) {
1096                 expectedTagVal = 0;
1097             }
1098             int line = t->getSrcLine(bp);
1099             int32_t rs = t->bi->getRuleStatus();
1100             if (rs != expectedTagVal) {
1101                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1102                       "          Actual, Expected status = %4d, %4d",
1103                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1104             }
1105         }
1106 
1107         prevBP = bp;
1108     }
1109 
1110     // Verify that there were no missed breaks prior to the last one found
1111     for (i=prevBP-1; i>=0; i--) {
1112         if (t->getExpectedBreak(i) != 0) {
1113             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1114                       i, t->getSrcLine(i), t->getSrcCol(i));
1115         }
1116     }
1117 
1118     // Check isBoundary()
1119     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1120         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1121         UBool boundaryFound    = t->bi->isBoundary(i);
1122         if (boundaryExpected != boundaryFound) {
1123             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1124                   "        Expected, Actual= %s, %s",
1125                   i, t->getSrcLine(i), t->getSrcCol(i),
1126                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1127         }
1128     }
1129 
1130     // Check following()
1131     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1132         int32_t actualBreak = t->bi->following(i);
1133         int32_t expectedBreak = BreakIterator::DONE;
1134         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1135             if (t->getExpectedBreak(j) != 0) {
1136                 expectedBreak = j;
1137                 break;
1138             }
1139         }
1140         if (expectedBreak != actualBreak) {
1141             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1142                   "        Expected, Actual= %d, %d",
1143                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1144         }
1145     }
1146 
1147     // Check preceding()
1148     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1149         int32_t actualBreak = t->bi->preceding(i);
1150         int32_t expectedBreak = BreakIterator::DONE;
1151 
1152         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1153         // preceding(trailing byte) will return the index of some preceding code point,
1154         // not the lead byte of the current code point, even though that has a smaller index.
1155         // Therefore, start looking at the expected break data not at i-1, but at
1156         // the start of code point index - 1.
1157         utext_setNativeIndex(t->textToBreak, i);
1158         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1159         for (; j >= 0; j--) {
1160             if (t->getExpectedBreak(j) != 0) {
1161                 expectedBreak = j;
1162                 break;
1163             }
1164         }
1165         if (expectedBreak != actualBreak) {
1166             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1167                   "        Expected, Actual= %d, %d",
1168                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1169         }
1170     }
1171 }
1172 
1173 
TestExtended()1174 void RBBITest::TestExtended() {
1175 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1176     UErrorCode      status  = U_ZERO_ERROR;
1177     Locale          locale("");
1178 
1179     UnicodeString       rules;
1180     TestParams          tp(status);
1181 
1182     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1183     if (U_FAILURE(status)) {
1184         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1185     }
1186 
1187 
1188     //
1189     //  Open and read the test data file.
1190     //
1191     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1192     char testFileName[1000];
1193     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1194         errln("Can't open test data.  Path too long.");
1195         return;
1196     }
1197     strcpy(testFileName, testDataDirectory);
1198     strcat(testFileName, "rbbitst.txt");
1199 
1200     int    len;
1201     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1202     if (U_FAILURE(status)) {
1203         return; /* something went wrong, error already output */
1204     }
1205 
1206 
1207     bool skipTest = false; // Skip this test?
1208 
1209     //
1210     //  Put the test data into a UnicodeString
1211     //
1212     UnicodeString testString(FALSE, testFile, len);
1213 
1214     enum EParseState{
1215         PARSE_COMMENT,
1216         PARSE_TAG,
1217         PARSE_DATA,
1218         PARSE_NUM
1219     }
1220     parseState = PARSE_TAG;
1221 
1222     EParseState savedState = PARSE_TAG;
1223 
1224     static const UChar CH_LF        = 0x0a;
1225     static const UChar CH_CR        = 0x0d;
1226     static const UChar CH_HASH      = 0x23;
1227     /*static const UChar CH_PERIOD    = 0x2e;*/
1228     static const UChar CH_LT        = 0x3c;
1229     static const UChar CH_GT        = 0x3e;
1230     static const UChar CH_BACKSLASH = 0x5c;
1231     static const UChar CH_BULLET    = 0x2022;
1232 
1233     int32_t    lineNum  = 1;
1234     int32_t    colStart = 0;
1235     int32_t    column   = 0;
1236     int32_t    charIdx  = 0;
1237 
1238     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1239 
1240     for (charIdx = 0; charIdx < len; ) {
1241         status = U_ZERO_ERROR;
1242         UChar  c = testString.charAt(charIdx);
1243         charIdx++;
1244         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1245             // treat CRLF as a unit
1246             c = CH_LF;
1247             charIdx++;
1248         }
1249         if (c == CH_LF || c == CH_CR) {
1250             lineNum++;
1251             colStart = charIdx;
1252         }
1253         column = charIdx - colStart + 1;
1254 
1255         switch (parseState) {
1256         case PARSE_COMMENT:
1257             if (c == 0x0a || c == 0x0d) {
1258                 parseState = savedState;
1259             }
1260             break;
1261 
1262         case PARSE_TAG:
1263             {
1264             if (c == CH_HASH) {
1265                 parseState = PARSE_COMMENT;
1266                 savedState = PARSE_TAG;
1267                 break;
1268             }
1269             if (u_isUWhiteSpace(c)) {
1270                 break;
1271             }
1272             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1273                 delete tp.bi;
1274                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1275                 skipTest = false;
1276                 charIdx += 5;
1277                 break;
1278             }
1279             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1280                 delete tp.bi;
1281                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1282                 skipTest = false;
1283                 charIdx += 5;
1284                 break;
1285             }
1286             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1287                 delete tp.bi;
1288                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1289                 skipTest = false;
1290                 charIdx += 5;
1291                 break;
1292             }
1293             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1294                 delete tp.bi;
1295                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1296                 skipTest = false;
1297                 charIdx += 5;
1298                 break;
1299             }
1300             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1301                 delete tp.bi;
1302                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1303                 charIdx += 6;
1304                 break;
1305             }
1306 
1307             // <locale  loc_name>
1308             localeMatcher.reset(testString);
1309             if (localeMatcher.lookingAt(charIdx-1, status)) {
1310                 UnicodeString localeName = localeMatcher.group(1, status);
1311                 char localeName8[100];
1312                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1313                 locale = Locale::createFromName(localeName8);
1314                 charIdx += localeMatcher.group(0, status).length() - 1;
1315                 TEST_ASSERT_SUCCESS(status);
1316                 break;
1317             }
1318             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1319                 parseState = PARSE_DATA;
1320                 charIdx += 5;
1321                 tp.dataToBreak = "";
1322                 tp.expectedBreaks->removeAllElements();
1323                 tp.srcCol ->removeAllElements();
1324                 tp.srcLine->removeAllElements();
1325                 break;
1326             }
1327 
1328             errln("line %d: Tag expected in test file.", lineNum);
1329             parseState = PARSE_COMMENT;
1330             savedState = PARSE_DATA;
1331             goto end_test; // Stop the test.
1332             }
1333             break;
1334 
1335         case PARSE_DATA:
1336             if (c == CH_BULLET) {
1337                 int32_t  breakIdx = tp.dataToBreak.length();
1338                 tp.expectedBreaks->setSize(breakIdx+1);
1339                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1340                 tp.srcLine->setSize(breakIdx+1);
1341                 tp.srcLine->setElementAt(lineNum, breakIdx);
1342                 tp.srcCol ->setSize(breakIdx+1);
1343                 tp.srcCol ->setElementAt(column, breakIdx);
1344                 break;
1345             }
1346 
1347             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1348                 // Add final entry to mappings from break location to source file position.
1349                 //  Need one extra because last break position returned is after the
1350                 //    last char in the data, not at the last char.
1351                 tp.srcLine->addElement(lineNum, status);
1352                 tp.srcCol ->addElement(column, status);
1353 
1354                 parseState = PARSE_TAG;
1355                 charIdx += 6;
1356 
1357                 if (!skipTest) {
1358                     // RUN THE TEST!
1359                     status = U_ZERO_ERROR;
1360                     tp.setUTF16(status);
1361                     executeTest(&tp, status);
1362                     TEST_ASSERT_SUCCESS(status);
1363 
1364                     // Run again, this time with UTF-8 text wrapped in a UText.
1365                     status = U_ZERO_ERROR;
1366                     tp.setUTF8(status);
1367                     TEST_ASSERT_SUCCESS(status);
1368                     executeTest(&tp, status);
1369                 }
1370                 break;
1371             }
1372 
1373             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1374                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1375                 // Get the code point from the name and insert it into the test data.
1376                 //   (Damn, no API takes names in Unicode  !!!
1377                 //    we've got to take it back to char *)
1378                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1379                 int32_t nameLength = nameEndIdx - (charIdx+2);
1380                 char charNameBuf[200];
1381                 UChar32 theChar = -1;
1382                 if (nameEndIdx != -1) {
1383                     UErrorCode status = U_ZERO_ERROR;
1384                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1385                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1386                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1387                     if (U_FAILURE(status)) {
1388                         theChar = -1;
1389                     }
1390                 }
1391                 if (theChar == -1) {
1392                     errln("Error in named character in test file at line %d, col %d",
1393                         lineNum, column);
1394                 } else {
1395                     // Named code point was recognized.  Insert it
1396                     //   into the test data.
1397                     tp.dataToBreak.append(theChar);
1398                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1399                         tp.srcLine->addElement(lineNum, status);
1400                         tp.srcCol ->addElement(column, status);
1401                     }
1402                 }
1403                 if (nameEndIdx > charIdx) {
1404                     charIdx = nameEndIdx+1;
1405 
1406                 }
1407                 break;
1408             }
1409 
1410 
1411 
1412 
1413             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1414                 charIdx++;
1415                 int32_t  breakIdx = tp.dataToBreak.length();
1416                 tp.expectedBreaks->setSize(breakIdx+1);
1417                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1418                 tp.srcLine->setSize(breakIdx+1);
1419                 tp.srcLine->setElementAt(lineNum, breakIdx);
1420                 tp.srcCol ->setSize(breakIdx+1);
1421                 tp.srcCol ->setElementAt(column, breakIdx);
1422                 break;
1423             }
1424 
1425             if (c == CH_LT) {
1426                 tagValue   = 0;
1427                 parseState = PARSE_NUM;
1428                 break;
1429             }
1430 
1431             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1432                 parseState = PARSE_COMMENT;
1433                 savedState = PARSE_DATA;
1434                 break;
1435             }
1436 
1437             if (c == CH_BACKSLASH) {
1438                 // Check for \ at end of line, a line continuation.
1439                 //     Advance over (discard) the newline
1440                 UChar32 cp = testString.char32At(charIdx);
1441                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1442                     // We have a CR LF
1443                     //  Need an extra increment of the input ptr to move over both of them
1444                     charIdx++;
1445                 }
1446                 if (cp == CH_LF || cp == CH_CR) {
1447                     lineNum++;
1448                     colStart = charIdx;
1449                     charIdx++;
1450                     break;
1451                 }
1452 
1453                 // Let unescape handle the back slash.
1454                 cp = testString.unescapeAt(charIdx);
1455                 if (cp != -1) {
1456                     // Escape sequence was recognized.  Insert the char
1457                     //   into the test data.
1458                     tp.dataToBreak.append(cp);
1459                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1460                         tp.srcLine->addElement(lineNum, status);
1461                         tp.srcCol ->addElement(column, status);
1462                     }
1463                     break;
1464                 }
1465 
1466 
1467                 // Not a recognized backslash escape sequence.
1468                 // Take the next char as a literal.
1469                 //  TODO:  Should this be an error?
1470                 c = testString.charAt(charIdx);
1471                 charIdx = testString.moveIndex32(charIdx, 1);
1472             }
1473 
1474             // Normal, non-escaped data char.
1475             tp.dataToBreak.append(c);
1476 
1477             // Save the mapping from offset in the data to line/column numbers in
1478             //   the original input file.  Will be used for better error messages only.
1479             //   If there's an expected break before this char, the slot in the mapping
1480             //     vector will already be set for this char; don't overwrite it.
1481             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1482                 tp.srcLine->addElement(lineNum, status);
1483                 tp.srcCol ->addElement(column, status);
1484             }
1485             break;
1486 
1487 
1488         case PARSE_NUM:
1489             // We are parsing an expected numeric tag value, like <1234>,
1490             //   within a chunk of data.
1491             if (u_isUWhiteSpace(c)) {
1492                 break;
1493             }
1494 
1495             if (c == CH_GT) {
1496                 // Finished the number.  Add the info to the expected break data,
1497                 //   and switch parse state back to doing plain data.
1498                 parseState = PARSE_DATA;
1499                 if (tagValue == 0) {
1500                     tagValue = -1;
1501                 }
1502                 int32_t  breakIdx = tp.dataToBreak.length();
1503                 tp.expectedBreaks->setSize(breakIdx+1);
1504                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1505                 tp.srcLine->setSize(breakIdx+1);
1506                 tp.srcLine->setElementAt(lineNum, breakIdx);
1507                 tp.srcCol ->setSize(breakIdx+1);
1508                 tp.srcCol ->setElementAt(column, breakIdx);
1509                 break;
1510             }
1511 
1512             if (u_isdigit(c)) {
1513                 tagValue = tagValue*10 + u_charDigitValue(c);
1514                 break;
1515             }
1516 
1517             errln("Syntax Error in test file at line %d, col %d",
1518                 lineNum, column);
1519             parseState = PARSE_COMMENT;
1520             goto end_test; // Stop the test
1521             break;
1522         }
1523 
1524 
1525         if (U_FAILURE(status)) {
1526             dataerrln("ICU Error %s while parsing test file at line %d.",
1527                 u_errorName(status), lineNum);
1528             status = U_ZERO_ERROR;
1529             goto end_test; // Stop the test
1530         }
1531 
1532     }
1533 
1534 end_test:
1535     delete [] testFile;
1536 #endif
1537 }
1538 
1539 
1540 //-------------------------------------------------------------------------------
1541 //
1542 //  TestDictRules   create a break iterator from source rules that includes a
1543 //                  dictionary range.   Regression for bug #7130.  Source rules
1544 //                  do not declare a break iterator type (word, line, sentence, etc.
1545 //                  but the dictionary code, without a type, would loop.
1546 //
1547 //-------------------------------------------------------------------------------
TestDictRules()1548 void RBBITest::TestDictRules() {
1549     const char *rules =  "$dictionary = [a-z]; \n"
1550                          "!!forward; \n"
1551                          "$dictionary $dictionary; \n"
1552                          "!!reverse; \n"
1553                          "$dictionary $dictionary; \n";
1554     const char *text = "aa";
1555     UErrorCode status = U_ZERO_ERROR;
1556     UParseError parseError;
1557 
1558     RuleBasedBreakIterator bi(rules, parseError, status);
1559     if (U_SUCCESS(status)) {
1560         UnicodeString utext = text;
1561         bi.setText(utext);
1562         int32_t position;
1563         int32_t loops;
1564         for (loops = 0; loops<10; loops++) {
1565             position = bi.next();
1566             if (position == RuleBasedBreakIterator::DONE) {
1567                 break;
1568             }
1569         }
1570         TEST_ASSERT(loops == 1);
1571     } else {
1572         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1573     }
1574 }
1575 
1576 
1577 
1578 //-------------------------------------------------------------------------------
1579 //
1580 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1581 //    return the datain one big UChar * buffer, which the caller must delete.
1582 //
1583 //    parameters:
1584 //          fileName:   the name of the file, with no directory part.  The test data directory
1585 //                      is assumed.
1586 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1587 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1588 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1589 //                      Pass NULL for the system default encoding.
1590 //          status
1591 //    returns:
1592 //                      The file data, converted to UChar.
1593 //                      The caller must delete this when done with
1594 //                           delete [] theBuffer;
1595 //
1596 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1597 //           Move this function to some common place.
1598 //
1599 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1600 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1601     UChar       *retPtr  = NULL;
1602     char        *fileBuf = NULL;
1603     UConverter* conv     = NULL;
1604     FILE        *f       = NULL;
1605 
1606     ulen = 0;
1607     if (U_FAILURE(status)) {
1608         return retPtr;
1609     }
1610 
1611     //
1612     //  Open the file.
1613     //
1614     f = fopen(fileName, "rb");
1615     if (f == 0) {
1616         dataerrln("Error opening test data file %s\n", fileName);
1617         status = U_FILE_ACCESS_ERROR;
1618         return NULL;
1619     }
1620     //
1621     //  Read it in
1622     //
1623     int   fileSize;
1624     int   amt_read;
1625 
1626     fseek( f, 0, SEEK_END);
1627     fileSize = ftell(f);
1628     fileBuf = new char[fileSize];
1629     fseek(f, 0, SEEK_SET);
1630     amt_read = fread(fileBuf, 1, fileSize, f);
1631     if (amt_read != fileSize || fileSize <= 0) {
1632         errln("Error reading test data file.");
1633         goto cleanUpAndReturn;
1634     }
1635 
1636     //
1637     // Look for a Unicode Signature (BOM) on the data just read
1638     //
1639     int32_t        signatureLength;
1640     const char *   fileBufC;
1641     const char*    bomEncoding;
1642 
1643     fileBufC = fileBuf;
1644     bomEncoding = ucnv_detectUnicodeSignature(
1645         fileBuf, fileSize, &signatureLength, &status);
1646     if(bomEncoding!=NULL ){
1647         fileBufC  += signatureLength;
1648         fileSize  -= signatureLength;
1649         encoding = bomEncoding;
1650     }
1651 
1652     //
1653     // Open a converter to take the rule file to UTF-16
1654     //
1655     conv = ucnv_open(encoding, &status);
1656     if (U_FAILURE(status)) {
1657         goto cleanUpAndReturn;
1658     }
1659 
1660     //
1661     // Convert the rules to UChar.
1662     //  Preflight first to determine required buffer size.
1663     //
1664     ulen = ucnv_toUChars(conv,
1665         NULL,           //  dest,
1666         0,              //  destCapacity,
1667         fileBufC,
1668         fileSize,
1669         &status);
1670     if (status == U_BUFFER_OVERFLOW_ERROR) {
1671         // Buffer Overflow is expected from the preflight operation.
1672         status = U_ZERO_ERROR;
1673 
1674         retPtr = new UChar[ulen+1];
1675         ucnv_toUChars(conv,
1676             retPtr,       //  dest,
1677             ulen+1,
1678             fileBufC,
1679             fileSize,
1680             &status);
1681     }
1682 
1683 cleanUpAndReturn:
1684     fclose(f);
1685     delete []fileBuf;
1686     ucnv_close(conv);
1687     if (U_FAILURE(status)) {
1688         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1689         delete []retPtr;
1690         retPtr = 0;
1691         ulen   = 0;
1692     };
1693     return retPtr;
1694 }
1695 
1696 
1697 
1698 //--------------------------------------------------------------------------------------------
1699 //
1700 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1701 //
1702 //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1703 void RBBITest::TestUnicodeFiles() {
1704     RuleBasedBreakIterator  *bi;
1705     UErrorCode               status = U_ZERO_ERROR;
1706 
1707     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1708     TEST_ASSERT_SUCCESS(status);
1709     if (U_SUCCESS(status)) {
1710         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1711     }
1712     delete bi;
1713 
1714     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1715     TEST_ASSERT_SUCCESS(status);
1716     if (U_SUCCESS(status)) {
1717         runUnicodeTestData("WordBreakTest.txt", bi);
1718     }
1719     delete bi;
1720 
1721     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1722     TEST_ASSERT_SUCCESS(status);
1723     if (U_SUCCESS(status)) {
1724         runUnicodeTestData("SentenceBreakTest.txt", bi);
1725     }
1726     delete bi;
1727 
1728     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1729     TEST_ASSERT_SUCCESS(status);
1730     if (U_SUCCESS(status)) {
1731         runUnicodeTestData("LineBreakTest.txt", bi);
1732     }
1733     delete bi;
1734 }
1735 
1736 
1737 // Check for test cases from the Unicode test data files that are known to fail
1738 // and should be skipped because ICU is not yet able to fully implement the spec.
1739 // See ticket #7270.
1740 
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1741 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1742     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1743         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1744         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1745         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1746         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1747         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1748         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1749     };
1750     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1751         return FALSE;
1752     }
1753 
1754     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1755         if (testCase == UnicodeString(badTestCases[i])) {
1756             return logKnownIssue("7270");
1757         }
1758     }
1759     return FALSE;
1760 }
1761 
1762 
1763 //--------------------------------------------------------------------------------------------
1764 //
1765 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1766 //
1767 //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1768 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1769 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1770     UErrorCode  status = U_ZERO_ERROR;
1771 
1772     //
1773     //  Open and read the test data file, put it into a UnicodeString.
1774     //
1775     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1776     char testFileName[1000];
1777     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1778         dataerrln("Can't open test data.  Path too long.");
1779         return;
1780     }
1781     strcpy(testFileName, testDataDirectory);
1782     strcat(testFileName, fileName);
1783 
1784     logln("Opening data file %s\n", fileName);
1785 
1786     int    len;
1787     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1788     if (status != U_FILE_ACCESS_ERROR) {
1789         TEST_ASSERT_SUCCESS(status);
1790         TEST_ASSERT(testFile != NULL);
1791     }
1792     if (U_FAILURE(status) || testFile == NULL) {
1793         return; /* something went wrong, error already output */
1794     }
1795     UnicodeString testFileAsString(TRUE, testFile, len);
1796 
1797     //
1798     //  Parse the test data file using a regular expression.
1799     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1800     //     is identified by which group had a match.
1801     //
1802     //    Caputure Group #                  1          2            3            4           5
1803     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1804     //
1805     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1806     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1807     UnicodeString   testString;
1808     UVector32       breakPositions(status);
1809     int             lineNumber = 1;
1810     TEST_ASSERT_SUCCESS(status);
1811     if (U_FAILURE(status)) {
1812         return;
1813     }
1814 
1815     //
1816     //  Scan through each test case, building up the string to be broken in testString,
1817     //   and the positions that should be boundaries in the breakPositions vector.
1818     //
1819     int spin = 0;
1820     while (tokenMatcher.find()) {
1821       	if(tokenMatcher.hitEnd()) {
1822           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1823              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1824              and caused an infinite loop here on EBCDIC systems!
1825           */
1826           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1827           //	   return;
1828       	}
1829         if (tokenMatcher.start(1, status) >= 0) {
1830             // Scanned a divide sign, indicating a break position in the test data.
1831             if (testString.length()>0) {
1832                 breakPositions.addElement(testString.length(), status);
1833             }
1834         }
1835         else if (tokenMatcher.start(2, status) >= 0) {
1836             // Scanned an 'x', meaning no break at this position in the test data
1837             //   Nothing to be done here.
1838             }
1839         else if (tokenMatcher.start(3, status) >= 0) {
1840             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1841             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1842             int length = hexNumber.length();
1843             if (length<=8) {
1844                 char buf[10];
1845                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1846                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1847                 if (c<=0x10ffff) {
1848                     testString.append(c);
1849                 } else {
1850                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1851                        fileName, lineNumber);
1852                 }
1853             } else {
1854                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1855                        fileName, lineNumber);
1856              }
1857         }
1858         else if (tokenMatcher.start(4, status) >= 0) {
1859             // Scanned to end of a line, possibly skipping over a comment in the process.
1860             //   If the line from the file contained test data, run the test now.
1861             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1862                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1863             }
1864 
1865             // Clear out this test case.
1866             //    The string and breakPositions vector will be refilled as the next
1867             //       test case is parsed.
1868             testString.remove();
1869             breakPositions.removeAllElements();
1870             lineNumber++;
1871         } else {
1872             // Scanner catchall.  Something unrecognized appeared on the line.
1873             char token[16];
1874             UnicodeString uToken = tokenMatcher.group(0, status);
1875             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1876             token[sizeof(token)-1] = 0;
1877             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1878 
1879             // Clean up, in preparation for continuing with the next line.
1880             testString.remove();
1881             breakPositions.removeAllElements();
1882             lineNumber++;
1883         }
1884         TEST_ASSERT_SUCCESS(status);
1885         if (U_FAILURE(status)) {
1886             break;
1887         }
1888     }
1889 
1890     delete [] testFile;
1891  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1892 }
1893 
1894 //--------------------------------------------------------------------------------------------
1895 //
1896 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1897 //                            test data files.  Do only a simple, forward-only check -
1898 //                            this test is mostly to check that ICU and the Unicode
1899 //                            data agree with each other.
1900 //
1901 //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1902 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1903                          const UnicodeString &testString,   // Text data to be broken
1904                          UVector32 *breakPositions,         // Positions where breaks should be found.
1905                          RuleBasedBreakIterator *bi) {
1906     int32_t pos;                 // Break Position in the test string
1907     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1908     int32_t expectedPos;         // Expected break position (index into test string)
1909 
1910     bi->setText(testString);
1911     pos = bi->first();
1912     pos = bi->next();
1913 
1914     while (pos != BreakIterator::DONE) {
1915         if (expectedI >= breakPositions->size()) {
1916             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1917                 testFileName, lineNumber, pos);
1918             break;
1919         }
1920         expectedPos = breakPositions->elementAti(expectedI);
1921         if (pos < expectedPos) {
1922             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1923                 testFileName, lineNumber, pos);
1924             break;
1925         }
1926         if (pos > expectedPos) {
1927             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1928                 testFileName, lineNumber, expectedPos);
1929             break;
1930         }
1931         pos = bi->next();
1932         expectedI++;
1933     }
1934 
1935     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1936         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1937             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1938     }
1939 }
1940 
1941 
1942 
1943 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1944 //---------------------------------------------------------------------------------------
1945 //
1946 //   classs RBBIMonkeyKind
1947 //
1948 //      Monkey Test for Break Iteration
1949 //      Abstract interface class.   Concrete derived classes independently
1950 //      implement the break rules for different iterator types.
1951 //
1952 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1953 //      testing, but works purely in terms of the interface defined here.
1954 //
1955 //---------------------------------------------------------------------------------------
1956 class RBBIMonkeyKind {
1957 public:
1958     // Return a UVector of UnicodeSets, representing the character classes used
1959     //   for this type of iterator.
1960     virtual  UVector  *charClasses() = 0;
1961 
1962     // Set the test text on which subsequent calls to next() will operate
1963     virtual  void      setText(const UnicodeString &s) = 0;
1964 
1965     // Find the next break postion, starting from the prev break position, or from zero.
1966     // Return -1 after reaching end of string.
1967     virtual  int32_t   next(int32_t i) = 0;
1968 
1969     virtual ~RBBIMonkeyKind();
1970     UErrorCode       deferredStatus;
1971 
1972 
1973 protected:
1974     RBBIMonkeyKind();
1975 
1976 private:
1977 };
1978 
RBBIMonkeyKind()1979 RBBIMonkeyKind::RBBIMonkeyKind() {
1980     deferredStatus = U_ZERO_ERROR;
1981 }
1982 
~RBBIMonkeyKind()1983 RBBIMonkeyKind::~RBBIMonkeyKind() {
1984 }
1985 
1986 
1987 //----------------------------------------------------------------------------------------
1988 //
1989 //   Random Numbers.  Similar to standard lib rand() and srand()
1990 //                    Not using library to
1991 //                      1.  Get same results on all platforms.
1992 //                      2.  Get access to current seed, to more easily reproduce failures.
1993 //
1994 //---------------------------------------------------------------------------------------
1995 static uint32_t m_seed = 1;
1996 
m_rand()1997 static uint32_t m_rand()
1998 {
1999     m_seed = m_seed * 1103515245 + 12345;
2000     return (uint32_t)(m_seed/65536) % 32768;
2001 }
2002 
2003 
2004 //------------------------------------------------------------------------------------------
2005 //
2006 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2007 //                             of RBBIMonkeyKind.
2008 //
2009 //------------------------------------------------------------------------------------------
2010 class RBBICharMonkey: public RBBIMonkeyKind {
2011 public:
2012     RBBICharMonkey();
2013     virtual          ~RBBICharMonkey();
2014     virtual  UVector *charClasses();
2015     virtual  void     setText(const UnicodeString &s);
2016     virtual  int32_t  next(int32_t i);
2017 private:
2018     UVector   *fSets;
2019 
2020     UnicodeSet  *fCRLFSet;
2021     UnicodeSet  *fControlSet;
2022     UnicodeSet  *fExtendSet;
2023     UnicodeSet  *fRegionalIndicatorSet;
2024     UnicodeSet  *fPrependSet;
2025     UnicodeSet  *fSpacingSet;
2026     UnicodeSet  *fLSet;
2027     UnicodeSet  *fVSet;
2028     UnicodeSet  *fTSet;
2029     UnicodeSet  *fLVSet;
2030     UnicodeSet  *fLVTSet;
2031     UnicodeSet  *fHangulSet;
2032     UnicodeSet  *fAnySet;
2033 
2034     const UnicodeString *fText;
2035 };
2036 
2037 
RBBICharMonkey()2038 RBBICharMonkey::RBBICharMonkey() {
2039     UErrorCode  status = U_ZERO_ERROR;
2040 
2041     fText = NULL;
2042 
2043     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2044     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2045     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2046     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2047     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2048     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2049     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2050     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2051     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2052     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2053     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2054     fHangulSet  = new UnicodeSet();
2055     fHangulSet->addAll(*fLSet);
2056     fHangulSet->addAll(*fVSet);
2057     fHangulSet->addAll(*fTSet);
2058     fHangulSet->addAll(*fLVSet);
2059     fHangulSet->addAll(*fLVTSet);
2060     fAnySet     = new UnicodeSet(0, 0x10ffff);
2061 
2062     fSets       = new UVector(status);
2063     fSets->addElement(fCRLFSet,    status);
2064     fSets->addElement(fControlSet, status);
2065     fSets->addElement(fExtendSet,  status);
2066     fSets->addElement(fRegionalIndicatorSet, status);
2067     if (!fPrependSet->isEmpty()) {
2068         fSets->addElement(fPrependSet, status);
2069     }
2070     fSets->addElement(fSpacingSet, status);
2071     fSets->addElement(fHangulSet,  status);
2072     fSets->addElement(fAnySet,     status);
2073     if (U_FAILURE(status)) {
2074         deferredStatus = status;
2075     }
2076 }
2077 
2078 
setText(const UnicodeString & s)2079 void RBBICharMonkey::setText(const UnicodeString &s) {
2080     fText = &s;
2081 }
2082 
2083 
2084 
next(int32_t prevPos)2085 int32_t RBBICharMonkey::next(int32_t prevPos) {
2086     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2087                               //   break position being tested.  The candidate break
2088                               //   location is before p2.
2089 
2090     int     breakPos = -1;
2091 
2092     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2093 
2094     if (U_FAILURE(deferredStatus)) {
2095         return -1;
2096     }
2097 
2098     // Previous break at end of string.  return DONE.
2099     if (prevPos >= fText->length()) {
2100         return -1;
2101     }
2102     p0 = p1 = p2 = p3 = prevPos;
2103     c3 =  fText->char32At(prevPos);
2104     c0 = c1 = c2 = 0;
2105     (void)p0;   // suppress set but not used warning.
2106     (void)c0;
2107 
2108     // Loop runs once per "significant" character position in the input text.
2109     for (;;) {
2110         // Move all of the positions forward in the input string.
2111         p0 = p1;  c0 = c1;
2112         p1 = p2;  c1 = c2;
2113         p2 = p3;  c2 = c3;
2114 
2115         // Advancd p3 by one codepoint
2116         p3 = fText->moveIndex32(p3, 1);
2117         c3 = fText->char32At(p3);
2118 
2119         if (p1 == p2) {
2120             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2121             continue;
2122         }
2123         if (p2 == fText->length()) {
2124             // Reached end of string.  Always a break position.
2125             break;
2126         }
2127 
2128         // Rule  GB3   CR x LF
2129         //     No Extend or Format characters may appear between the CR and LF,
2130         //     which requires the additional check for p2 immediately following p1.
2131         //
2132         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2133             continue;
2134         }
2135 
2136         // Rule (GB4).   ( Control | CR | LF ) <break>
2137         if (fControlSet->contains(c1) ||
2138             c1 == 0x0D ||
2139             c1 == 0x0A)  {
2140             break;
2141         }
2142 
2143         // Rule (GB5)    <break>  ( Control | CR | LF )
2144         //
2145         if (fControlSet->contains(c2) ||
2146             c2 == 0x0D ||
2147             c2 == 0x0A)  {
2148             break;
2149         }
2150 
2151 
2152         // Rule (GB6)  L x ( L | V | LV | LVT )
2153         if (fLSet->contains(c1) &&
2154                (fLSet->contains(c2)  ||
2155                 fVSet->contains(c2)  ||
2156                 fLVSet->contains(c2) ||
2157                 fLVTSet->contains(c2))) {
2158             continue;
2159         }
2160 
2161         // Rule (GB7)    ( LV | V )  x  ( V | T )
2162         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2163             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2164             continue;
2165         }
2166 
2167         // Rule (GB8)    ( LVT | T)  x T
2168         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2169             fTSet->contains(c2))  {
2170             continue;
2171         }
2172 
2173         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
2174         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2175             continue;
2176         }
2177 
2178         // Rule (GB9)    Numeric x ALetter
2179         if (fExtendSet->contains(c2))  {
2180             continue;
2181         }
2182 
2183         // Rule (GB9a)   x  SpacingMark
2184         if (fSpacingSet->contains(c2)) {
2185             continue;
2186         }
2187 
2188         // Rule (GB9b)   Prepend x
2189         if (fPrependSet->contains(c1)) {
2190             continue;
2191         }
2192 
2193         // Rule (GB10)  Any  <break>  Any
2194         break;
2195     }
2196 
2197     breakPos = p2;
2198     return breakPos;
2199 }
2200 
2201 
2202 
charClasses()2203 UVector  *RBBICharMonkey::charClasses() {
2204     return fSets;
2205 }
2206 
2207 
~RBBICharMonkey()2208 RBBICharMonkey::~RBBICharMonkey() {
2209     delete fSets;
2210     delete fCRLFSet;
2211     delete fControlSet;
2212     delete fExtendSet;
2213     delete fRegionalIndicatorSet;
2214     delete fPrependSet;
2215     delete fSpacingSet;
2216     delete fLSet;
2217     delete fVSet;
2218     delete fTSet;
2219     delete fLVSet;
2220     delete fLVTSet;
2221     delete fHangulSet;
2222     delete fAnySet;
2223 }
2224 
2225 //------------------------------------------------------------------------------------------
2226 //
2227 //   class RBBIWordMonkey      Word Break specific implementation
2228 //                             of RBBIMonkeyKind.
2229 //
2230 //------------------------------------------------------------------------------------------
2231 class RBBIWordMonkey: public RBBIMonkeyKind {
2232 public:
2233     RBBIWordMonkey();
2234     virtual          ~RBBIWordMonkey();
2235     virtual  UVector *charClasses();
2236     virtual  void     setText(const UnicodeString &s);
2237     virtual int32_t   next(int32_t i);
2238 private:
2239     UVector      *fSets;
2240 
2241     UnicodeSet  *fCRSet;
2242     UnicodeSet  *fLFSet;
2243     UnicodeSet  *fNewlineSet;
2244     UnicodeSet  *fRegionalIndicatorSet;
2245     UnicodeSet  *fKatakanaSet;
2246     UnicodeSet  *fHebrew_LetterSet;
2247     UnicodeSet  *fALetterSet;
2248     // TODO(jungshik): Do we still need this change?
2249     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2250     UnicodeSet  *fSingle_QuoteSet;
2251     UnicodeSet  *fDouble_QuoteSet;
2252     UnicodeSet  *fMidNumLetSet;
2253     UnicodeSet  *fMidLetterSet;
2254     UnicodeSet  *fMidNumSet;
2255     UnicodeSet  *fNumericSet;
2256     UnicodeSet  *fFormatSet;
2257     UnicodeSet  *fOtherSet;
2258     UnicodeSet  *fExtendSet;
2259     UnicodeSet  *fExtendNumLetSet;
2260     UnicodeSet  *fDictionaryCjkSet;
2261 
2262     const UnicodeString  *fText;
2263 };
2264 
2265 
RBBIWordMonkey()2266 RBBIWordMonkey::RBBIWordMonkey()
2267 {
2268     UErrorCode  status = U_ZERO_ERROR;
2269 
2270     fSets            = new UVector(status);
2271 
2272     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2273     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2274     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2275     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2276     // Exclude Hangul syllables from ALetterSet during testing.
2277     // Leave CJK dictionary characters out from the monkey tests!
2278 #if 0
2279     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2280                                       "[\\p{Line_Break = Complex_Context}"
2281                                       "-\\p{Grapheme_Cluster_Break = Extend}"
2282                                       "-\\p{Grapheme_Cluster_Break = Control}"
2283                                       "]]",
2284                                       status);
2285 #endif
2286     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2287     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2288     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2289     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2290     fALetterSet->removeAll(*fDictionaryCjkSet);
2291     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2292     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2293     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2294     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2295     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2296     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2297     // we should figure out why
2298     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2299     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2300     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2301     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2302 
2303     fOtherSet        = new UnicodeSet();
2304     if(U_FAILURE(status)) {
2305       deferredStatus = status;
2306       return;
2307     }
2308 
2309     fOtherSet->complement();
2310     fOtherSet->removeAll(*fCRSet);
2311     fOtherSet->removeAll(*fLFSet);
2312     fOtherSet->removeAll(*fNewlineSet);
2313     fOtherSet->removeAll(*fKatakanaSet);
2314     fOtherSet->removeAll(*fHebrew_LetterSet);
2315     fOtherSet->removeAll(*fALetterSet);
2316     fOtherSet->removeAll(*fSingle_QuoteSet);
2317     fOtherSet->removeAll(*fDouble_QuoteSet);
2318     fOtherSet->removeAll(*fMidLetterSet);
2319     fOtherSet->removeAll(*fMidNumSet);
2320     fOtherSet->removeAll(*fNumericSet);
2321     fOtherSet->removeAll(*fExtendNumLetSet);
2322     fOtherSet->removeAll(*fFormatSet);
2323     fOtherSet->removeAll(*fExtendSet);
2324     fOtherSet->removeAll(*fRegionalIndicatorSet);
2325     // Inhibit dictionary characters from being tested at all.
2326     fOtherSet->removeAll(*fDictionaryCjkSet);
2327     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2328 
2329     fSets->addElement(fCRSet,                status);
2330     fSets->addElement(fLFSet,                status);
2331     fSets->addElement(fNewlineSet,           status);
2332     fSets->addElement(fRegionalIndicatorSet, status);
2333     fSets->addElement(fHebrew_LetterSet,     status);
2334     fSets->addElement(fALetterSet,           status);
2335     fSets->addElement(fSingle_QuoteSet,      status);
2336     fSets->addElement(fDouble_QuoteSet,      status);
2337     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2338     fSets->addElement(fMidLetterSet,         status);
2339     fSets->addElement(fMidNumLetSet,         status);
2340     fSets->addElement(fMidNumSet,            status);
2341     fSets->addElement(fNumericSet,           status);
2342     fSets->addElement(fFormatSet,            status);
2343     fSets->addElement(fExtendSet,            status);
2344     fSets->addElement(fOtherSet,             status);
2345     fSets->addElement(fExtendNumLetSet,      status);
2346 
2347     if (U_FAILURE(status)) {
2348         deferredStatus = status;
2349     }
2350 }
2351 
setText(const UnicodeString & s)2352 void RBBIWordMonkey::setText(const UnicodeString &s) {
2353     fText       = &s;
2354 }
2355 
2356 
next(int32_t prevPos)2357 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2358     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2359                               //   break position being tested.  The candidate break
2360                               //   location is before p2.
2361 
2362     int     breakPos = -1;
2363 
2364     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2365 
2366     if (U_FAILURE(deferredStatus)) {
2367         return -1;
2368     }
2369 
2370     // Prev break at end of string.  return DONE.
2371     if (prevPos >= fText->length()) {
2372         return -1;
2373     }
2374     p0 = p1 = p2 = p3 = prevPos;
2375     c3 =  fText->char32At(prevPos);
2376     c0 = c1 = c2 = 0;
2377     (void)p0;       // Suppress set but not used warning.
2378 
2379     // Loop runs once per "significant" character position in the input text.
2380     for (;;) {
2381         // Move all of the positions forward in the input string.
2382         p0 = p1;  c0 = c1;
2383         p1 = p2;  c1 = c2;
2384         p2 = p3;  c2 = c3;
2385 
2386         // Advancd p3 by    X(Extend | Format)*   Rule 4
2387         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2388         do {
2389             p3 = fText->moveIndex32(p3, 1);
2390             c3 = fText->char32At(p3);
2391             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2392                break;
2393             };
2394         }
2395         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2396 
2397 
2398         if (p1 == p2) {
2399             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2400             continue;
2401         }
2402         if (p2 == fText->length()) {
2403             // Reached end of string.  Always a break position.
2404             break;
2405         }
2406 
2407         // Rule  (3)   CR x LF
2408         //     No Extend or Format characters may appear between the CR and LF,
2409         //     which requires the additional check for p2 immediately following p1.
2410         //
2411         if (c1==0x0D && c2==0x0A) {
2412             continue;
2413         }
2414 
2415         // Rule (3a)  Break before and after newlines (including CR and LF)
2416         //
2417         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2418             break;
2419         };
2420         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2421             break;
2422         };
2423 
2424         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2425         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2426             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2427             continue;
2428         }
2429 
2430         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2431         //
2432         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2433              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2434              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2435             continue;
2436         }
2437 
2438         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2439         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2440             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2441             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2442             continue;
2443         }
2444 
2445         // Rule (7a)     Hebrew_Letter x Single_Quote
2446         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2447             continue;
2448         }
2449 
2450         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2451         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2452             continue;
2453         }
2454 
2455         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2456         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2457             continue;
2458         }
2459 
2460         // Rule (8)    Numeric x Numeric
2461         if (fNumericSet->contains(c1) &&
2462             fNumericSet->contains(c2))  {
2463             continue;
2464         }
2465 
2466         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2467         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2468             fNumericSet->contains(c2))  {
2469             continue;
2470         }
2471 
2472         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2473         if (fNumericSet->contains(c1) &&
2474             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2475             continue;
2476         }
2477 
2478         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2479         if (fNumericSet->contains(c0) &&
2480             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2481             fNumericSet->contains(c2)) {
2482             continue;
2483         }
2484 
2485         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2486         if (fNumericSet->contains(c1) &&
2487             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2488             fNumericSet->contains(c3)) {
2489             continue;
2490         }
2491 
2492         // Rule (13)  Katakana x Katakana
2493         if (fKatakanaSet->contains(c1) &&
2494             fKatakanaSet->contains(c2))  {
2495             continue;
2496         }
2497 
2498         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2499         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2500              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2501              fExtendNumLetSet->contains(c2)) {
2502                 continue;
2503         }
2504 
2505         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2506         if (fExtendNumLetSet->contains(c1) &&
2507                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2508                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2509             continue;
2510         }
2511 
2512         // Rule 13c
2513         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2514             continue;
2515         }
2516 
2517         // Rule 14.  Break found here.
2518         break;
2519     }
2520 
2521     breakPos = p2;
2522     return breakPos;
2523 }
2524 
2525 
charClasses()2526 UVector  *RBBIWordMonkey::charClasses() {
2527     return fSets;
2528 }
2529 
2530 
~RBBIWordMonkey()2531 RBBIWordMonkey::~RBBIWordMonkey() {
2532     delete fSets;
2533     delete fCRSet;
2534     delete fLFSet;
2535     delete fNewlineSet;
2536     delete fKatakanaSet;
2537     delete fHebrew_LetterSet;
2538     delete fALetterSet;
2539     delete fSingle_QuoteSet;
2540     delete fDouble_QuoteSet;
2541     delete fMidNumLetSet;
2542     delete fMidLetterSet;
2543     delete fMidNumSet;
2544     delete fNumericSet;
2545     delete fFormatSet;
2546     delete fExtendSet;
2547     delete fExtendNumLetSet;
2548     delete fRegionalIndicatorSet;
2549     delete fDictionaryCjkSet;
2550     delete fOtherSet;
2551 }
2552 
2553 
2554 
2555 
2556 //------------------------------------------------------------------------------------------
2557 //
2558 //   class RBBISentMonkey      Sentence Break specific implementation
2559 //                             of RBBIMonkeyKind.
2560 //
2561 //------------------------------------------------------------------------------------------
2562 class RBBISentMonkey: public RBBIMonkeyKind {
2563 public:
2564     RBBISentMonkey();
2565     virtual          ~RBBISentMonkey();
2566     virtual  UVector *charClasses();
2567     virtual  void     setText(const UnicodeString &s);
2568     virtual int32_t   next(int32_t i);
2569 private:
2570     int               moveBack(int posFrom);
2571     int               moveForward(int posFrom);
2572     UChar32           cAt(int pos);
2573 
2574     UVector      *fSets;
2575 
2576     UnicodeSet  *fSepSet;
2577     UnicodeSet  *fFormatSet;
2578     UnicodeSet  *fSpSet;
2579     UnicodeSet  *fLowerSet;
2580     UnicodeSet  *fUpperSet;
2581     UnicodeSet  *fOLetterSet;
2582     UnicodeSet  *fNumericSet;
2583     UnicodeSet  *fATermSet;
2584     UnicodeSet  *fSContinueSet;
2585     UnicodeSet  *fSTermSet;
2586     UnicodeSet  *fCloseSet;
2587     UnicodeSet  *fOtherSet;
2588     UnicodeSet  *fExtendSet;
2589 
2590     const UnicodeString  *fText;
2591 
2592 };
2593 
RBBISentMonkey()2594 RBBISentMonkey::RBBISentMonkey()
2595 {
2596     UErrorCode  status = U_ZERO_ERROR;
2597 
2598     fSets            = new UVector(status);
2599 
2600     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2601     //                       set and made into character classes of their own.  For the monkey impl,
2602     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2603     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2604     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2605     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2606     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2607     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2608     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2609     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2610     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2611     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2612     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2613     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2614     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2615     fOtherSet        = new UnicodeSet();
2616 
2617     if(U_FAILURE(status)) {
2618       deferredStatus = status;
2619       return;
2620     }
2621 
2622     fOtherSet->complement();
2623     fOtherSet->removeAll(*fSepSet);
2624     fOtherSet->removeAll(*fFormatSet);
2625     fOtherSet->removeAll(*fSpSet);
2626     fOtherSet->removeAll(*fLowerSet);
2627     fOtherSet->removeAll(*fUpperSet);
2628     fOtherSet->removeAll(*fOLetterSet);
2629     fOtherSet->removeAll(*fNumericSet);
2630     fOtherSet->removeAll(*fATermSet);
2631     fOtherSet->removeAll(*fSContinueSet);
2632     fOtherSet->removeAll(*fSTermSet);
2633     fOtherSet->removeAll(*fCloseSet);
2634     fOtherSet->removeAll(*fExtendSet);
2635 
2636     fSets->addElement(fSepSet,       status);
2637     fSets->addElement(fFormatSet,    status);
2638     fSets->addElement(fSpSet,        status);
2639     fSets->addElement(fLowerSet,     status);
2640     fSets->addElement(fUpperSet,     status);
2641     fSets->addElement(fOLetterSet,   status);
2642     fSets->addElement(fNumericSet,   status);
2643     fSets->addElement(fATermSet,     status);
2644     fSets->addElement(fSContinueSet, status);
2645     fSets->addElement(fSTermSet,     status);
2646     fSets->addElement(fCloseSet,     status);
2647     fSets->addElement(fOtherSet,     status);
2648     fSets->addElement(fExtendSet,    status);
2649 
2650     if (U_FAILURE(status)) {
2651         deferredStatus = status;
2652     }
2653 }
2654 
2655 
2656 
setText(const UnicodeString & s)2657 void RBBISentMonkey::setText(const UnicodeString &s) {
2658     fText       = &s;
2659 }
2660 
charClasses()2661 UVector  *RBBISentMonkey::charClasses() {
2662     return fSets;
2663 }
2664 
2665 
2666 //  moveBack()   Find the "significant" code point preceding the index i.
2667 //               Skips over ($Extend | $Format)* .
2668 //
moveBack(int i)2669 int RBBISentMonkey::moveBack(int i) {
2670     if (i <= 0) {
2671         return -1;
2672     }
2673     UChar32   c;
2674     int32_t   j = i;
2675     do {
2676         j = fText->moveIndex32(j, -1);
2677         c = fText->char32At(j);
2678     }
2679     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2680     return j;
2681 
2682  }
2683 
2684 
moveForward(int i)2685 int RBBISentMonkey::moveForward(int i) {
2686     if (i>=fText->length()) {
2687         return fText->length();
2688     }
2689     UChar32   c;
2690     int32_t   j = i;
2691     do {
2692         j = fText->moveIndex32(j, 1);
2693         c = cAt(j);
2694     }
2695     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2696     return j;
2697 }
2698 
cAt(int pos)2699 UChar32 RBBISentMonkey::cAt(int pos) {
2700     if (pos<0 || pos>=fText->length()) {
2701         return -1;
2702     } else {
2703         return fText->char32At(pos);
2704     }
2705 }
2706 
next(int32_t prevPos)2707 int32_t RBBISentMonkey::next(int32_t prevPos) {
2708     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2709                               //   break position being tested.  The candidate break
2710                               //   location is before p2.
2711 
2712     int     breakPos = -1;
2713 
2714     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2715     UChar32 c;
2716 
2717     if (U_FAILURE(deferredStatus)) {
2718         return -1;
2719     }
2720 
2721     // Prev break at end of string.  return DONE.
2722     if (prevPos >= fText->length()) {
2723         return -1;
2724     }
2725     p0 = p1 = p2 = p3 = prevPos;
2726     c3 =  fText->char32At(prevPos);
2727     c0 = c1 = c2 = 0;
2728     (void)p0;     // Suppress set but not used warning.
2729 
2730     // Loop runs once per "significant" character position in the input text.
2731     for (;;) {
2732         // Move all of the positions forward in the input string.
2733         p0 = p1;  c0 = c1;
2734         p1 = p2;  c1 = c2;
2735         p2 = p3;  c2 = c3;
2736 
2737         // Advancd p3 by    X(Extend | Format)*   Rule 4
2738         p3 = moveForward(p3);
2739         c3 = cAt(p3);
2740 
2741         // Rule (3)  CR x LF
2742         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2743             continue;
2744         }
2745 
2746         // Rule (4).   Sep  <break>
2747         if (fSepSet->contains(c1)) {
2748             p2 = p1+1;   // Separators don't combine with Extend or Format.
2749             break;
2750         }
2751 
2752         if (p2 >= fText->length()) {
2753             // Reached end of string.  Always a break position.
2754             break;
2755         }
2756 
2757         if (p2 == prevPos) {
2758             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2759             continue;
2760         }
2761 
2762         // Rule (6).   ATerm x Numeric
2763         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2764             continue;
2765         }
2766 
2767         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2768         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2769                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2770             continue;
2771         }
2772 
2773         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2774         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2775         //                  note to the Unicode 5.0 documents.
2776         int p8 = p1;
2777         while (fSpSet->contains(cAt(p8))) {
2778             p8 = moveBack(p8);
2779         }
2780         while (fCloseSet->contains(cAt(p8))) {
2781             p8 = moveBack(p8);
2782         }
2783         if (fATermSet->contains(cAt(p8))) {
2784             p8=p2;
2785             for (;;) {
2786                 c = cAt(p8);
2787                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2788                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2789                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2790                     break;
2791                 }
2792                 p8 = moveForward(p8);
2793             }
2794             if (fLowerSet->contains(cAt(p8))) {
2795                 continue;
2796             }
2797         }
2798 
2799         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2800         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2801             p8 = p1;
2802             while (fSpSet->contains(cAt(p8))) {
2803                 p8 = moveBack(p8);
2804             }
2805             while (fCloseSet->contains(cAt(p8))) {
2806                 p8 = moveBack(p8);
2807             }
2808             c = cAt(p8);
2809             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2810                 continue;
2811             }
2812         }
2813 
2814         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2815         int p9 = p1;
2816         while (fCloseSet->contains(cAt(p9))) {
2817             p9 = moveBack(p9);
2818         }
2819         c = cAt(p9);
2820         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2821             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2822                 continue;
2823             }
2824         }
2825 
2826         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2827         int p10 = p1;
2828         while (fSpSet->contains(cAt(p10))) {
2829             p10 = moveBack(p10);
2830         }
2831         while (fCloseSet->contains(cAt(p10))) {
2832             p10 = moveBack(p10);
2833         }
2834         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2835             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2836                 continue;
2837             }
2838         }
2839 
2840         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2841         int p11 = p1;
2842         if (fSepSet->contains(cAt(p11))) {
2843             p11 = moveBack(p11);
2844         }
2845         while (fSpSet->contains(cAt(p11))) {
2846             p11 = moveBack(p11);
2847         }
2848         while (fCloseSet->contains(cAt(p11))) {
2849             p11 = moveBack(p11);
2850         }
2851         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2852             break;
2853         }
2854 
2855         //  Rule (12)  Any x Any
2856         continue;
2857     }
2858     breakPos = p2;
2859     return breakPos;
2860 }
2861 
~RBBISentMonkey()2862 RBBISentMonkey::~RBBISentMonkey() {
2863     delete fSets;
2864     delete fSepSet;
2865     delete fFormatSet;
2866     delete fSpSet;
2867     delete fLowerSet;
2868     delete fUpperSet;
2869     delete fOLetterSet;
2870     delete fNumericSet;
2871     delete fATermSet;
2872     delete fSContinueSet;
2873     delete fSTermSet;
2874     delete fCloseSet;
2875     delete fOtherSet;
2876     delete fExtendSet;
2877 }
2878 
2879 
2880 
2881 //-------------------------------------------------------------------------------------------
2882 //
2883 //  RBBILineMonkey
2884 //
2885 //-------------------------------------------------------------------------------------------
2886 
2887 class RBBILineMonkey: public RBBIMonkeyKind {
2888 public:
2889     RBBILineMonkey();
2890     virtual          ~RBBILineMonkey();
2891     virtual  UVector *charClasses();
2892     virtual  void     setText(const UnicodeString &s);
2893     virtual  int32_t  next(int32_t i);
2894     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2895 private:
2896     UVector      *fSets;
2897 
2898     UnicodeSet  *fBK;
2899     UnicodeSet  *fCR;
2900     UnicodeSet  *fLF;
2901     UnicodeSet  *fCM;
2902     UnicodeSet  *fNL;
2903     UnicodeSet  *fSG;
2904     UnicodeSet  *fWJ;
2905     UnicodeSet  *fZW;
2906     UnicodeSet  *fGL;
2907     UnicodeSet  *fCB;
2908     UnicodeSet  *fSP;
2909     UnicodeSet  *fB2;
2910     UnicodeSet  *fBA;
2911     UnicodeSet  *fBB;
2912     UnicodeSet  *fHY;
2913     UnicodeSet  *fH2;
2914     UnicodeSet  *fH3;
2915     UnicodeSet  *fCL;
2916     UnicodeSet  *fCP;
2917     UnicodeSet  *fEX;
2918     UnicodeSet  *fIN;
2919     UnicodeSet  *fJL;
2920     UnicodeSet  *fJV;
2921     UnicodeSet  *fJT;
2922     UnicodeSet  *fNS;
2923     UnicodeSet  *fOP;
2924     UnicodeSet  *fQU;
2925     UnicodeSet  *fIS;
2926     UnicodeSet  *fNU;
2927     UnicodeSet  *fPO;
2928     UnicodeSet  *fPR;
2929     UnicodeSet  *fSY;
2930     UnicodeSet  *fAI;
2931     UnicodeSet  *fAL;
2932     UnicodeSet  *fCJ;
2933     UnicodeSet  *fHL;
2934     UnicodeSet  *fID;
2935     UnicodeSet  *fRI;
2936     UnicodeSet  *fSA;
2937     UnicodeSet  *fXX;
2938 
2939     BreakIterator        *fCharBI;
2940     const UnicodeString  *fText;
2941     RegexMatcher         *fNumberMatcher;
2942 };
2943 
2944 
RBBILineMonkey()2945 RBBILineMonkey::RBBILineMonkey()
2946 {
2947     UErrorCode  status = U_ZERO_ERROR;
2948 
2949     fSets  = new UVector(status);
2950 
2951     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2952     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2953     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2954     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2955     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2956     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2957     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2958     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2959     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2960     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2961     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2962     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2963     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2964     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2965     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2966     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2967     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2968     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2969     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2970     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2971     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2972     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2973     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2974     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2975     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2976     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2977     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2978     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2979     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2980     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2981     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2982     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2983     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2984     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2985     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2986     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2987     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2988     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2989     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2990     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2991 
2992     if (U_FAILURE(status)) {
2993         deferredStatus = status;
2994         fCharBI = NULL;
2995         fNumberMatcher = NULL;
2996         return;
2997     }
2998 
2999     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3000     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3001     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
3002     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3003 
3004     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
3005 
3006     fSets->addElement(fBK, status);
3007     fSets->addElement(fCR, status);
3008     fSets->addElement(fLF, status);
3009     fSets->addElement(fCM, status);
3010     fSets->addElement(fNL, status);
3011     fSets->addElement(fWJ, status);
3012     fSets->addElement(fZW, status);
3013     fSets->addElement(fGL, status);
3014     fSets->addElement(fCB, status);
3015     fSets->addElement(fSP, status);
3016     fSets->addElement(fB2, status);
3017     fSets->addElement(fBA, status);
3018     fSets->addElement(fBB, status);
3019     fSets->addElement(fHY, status);
3020     fSets->addElement(fH2, status);
3021     fSets->addElement(fH3, status);
3022     fSets->addElement(fCL, status);
3023     fSets->addElement(fCP, status);
3024     fSets->addElement(fEX, status);
3025     fSets->addElement(fIN, status);
3026     fSets->addElement(fJL, status);
3027     fSets->addElement(fJT, status);
3028     fSets->addElement(fJV, status);
3029     fSets->addElement(fNS, status);
3030     fSets->addElement(fOP, status);
3031     fSets->addElement(fQU, status);
3032     fSets->addElement(fIS, status);
3033     fSets->addElement(fNU, status);
3034     fSets->addElement(fPO, status);
3035     fSets->addElement(fPR, status);
3036     fSets->addElement(fSY, status);
3037     fSets->addElement(fAI, status);
3038     fSets->addElement(fAL, status);
3039     fSets->addElement(fHL, status);
3040     fSets->addElement(fID, status);
3041     fSets->addElement(fWJ, status);
3042     fSets->addElement(fRI, status);
3043     fSets->addElement(fSA, status);
3044     fSets->addElement(fSG, status);
3045 
3046     const char *rules =
3047             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3048             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3049             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3050             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3051             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3052             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3053 
3054     fNumberMatcher = new RegexMatcher(
3055         UnicodeString(rules, -1, US_INV), 0, status);
3056 
3057     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3058 
3059     if (U_FAILURE(status)) {
3060         deferredStatus = status;
3061     }
3062 }
3063 
3064 
setText(const UnicodeString & s)3065 void RBBILineMonkey::setText(const UnicodeString &s) {
3066     fText       = &s;
3067     fCharBI->setText(s);
3068     fNumberMatcher->reset(s);
3069 }
3070 
3071 //
3072 //  rule9Adjust
3073 //     Line Break TR rules 9 and 10 implementation.
3074 //     This deals with combining marks and other sequences that
3075 //     that must be treated as if they were something other than what they actually are.
3076 //
3077 //     This is factored out into a separate function because it must be applied twice for
3078 //     each potential break, once to the chars before the position being checked, then
3079 //     again to the text following the possible break.
3080 //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3081 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3082     if (pos == -1) {
3083         // Invalid initial position.  Happens during the warmup iteration of the
3084         //   main loop in next().
3085         return;
3086     }
3087 
3088     int32_t  nPos = *nextPos;
3089 
3090     // LB 9  Keep combining sequences together.
3091     //  advance over any CM class chars.  Note that Line Break CM is different
3092     //  from the normal Grapheme Extend property.
3093     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3094           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3095         for (;;) {
3096             *nextChar = fText->char32At(nPos);
3097             if (!fCM->contains(*nextChar)) {
3098                 break;
3099             }
3100             nPos = fText->moveIndex32(nPos, 1);
3101         }
3102     }
3103 
3104 
3105     // LB 9 Treat X CM* as if it were x.
3106     //       No explicit action required.
3107 
3108     // LB 10  Treat any remaining combining mark as AL
3109     if (fCM->contains(*posChar)) {
3110         *posChar = 0x41;   // thisChar = 'A';
3111     }
3112 
3113     // Push the updated nextPos and nextChar back to our caller.
3114     // This only makes a difference if posChar got bigger by consuming a
3115     // combining sequence.
3116     *nextPos  = nPos;
3117     *nextChar = fText->char32At(nPos);
3118 }
3119 
3120 
3121 
next(int32_t startPos)3122 int32_t RBBILineMonkey::next(int32_t startPos) {
3123     UErrorCode status = U_ZERO_ERROR;
3124     int32_t    pos;       //  Index of the char following a potential break position
3125     UChar32    thisChar;  //  Character at above position "pos"
3126 
3127     int32_t    prevPos;   //  Index of the char preceding a potential break position
3128     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3129                           //   and thisChar may not be adjacent because combining
3130                           //   characters between them will be ignored.
3131 
3132     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3133     UChar32    prevCharX2;
3134 
3135     int32_t    nextPos;   //  Index of the next character following pos.
3136                           //     Usually skips over combining marks.
3137     int32_t    nextCPPos; //  Index of the code point following "pos."
3138                           //     May point to a combining mark.
3139     int32_t    tPos;      //  temp value.
3140     UChar32    c;
3141 
3142     if (U_FAILURE(deferredStatus)) {
3143         return -1;
3144     }
3145 
3146     if (startPos >= fText->length()) {
3147         return -1;
3148     }
3149 
3150 
3151     // Initial values for loop.  Loop will run the first time without finding breaks,
3152     //                           while the invalid values shift out and the "this" and
3153     //                           "prev" positions are filled in with good values.
3154     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3155     thisChar = prevChar  = prevCharX2 = 0;
3156     nextPos  = nextCPPos = startPos;
3157 
3158 
3159     // Loop runs once per position in the test text, until a break position
3160     //  is found.
3161     for (;;) {
3162         prevPosX2 = prevPos;
3163         prevCharX2 = prevChar;
3164 
3165         prevPos   = pos;
3166         prevChar  = thisChar;
3167 
3168         pos       = nextPos;
3169         thisChar  = fText->char32At(pos);
3170 
3171         nextCPPos = fText->moveIndex32(pos, 1);
3172         nextPos   = nextCPPos;
3173 
3174         // Rule LB2 - Break at end of text.
3175         if (pos >= fText->length()) {
3176             break;
3177         }
3178 
3179         // Rule LB 9 - adjust for combining sequences.
3180         //             We do this one out-of-order because the adjustment does not change anything
3181         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3182         //             be applied.
3183         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3184         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3185         c = fText->char32At(nextPos);
3186         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3187 
3188         // If the loop is still warming up - if we haven't shifted the initial
3189         //   -1 positions out of prevPos yet - loop back to advance the
3190         //    position in the input without any further looking for breaks.
3191         if (prevPos == -1) {
3192             continue;
3193         }
3194 
3195         // LB 4  Always break after hard line breaks,
3196         if (fBK->contains(prevChar)) {
3197             break;
3198         }
3199 
3200         // LB 5  Break after CR, LF, NL, but not inside CR LF
3201         if (prevChar == 0x0d && thisChar == 0x0a) {
3202             continue;
3203         }
3204         if (prevChar == 0x0d ||
3205             prevChar == 0x0a ||
3206             prevChar == 0x85)  {
3207             break;
3208         }
3209 
3210         // LB 6  Don't break before hard line breaks
3211         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3212             fBK->contains(thisChar)) {
3213                 continue;
3214         }
3215 
3216 
3217         // LB 7  Don't break before spaces or zero-width space.
3218         if (fSP->contains(thisChar)) {
3219             continue;
3220         }
3221 
3222         if (fZW->contains(thisChar)) {
3223             continue;
3224         }
3225 
3226         // LB 8  Break after zero width space
3227         if (fZW->contains(prevChar)) {
3228             break;
3229         }
3230 
3231         // LB 9, 10  Already done, at top of loop.
3232         //
3233 
3234 
3235         // LB 11  Do not break before or after WORD JOINER and related characters.
3236         //    x  WJ
3237         //    WJ  x
3238         //
3239         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3240             continue;
3241         }
3242 
3243         // LB 12
3244         //    GL  x
3245         if (fGL->contains(prevChar)) {
3246             continue;
3247         }
3248 
3249         // LB 12a
3250         //    [^SP BA HY] x GL
3251         if (!(fSP->contains(prevChar) ||
3252               fBA->contains(prevChar) ||
3253               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3254             continue;
3255         }
3256 
3257 
3258 
3259         // LB 13  Don't break before closings.
3260         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3261         //        fall into LB 17 and the more general number regular expression.
3262         //
3263         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3264             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3265                                          fEX->contains(thisChar)  ||
3266             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3267             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3268             continue;
3269         }
3270 
3271         // LB 14 Don't break after OP SP*
3272         //       Scan backwards, checking for this sequence.
3273         //       The OP char could include combining marks, so we actually check for
3274         //           OP CM* SP*
3275         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3276         //       sequence into a ID char, so before scanning back through spaces,
3277         //       verify that prevChar is indeed a space.  The prevChar variable
3278         //       may differ from fText[prevPos]
3279         tPos = prevPos;
3280         if (fSP->contains(prevChar)) {
3281             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3282                 tPos=fText->moveIndex32(tPos, -1);
3283             }
3284         }
3285         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3286             tPos=fText->moveIndex32(tPos, -1);
3287         }
3288         if (fOP->contains(fText->char32At(tPos))) {
3289             continue;
3290         }
3291 
3292 
3293         // LB 15    QU SP* x OP
3294         if (fOP->contains(thisChar)) {
3295             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3296             int tPos = prevPos;
3297             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3298                 tPos = fText->moveIndex32(tPos, -1);
3299             }
3300             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3301                 tPos = fText->moveIndex32(tPos, -1);
3302             }
3303             if (fQU->contains(fText->char32At(tPos))) {
3304                 continue;
3305             }
3306         }
3307 
3308 
3309 
3310         // LB 16   (CL | CP) SP* x NS
3311         //    Scan backwards for SP* CM* (CL | CP)
3312         if (fNS->contains(thisChar)) {
3313             int tPos = prevPos;
3314             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3315                 tPos = fText->moveIndex32(tPos, -1);
3316             }
3317             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3318                 tPos = fText->moveIndex32(tPos, -1);
3319             }
3320             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3321                 continue;
3322             }
3323         }
3324 
3325 
3326         // LB 17        B2 SP* x B2
3327         if (fB2->contains(thisChar)) {
3328             //  Scan backwards, checking for the B2 CM* SP* sequence.
3329             tPos = prevPos;
3330             if (fSP->contains(prevChar)) {
3331                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3332                     tPos=fText->moveIndex32(tPos, -1);
3333                 }
3334             }
3335             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3336                 tPos=fText->moveIndex32(tPos, -1);
3337             }
3338             if (fB2->contains(fText->char32At(tPos))) {
3339                 continue;
3340             }
3341         }
3342 
3343 
3344         // LB 18    break after space
3345         if (fSP->contains(prevChar)) {
3346             break;
3347         }
3348 
3349         // LB 19
3350         //    x   QU
3351         //    QU  x
3352         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3353             continue;
3354         }
3355 
3356         // LB 20  Break around a CB
3357         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3358             break;
3359         }
3360 
3361         // LB 21
3362         if (fBA->contains(thisChar) ||
3363             fHY->contains(thisChar) ||
3364             fNS->contains(thisChar) ||
3365             fBB->contains(prevChar) )   {
3366             continue;
3367         }
3368 
3369         // LB 21a
3370         //   HL (HY | BA) x
3371         if (fHL->contains(prevCharX2) &&
3372                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3373             continue;
3374         }
3375 
3376         // LB 21b
3377         //   SY x HL
3378         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3379             continue;
3380         }
3381 
3382         // LB 22
3383         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3384             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3385             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3386             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3387             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3388             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3389             continue;
3390         }
3391 
3392 
3393         // LB 23    ID x PO
3394         //          AL x NU
3395         //          HL x NU
3396         //          NU x AL
3397         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3398             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3399             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3400             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3401             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3402             continue;
3403         }
3404 
3405         // LB 24  Do not break between prefix and letters or ideographs.
3406         //        PR x ID
3407         //        PR x (AL | HL)
3408         //        PO x (AL | HL)
3409         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3410             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3411             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3412             continue;
3413         }
3414 
3415 
3416 
3417         // LB 25    Numbers
3418         if (fNumberMatcher->lookingAt(prevPos, status)) {
3419             if (U_FAILURE(status)) {
3420                 break;
3421             }
3422             // Matched a number.  But could have been just a single digit, which would
3423             //    not represent a "no break here" between prevChar and thisChar
3424             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3425             if (numEndIdx > pos) {
3426                 // Number match includes at least our two chars being checked
3427                 if (numEndIdx > nextPos) {
3428                     // Number match includes additional chars.  Update pos and nextPos
3429                     //   so that next loop iteration will continue at the end of the number,
3430                     //   checking for breaks between last char in number & whatever follows.
3431                     pos = nextPos = numEndIdx;
3432                     do {
3433                         pos = fText->moveIndex32(pos, -1);
3434                         thisChar = fText->char32At(pos);
3435                     } while (fCM->contains(thisChar));
3436                 }
3437                 continue;
3438             }
3439         }
3440 
3441 
3442         // LB 26 Do not break a Korean syllable.
3443         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3444                                         fJV->contains(thisChar) ||
3445                                         fH2->contains(thisChar) ||
3446                                         fH3->contains(thisChar))) {
3447                                             continue;
3448                                         }
3449 
3450         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3451             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3452                 continue;
3453         }
3454 
3455         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3456             fJT->contains(thisChar)) {
3457                 continue;
3458         }
3459 
3460         // LB 27 Treat a Korean Syllable Block the same as ID.
3461         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3462             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3463             fIN->contains(thisChar)) {
3464                 continue;
3465             }
3466         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3467             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3468             fPO->contains(thisChar)) {
3469                 continue;
3470             }
3471         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3472             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3473                 continue;
3474             }
3475 
3476 
3477 
3478         // LB 28  Do not break between alphabetics ("at").
3479         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3480             continue;
3481         }
3482 
3483         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3484         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3485             continue;
3486         }
3487 
3488         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3489         //          (AL | NU) x OP
3490         //          CP x (AL | NU)
3491         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3492             continue;
3493         }
3494         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3495             continue;
3496         }
3497 
3498         // LB30a  Do not break between regional indicators.
3499         //        RI x RI
3500         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3501             continue;
3502         }
3503 
3504         // LB 31    Break everywhere else
3505         break;
3506 
3507     }
3508 
3509     return pos;
3510 }
3511 
3512 
charClasses()3513 UVector  *RBBILineMonkey::charClasses() {
3514     return fSets;
3515 }
3516 
3517 
~RBBILineMonkey()3518 RBBILineMonkey::~RBBILineMonkey() {
3519     delete fSets;
3520 
3521     delete fBK;
3522     delete fCR;
3523     delete fLF;
3524     delete fCM;
3525     delete fNL;
3526     delete fWJ;
3527     delete fZW;
3528     delete fGL;
3529     delete fCB;
3530     delete fSP;
3531     delete fB2;
3532     delete fBA;
3533     delete fBB;
3534     delete fHY;
3535     delete fH2;
3536     delete fH3;
3537     delete fCL;
3538     delete fCP;
3539     delete fEX;
3540     delete fIN;
3541     delete fJL;
3542     delete fJV;
3543     delete fJT;
3544     delete fNS;
3545     delete fOP;
3546     delete fQU;
3547     delete fIS;
3548     delete fNU;
3549     delete fPO;
3550     delete fPR;
3551     delete fSY;
3552     delete fAI;
3553     delete fAL;
3554     delete fCJ;
3555     delete fHL;
3556     delete fID;
3557     delete fRI;
3558     delete fSA;
3559     delete fSG;
3560     delete fXX;
3561 
3562     delete fCharBI;
3563     delete fNumberMatcher;
3564 }
3565 
3566 
3567 //-------------------------------------------------------------------------------------------
3568 //
3569 //   TestMonkey
3570 //
3571 //     params
3572 //       seed=nnnnn        Random number starting seed.
3573 //                         Setting the seed allows errors to be reproduced.
3574 //       loop=nnn          Looping count.  Controls running time.
3575 //                         -1:  run forever.
3576 //                          0 or greater:  run length.
3577 //
3578 //       type = char | word | line | sent | title
3579 //
3580 //-------------------------------------------------------------------------------------------
3581 
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3582 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3583     int32_t val = defaultVal;
3584     name.append(" *= *(-?\\d+)");
3585     UErrorCode status = U_ZERO_ERROR;
3586     RegexMatcher m(name, params, 0, status);
3587     if (m.find()) {
3588         // The param exists.  Convert the string to an int.
3589         char valString[100];
3590         int32_t paramLength = m.end(1, status) - m.start(1, status);
3591         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3592             paramLength = (int32_t)(sizeof(valString)-2);
3593         }
3594         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3595         val = strtol(valString,  NULL, 10);
3596 
3597         // Delete this parameter from the params string.
3598         m.reset();
3599         params = m.replaceFirst("", status);
3600     }
3601     U_ASSERT(U_SUCCESS(status));
3602     return val;
3603 }
3604 #endif
3605 
3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3607 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3608                                     BreakIterator *bi,
3609                                     int expected[],
3610                                     int expectedcount)
3611 {
3612     int count = 0;
3613     int i = 0;
3614     int forward[50];
3615     bi->setText(ustr);
3616     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3617         forward[count] = i;
3618         if (count < expectedcount && expected[count] != i) {
3619             test->errln("break forward test failed: expected %d but got %d",
3620                         expected[count], i);
3621             break;
3622         }
3623         count ++;
3624     }
3625     if (count != expectedcount) {
3626         printStringBreaks(ustr, expected, expectedcount);
3627         test->errln("break forward test failed: missed %d match",
3628                     expectedcount - count);
3629         return;
3630     }
3631     // testing boundaries
3632     for (i = 1; i < expectedcount; i ++) {
3633         int j = expected[i - 1];
3634         if (!bi->isBoundary(j)) {
3635             printStringBreaks(ustr, expected, expectedcount);
3636             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3637             return;
3638         }
3639         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3640             if (bi->isBoundary(j)) {
3641                 printStringBreaks(ustr, expected, expectedcount);
3642                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3643                 return;
3644             }
3645         }
3646     }
3647 
3648     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3649         count --;
3650         if (forward[count] != i) {
3651             printStringBreaks(ustr, expected, expectedcount);
3652             test->errln("happy break test previous() failed: expected %d but got %d",
3653                         forward[count], i);
3654             break;
3655         }
3656     }
3657     if (count != 0) {
3658         printStringBreaks(ustr, expected, expectedcount);
3659         test->errln("break test previous() failed: missed a match");
3660         return;
3661     }
3662 
3663     // testing preceding
3664     for (i = 0; i < expectedcount - 1; i ++) {
3665         // int j = expected[i] + 1;
3666         int j = ustr.moveIndex32(expected[i], 1);
3667         for (; j <= expected[i + 1]; j ++) {
3668             if (bi->preceding(j) != expected[i]) {
3669                 printStringBreaks(ustr, expected, expectedcount);
3670                 test->errln("preceding(): Not expecting boundary at position %d", j);
3671                 return;
3672             }
3673         }
3674     }
3675 }
3676 #endif
3677 
TestWordBreaks(void)3678 void RBBITest::TestWordBreaks(void)
3679 {
3680 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3681 
3682     Locale        locale("en");
3683     UErrorCode    status = U_ZERO_ERROR;
3684     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3685     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3686     // Replaced any C+J characters in a row with a random sequence of characters
3687     // of the same length to make our C+J segmentation not get in the way.
3688     static const char *strlist[] =
3689     {
3690     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3691     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3692     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3693     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3694     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3695     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3696     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3697     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3698     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3699     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3700     "\\u2027\\U000e0067\\u0a47\\u00b7",
3701     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3702     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3703     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3704     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3705     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3706     "\\u0027\\u11af\\U000e0057\\u0602",
3707     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3708     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3709     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3710     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3711     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3712     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3713     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3714     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3715     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3716     "\\u18f4\\U000e0049\\u20e7\\u2027",
3717     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3718     "\\ua183\\u102d\\u0bec\\u003a",
3719     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3720     "\\u003a\\u0e57\\u0fad\\u002e",
3721     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3722     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3723     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3724     "\\u003a\\u0664\\u00b7\\u1fba",
3725     "\\u003b\\u0027\\u00b7\\u47a3",
3726     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3727     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3728     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3729     };
3730     int loop;
3731     if (U_FAILURE(status)) {
3732         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3733         return;
3734     }
3735     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3736         // printf("looping %d\n", loop);
3737         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3738         // RBBICharMonkey monkey;
3739         RBBIWordMonkey monkey;
3740 
3741         int expected[50];
3742         int expectedcount = 0;
3743 
3744         monkey.setText(ustr);
3745         int i;
3746         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3747             expected[expectedcount ++] = i;
3748         }
3749 
3750         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3751     }
3752     delete bi;
3753 #endif
3754 }
3755 
TestWordBoundary(void)3756 void RBBITest::TestWordBoundary(void)
3757 {
3758     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3759     Locale        locale("en");
3760     UErrorCode    status = U_ZERO_ERROR;
3761     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3762     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3763     UChar         str[50];
3764     static const char *strlist[] =
3765     {
3766     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3767     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3768     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3769     "\\u2027\\U000e0067\\u0a47\\u00b7",
3770     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3771     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3772     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3773     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3774     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3775     "\\u0027\\u11af\\U000e0057\\u0602",
3776     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3777     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3778     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3779     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3780     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3781     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3782     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3783     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3784     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3785     "\\u58f4\\U000e0049\\u20e7\\u2027",
3786     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3787     "\\ua183\\u102d\\u0bec\\u003a",
3788     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3789     "\\u003a\\u0e57\\u0fad\\u002e",
3790     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3791     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3792     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3793     "\\u003a\\u0664\\u00b7\\u1fba",
3794     "\\u003b\\u0027\\u00b7\\u47a3",
3795     };
3796     int loop;
3797     if (U_FAILURE(status)) {
3798         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3799         return;
3800     }
3801     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3802         // printf("looping %d\n", loop);
3803         u_unescape(strlist[loop], str, 20);
3804         UnicodeString ustr(str);
3805         int forward[50];
3806         int count = 0;
3807 
3808         bi->setText(ustr);
3809         int prev = 0;
3810         int i;
3811         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3812             forward[count ++] = i;
3813             if (i > prev) {
3814                 int j;
3815                 for (j = prev + 1; j < i; j ++) {
3816                     if (bi->isBoundary(j)) {
3817                         printStringBreaks(ustr, forward, count);
3818                         errln("happy boundary test failed: expected %d not a boundary",
3819                                j);
3820                         return;
3821                     }
3822                 }
3823             }
3824             if (!bi->isBoundary(i)) {
3825                 printStringBreaks(ustr, forward, count);
3826                 errln("happy boundary test failed: expected %d a boundary",
3827                        i);
3828                 return;
3829             }
3830             prev = i;
3831         }
3832     }
3833     delete bi;
3834 }
3835 
TestLineBreaks(void)3836 void RBBITest::TestLineBreaks(void)
3837 {
3838 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3839     Locale        locale("en");
3840     UErrorCode    status = U_ZERO_ERROR;
3841     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3842     const int32_t  STRSIZE = 50;
3843     UChar         str[STRSIZE];
3844     static const char *strlist[] =
3845     {
3846      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3847      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3848              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3849      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3850              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3851      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3852      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3853      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3854      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3855      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3856      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3857      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3858      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3859      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3860      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3861      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3862      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3863      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3864      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3865      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3866      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3867      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3868      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3869      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3870      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3871      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3872      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3873      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3874      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3875      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3876      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3877      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3878      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3879      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3880      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3881      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3882      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3883      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3884      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3885      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3886      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3887      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3888          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3889          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3890          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3891      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3892          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3893     };
3894     int loop;
3895     TEST_ASSERT_SUCCESS(status);
3896     if (U_FAILURE(status)) {
3897         return;
3898     }
3899     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3900         // printf("looping %d\n", loop);
3901         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3902         if (t >= STRSIZE) {
3903             TEST_ASSERT(FALSE);
3904             continue;
3905         }
3906 
3907 
3908         UnicodeString ustr(str);
3909         RBBILineMonkey monkey;
3910         if (U_FAILURE(monkey.deferredStatus)) {
3911             continue;
3912         }
3913 
3914         const int EXPECTEDSIZE = 50;
3915         int expected[EXPECTEDSIZE];
3916         int expectedcount = 0;
3917 
3918         monkey.setText(ustr);
3919         int i;
3920         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3921             if (expectedcount >= EXPECTEDSIZE) {
3922                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3923                 return;
3924             }
3925             expected[expectedcount ++] = i;
3926         }
3927 
3928         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3929     }
3930     delete bi;
3931 #endif
3932 }
3933 
TestSentBreaks(void)3934 void RBBITest::TestSentBreaks(void)
3935 {
3936 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3937     Locale        locale("en");
3938     UErrorCode    status = U_ZERO_ERROR;
3939     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3940     UChar         str[200];
3941     static const char *strlist[] =
3942     {
3943      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3944      "This\n",
3945      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3946      "\"Sentence ending with a quote.\" Bye.",
3947      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3948      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3949      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3950      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3951      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3952      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3953      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3954              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3955              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3956              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3957      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3958              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3959              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3960              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3961              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3962              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3963     };
3964     int loop;
3965     if (U_FAILURE(status)) {
3966         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3967         return;
3968     }
3969     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3970         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3971         UnicodeString ustr(str);
3972 
3973         RBBISentMonkey monkey;
3974         if (U_FAILURE(monkey.deferredStatus)) {
3975             continue;
3976         }
3977 
3978         const int EXPECTEDSIZE = 50;
3979         int expected[EXPECTEDSIZE];
3980         int expectedcount = 0;
3981 
3982         monkey.setText(ustr);
3983         int i;
3984         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3985             if (expectedcount >= EXPECTEDSIZE) {
3986                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3987                 return;
3988             }
3989             expected[expectedcount ++] = i;
3990         }
3991 
3992         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3993     }
3994     delete bi;
3995 #endif
3996 }
3997 
TestMonkey(char * params)3998 void RBBITest::TestMonkey(char *params) {
3999 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4000 
4001     UErrorCode     status    = U_ZERO_ERROR;
4002     int32_t        loopCount = 500;
4003     int32_t        seed      = 1;
4004     UnicodeString  breakType = "all";
4005     Locale         locale("en");
4006     UBool          useUText  = FALSE;
4007 
4008     if (quick == FALSE) {
4009         loopCount = 10000;
4010     }
4011 
4012     if (params) {
4013         UnicodeString p(params);
4014         loopCount = getIntParam("loop", p, loopCount);
4015         seed      = getIntParam("seed", p, seed);
4016 
4017         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4018         if (m.find()) {
4019             breakType = m.group(1, status);
4020             m.reset();
4021             p = m.replaceFirst("", status);
4022         }
4023 
4024         RegexMatcher u(" *utext", p, 0, status);
4025         if (u.find()) {
4026             useUText = TRUE;
4027             u.reset();
4028             p = u.replaceFirst("", status);
4029         }
4030 
4031 
4032         // m.reset(p);
4033         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4034             // Each option is stripped out of the option string as it is processed.
4035             // All options have been checked.  The option string should have been completely emptied..
4036             char buf[100];
4037             p.extract(buf, sizeof(buf), NULL, status);
4038             buf[sizeof(buf)-1] = 0;
4039             errln("Unrecognized or extra parameter:  %s\n", buf);
4040             return;
4041         }
4042 
4043     }
4044 
4045     if (breakType == "char" || breakType == "all") {
4046         RBBICharMonkey  m;
4047         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4048         if (U_SUCCESS(status)) {
4049             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4050             if (breakType == "all" && useUText==FALSE) {
4051                 // Also run a quick test with UText when "all" is specified
4052                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4053             }
4054         }
4055         else {
4056             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4057         }
4058         delete bi;
4059     }
4060 
4061     if (breakType == "word" || breakType == "all") {
4062         logln("Word Break Monkey Test");
4063         RBBIWordMonkey  m;
4064         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4065         if (U_SUCCESS(status)) {
4066             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4067         }
4068         else {
4069             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4070         }
4071         delete bi;
4072     }
4073 
4074     if (breakType == "line" || breakType == "all") {
4075         logln("Line Break Monkey Test");
4076         RBBILineMonkey  m;
4077         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4078         if (loopCount >= 10) {
4079             loopCount = loopCount / 5;   // Line break runs slower than the others.
4080         }
4081         if (U_SUCCESS(status)) {
4082             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4083         }
4084         else {
4085             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4086         }
4087         delete bi;
4088     }
4089 
4090     if (breakType == "sent" || breakType == "all"  ) {
4091         logln("Sentence Break Monkey Test");
4092         RBBISentMonkey  m;
4093         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4094         if (loopCount >= 10) {
4095             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4096         }
4097         if (U_SUCCESS(status)) {
4098             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4099         }
4100         else {
4101             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4102         }
4103         delete bi;
4104     }
4105 
4106 #endif
4107 }
4108 
4109 //
4110 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4111 //    Parameters:
4112 //       bi      - the break iterator to use
4113 //       mk      - MonkeyKind, abstraction for obtaining expected results
4114 //       name    - Name of test (char, word, etc.) for use in error messages
4115 //       seed    - Seed for starting random number generator (parameter from user)
4116 //       numIterations
4117 //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4118 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4119                          int32_t numIterations, UBool useUText) {
4120 
4121 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4122 
4123     const int32_t    TESTSTRINGLEN = 500;
4124     UnicodeString    testText;
4125     int32_t          numCharClasses;
4126     UVector          *chClasses;
4127     int              expected[TESTSTRINGLEN*2 + 1];
4128     int              expectedCount = 0;
4129     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4130     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4131     char             reverseBreaks[TESTSTRINGLEN*2+1];
4132     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4133     char             followingBreaks[TESTSTRINGLEN*2+1];
4134     char             precedingBreaks[TESTSTRINGLEN*2+1];
4135     int              i;
4136     int              loopCount = 0;
4137 
4138     m_seed = seed;
4139 
4140     numCharClasses = mk.charClasses()->size();
4141     chClasses      = mk.charClasses();
4142 
4143     // Check for errors that occured during the construction of the MonkeyKind object.
4144     //  Can't report them where they occured because errln() is a method coming from intlTest,
4145     //  and is not visible outside of RBBITest :-(
4146     if (U_FAILURE(mk.deferredStatus)) {
4147         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4148         return;
4149     }
4150 
4151     // Verify that the character classes all have at least one member.
4152     for (i=0; i<numCharClasses; i++) {
4153         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4154         if (s == NULL || s->size() == 0) {
4155             errln("Character Class #%d is null or of zero size.", i);
4156             return;
4157         }
4158     }
4159 
4160     while (loopCount < numIterations || numIterations == -1) {
4161         if (numIterations == -1 && loopCount % 10 == 0) {
4162             // If test is running in an infinite loop, display a periodic tic so
4163             //   we can tell that it is making progress.
4164             fprintf(stderr, ".");
4165         }
4166         // Save current random number seed, so that we can recreate the random numbers
4167         //   for this loop iteration in event of an error.
4168         seed = m_seed;
4169 
4170         // Populate a test string with data.
4171         testText.truncate(0);
4172         for (i=0; i<TESTSTRINGLEN; i++) {
4173             int32_t  aClassNum = m_rand() % numCharClasses;
4174             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4175             int32_t   charIdx = m_rand() % classSet->size();
4176             UChar32   c = classSet->charAt(charIdx);
4177             if (c < 0) {   // TODO:  deal with sets containing strings.
4178                 errln("c < 0");
4179                 break;
4180             }
4181             testText.append(c);
4182         }
4183 
4184         // Calculate the expected results for this test string.
4185         mk.setText(testText);
4186         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4187         expectedBreaks[0] = 1;
4188         int32_t breakPos = 0;
4189         expectedCount = 0;
4190         for (;;) {
4191             breakPos = mk.next(breakPos);
4192             if (breakPos == -1) {
4193                 break;
4194             }
4195             if (breakPos > testText.length()) {
4196                 errln("breakPos > testText.length()");
4197             }
4198             expectedBreaks[breakPos] = 1;
4199             U_ASSERT(expectedCount<testText.length());
4200             expected[expectedCount ++] = breakPos;
4201             (void)expected;   // Set but not used warning.
4202                               // TODO (andy): check it out.
4203         }
4204 
4205         // Find the break positions using forward iteration
4206         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4207         if (useUText) {
4208             UErrorCode status = U_ZERO_ERROR;
4209             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4210             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4211             bi->setText(testUText, status);
4212             TEST_ASSERT_SUCCESS(status);
4213             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4214                                       //  This UText can be closed immediately, so long as the
4215                                       //  testText string continues to exist.
4216         } else {
4217             bi->setText(testText);
4218         }
4219 
4220         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4221             if (i < 0 || i > testText.length()) {
4222                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4223                 break;
4224             }
4225             forwardBreaks[i] = 1;
4226         }
4227 
4228         // Find the break positions using reverse iteration
4229         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4230         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4231             if (i < 0 || i > testText.length()) {
4232                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4233                 break;
4234             }
4235             reverseBreaks[i] = 1;
4236         }
4237 
4238         // Find the break positions using isBoundary() tests.
4239         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4240         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4241         for (i=0; i<=testText.length(); i++) {
4242             isBoundaryBreaks[i] = bi->isBoundary(i);
4243         }
4244 
4245 
4246         // Find the break positions using the following() function.
4247         // printf(".");
4248         memset(followingBreaks, 0, sizeof(followingBreaks));
4249         int32_t   lastBreakPos = 0;
4250         followingBreaks[0] = 1;
4251         for (i=0; i<testText.length(); i++) {
4252             breakPos = bi->following(i);
4253             if (breakPos <= i ||
4254                 breakPos < lastBreakPos ||
4255                 breakPos > testText.length() ||
4256                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4257                 errln("%s break monkey test: "
4258                     "Out of range value returned by BreakIterator::following().\n"
4259                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4260                          name, seed, i, breakPos, lastBreakPos);
4261                 break;
4262             }
4263             followingBreaks[breakPos] = 1;
4264             lastBreakPos = breakPos;
4265         }
4266 
4267         // Find the break positions using the preceding() function.
4268         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4269         lastBreakPos = testText.length();
4270         precedingBreaks[testText.length()] = 1;
4271         for (i=testText.length(); i>0; i--) {
4272             breakPos = bi->preceding(i);
4273             if (breakPos >= i ||
4274                 breakPos > lastBreakPos ||
4275                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4276                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4277                 errln("%s break monkey test: "
4278                     "Out of range value returned by BreakIterator::preceding().\n"
4279                     "index=%d;  prev returned %d; lastBreak=%d" ,
4280                     name,  i, breakPos, lastBreakPos);
4281                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4282                     precedingBreaks[i] = 2;   // Forces an error.
4283                 }
4284             } else {
4285                 if (breakPos >= 0) {
4286                     precedingBreaks[breakPos] = 1;
4287                 }
4288                 lastBreakPos = breakPos;
4289             }
4290         }
4291 
4292         // Compare the expected and actual results.
4293         for (i=0; i<=testText.length(); i++) {
4294             const char *errorType = NULL;
4295             if  (forwardBreaks[i] != expectedBreaks[i]) {
4296                 errorType = "next()";
4297             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4298                 errorType = "previous()";
4299             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4300                 errorType = "isBoundary()";
4301             } else if (followingBreaks[i] != expectedBreaks[i]) {
4302                 errorType = "following()";
4303             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4304                 errorType = "preceding()";
4305             }
4306 
4307 
4308             if (errorType != NULL) {
4309                 // Format a range of the test text that includes the failure as
4310                 //  a data item that can be included in the rbbi test data file.
4311 
4312                 // Start of the range is the last point where expected and actual results
4313                 //   both agreed that there was a break position.
4314                 int startContext = i;
4315                 int32_t count = 0;
4316                 for (;;) {
4317                     if (startContext==0) { break; }
4318                     startContext --;
4319                     if (expectedBreaks[startContext] != 0) {
4320                         if (count == 2) break;
4321                         count ++;
4322                     }
4323                 }
4324 
4325                 // End of range is two expected breaks past the start position.
4326                 int endContext = i + 1;
4327                 int ci;
4328                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4329                     for (;;) {
4330                         if (endContext >= testText.length()) {break;}
4331                         if (expectedBreaks[endContext-1] != 0) {
4332                             if (count == 0) break;
4333                             count --;
4334                         }
4335                         endContext ++;
4336                     }
4337                 }
4338 
4339                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4340                 UnicodeString errorText = "<data>";
4341                 /***if (strcmp(errorType, "next()") == 0) {
4342                     startContext = 0;
4343                     endContext = testText.length();
4344 
4345                     printStringBreaks(testText, expected, expectedCount);
4346                 }***/
4347 
4348                 for (ci=startContext; ci<endContext;) {
4349                     UnicodeString hexChars("0123456789abcdef");
4350                     UChar32  c;
4351                     int      bn;
4352                     c = testText.char32At(ci);
4353                     if (ci == i) {
4354                         // This is the location of the error.
4355                         errorText.append("<?>");
4356                     } else if (expectedBreaks[ci] != 0) {
4357                         // This a non-error expected break position.
4358                         errorText.append("\\");
4359                     }
4360                     if (c < 0x10000) {
4361                         errorText.append("\\u");
4362                         for (bn=12; bn>=0; bn-=4) {
4363                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4364                         }
4365                     } else {
4366                         errorText.append("\\U");
4367                         for (bn=28; bn>=0; bn-=4) {
4368                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4369                         }
4370                     }
4371                     ci = testText.moveIndex32(ci, 1);
4372                 }
4373                 errorText.append("\\");
4374                 errorText.append("</data>\n");
4375 
4376                 // Output the error
4377                 char  charErrorTxt[500];
4378                 UErrorCode status = U_ZERO_ERROR;
4379                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4380                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4381                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4382 
4383                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4384                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4385                     errorType, seed, i, charErrorTxt);
4386                 break;
4387             }
4388         }
4389 
4390         loopCount++;
4391     }
4392 #endif
4393 }
4394 
4395 
4396 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4397 //             This test checks the initial patch,
4398 //             which is to just keep it from crashing.  Correct word boundaries
4399 //             await a proper fix to the dictionary code.
4400 //
TestBug5532(void)4401 void RBBITest::TestBug5532(void)  {
4402    // Text includes a mixture of Thai and Latin.
4403    const unsigned char utf8Data[] = {
4404            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4405            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4406            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4407            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4408            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4409            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4410            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4411            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4412            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4413            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4414            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4415 
4416     UErrorCode status = U_ZERO_ERROR;
4417     UText utext=UTEXT_INITIALIZER;
4418     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4419     TEST_ASSERT_SUCCESS(status);
4420 
4421     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4422     TEST_ASSERT_SUCCESS(status);
4423     if (U_SUCCESS(status)) {
4424         bi->setText(&utext, status);
4425         TEST_ASSERT_SUCCESS(status);
4426 
4427         int32_t breakCount = 0;
4428         int32_t previousBreak = -1;
4429         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4430             // For now, just make sure that the break iterator doesn't hang.
4431             TEST_ASSERT(previousBreak < bi->current());
4432             previousBreak = bi->current();
4433         }
4434         TEST_ASSERT(breakCount > 0);
4435     }
4436     delete bi;
4437     utext_close(&utext);
4438 }
4439 
4440 
TestBug9983(void)4441 void RBBITest::TestBug9983(void)  {
4442     UnicodeString text = UnicodeString("\\u002A"  // * Other
4443                                        "\\uFF65"  //   Other
4444                                        "\\u309C"  //   Katakana
4445                                        "\\uFF9F"  //   Extend
4446                                        "\\uFF65"  //   Other
4447                                        "\\u0020"  //   Other
4448                                        "\\u0000").unescape();
4449 
4450     UErrorCode status = U_ZERO_ERROR;
4451     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4452         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4453     TEST_ASSERT_SUCCESS(status);
4454     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4455         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4456     TEST_ASSERT_SUCCESS(status);
4457     if (U_FAILURE(status)) {
4458         return;
4459     }
4460     int32_t offset, rstatus, iterationCount;
4461 
4462     brkiter->setText(text);
4463     brkiter->last();
4464     iterationCount = 0;
4465     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4466         iterationCount++;
4467         rstatus = brkiter->getRuleStatus();
4468         (void)rstatus;     // Suppress set but not used warning.
4469         if (iterationCount >= 10) {
4470            break;
4471         }
4472     }
4473     TEST_ASSERT(iterationCount == 6);
4474 
4475     brkiterPOSIX->setText(text);
4476     brkiterPOSIX->last();
4477     iterationCount = 0;
4478     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4479         iterationCount++;
4480         rstatus = brkiterPOSIX->getRuleStatus();
4481         (void)rstatus;     // Suppress set but not used warning.
4482         if (iterationCount >= 10) {
4483            break;
4484         }
4485     }
4486     TEST_ASSERT(iterationCount == 6);
4487 }
4488 
4489 
4490 //
4491 //  TestDebug    -  A place-holder test for debugging purposes.
4492 //                  For putting in fragments of other tests that can be invoked
4493 //                  for tracing  without a lot of unwanted extra stuff happening.
4494 //
TestDebug(void)4495 void RBBITest::TestDebug(void) {
4496 #if 0
4497     UErrorCode   status = U_ZERO_ERROR;
4498     int pos = 0;
4499     int ruleStatus = 0;
4500 
4501     RuleBasedBreakIterator* bi =
4502        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4503        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4504        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4505     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4506     // UnicodeString s("Aaa.  Bcd");
4507     s = s.unescape();
4508     bi->setText(s);
4509     UBool r = bi->isBoundary(8);
4510     printf("%s", r?"true":"false");
4511     return;
4512     pos = bi->last();
4513     do {
4514         // ruleStatus = bi->getRuleStatus();
4515         printf("%d\t%d\n", pos, ruleStatus);
4516         pos = bi->previous();
4517     } while (pos != BreakIterator::DONE);
4518 #endif
4519 }
4520 
TestProperties()4521 void RBBITest::TestProperties() {
4522     UErrorCode errorCode = U_ZERO_ERROR;
4523     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4524     if (!prependSet.isEmpty()) {
4525         errln(
4526             "[:GCB=Prepend:] is not empty any more. "
4527             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4528             "change this test to the opposite condition.");
4529     }
4530 }
4531 
4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4533