• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  // Copyright (C) 2016 and later: Unicode, Inc. and others.
2  // License & terms of use: http://www.unicode.org/copyright.html
3  /********************************************************************
4   * COPYRIGHT:
5   * Copyright (c) 1999-2016, International Business Machines Corporation and
6   * others. All Rights Reserved.
7   ********************************************************************/
8  /************************************************************************
9  *   Date        Name        Description
10  *   12/15/99    Madhu        Creation.
11  *   01/12/2000  Madhu        Updated for changed API and added new tests
12  ************************************************************************/
13  
14  #include "unicode/utypes.h"
15  #if !UCONFIG_NO_BREAK_ITERATION
16  
17  #include <stdio.h>
18  #include <stdlib.h>
19  #include <string.h>
20  
21  #include "unicode/brkiter.h"
22  #include "unicode/localpointer.h"
23  #include "unicode/numfmt.h"
24  #include "unicode/rbbi.h"
25  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
26  #include "unicode/regex.h"
27  #endif
28  #include "unicode/schriter.h"
29  #include "unicode/uchar.h"
30  #include "unicode/utf16.h"
31  #include "unicode/ucnv.h"
32  #include "unicode/uniset.h"
33  #include "unicode/uscript.h"
34  #include "unicode/ustring.h"
35  #include "unicode/utext.h"
36  
37  #include "charstr.h"
38  #include "cmemory.h"
39  #include "intltest.h"
40  #include "rbbitst.h"
41  #include "utypeinfo.h"  // for 'typeid' to work
42  #include "uvector.h"
43  #include "uvectr32.h"
44  
45  #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
46  #include "unicode/filteredbrk.h"
47  #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
48  
49  #define TEST_ASSERT(x) {if (!(x)) { \
50      errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
51  
52  #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
53      errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
54  
55  
56  //---------------------------------------------
57  // runIndexedTest
58  //---------------------------------------------
59  
60  
61  //  Note:  Before adding new tests to this file, check whether the desired test data can
62  //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
63  //         it's much less work than writing a new test, diagnostic output in the event of failures
64  //         is good, and the test data file will is shared with ICU4J, so eventually the test
65  //         will run there as well, without additional effort.
66  
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)67  void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
68  {
69      if (exec) logln("TestSuite RuleBasedBreakIterator: ");
70      fTestParams = params;
71  
72      TESTCASE_AUTO_BEGIN;
73  #if !UCONFIG_NO_FILE_IO
74      TESTCASE_AUTO(TestBug4153072);
75  #endif
76      TESTCASE_AUTO(TestStatusReturn);
77  #if !UCONFIG_NO_FILE_IO
78      TESTCASE_AUTO(TestUnicodeFiles);
79      TESTCASE_AUTO(TestEmptyString);
80  #endif
81      TESTCASE_AUTO(TestGetAvailableLocales);
82      TESTCASE_AUTO(TestGetDisplayName);
83  #if !UCONFIG_NO_FILE_IO
84      TESTCASE_AUTO(TestEndBehaviour);
85      TESTCASE_AUTO(TestWordBreaks);
86      TESTCASE_AUTO(TestWordBoundary);
87      TESTCASE_AUTO(TestLineBreaks);
88      TESTCASE_AUTO(TestSentBreaks);
89      TESTCASE_AUTO(TestExtended);
90  #endif
91  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
92      TESTCASE_AUTO(TestMonkey);
93  #endif
94  #if !UCONFIG_NO_FILE_IO
95      TESTCASE_AUTO(TestBug3818);
96  #endif
97      TESTCASE_AUTO(TestDebug);
98  #if !UCONFIG_NO_FILE_IO
99      TESTCASE_AUTO(TestBug5775);
100  #endif
101      TESTCASE_AUTO(TestBug9983);
102      TESTCASE_AUTO(TestDictRules);
103      TESTCASE_AUTO(TestBug5532);
104      TESTCASE_AUTO(TestBug7547);
105      TESTCASE_AUTO(TestBug12797);
106      TESTCASE_AUTO(TestBug12918);
107      TESTCASE_AUTO_END;
108  }
109  
110  
111  //---------------------------------------------------------------------------
112  //
113  //   class BITestData   Holds a set of Break iterator test data and results
114  //                      Includes
115  //                         - the string data to be broken
116  //                         - a vector of the expected break positions.
117  //                         - a vector of source line numbers for the data,
118  //                               (to help see where errors occured.)
119  //                         - The expected break tag values.
120  //                         - Vectors of actual break positions and tag values.
121  //                         - Functions for comparing actual with expected and
122  //                            reporting errors.
123  //
124  //----------------------------------------------------------------------------
125  class BITestData {
126  public:
127      UnicodeString    fDataToBreak;
128      UVector          fExpectedBreakPositions;
129      UVector          fExpectedTags;
130      UVector          fLineNum;
131      UVector          fActualBreakPositions;   // Test Results.
132      UVector          fActualTags;
133  
134      BITestData(UErrorCode &status);
135      void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
136      void             checkResults(const char *heading, RBBITest *test);
137      void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
138      void             clearResults();
139  };
140  
141  //
142  // Constructor.
143  //
BITestData(UErrorCode & status)144  BITestData::BITestData(UErrorCode &status)
145  : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
146    fActualTags(status)
147  {
148  }
149  
150  //
151  // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
152  //                 The macro form collects the line number, which is helpful
153  //                 when tracking down failures.
154  //
155  //                 A null data item is inserted at the start of each test's data
156  //                  to put the starting zero into the data list.  The position saved for
157  //                  each non-null item is its ending position.
158  //
159  #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
addDataChunk(const char * data,int32_t tag,int32_t lineNum,UErrorCode status)160  void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
161      if (U_FAILURE(status)) {return;}
162      if (data != NULL) {
163          fDataToBreak.append(CharsToUnicodeString(data));
164      }
165      fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
166      fExpectedTags.addElement(tag, status);
167      fLineNum.addElement(lineNum, status);
168  }
169  
170  
171  //
172  //  checkResults.   Compare the actual and expected break positions, report any differences.
173  //
checkResults(const char * heading,RBBITest * test)174  void BITestData::checkResults(const char *heading, RBBITest *test) {
175      int32_t   expectedIndex = 0;
176      int32_t   actualIndex = 0;
177  
178      for (;;) {
179          // If we've run through both the expected and actual results vectors, we're done.
180          //   break out of the loop.
181          if (expectedIndex >= fExpectedBreakPositions.size() &&
182              actualIndex   >= fActualBreakPositions.size()) {
183              break;
184          }
185  
186  
187          if (expectedIndex >= fExpectedBreakPositions.size()) {
188              err(heading, test, expectedIndex-1, actualIndex);
189              actualIndex++;
190              continue;
191          }
192  
193          if (actualIndex >= fActualBreakPositions.size()) {
194              err(heading, test, expectedIndex, actualIndex-1);
195              expectedIndex++;
196              continue;
197          }
198  
199          if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
200              err(heading, test, expectedIndex, actualIndex);
201              // Try to resync the positions of the indices, to avoid a rash of spurious erros.
202              if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
203                  actualIndex++;
204              } else {
205                  expectedIndex++;
206              }
207              continue;
208          }
209  
210          if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
211              test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
212                  heading, fLineNum.elementAt(expectedIndex),
213                  fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
214          }
215  
216          actualIndex++;
217          expectedIndex++;
218      }
219  }
220  
221  //
222  //  err   -  An error was found.  Report it, along with information about where the
223  //                                incorrectly broken test data appeared in the source file.
224  //
err(const char * heading,RBBITest * test,int32_t expectedIdx,int32_t actualIdx)225  void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
226  {
227      int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
228      int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
229      int32_t   o        = 0;
230      int32_t   line     = fLineNum.elementAti(expectedIdx);
231      if (expectedIdx > 0) {
232          // The line numbers are off by one because a premature break occurs somewhere
233          //    within the previous item, rather than at the start of the current (expected) item.
234          //    We want to report the offset of the unexpected break from the start of
235          //      this previous item.
236          o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
237      }
238      if (actual < expected) {
239          test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
240      } else {
241          test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
242      }
243  }
244  
245  
clearResults()246  void BITestData::clearResults() {
247      fActualBreakPositions.removeAllElements();
248      fActualTags.removeAllElements();
249  }
250  
251  
252  //--------------------------------------------------------------------------------------
253  //
254  //    RBBITest    constructor and destructor
255  //
256  //--------------------------------------------------------------------------------------
257  
RBBITest()258  RBBITest::RBBITest() {
259      fTestParams = NULL;
260  }
261  
262  
~RBBITest()263  RBBITest::~RBBITest() {
264  }
265  
266  //-----------------------------------------------------------------------------------
267  //
268  //   Test for status {tag} return value from break rules.
269  //        TODO:  a more thorough test.
270  //
271  //-----------------------------------------------------------------------------------
TestStatusReturn()272  void RBBITest::TestStatusReturn() {
273       UnicodeString rulesString1("$Letters = [:L:];\n"
274                                    "$Numbers = [:N:];\n"
275                                    "$Letters+{1};\n"
276                                    "$Numbers+{2};\n"
277                                    "Help\\ /me\\!{4};\n"
278                                    "[^$Letters $Numbers];\n"
279                                    "!.*;\n", -1, US_INV);
280       UnicodeString testString1  = "abc123..abc Help me Help me!";
281                                  // 01234567890123456789012345678
282       int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
283       int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
284  
285       UErrorCode status=U_ZERO_ERROR;
286       UParseError    parseError;
287  
288       LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
289       if(U_FAILURE(status)) {
290           dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
291           return;
292       }
293       int32_t  pos;
294       int32_t  i = 0;
295       bi->setText(testString1);
296       for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
297           if (pos != bounds1[i]) {
298               errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
299               break;
300           }
301  
302           int tag = bi->getRuleStatus();
303           if (tag != brkStatus[i]) {
304               errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
305               break;
306           }
307           i++;
308       }
309  }
310  
311  
printStringBreaks(UText * tstr,int expected[],int expectedCount)312  static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
313      UErrorCode status = U_ZERO_ERROR;
314      char name[100];
315      printf("code    alpha extend alphanum type word sent line name\n");
316      int nextExpectedIndex = 0;
317      utext_setNativeIndex(tstr, 0);
318      for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
319          if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
320              printf("------------------------------------------------ %d\n", j);
321              ++nextExpectedIndex;
322          }
323  
324          UChar32 c = utext_next32(tstr);
325          u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
326          printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
327                             u_isUAlphabetic(c),
328                             u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
329                             u_isalnum(c),
330                             u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
331                                                    u_charType(c),
332                                                    U_SHORT_PROPERTY_NAME),
333                             u_getPropertyValueName(UCHAR_WORD_BREAK,
334                                                    u_getIntPropertyValue(c,
335                                                            UCHAR_WORD_BREAK),
336                                                    U_SHORT_PROPERTY_NAME),
337                             u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
338                                     u_getIntPropertyValue(c,
339                                             UCHAR_SENTENCE_BREAK),
340                                     U_SHORT_PROPERTY_NAME),
341                             u_getPropertyValueName(UCHAR_LINE_BREAK,
342                                     u_getIntPropertyValue(c,
343                                             UCHAR_LINE_BREAK),
344                                     U_SHORT_PROPERTY_NAME),
345                             name);
346      }
347  }
348  
349  
printStringBreaks(const UnicodeString & ustr,int expected[],int expectedCount)350  static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
351     UErrorCode status = U_ZERO_ERROR;
352     UText *tstr = NULL;
353     tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
354     if (U_FAILURE(status)) {
355         printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
356         return;
357      }
358     printStringBreaks(tstr, expected, expectedCount);
359     utext_close(tstr);
360  }
361  
362  
TestBug3818()363  void RBBITest::TestBug3818() {
364      UErrorCode  status = U_ZERO_ERROR;
365  
366      // Four Thai words...
367      static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
368                                             0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
369      UnicodeString  thaiStr(thaiWordData);
370  
371      BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
372      if (U_FAILURE(status) || bi == NULL) {
373          errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
374          return;
375      }
376      bi->setText(thaiStr);
377  
378      int32_t  startOfSecondWord = bi->following(1);
379      if (startOfSecondWord != 4) {
380          errln("Fail at file %s, line %d expected start of word at 4, got %d",
381              __FILE__, __LINE__, startOfSecondWord);
382      }
383      startOfSecondWord = bi->following(0);
384      if (startOfSecondWord != 4) {
385          errln("Fail at file %s, line %d expected start of word at 4, got %d",
386              __FILE__, __LINE__, startOfSecondWord);
387      }
388      delete bi;
389  }
390  
391  //----------------------------------------------------------------------------
392  //
393  // generalIteratorTest      Given a break iterator and a set of test data,
394  //                          Run the tests and report the results.
395  //
396  //----------------------------------------------------------------------------
generalIteratorTest(RuleBasedBreakIterator & bi,BITestData & td)397  void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
398  {
399  
400      bi.setText(td.fDataToBreak);
401  
402      testFirstAndNext(bi, td);
403  
404      testLastAndPrevious(bi, td);
405  
406      testFollowing(bi, td);
407      testPreceding(bi, td);
408      testIsBoundary(bi, td);
409      doMultipleSelectionTest(bi, td);
410  }
411  
412  
413  //
414  //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
415  //                       kind of loop.
416  //
testFirstAndNext(RuleBasedBreakIterator & bi,BITestData & td)417  void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
418  {
419      UErrorCode  status = U_ZERO_ERROR;
420      int32_t     p;
421      int32_t     lastP = -1;
422      int32_t     tag;
423  
424      logln("Test first and next");
425      bi.setText(td.fDataToBreak);
426      td.clearResults();
427  
428      for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
429          td.fActualBreakPositions.addElement(p, status);  // Save result.
430          tag = bi.getRuleStatus();
431          td.fActualTags.addElement(tag, status);
432          if (p <= lastP) {
433              // If the iterator is not making forward progress, stop.
434              //  No need to raise an error here, it'll be detected in the normal check of results.
435              break;
436          }
437          lastP = p;
438      }
439      td.checkResults("testFirstAndNext", this);
440  }
441  
442  
443  //
444  //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
445  //
testLastAndPrevious(RuleBasedBreakIterator & bi,BITestData & td)446  void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
447  {
448      UErrorCode  status = U_ZERO_ERROR;
449      int32_t     p;
450      int32_t     lastP  = 0x7ffffffe;
451      int32_t     tag;
452  
453      logln("Test last and previous");
454      bi.setText(td.fDataToBreak);
455      td.clearResults();
456  
457      for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
458          // Save break position.  Insert it at start of vector of results, shoving
459          //    already-saved results further towards the end.
460          td.fActualBreakPositions.insertElementAt(p, 0, status);
461          // bi.previous();   // TODO:  Why does this fix things up????
462          // bi.next();
463          tag = bi.getRuleStatus();
464          td.fActualTags.insertElementAt(tag, 0, status);
465          if (p >= lastP) {
466              // If the iterator is not making progress, stop.
467              //  No need to raise an error here, it'll be detected in the normal check of results.
468              break;
469          }
470          lastP = p;
471      }
472      td.checkResults("testLastAndPrevious", this);
473  }
474  
475  
testFollowing(RuleBasedBreakIterator & bi,BITestData & td)476  void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
477  {
478      UErrorCode  status = U_ZERO_ERROR;
479      int32_t     p;
480      int32_t     tag;
481      int32_t     lastP  = -2;     // A value that will never be returned as a break position.
482                                   //   cannot be -1; that is returned for DONE.
483      int         i;
484  
485      logln("testFollowing():");
486      bi.setText(td.fDataToBreak);
487      td.clearResults();
488  
489      // Save the starting point, since we won't get that out of following.
490      p = bi.first();
491      td.fActualBreakPositions.addElement(p, status);  // Save result.
492      tag = bi.getRuleStatus();
493      td.fActualTags.addElement(tag, status);
494  
495      for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
496          p = bi.following(i);
497          if (p != lastP) {
498              if (p == RuleBasedBreakIterator::DONE) {
499                  break;
500              }
501              // We've reached a new break position.  Save it.
502              td.fActualBreakPositions.addElement(p, status);  // Save result.
503              tag = bi.getRuleStatus();
504              td.fActualTags.addElement(tag, status);
505              lastP = p;
506          }
507      }
508      // The loop normally exits by means of the break in the middle.
509      // Make sure that the index was at the correct position for the break iterator to have
510      //   returned DONE.
511      if (i != td.fDataToBreak.length()) {
512          errln("testFollowing():  iterator returned DONE prematurely.");
513      }
514  
515      // Full check of all results.
516      td.checkResults("testFollowing", this);
517  }
518  
519  
520  
testPreceding(RuleBasedBreakIterator & bi,BITestData & td)521  void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
522      UErrorCode  status = U_ZERO_ERROR;
523      int32_t     p;
524      int32_t     tag;
525      int32_t     lastP  = 0x7ffffffe;
526      int         i;
527  
528      logln("testPreceding():");
529      bi.setText(td.fDataToBreak);
530      td.clearResults();
531  
532      p = bi.last();
533      td.fActualBreakPositions.addElement(p, status);
534      tag = bi.getRuleStatus();
535      td.fActualTags.addElement(tag, status);
536  
537      for (i = td.fDataToBreak.length(); i>=-1; i--) {
538          p = bi.preceding(i);
539          if (p != lastP) {
540              if (p == RuleBasedBreakIterator::DONE) {
541                  break;
542              }
543              // We've reached a new break position.  Save it.
544              td.fActualBreakPositions.insertElementAt(p, 0, status);
545              lastP = p;
546              tag = bi.getRuleStatus();
547              td.fActualTags.insertElementAt(tag, 0, status);
548          }
549      }
550      // The loop normally exits by means of the break in the middle.
551      // Make sure that the index was at the correct position for the break iterator to have
552      //   returned DONE.
553      if (i != 0) {
554          errln("testPreceding():  iterator returned DONE prematurely.");
555      }
556  
557      // Full check of all results.
558      td.checkResults("testPreceding", this);
559  }
560  
561  
562  
testIsBoundary(RuleBasedBreakIterator & bi,BITestData & td)563  void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
564      UErrorCode  status = U_ZERO_ERROR;
565      int         i;
566      int32_t     tag;
567  
568      logln("testIsBoundary():");
569      bi.setText(td.fDataToBreak);
570      td.clearResults();
571  
572      for (i = 0; i <= td.fDataToBreak.length(); i++) {
573          if (bi.isBoundary(i)) {
574              td.fActualBreakPositions.addElement(i, status);  // Save result.
575              tag = bi.getRuleStatus();
576              td.fActualTags.addElement(tag, status);
577          }
578      }
579      td.checkResults("testIsBoundary: ", this);
580  }
581  
582  
583  
doMultipleSelectionTest(RuleBasedBreakIterator & iterator,BITestData & td)584  void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
585  {
586      iterator.setText(td.fDataToBreak);
587  
588      RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
589      int32_t offset = iterator.first();
590      int32_t testOffset;
591      int32_t count = 0;
592  
593      logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
594  
595      if (*testIterator != iterator)
596          errln("clone() or operator!= failed: two clones compared unequal");
597  
598      do {
599          testOffset = testIterator->first();
600          testOffset = testIterator->next(count);
601          if (offset != testOffset)
602              errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
603  
604          if (offset != RuleBasedBreakIterator::DONE) {
605              count++;
606              offset = iterator.next();
607  
608              if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
609                  errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
610                  if (count > 10000 || offset == -1) {
611                      errln("operator== failed too many times. Stopping test.");
612                      if (offset == -1) {
613                          errln("Does (RuleBasedBreakIterator::DONE == -1)?");
614                      }
615                      return;
616                  }
617              }
618          }
619      } while (offset != RuleBasedBreakIterator::DONE);
620  
621      // now do it backwards...
622      offset = iterator.last();
623      count = 0;
624  
625      do {
626          testOffset = testIterator->last();
627          testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
628          if (offset != testOffset)
629              errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
630  
631          if (offset != RuleBasedBreakIterator::DONE) {
632              count--;
633              offset = iterator.previous();
634          }
635      } while (offset != RuleBasedBreakIterator::DONE);
636  
637      delete testIterator;
638  }
639  
640  
641  //---------------------------------------------
642  //
643  //     other tests
644  //
645  //---------------------------------------------
TestEmptyString()646  void RBBITest::TestEmptyString()
647  {
648      UnicodeString text = "";
649      UErrorCode status = U_ZERO_ERROR;
650  
651      BITestData x(status);
652      ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
653      RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
654      if (U_FAILURE(status))
655      {
656          errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
657          return;
658      }
659      generalIteratorTest(*bi, x);
660      delete bi;
661  }
662  
TestGetAvailableLocales()663  void RBBITest::TestGetAvailableLocales()
664  {
665      int32_t locCount = 0;
666      const Locale* locList = BreakIterator::getAvailableLocales(locCount);
667  
668      if (locCount == 0)
669          dataerrln("getAvailableLocales() returned an empty list!");
670      // Just make sure that it's returning good memory.
671      int32_t i;
672      for (i = 0; i < locCount; ++i) {
673          logln(locList[i].getName());
674      }
675  }
676  
677  //Testing the BreakIterator::getDisplayName() function
TestGetDisplayName()678  void RBBITest::TestGetDisplayName()
679  {
680      UnicodeString   result;
681  
682      BreakIterator::getDisplayName(Locale::getUS(), result);
683      if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
684          dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
685                  + result);
686  
687      BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
688      if (result != "French (France)")
689          dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
690                  + result);
691  }
692  /**
693   * Test End Behaviour
694   * @bug 4068137
695   */
TestEndBehaviour()696  void RBBITest::TestEndBehaviour()
697  {
698      UErrorCode status = U_ZERO_ERROR;
699      UnicodeString testString("boo.");
700      BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
701      if (U_FAILURE(status))
702      {
703          errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
704          return;
705      }
706      wb->setText(testString);
707  
708      if (wb->first() != 0)
709          errln("Didn't get break at beginning of string.");
710      if (wb->next() != 3)
711          errln("Didn't get break before period in \"boo.\"");
712      if (wb->current() != 4 && wb->next() != 4)
713          errln("Didn't get break at end of string.");
714      delete wb;
715  }
716  /*
717   * @bug 4153072
718   */
TestBug4153072()719  void RBBITest::TestBug4153072() {
720      UErrorCode status = U_ZERO_ERROR;
721      BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
722      if (U_FAILURE(status))
723      {
724          errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
725          return;
726      }
727      UnicodeString str("...Hello, World!...");
728      int32_t begin = 3;
729      int32_t end = str.length() - 3;
730      UBool onBoundary;
731  
732      StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
733      iter->adoptText(textIterator);
734      int index;
735      // Note: with the switch to UText, there is no way to restrict the
736      //       iteration range to begin at an index other than zero.
737      //       String character iterators created with a non-zero bound are
738      //         treated by RBBI as being empty.
739      for (index = -1; index < begin + 1; ++index) {
740          onBoundary = iter->isBoundary(index);
741          if (index == 0?  !onBoundary : onBoundary) {
742              errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
743                              " and begin index = " + begin);
744          }
745      }
746      delete iter;
747  }
748  
749  
750  //
751  // Test for problem reported by Ashok Matoria on 9 July 2007
752  //    One.<kSoftHyphen><kSpace>Two.
753  //
754  //    Sentence break at start (0) and then on calling next() it breaks at
755  //   'T' of "Two". Now, at this point if I do next() and
756  //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
757  //
TestBug5775()758  void RBBITest::TestBug5775() {
759      UErrorCode status = U_ZERO_ERROR;
760      BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
761      TEST_ASSERT_SUCCESS(status);
762      if (U_FAILURE(status)) {
763          return;
764      }
765  // Check for status first for better handling of no data errors.
766      TEST_ASSERT(bi != NULL);
767      if (bi == NULL) {
768          return;
769      }
770  
771      UnicodeString s("One.\\u00ad Two.", -1, US_INV);
772      //               01234      56789
773      s = s.unescape();
774      bi->setText(s);
775      int pos = bi->next();
776      TEST_ASSERT(pos == 6);
777      pos = bi->next();
778      TEST_ASSERT(pos == 10);
779      pos = bi->previous();
780      TEST_ASSERT(pos == 6);
781      delete bi;
782  }
783  
784  
785  
786  //------------------------------------------------------------------------------
787  //
788  //   RBBITest::Extended    Run  RBBI Tests from an external test data file
789  //
790  //------------------------------------------------------------------------------
791  
792  struct TestParams {
793      BreakIterator   *bi;                   // Break iterator is set while parsing test source.
794                                             //   Changed out whenever test data changes break type.
795  
796      UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
797      UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
798      UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
799      UVector32       *srcCol;
800  
801      UText           *textToBreak;          // UText, could be UTF8 or UTF16.
802      UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
803      CharString       utf8String;           // UTF-8 form of text to break.
804  
TestParamsTestParams805      TestParams(UErrorCode &status) : dataToBreak() {
806          bi               = NULL;
807          expectedBreaks   = new UVector32(status);
808          srcLine          = new UVector32(status);
809          srcCol           = new UVector32(status);
810          textToBreak      = NULL;
811          textMap          = new UVector32(status);
812      }
813  
~TestParamsTestParams814      ~TestParams() {
815          delete bi;
816          delete expectedBreaks;
817          delete srcLine;
818          delete srcCol;
819          utext_close(textToBreak);
820          delete textMap;
821      }
822  
823      int32_t getSrcLine(int32_t bp);
824      int32_t getExpectedBreak(int32_t bp);
825      int32_t getSrcCol(int32_t bp);
826  
827      void setUTF16(UErrorCode &status);
828      void setUTF8(UErrorCode &status);
829  };
830  
831  // Append a UnicodeString to a CharString with UTF-8 encoding.
832  // Substitute any invalid chars.
833  //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
CharStringAppend(CharString & dest,const UnicodeString & src,UErrorCode & status)834  static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
835      if (U_FAILURE(status)) {
836          return;
837      }
838      int32_t utf8Length;
839      u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
840                         src.getBuffer(), src.length(),   // UTF-16 data
841                         0xfffd, NULL,                    // Substitution char, number of subs.
842                         &status);
843      if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
844          return;
845      }
846      status = U_ZERO_ERROR;
847      int32_t capacity;
848      char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
849      u_strToUTF8WithSub(buffer, utf8Length, NULL,
850                         src.getBuffer(), src.length(),
851                         0xfffd, NULL, &status);
852      dest.append(buffer, utf8Length, status);
853  }
854  
855  
setUTF16(UErrorCode & status)856  void TestParams::setUTF16(UErrorCode &status) {
857      textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
858      textMap->removeAllElements();
859      for (int32_t i=0; i<dataToBreak.length(); i++) {
860          if (i == dataToBreak.getChar32Start(i)) {
861              textMap->addElement(i, status);
862          } else {
863              textMap->addElement(-1, status);
864          }
865      }
866      textMap->addElement(dataToBreak.length(), status);
867      U_ASSERT(dataToBreak.length() + 1 == textMap->size());
868  }
869  
870  
setUTF8(UErrorCode & status)871  void TestParams::setUTF8(UErrorCode &status) {
872      if (U_FAILURE(status)) {
873          return;
874      }
875      utf8String.clear();
876      CharStringAppend(utf8String, dataToBreak, status);
877      textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
878      if (U_FAILURE(status)) {
879          return;
880      }
881  
882      textMap->removeAllElements();
883      int32_t utf16Index = 0;
884      for (;;) {
885          textMap->addElement(utf16Index, status);
886          UChar32 c32 = utext_current32(textToBreak);
887          if (c32 < 0) {
888              break;
889          }
890          utf16Index += U16_LENGTH(c32);
891          utext_next32(textToBreak);
892          while (textMap->size() < utext_getNativeIndex(textToBreak)) {
893              textMap->addElement(-1, status);
894          }
895      }
896      U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
897  }
898  
899  
getSrcLine(int32_t bp)900  int32_t TestParams::getSrcLine(int32_t bp) {
901      if (bp >= textMap->size()) {
902          bp = textMap->size() - 1;
903      }
904      int32_t i = 0;
905      for(; bp >= 0 ; --bp) {
906          // Move to a character boundary if we are not on one already.
907          i = textMap->elementAti(bp);
908          if (i >= 0) {
909              break;
910          }
911      }
912      return srcLine->elementAti(i);
913  }
914  
915  
getExpectedBreak(int32_t bp)916  int32_t TestParams::getExpectedBreak(int32_t bp) {
917      if (bp >= textMap->size()) {
918          return 0;
919      }
920      int32_t i = textMap->elementAti(bp);
921      int32_t retVal = 0;
922      if (i >= 0) {
923          retVal = expectedBreaks->elementAti(i);
924      }
925      return retVal;
926  }
927  
928  
getSrcCol(int32_t bp)929  int32_t TestParams::getSrcCol(int32_t bp) {
930      if (bp >= textMap->size()) {
931          bp = textMap->size() - 1;
932      }
933      int32_t i = 0;
934      for(; bp >= 0; --bp) {
935          // Move bp to a character boundary if we are not on one already.
936          i = textMap->elementAti(bp);
937          if (i >= 0) {
938              break;
939          }
940      }
941      return srcCol->elementAti(i);
942  }
943  
944  
executeTest(TestParams * t,UErrorCode & status)945  void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
946      int32_t    bp;
947      int32_t    prevBP;
948      int32_t    i;
949  
950      TEST_ASSERT_SUCCESS(status);
951      if (U_FAILURE(status)) {
952          return;
953      }
954  
955      if (t->bi == NULL) {
956          return;
957      }
958  
959      t->bi->setText(t->textToBreak, status);
960      //
961      //  Run the iterator forward
962      //
963      prevBP = -1;
964      for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
965          if (prevBP ==  bp) {
966              // Fail for lack of forward progress.
967              errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
968                  bp, t->getSrcLine(bp), t->getSrcCol(bp));
969              break;
970          }
971  
972          // Check that there we didn't miss an expected break between the last one
973          //  and this one.
974          for (i=prevBP+1; i<bp; i++) {
975              if (t->getExpectedBreak(i) != 0) {
976                  int expected[] = {0, i};
977                  printStringBreaks(t->dataToBreak, expected, 2);
978                  errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
979                        i, t->getSrcLine(i), t->getSrcCol(i));
980              }
981          }
982  
983          // Check that the break we did find was expected
984          if (t->getExpectedBreak(bp) == 0) {
985              int expected[] = {0, bp};
986              printStringBreaks(t->textToBreak, expected, 2);
987              errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
988                  bp, t->getSrcLine(bp), t->getSrcCol(bp));
989          } else {
990              // The break was expected.
991              //   Check that the {nnn} tag value is correct.
992              int32_t expectedTagVal = t->getExpectedBreak(bp);
993              if (expectedTagVal == -1) {
994                  expectedTagVal = 0;
995              }
996              int32_t line = t->getSrcLine(bp);
997              int32_t rs = t->bi->getRuleStatus();
998              if (rs != expectedTagVal) {
999                  errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1000                        "          Actual, Expected status = %4d, %4d",
1001                      bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1002              }
1003          }
1004  
1005          prevBP = bp;
1006      }
1007  
1008      // Verify that there were no missed expected breaks after the last one found
1009      for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1010          if (t->getExpectedBreak(i) != 0) {
1011              errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1012                        i, t->getSrcLine(i), t->getSrcCol(i));
1013          }
1014      }
1015  
1016      //
1017      //  Run the iterator backwards, verify that the same breaks are found.
1018      //
1019      prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1020      for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1021          if (prevBP ==  bp) {
1022              // Fail for lack of progress.
1023              errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1024                  bp, t->getSrcLine(bp), t->getSrcCol(bp));
1025              break;
1026          }
1027  
1028          // Check that we didn't miss an expected break between the last one
1029          //  and this one.  (UVector returns zeros for index out of bounds.)
1030          for (i=prevBP-1; i>bp; i--) {
1031              if (t->getExpectedBreak(i) != 0) {
1032                  errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1033                        i, t->getSrcLine(i), t->getSrcCol(i));
1034              }
1035          }
1036  
1037          // Check that the break we did find was expected
1038          if (t->getExpectedBreak(bp) == 0) {
1039              errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1040                     bp, t->getSrcLine(bp), t->getSrcCol(bp));
1041          } else {
1042              // The break was expected.
1043              //   Check that the {nnn} tag value is correct.
1044              int32_t expectedTagVal = t->getExpectedBreak(bp);
1045              if (expectedTagVal == -1) {
1046                  expectedTagVal = 0;
1047              }
1048              int line = t->getSrcLine(bp);
1049              int32_t rs = t->bi->getRuleStatus();
1050              if (rs != expectedTagVal) {
1051                  errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1052                        "          Actual, Expected status = %4d, %4d",
1053                      bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1054              }
1055          }
1056  
1057          prevBP = bp;
1058      }
1059  
1060      // Verify that there were no missed breaks prior to the last one found
1061      for (i=prevBP-1; i>=0; i--) {
1062          if (t->getExpectedBreak(i) != 0) {
1063              errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1064                        i, t->getSrcLine(i), t->getSrcCol(i));
1065          }
1066      }
1067  
1068      // Check isBoundary()
1069      for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1070          UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1071          UBool boundaryFound    = t->bi->isBoundary(i);
1072          if (boundaryExpected != boundaryFound) {
1073              errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1074                    "        Expected, Actual= %s, %s",
1075                    i, t->getSrcLine(i), t->getSrcCol(i),
1076                    boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1077          }
1078      }
1079  
1080      // Check following()
1081      for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1082          int32_t actualBreak = t->bi->following(i);
1083          int32_t expectedBreak = BreakIterator::DONE;
1084          for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1085              if (t->getExpectedBreak(j) != 0) {
1086                  expectedBreak = j;
1087                  break;
1088              }
1089          }
1090          if (expectedBreak != actualBreak) {
1091              errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1092                    "        Expected, Actual= %d, %d",
1093                    i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1094          }
1095      }
1096  
1097      // Check preceding()
1098      for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1099          int32_t actualBreak = t->bi->preceding(i);
1100          int32_t expectedBreak = BreakIterator::DONE;
1101  
1102          // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1103          // preceding(trailing byte) will return the index of some preceding code point,
1104          // not the lead byte of the current code point, even though that has a smaller index.
1105          // Therefore, start looking at the expected break data not at i-1, but at
1106          // the start of code point index - 1.
1107          utext_setNativeIndex(t->textToBreak, i);
1108          int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1109          for (; j >= 0; j--) {
1110              if (t->getExpectedBreak(j) != 0) {
1111                  expectedBreak = j;
1112                  break;
1113              }
1114          }
1115          if (expectedBreak != actualBreak) {
1116              errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1117                    "        Expected, Actual= %d, %d",
1118                    i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1119          }
1120      }
1121  }
1122  
1123  
TestExtended()1124  void RBBITest::TestExtended() {
1125  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1126      UErrorCode      status  = U_ZERO_ERROR;
1127      Locale          locale("");
1128  
1129      UnicodeString       rules;
1130      TestParams          tp(status);
1131  
1132      RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1133      if (U_FAILURE(status)) {
1134          dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1135      }
1136  
1137  
1138      //
1139      //  Open and read the test data file.
1140      //
1141      const char *testDataDirectory = IntlTest::getSourceTestData(status);
1142      char testFileName[1000];
1143      if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1144          errln("Can't open test data.  Path too long.");
1145          return;
1146      }
1147      strcpy(testFileName, testDataDirectory);
1148      strcat(testFileName, "rbbitst.txt");
1149  
1150      int    len;
1151      UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1152      if (U_FAILURE(status)) {
1153          return; /* something went wrong, error already output */
1154      }
1155  
1156  
1157      bool skipTest = false; // Skip this test?
1158  
1159      //
1160      //  Put the test data into a UnicodeString
1161      //
1162      UnicodeString testString(FALSE, testFile, len);
1163  
1164      enum EParseState{
1165          PARSE_COMMENT,
1166          PARSE_TAG,
1167          PARSE_DATA,
1168          PARSE_NUM
1169      }
1170      parseState = PARSE_TAG;
1171  
1172      EParseState savedState = PARSE_TAG;
1173  
1174      static const UChar CH_LF        = 0x0a;
1175      static const UChar CH_CR        = 0x0d;
1176      static const UChar CH_HASH      = 0x23;
1177      /*static const UChar CH_PERIOD    = 0x2e;*/
1178      static const UChar CH_LT        = 0x3c;
1179      static const UChar CH_GT        = 0x3e;
1180      static const UChar CH_BACKSLASH = 0x5c;
1181      static const UChar CH_BULLET    = 0x2022;
1182  
1183      int32_t    lineNum  = 1;
1184      int32_t    colStart = 0;
1185      int32_t    column   = 0;
1186      int32_t    charIdx  = 0;
1187  
1188      int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1189  
1190      for (charIdx = 0; charIdx < len; ) {
1191          status = U_ZERO_ERROR;
1192          UChar  c = testString.charAt(charIdx);
1193          charIdx++;
1194          if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1195              // treat CRLF as a unit
1196              c = CH_LF;
1197              charIdx++;
1198          }
1199          if (c == CH_LF || c == CH_CR) {
1200              lineNum++;
1201              colStart = charIdx;
1202          }
1203          column = charIdx - colStart + 1;
1204  
1205          switch (parseState) {
1206          case PARSE_COMMENT:
1207              if (c == 0x0a || c == 0x0d) {
1208                  parseState = savedState;
1209              }
1210              break;
1211  
1212          case PARSE_TAG:
1213              {
1214              if (c == CH_HASH) {
1215                  parseState = PARSE_COMMENT;
1216                  savedState = PARSE_TAG;
1217                  break;
1218              }
1219              if (u_isUWhiteSpace(c)) {
1220                  break;
1221              }
1222              if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1223                  delete tp.bi;
1224                  tp.bi = BreakIterator::createWordInstance(locale,  status);
1225                  skipTest = false;
1226                  charIdx += 5;
1227                  break;
1228              }
1229              if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1230                  delete tp.bi;
1231                  tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1232                  skipTest = false;
1233                  charIdx += 5;
1234                  break;
1235              }
1236              if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1237                  delete tp.bi;
1238                  tp.bi = BreakIterator::createLineInstance(locale,  status);
1239                  skipTest = false;
1240                  charIdx += 5;
1241                  break;
1242              }
1243              if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1244                  delete tp.bi;
1245                  tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1246                  skipTest = false;
1247                  charIdx += 5;
1248                  break;
1249              }
1250              if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1251                  delete tp.bi;
1252                  tp.bi = BreakIterator::createTitleInstance(locale,  status);
1253                  charIdx += 6;
1254                  break;
1255              }
1256  
1257              // <locale  loc_name>
1258              localeMatcher.reset(testString);
1259              if (localeMatcher.lookingAt(charIdx-1, status)) {
1260                  UnicodeString localeName = localeMatcher.group(1, status);
1261                  char localeName8[100];
1262                  localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1263                  locale = Locale::createFromName(localeName8);
1264                  charIdx += localeMatcher.group(0, status).length() - 1;
1265                  TEST_ASSERT_SUCCESS(status);
1266                  break;
1267              }
1268              if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1269                  parseState = PARSE_DATA;
1270                  charIdx += 5;
1271                  tp.dataToBreak = "";
1272                  tp.expectedBreaks->removeAllElements();
1273                  tp.srcCol ->removeAllElements();
1274                  tp.srcLine->removeAllElements();
1275                  break;
1276              }
1277  
1278              errln("line %d: Tag expected in test file.", lineNum);
1279              parseState = PARSE_COMMENT;
1280              savedState = PARSE_DATA;
1281              goto end_test; // Stop the test.
1282              }
1283              break;
1284  
1285          case PARSE_DATA:
1286              if (c == CH_BULLET) {
1287                  int32_t  breakIdx = tp.dataToBreak.length();
1288                  tp.expectedBreaks->setSize(breakIdx+1);
1289                  tp.expectedBreaks->setElementAt(-1, breakIdx);
1290                  tp.srcLine->setSize(breakIdx+1);
1291                  tp.srcLine->setElementAt(lineNum, breakIdx);
1292                  tp.srcCol ->setSize(breakIdx+1);
1293                  tp.srcCol ->setElementAt(column, breakIdx);
1294                  break;
1295              }
1296  
1297              if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1298                  // Add final entry to mappings from break location to source file position.
1299                  //  Need one extra because last break position returned is after the
1300                  //    last char in the data, not at the last char.
1301                  tp.srcLine->addElement(lineNum, status);
1302                  tp.srcCol ->addElement(column, status);
1303  
1304                  parseState = PARSE_TAG;
1305                  charIdx += 6;
1306  
1307                  if (!skipTest) {
1308                      // RUN THE TEST!
1309                      status = U_ZERO_ERROR;
1310                      tp.setUTF16(status);
1311                      executeTest(&tp, status);
1312                      TEST_ASSERT_SUCCESS(status);
1313  
1314                      // Run again, this time with UTF-8 text wrapped in a UText.
1315                      status = U_ZERO_ERROR;
1316                      tp.setUTF8(status);
1317                      TEST_ASSERT_SUCCESS(status);
1318                      executeTest(&tp, status);
1319                  }
1320                  break;
1321              }
1322  
1323              if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1324                  // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1325                  // Get the code point from the name and insert it into the test data.
1326                  //   (Damn, no API takes names in Unicode  !!!
1327                  //    we've got to take it back to char *)
1328                  int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1329                  int32_t nameLength = nameEndIdx - (charIdx+2);
1330                  char charNameBuf[200];
1331                  UChar32 theChar = -1;
1332                  if (nameEndIdx != -1) {
1333                      UErrorCode status = U_ZERO_ERROR;
1334                      testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1335                      charNameBuf[sizeof(charNameBuf)-1] = 0;
1336                      theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1337                      if (U_FAILURE(status)) {
1338                          theChar = -1;
1339                      }
1340                  }
1341                  if (theChar == -1) {
1342                      errln("Error in named character in test file at line %d, col %d",
1343                          lineNum, column);
1344                  } else {
1345                      // Named code point was recognized.  Insert it
1346                      //   into the test data.
1347                      tp.dataToBreak.append(theChar);
1348                      while (tp.dataToBreak.length() > tp.srcLine->size()) {
1349                          tp.srcLine->addElement(lineNum, status);
1350                          tp.srcCol ->addElement(column, status);
1351                      }
1352                  }
1353                  if (nameEndIdx > charIdx) {
1354                      charIdx = nameEndIdx+1;
1355  
1356                  }
1357                  break;
1358              }
1359  
1360  
1361  
1362  
1363              if (testString.compare(charIdx-1, 2, "<>") == 0) {
1364                  charIdx++;
1365                  int32_t  breakIdx = tp.dataToBreak.length();
1366                  tp.expectedBreaks->setSize(breakIdx+1);
1367                  tp.expectedBreaks->setElementAt(-1, breakIdx);
1368                  tp.srcLine->setSize(breakIdx+1);
1369                  tp.srcLine->setElementAt(lineNum, breakIdx);
1370                  tp.srcCol ->setSize(breakIdx+1);
1371                  tp.srcCol ->setElementAt(column, breakIdx);
1372                  break;
1373              }
1374  
1375              if (c == CH_LT) {
1376                  tagValue   = 0;
1377                  parseState = PARSE_NUM;
1378                  break;
1379              }
1380  
1381              if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1382                  parseState = PARSE_COMMENT;
1383                  savedState = PARSE_DATA;
1384                  break;
1385              }
1386  
1387              if (c == CH_BACKSLASH) {
1388                  // Check for \ at end of line, a line continuation.
1389                  //     Advance over (discard) the newline
1390                  UChar32 cp = testString.char32At(charIdx);
1391                  if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1392                      // We have a CR LF
1393                      //  Need an extra increment of the input ptr to move over both of them
1394                      charIdx++;
1395                  }
1396                  if (cp == CH_LF || cp == CH_CR) {
1397                      lineNum++;
1398                      colStart = charIdx;
1399                      charIdx++;
1400                      break;
1401                  }
1402  
1403                  // Let unescape handle the back slash.
1404                  cp = testString.unescapeAt(charIdx);
1405                  if (cp != -1) {
1406                      // Escape sequence was recognized.  Insert the char
1407                      //   into the test data.
1408                      tp.dataToBreak.append(cp);
1409                      while (tp.dataToBreak.length() > tp.srcLine->size()) {
1410                          tp.srcLine->addElement(lineNum, status);
1411                          tp.srcCol ->addElement(column, status);
1412                      }
1413                      break;
1414                  }
1415  
1416  
1417                  // Not a recognized backslash escape sequence.
1418                  // Take the next char as a literal.
1419                  //  TODO:  Should this be an error?
1420                  c = testString.charAt(charIdx);
1421                  charIdx = testString.moveIndex32(charIdx, 1);
1422              }
1423  
1424              // Normal, non-escaped data char.
1425              tp.dataToBreak.append(c);
1426  
1427              // Save the mapping from offset in the data to line/column numbers in
1428              //   the original input file.  Will be used for better error messages only.
1429              //   If there's an expected break before this char, the slot in the mapping
1430              //     vector will already be set for this char; don't overwrite it.
1431              if (tp.dataToBreak.length() > tp.srcLine->size()) {
1432                  tp.srcLine->addElement(lineNum, status);
1433                  tp.srcCol ->addElement(column, status);
1434              }
1435              break;
1436  
1437  
1438          case PARSE_NUM:
1439              // We are parsing an expected numeric tag value, like <1234>,
1440              //   within a chunk of data.
1441              if (u_isUWhiteSpace(c)) {
1442                  break;
1443              }
1444  
1445              if (c == CH_GT) {
1446                  // Finished the number.  Add the info to the expected break data,
1447                  //   and switch parse state back to doing plain data.
1448                  parseState = PARSE_DATA;
1449                  if (tagValue == 0) {
1450                      tagValue = -1;
1451                  }
1452                  int32_t  breakIdx = tp.dataToBreak.length();
1453                  tp.expectedBreaks->setSize(breakIdx+1);
1454                  tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1455                  tp.srcLine->setSize(breakIdx+1);
1456                  tp.srcLine->setElementAt(lineNum, breakIdx);
1457                  tp.srcCol ->setSize(breakIdx+1);
1458                  tp.srcCol ->setElementAt(column, breakIdx);
1459                  break;
1460              }
1461  
1462              if (u_isdigit(c)) {
1463                  tagValue = tagValue*10 + u_charDigitValue(c);
1464                  break;
1465              }
1466  
1467              errln("Syntax Error in test file at line %d, col %d",
1468                  lineNum, column);
1469              parseState = PARSE_COMMENT;
1470              goto end_test; // Stop the test
1471              break;
1472          }
1473  
1474  
1475          if (U_FAILURE(status)) {
1476              dataerrln("ICU Error %s while parsing test file at line %d.",
1477                  u_errorName(status), lineNum);
1478              status = U_ZERO_ERROR;
1479              goto end_test; // Stop the test
1480          }
1481  
1482      }
1483  
1484  end_test:
1485      delete [] testFile;
1486  #endif
1487  }
1488  
1489  
1490  //-------------------------------------------------------------------------------
1491  //
1492  //  TestDictRules   create a break iterator from source rules that includes a
1493  //                  dictionary range.   Regression for bug #7130.  Source rules
1494  //                  do not declare a break iterator type (word, line, sentence, etc.
1495  //                  but the dictionary code, without a type, would loop.
1496  //
1497  //-------------------------------------------------------------------------------
TestDictRules()1498  void RBBITest::TestDictRules() {
1499      const char *rules =  "$dictionary = [a-z]; \n"
1500                           "!!forward; \n"
1501                           "$dictionary $dictionary; \n"
1502                           "!!reverse; \n"
1503                           "$dictionary $dictionary; \n";
1504      const char *text = "aa";
1505      UErrorCode status = U_ZERO_ERROR;
1506      UParseError parseError;
1507  
1508      RuleBasedBreakIterator bi(rules, parseError, status);
1509      if (U_SUCCESS(status)) {
1510          UnicodeString utext = text;
1511          bi.setText(utext);
1512          int32_t position;
1513          int32_t loops;
1514          for (loops = 0; loops<10; loops++) {
1515              position = bi.next();
1516              if (position == RuleBasedBreakIterator::DONE) {
1517                  break;
1518              }
1519          }
1520          TEST_ASSERT(loops == 1);
1521      } else {
1522          dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1523      }
1524  }
1525  
1526  
1527  
1528  //-------------------------------------------------------------------------------
1529  //
1530  //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1531  //    return the data in one big UChar * buffer, which the caller must delete.
1532  //
1533  //    parameters:
1534  //          fileName:   the name of the file, with no directory part.  The test data directory
1535  //                      is assumed.
1536  //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1537  //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1538  //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1539  //                      Pass NULL for the system default encoding.
1540  //          status
1541  //    returns:
1542  //                      The file data, converted to UChar.
1543  //                      The caller must delete this when done with
1544  //                           delete [] theBuffer;
1545  //
1546  //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1547  //           Move this function to some common place.
1548  //
1549  //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int & ulen,const char * encoding,UErrorCode & status)1550  UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1551      UChar       *retPtr  = NULL;
1552      char        *fileBuf = NULL;
1553      UConverter* conv     = NULL;
1554      FILE        *f       = NULL;
1555  
1556      ulen = 0;
1557      if (U_FAILURE(status)) {
1558          return retPtr;
1559      }
1560  
1561      //
1562      //  Open the file.
1563      //
1564      f = fopen(fileName, "rb");
1565      if (f == 0) {
1566          dataerrln("Error opening test data file %s\n", fileName);
1567          status = U_FILE_ACCESS_ERROR;
1568          return NULL;
1569      }
1570      //
1571      //  Read it in
1572      //
1573      int   fileSize;
1574      int   amt_read;
1575  
1576      fseek( f, 0, SEEK_END);
1577      fileSize = ftell(f);
1578      fileBuf = new char[fileSize];
1579      fseek(f, 0, SEEK_SET);
1580      amt_read = fread(fileBuf, 1, fileSize, f);
1581      if (amt_read != fileSize || fileSize <= 0) {
1582          errln("Error reading test data file.");
1583          goto cleanUpAndReturn;
1584      }
1585  
1586      //
1587      // Look for a Unicode Signature (BOM) on the data just read
1588      //
1589      int32_t        signatureLength;
1590      const char *   fileBufC;
1591      const char*    bomEncoding;
1592  
1593      fileBufC = fileBuf;
1594      bomEncoding = ucnv_detectUnicodeSignature(
1595          fileBuf, fileSize, &signatureLength, &status);
1596      if(bomEncoding!=NULL ){
1597          fileBufC  += signatureLength;
1598          fileSize  -= signatureLength;
1599          encoding = bomEncoding;
1600      }
1601  
1602      //
1603      // Open a converter to take the rule file to UTF-16
1604      //
1605      conv = ucnv_open(encoding, &status);
1606      if (U_FAILURE(status)) {
1607          goto cleanUpAndReturn;
1608      }
1609  
1610      //
1611      // Convert the rules to UChar.
1612      //  Preflight first to determine required buffer size.
1613      //
1614      ulen = ucnv_toUChars(conv,
1615          NULL,           //  dest,
1616          0,              //  destCapacity,
1617          fileBufC,
1618          fileSize,
1619          &status);
1620      if (status == U_BUFFER_OVERFLOW_ERROR) {
1621          // Buffer Overflow is expected from the preflight operation.
1622          status = U_ZERO_ERROR;
1623  
1624          retPtr = new UChar[ulen+1];
1625          ucnv_toUChars(conv,
1626              retPtr,       //  dest,
1627              ulen+1,
1628              fileBufC,
1629              fileSize,
1630              &status);
1631      }
1632  
1633  cleanUpAndReturn:
1634      fclose(f);
1635      delete []fileBuf;
1636      ucnv_close(conv);
1637      if (U_FAILURE(status)) {
1638          errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1639          delete []retPtr;
1640          retPtr = 0;
1641          ulen   = 0;
1642      };
1643      return retPtr;
1644  }
1645  
1646  
1647  
1648  //--------------------------------------------------------------------------------------------
1649  //
1650  //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1651  //
1652  //-------------------------------------------------------------------------------------------
TestUnicodeFiles()1653  void RBBITest::TestUnicodeFiles() {
1654      RuleBasedBreakIterator  *bi;
1655      UErrorCode               status = U_ZERO_ERROR;
1656  
1657      bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1658      TEST_ASSERT_SUCCESS(status);
1659      if (U_SUCCESS(status)) {
1660          runUnicodeTestData("GraphemeBreakTest.txt", bi);
1661      }
1662      delete bi;
1663  
1664      bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1665      TEST_ASSERT_SUCCESS(status);
1666      if (U_SUCCESS(status)) {
1667          runUnicodeTestData("WordBreakTest.txt", bi);
1668      }
1669      delete bi;
1670  
1671      bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1672      TEST_ASSERT_SUCCESS(status);
1673      if (U_SUCCESS(status)) {
1674          runUnicodeTestData("SentenceBreakTest.txt", bi);
1675      }
1676      delete bi;
1677  
1678      bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1679      TEST_ASSERT_SUCCESS(status);
1680      if (U_SUCCESS(status)) {
1681          runUnicodeTestData("LineBreakTest.txt", bi);
1682      }
1683      delete bi;
1684  }
1685  
1686  
1687  // Check for test cases from the Unicode test data files that are known to fail
1688  // and should be skipped because ICU is not yet able to fully implement the spec.
1689  // See ticket #7270.
1690  
testCaseIsKnownIssue(const UnicodeString & testCase,const char * fileName)1691  UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1692      static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1693          {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1694          {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1695          {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1696          {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1697          {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1698          {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1699      };
1700      if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1701          return FALSE;
1702      }
1703  
1704      for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1705          if (testCase == UnicodeString(badTestCases[i])) {
1706              return logKnownIssue("7270");
1707          }
1708      }
1709      return FALSE;
1710  }
1711  
1712  
1713  //--------------------------------------------------------------------------------------------
1714  //
1715  //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1716  //
1717  //-------------------------------------------------------------------------------------------
runUnicodeTestData(const char * fileName,RuleBasedBreakIterator * bi)1718  void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1719  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1720      UErrorCode  status = U_ZERO_ERROR;
1721  
1722      //
1723      //  Open and read the test data file, put it into a UnicodeString.
1724      //
1725      const char *testDataDirectory = IntlTest::getSourceTestData(status);
1726      char testFileName[1000];
1727      if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1728          dataerrln("Can't open test data.  Path too long.");
1729          return;
1730      }
1731      strcpy(testFileName, testDataDirectory);
1732      strcat(testFileName, fileName);
1733  
1734      logln("Opening data file %s\n", fileName);
1735  
1736      int    len;
1737      UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1738      if (status != U_FILE_ACCESS_ERROR) {
1739          TEST_ASSERT_SUCCESS(status);
1740          TEST_ASSERT(testFile != NULL);
1741      }
1742      if (U_FAILURE(status) || testFile == NULL) {
1743          return; /* something went wrong, error already output */
1744      }
1745      UnicodeString testFileAsString(TRUE, testFile, len);
1746  
1747      //
1748      //  Parse the test data file using a regular expression.
1749      //  Each kind of token is recognized in its own capture group; what type of item was scanned
1750      //     is identified by which group had a match.
1751      //
1752      //    Caputure Group #                  1          2            3            4           5
1753      //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1754      //
1755      UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1756      RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1757      UnicodeString   testString;
1758      UVector32       breakPositions(status);
1759      int             lineNumber = 1;
1760      TEST_ASSERT_SUCCESS(status);
1761      if (U_FAILURE(status)) {
1762          return;
1763      }
1764  
1765      //
1766      //  Scan through each test case, building up the string to be broken in testString,
1767      //   and the positions that should be boundaries in the breakPositions vector.
1768      //
1769      int spin = 0;
1770      while (tokenMatcher.find()) {
1771        	if(tokenMatcher.hitEnd()) {
1772            /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1773               This occurred when the text file was corrupt (wasn't marked as UTF-8)
1774               and caused an infinite loop here on EBCDIC systems!
1775            */
1776            fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1777            //	   return;
1778        	}
1779          if (tokenMatcher.start(1, status) >= 0) {
1780              // Scanned a divide sign, indicating a break position in the test data.
1781              if (testString.length()>0) {
1782                  breakPositions.addElement(testString.length(), status);
1783              }
1784          }
1785          else if (tokenMatcher.start(2, status) >= 0) {
1786              // Scanned an 'x', meaning no break at this position in the test data
1787              //   Nothing to be done here.
1788              }
1789          else if (tokenMatcher.start(3, status) >= 0) {
1790              // Scanned Hex digits.  Convert them to binary, append to the character data string.
1791              const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1792              int length = hexNumber.length();
1793              if (length<=8) {
1794                  char buf[10];
1795                  hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1796                  UChar32 c = (UChar32)strtol(buf, NULL, 16);
1797                  if (c<=0x10ffff) {
1798                      testString.append(c);
1799                  } else {
1800                      errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1801                         fileName, lineNumber);
1802                  }
1803              } else {
1804                  errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1805                         fileName, lineNumber);
1806               }
1807          }
1808          else if (tokenMatcher.start(4, status) >= 0) {
1809              // Scanned to end of a line, possibly skipping over a comment in the process.
1810              //   If the line from the file contained test data, run the test now.
1811              if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1812                  checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1813              }
1814  
1815              // Clear out this test case.
1816              //    The string and breakPositions vector will be refilled as the next
1817              //       test case is parsed.
1818              testString.remove();
1819              breakPositions.removeAllElements();
1820              lineNumber++;
1821          } else {
1822              // Scanner catchall.  Something unrecognized appeared on the line.
1823              char token[16];
1824              UnicodeString uToken = tokenMatcher.group(0, status);
1825              uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1826              token[sizeof(token)-1] = 0;
1827              errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1828  
1829              // Clean up, in preparation for continuing with the next line.
1830              testString.remove();
1831              breakPositions.removeAllElements();
1832              lineNumber++;
1833          }
1834          TEST_ASSERT_SUCCESS(status);
1835          if (U_FAILURE(status)) {
1836              break;
1837          }
1838      }
1839  
1840      delete [] testFile;
1841   #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1842  }
1843  
1844  //--------------------------------------------------------------------------------------------
1845  //
1846  //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1847  //                            test data files.  Do only a simple, forward-only check -
1848  //                            this test is mostly to check that ICU and the Unicode
1849  //                            data agree with each other.
1850  //
1851  //--------------------------------------------------------------------------------------------
checkUnicodeTestCase(const char * testFileName,int lineNumber,const UnicodeString & testString,UVector32 * breakPositions,RuleBasedBreakIterator * bi)1852  void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1853                           const UnicodeString &testString,   // Text data to be broken
1854                           UVector32 *breakPositions,         // Positions where breaks should be found.
1855                           RuleBasedBreakIterator *bi) {
1856      int32_t pos;                 // Break Position in the test string
1857      int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1858      int32_t expectedPos;         // Expected break position (index into test string)
1859  
1860      bi->setText(testString);
1861      pos = bi->first();
1862      pos = bi->next();
1863  
1864      while (pos != BreakIterator::DONE) {
1865          if (expectedI >= breakPositions->size()) {
1866              errln("Test file \"%s\", line %d, unexpected break found at position %d",
1867                  testFileName, lineNumber, pos);
1868              break;
1869          }
1870          expectedPos = breakPositions->elementAti(expectedI);
1871          if (pos < expectedPos) {
1872              errln("Test file \"%s\", line %d, unexpected break found at position %d",
1873                  testFileName, lineNumber, pos);
1874              break;
1875          }
1876          if (pos > expectedPos) {
1877              errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1878                  testFileName, lineNumber, expectedPos);
1879              break;
1880          }
1881          pos = bi->next();
1882          expectedI++;
1883      }
1884  
1885      if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1886          errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1887              testFileName, lineNumber, breakPositions->elementAti(expectedI));
1888      }
1889  }
1890  
1891  
1892  
1893  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1894  //---------------------------------------------------------------------------------------
1895  //
1896  //   classs RBBIMonkeyKind
1897  //
1898  //      Monkey Test for Break Iteration
1899  //      Abstract interface class.   Concrete derived classes independently
1900  //      implement the break rules for different iterator types.
1901  //
1902  //      The Monkey Test itself uses doesn't know which type of break iterator it is
1903  //      testing, but works purely in terms of the interface defined here.
1904  //
1905  //---------------------------------------------------------------------------------------
1906  class RBBIMonkeyKind {
1907  public:
1908      // Return a UVector of UnicodeSets, representing the character classes used
1909      //   for this type of iterator.
1910      virtual  UVector  *charClasses() = 0;
1911  
1912      // Set the test text on which subsequent calls to next() will operate
1913      virtual  void      setText(const UnicodeString &s) = 0;
1914  
1915      // Find the next break postion, starting from the prev break position, or from zero.
1916      // Return -1 after reaching end of string.
1917      virtual  int32_t   next(int32_t i) = 0;
1918  
1919      virtual ~RBBIMonkeyKind();
1920      UErrorCode       deferredStatus;
1921  
1922  
1923  protected:
1924      RBBIMonkeyKind();
1925  
1926  private:
1927  };
1928  
RBBIMonkeyKind()1929  RBBIMonkeyKind::RBBIMonkeyKind() {
1930      deferredStatus = U_ZERO_ERROR;
1931  }
1932  
~RBBIMonkeyKind()1933  RBBIMonkeyKind::~RBBIMonkeyKind() {
1934  }
1935  
1936  
1937  //----------------------------------------------------------------------------------------
1938  //
1939  //   Random Numbers.  Similar to standard lib rand() and srand()
1940  //                    Not using library to
1941  //                      1.  Get same results on all platforms.
1942  //                      2.  Get access to current seed, to more easily reproduce failures.
1943  //
1944  //---------------------------------------------------------------------------------------
1945  static uint32_t m_seed = 1;
1946  
m_rand()1947  static uint32_t m_rand()
1948  {
1949      m_seed = m_seed * 1103515245 + 12345;
1950      return (uint32_t)(m_seed/65536) % 32768;
1951  }
1952  
1953  
1954  //
1955  // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
1956  //
1957  static const char *gExtended_Pict = "["
1958      "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
1959      "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1960      "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
1961      "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
1962      "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
1963      "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
1964      "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
1965      "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
1966      "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1967      "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
1968      "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
1969      "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
1970      "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
1971      "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
1972      "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
1973      "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
1974      "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
1975      "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
1976      "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
1977      "]";
1978  
1979  //------------------------------------------------------------------------------------------
1980  //
1981  //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1982  //                             of RBBIMonkeyKind.
1983  //
1984  //------------------------------------------------------------------------------------------
1985  class RBBICharMonkey: public RBBIMonkeyKind {
1986  public:
1987      RBBICharMonkey();
1988      virtual          ~RBBICharMonkey();
1989      virtual  UVector *charClasses();
1990      virtual  void     setText(const UnicodeString &s);
1991      virtual  int32_t  next(int32_t i);
1992  private:
1993      UVector   *fSets;
1994  
1995      UnicodeSet  *fCRLFSet;
1996      UnicodeSet  *fControlSet;
1997      UnicodeSet  *fExtendSet;
1998      UnicodeSet  *fZWJSet;
1999      UnicodeSet  *fRegionalIndicatorSet;
2000      UnicodeSet  *fPrependSet;
2001      UnicodeSet  *fSpacingSet;
2002      UnicodeSet  *fLSet;
2003      UnicodeSet  *fVSet;
2004      UnicodeSet  *fTSet;
2005      UnicodeSet  *fLVSet;
2006      UnicodeSet  *fLVTSet;
2007      UnicodeSet  *fHangulSet;
2008      UnicodeSet  *fEmojiBaseSet;
2009      UnicodeSet  *fEmojiModifierSet;
2010      UnicodeSet  *fExtendedPictSet;
2011      UnicodeSet  *fEBGSet;
2012      UnicodeSet  *fEmojiNRKSet;
2013      UnicodeSet  *fAnySet;
2014  
2015      const UnicodeString *fText;
2016  };
2017  
2018  
RBBICharMonkey()2019  RBBICharMonkey::RBBICharMonkey() {
2020      UErrorCode  status = U_ZERO_ERROR;
2021  
2022      fText = NULL;
2023  
2024      fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2025      fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
2026      fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
2027      fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
2028      fRegionalIndicatorSet =
2029                    new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2030      fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2031      fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2032      fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2033      fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2034      fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2035      fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2036      fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2037      fHangulSet  = new UnicodeSet();
2038      fHangulSet->addAll(*fLSet);
2039      fHangulSet->addAll(*fVSet);
2040      fHangulSet->addAll(*fTSet);
2041      fHangulSet->addAll(*fLVSet);
2042      fHangulSet->addAll(*fLVTSet);
2043  
2044      fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2045      fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
2046      fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2047      fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
2048      fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
2049                  "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2050      fAnySet           = new UnicodeSet(0, 0x10ffff);
2051  
2052      fSets             = new UVector(status);
2053      fSets->addElement(fCRLFSet,    status);
2054      fSets->addElement(fControlSet, status);
2055      fSets->addElement(fExtendSet,  status);
2056      fSets->addElement(fRegionalIndicatorSet, status);
2057      if (!fPrependSet->isEmpty()) {
2058          fSets->addElement(fPrependSet, status);
2059      }
2060      fSets->addElement(fSpacingSet, status);
2061      fSets->addElement(fHangulSet,  status);
2062      fSets->addElement(fAnySet,     status);
2063      fSets->addElement(fEmojiBaseSet, status);
2064      fSets->addElement(fEmojiModifierSet, status);
2065      fSets->addElement(fZWJSet,     status);
2066      fSets->addElement(fExtendedPictSet, status);
2067      fSets->addElement(fEBGSet,     status);
2068      fSets->addElement(fEmojiNRKSet,status);
2069      if (U_FAILURE(status)) {
2070          deferredStatus = status;
2071      }
2072  }
2073  
2074  
setText(const UnicodeString & s)2075  void RBBICharMonkey::setText(const UnicodeString &s) {
2076      fText = &s;
2077  }
2078  
2079  
2080  
next(int32_t prevPos)2081  int32_t RBBICharMonkey::next(int32_t prevPos) {
2082      int    p0, p1, p2, p3;    // Indices of the significant code points around the
2083                                //   break position being tested.  The candidate break
2084                                //   location is before p2.
2085  
2086      int     breakPos = -1;
2087  
2088      UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2089      UChar32 cBase;            // for (X Extend*) patterns, the X character.
2090  
2091      if (U_FAILURE(deferredStatus)) {
2092          return -1;
2093      }
2094  
2095      // Previous break at end of string.  return DONE.
2096      if (prevPos >= fText->length()) {
2097          return -1;
2098      }
2099      p0 = p1 = p2 = p3 = prevPos;
2100      c3 =  fText->char32At(prevPos);
2101      c0 = c1 = c2 = cBase = 0;
2102      (void)p0;   // suppress set but not used warning.
2103      (void)c0;
2104  
2105      // Loop runs once per "significant" character position in the input text.
2106      for (;;) {
2107          // Move all of the positions forward in the input string.
2108          p0 = p1;  c0 = c1;
2109          p1 = p2;  c1 = c2;
2110          p2 = p3;  c2 = c3;
2111  
2112          // Advancd p3 by one codepoint
2113          p3 = fText->moveIndex32(p3, 1);
2114          c3 = fText->char32At(p3);
2115  
2116          if (p1 == p2) {
2117              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2118              continue;
2119          }
2120          if (p2 == fText->length()) {
2121              // Reached end of string.  Always a break position.
2122              break;
2123          }
2124  
2125          // Rule  GB3   CR x LF
2126          //     No Extend or Format characters may appear between the CR and LF,
2127          //     which requires the additional check for p2 immediately following p1.
2128          //
2129          if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2130              continue;
2131          }
2132  
2133          // Rule (GB4).   ( Control | CR | LF ) <break>
2134          if (fControlSet->contains(c1) ||
2135              c1 == 0x0D ||
2136              c1 == 0x0A)  {
2137              break;
2138          }
2139  
2140          // Rule (GB5)    <break>  ( Control | CR | LF )
2141          //
2142          if (fControlSet->contains(c2) ||
2143              c2 == 0x0D ||
2144              c2 == 0x0A)  {
2145              break;
2146          }
2147  
2148  
2149          // Rule (GB6)  L x ( L | V | LV | LVT )
2150          if (fLSet->contains(c1) &&
2151                 (fLSet->contains(c2)  ||
2152                  fVSet->contains(c2)  ||
2153                  fLVSet->contains(c2) ||
2154                  fLVTSet->contains(c2))) {
2155              continue;
2156          }
2157  
2158          // Rule (GB7)    ( LV | V )  x  ( V | T )
2159          if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2160              (fVSet->contains(c2) || fTSet->contains(c2)))  {
2161              continue;
2162          }
2163  
2164          // Rule (GB8)    ( LVT | T)  x T
2165          if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2166              fTSet->contains(c2))  {
2167              continue;
2168          }
2169  
2170          // Rule (GB9)    x (Extend | ZWJ)
2171          if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
2172              if (!fExtendSet->contains(c1)) {
2173                  cBase = c1;
2174              }
2175              continue;
2176          }
2177  
2178          // Rule (GB9a)   x  SpacingMark
2179          if (fSpacingSet->contains(c2)) {
2180              continue;
2181          }
2182  
2183          // Rule (GB9b)   Prepend x
2184          if (fPrependSet->contains(c1)) {
2185              continue;
2186          }
2187  
2188          // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
2189          if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2190              continue;
2191          }
2192          if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
2193                  fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2194              continue;
2195          }
2196  
2197          // Rule (GB11)   (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
2198          if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
2199                  (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2200              continue;
2201          }
2202  
2203          // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
2204          //                   Note: The first if condition is a little tricky. We only need to force
2205          //                      a break if there are three or more contiguous RIs. If there are
2206          //                      only two, a break following will occur via other rules, and will include
2207          //                      any trailing extend characters, which is needed behavior.
2208          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2209                  && fRegionalIndicatorSet->contains(c2)) {
2210              break;
2211          }
2212          if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2213              continue;
2214          }
2215  
2216          // Rule (GB999)  Any  <break>  Any
2217          break;
2218      }
2219  
2220      breakPos = p2;
2221      return breakPos;
2222  }
2223  
2224  
2225  
charClasses()2226  UVector  *RBBICharMonkey::charClasses() {
2227      return fSets;
2228  }
2229  
2230  
~RBBICharMonkey()2231  RBBICharMonkey::~RBBICharMonkey() {
2232      delete fSets;
2233      delete fCRLFSet;
2234      delete fControlSet;
2235      delete fExtendSet;
2236      delete fRegionalIndicatorSet;
2237      delete fPrependSet;
2238      delete fSpacingSet;
2239      delete fLSet;
2240      delete fVSet;
2241      delete fTSet;
2242      delete fLVSet;
2243      delete fLVTSet;
2244      delete fHangulSet;
2245      delete fAnySet;
2246      delete fEmojiBaseSet;
2247      delete fEmojiModifierSet;
2248      delete fZWJSet;
2249      delete fExtendedPictSet;
2250      delete fEBGSet;
2251      delete fEmojiNRKSet;
2252  }
2253  
2254  //------------------------------------------------------------------------------------------
2255  //
2256  //   class RBBIWordMonkey      Word Break specific implementation
2257  //                             of RBBIMonkeyKind.
2258  //
2259  //------------------------------------------------------------------------------------------
2260  class RBBIWordMonkey: public RBBIMonkeyKind {
2261  public:
2262      RBBIWordMonkey();
2263      virtual          ~RBBIWordMonkey();
2264      virtual  UVector *charClasses();
2265      virtual  void     setText(const UnicodeString &s);
2266      virtual int32_t   next(int32_t i);
2267  private:
2268      UVector      *fSets;
2269  
2270      UnicodeSet  *fCRSet;
2271      UnicodeSet  *fLFSet;
2272      UnicodeSet  *fNewlineSet;
2273      UnicodeSet  *fRegionalIndicatorSet;
2274      UnicodeSet  *fKatakanaSet;
2275      UnicodeSet  *fHebrew_LetterSet;
2276      UnicodeSet  *fALetterSet;
2277      UnicodeSet  *fSingle_QuoteSet;
2278      UnicodeSet  *fDouble_QuoteSet;
2279      UnicodeSet  *fMidNumLetSet;
2280      UnicodeSet  *fMidLetterSet;
2281      UnicodeSet  *fMidNumSet;
2282      UnicodeSet  *fNumericSet;
2283      UnicodeSet  *fFormatSet;
2284      UnicodeSet  *fOtherSet;
2285      UnicodeSet  *fExtendSet;
2286      UnicodeSet  *fExtendNumLetSet;
2287      UnicodeSet  *fDictionarySet;
2288      UnicodeSet  *fEBaseSet;
2289      UnicodeSet  *fEBGSet;
2290      UnicodeSet  *fEModifierSet;
2291      UnicodeSet  *fZWJSet;
2292      UnicodeSet  *fExtendedPictSet;
2293      UnicodeSet  *fEmojiNRKSet;
2294  
2295      const UnicodeString  *fText;
2296  };
2297  
2298  
RBBIWordMonkey()2299  RBBIWordMonkey::RBBIWordMonkey()
2300  {
2301      UErrorCode  status = U_ZERO_ERROR;
2302  
2303      fSets            = new UVector(status);
2304  
2305      fCRSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2306      fLFSet            = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2307      fNewlineSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2308      fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2309      fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2310      fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2311      fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2312      fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2313      fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2314      fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2315      fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2316      fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2317      fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2318      fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2319      fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2320      fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2321  
2322      fEBaseSet         = new UnicodeSet(UNICODE_STRING_SIMPLE(
2323              "[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
2324      fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"),          status);
2325      fEModifierSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"),           status);
2326      fZWJSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"),          status);
2327      fExtendedPictSet  = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
2328      fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
2329              "[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2330  
2331      fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
2332      fDictionarySet->addAll(*fKatakanaSet);
2333      fDictionarySet->addAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2334  
2335      fALetterSet->removeAll(*fDictionarySet);
2336  
2337      fOtherSet        = new UnicodeSet();
2338      if(U_FAILURE(status)) {
2339        deferredStatus = status;
2340        return;
2341      }
2342  
2343      fOtherSet->complement();
2344      fOtherSet->removeAll(*fCRSet);
2345      fOtherSet->removeAll(*fLFSet);
2346      fOtherSet->removeAll(*fNewlineSet);
2347      fOtherSet->removeAll(*fKatakanaSet);
2348      fOtherSet->removeAll(*fHebrew_LetterSet);
2349      fOtherSet->removeAll(*fALetterSet);
2350      fOtherSet->removeAll(*fSingle_QuoteSet);
2351      fOtherSet->removeAll(*fDouble_QuoteSet);
2352      fOtherSet->removeAll(*fMidLetterSet);
2353      fOtherSet->removeAll(*fMidNumSet);
2354      fOtherSet->removeAll(*fNumericSet);
2355      fOtherSet->removeAll(*fExtendNumLetSet);
2356      fOtherSet->removeAll(*fFormatSet);
2357      fOtherSet->removeAll(*fExtendSet);
2358      fOtherSet->removeAll(*fRegionalIndicatorSet);
2359      fOtherSet->removeAll(*fEBaseSet);
2360      fOtherSet->removeAll(*fEBGSet);
2361      fOtherSet->removeAll(*fEModifierSet);
2362      fOtherSet->removeAll(*fZWJSet);
2363      fOtherSet->removeAll(*fExtendedPictSet);
2364      fOtherSet->removeAll(*fEmojiNRKSet);
2365  
2366      // Inhibit dictionary characters from being tested at all.
2367      fOtherSet->removeAll(*fDictionarySet);
2368  
2369      fSets->addElement(fCRSet,                status);
2370      fSets->addElement(fLFSet,                status);
2371      fSets->addElement(fNewlineSet,           status);
2372      fSets->addElement(fRegionalIndicatorSet, status);
2373      fSets->addElement(fHebrew_LetterSet,     status);
2374      fSets->addElement(fALetterSet,           status);
2375      fSets->addElement(fSingle_QuoteSet,      status);
2376      fSets->addElement(fDouble_QuoteSet,      status);
2377      //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
2378                                                          // from the test data. They are all in the dictionary set,
2379                                                          // which this (old, to be retired) monkey test cannot handle.
2380      fSets->addElement(fMidLetterSet,         status);
2381      fSets->addElement(fMidNumLetSet,         status);
2382      fSets->addElement(fMidNumSet,            status);
2383      fSets->addElement(fNumericSet,           status);
2384      fSets->addElement(fFormatSet,            status);
2385      fSets->addElement(fExtendSet,            status);
2386      fSets->addElement(fOtherSet,             status);
2387      fSets->addElement(fExtendNumLetSet,      status);
2388  
2389      fSets->addElement(fEBaseSet,             status);
2390      fSets->addElement(fEBGSet,               status);
2391      fSets->addElement(fEModifierSet,         status);
2392      fSets->addElement(fZWJSet,               status);
2393      fSets->addElement(fExtendedPictSet,      status);
2394      fSets->addElement(fEmojiNRKSet,          status);
2395  
2396      if (U_FAILURE(status)) {
2397          deferredStatus = status;
2398      }
2399  }
2400  
setText(const UnicodeString & s)2401  void RBBIWordMonkey::setText(const UnicodeString &s) {
2402      fText       = &s;
2403  }
2404  
2405  
next(int32_t prevPos)2406  int32_t RBBIWordMonkey::next(int32_t prevPos) {
2407      int    p0, p1, p2, p3;    // Indices of the significant code points around the
2408                                //   break position being tested.  The candidate break
2409                                //   location is before p2.
2410  
2411      int     breakPos = -1;
2412  
2413      UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2414  
2415      if (U_FAILURE(deferredStatus)) {
2416          return -1;
2417      }
2418  
2419      // Prev break at end of string.  return DONE.
2420      if (prevPos >= fText->length()) {
2421          return -1;
2422      }
2423      p0 = p1 = p2 = p3 = prevPos;
2424      c3 =  fText->char32At(prevPos);
2425      c0 = c1 = c2 = 0;
2426      (void)p0;       // Suppress set but not used warning.
2427  
2428      // Loop runs once per "significant" character position in the input text.
2429      for (;;) {
2430          // Move all of the positions forward in the input string.
2431          p0 = p1;  c0 = c1;
2432          p1 = p2;  c1 = c2;
2433          p2 = p3;  c2 = c3;
2434  
2435          // Advancd p3 by    X(Extend | Format)*   Rule 4
2436          //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2437          do {
2438              p3 = fText->moveIndex32(p3, 1);
2439              c3 = fText->char32At(p3);
2440              if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2441                 break;
2442              };
2443          }
2444          while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2445  
2446  
2447          if (p1 == p2) {
2448              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2449              continue;
2450          }
2451          if (p2 == fText->length()) {
2452              // Reached end of string.  Always a break position.
2453              break;
2454          }
2455  
2456          // Rule  (3)   CR x LF
2457          //     No Extend or Format characters may appear between the CR and LF,
2458          //     which requires the additional check for p2 immediately following p1.
2459          //
2460          if (c1==0x0D && c2==0x0A) {
2461              continue;
2462          }
2463  
2464          // Rule (3a)  Break before and after newlines (including CR and LF)
2465          //
2466          if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2467              break;
2468          };
2469          if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2470              break;
2471          };
2472  
2473          // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
2474          //              Not ignoring extend chars, so peek into input text to
2475          //              get the potential ZWJ, the character immediately preceding c2.
2476          //              Sloppy UChar32 indexing: p2-1 may reference trail half
2477          //              but char32At will get the full code point.
2478          if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2479              continue;
2480          }
2481  
2482          // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2483          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2484              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2485              continue;
2486          }
2487  
2488          // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2489          //
2490          if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2491               (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2492               (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2493              continue;
2494          }
2495  
2496          // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2497          if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2498              (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2499              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2500              continue;
2501          }
2502  
2503          // Rule (7a)     Hebrew_Letter x Single_Quote
2504          if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2505              continue;
2506          }
2507  
2508          // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2509          if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2510              continue;
2511          }
2512  
2513          // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2514          if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2515              continue;
2516          }
2517  
2518          // Rule (8)    Numeric x Numeric
2519          if (fNumericSet->contains(c1) &&
2520              fNumericSet->contains(c2))  {
2521              continue;
2522          }
2523  
2524          // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2525          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2526              fNumericSet->contains(c2))  {
2527              continue;
2528          }
2529  
2530          // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2531          if (fNumericSet->contains(c1) &&
2532              (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2533              continue;
2534          }
2535  
2536          // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2537          if (fNumericSet->contains(c0) &&
2538              (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2539              fNumericSet->contains(c2)) {
2540              continue;
2541          }
2542  
2543          // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2544          if (fNumericSet->contains(c1) &&
2545              (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2546              fNumericSet->contains(c3)) {
2547              continue;
2548          }
2549  
2550          // Rule (13)  Katakana x Katakana
2551          //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2552          //                  all Katakana are handled by the dictionary breaker.
2553          if (fKatakanaSet->contains(c1) &&
2554              fKatakanaSet->contains(c2))  {
2555              continue;
2556          }
2557  
2558          // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2559          if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2560               fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2561               fExtendNumLetSet->contains(c2)) {
2562                  continue;
2563          }
2564  
2565          // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2566          if (fExtendNumLetSet->contains(c1) &&
2567                  (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2568                   fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2569              continue;
2570          }
2571  
2572          // WB 14  (E_Base | EBG) x E_Modifier
2573          if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
2574              continue;
2575          }
2576  
2577          // Rule 15 - 17   Group pairs of Regional Indicators.
2578          if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2579              break;
2580          }
2581          if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2582              continue;
2583          }
2584  
2585          // Rule 999.  Break found here.
2586          break;
2587      }
2588  
2589      breakPos = p2;
2590      return breakPos;
2591  }
2592  
2593  
charClasses()2594  UVector  *RBBIWordMonkey::charClasses() {
2595      return fSets;
2596  }
2597  
2598  
~RBBIWordMonkey()2599  RBBIWordMonkey::~RBBIWordMonkey() {
2600      delete fSets;
2601      delete fCRSet;
2602      delete fLFSet;
2603      delete fNewlineSet;
2604      delete fKatakanaSet;
2605      delete fHebrew_LetterSet;
2606      delete fALetterSet;
2607      delete fSingle_QuoteSet;
2608      delete fDouble_QuoteSet;
2609      delete fMidNumLetSet;
2610      delete fMidLetterSet;
2611      delete fMidNumSet;
2612      delete fNumericSet;
2613      delete fFormatSet;
2614      delete fExtendSet;
2615      delete fExtendNumLetSet;
2616      delete fRegionalIndicatorSet;
2617      delete fDictionarySet;
2618      delete fOtherSet;
2619      delete fEBaseSet;
2620      delete fEBGSet;
2621      delete fEModifierSet;
2622      delete fZWJSet;
2623      delete fExtendedPictSet;
2624      delete fEmojiNRKSet;
2625  }
2626  
2627  
2628  
2629  
2630  //------------------------------------------------------------------------------------------
2631  //
2632  //   class RBBISentMonkey      Sentence Break specific implementation
2633  //                             of RBBIMonkeyKind.
2634  //
2635  //------------------------------------------------------------------------------------------
2636  class RBBISentMonkey: public RBBIMonkeyKind {
2637  public:
2638      RBBISentMonkey();
2639      virtual          ~RBBISentMonkey();
2640      virtual  UVector *charClasses();
2641      virtual  void     setText(const UnicodeString &s);
2642      virtual int32_t   next(int32_t i);
2643  private:
2644      int               moveBack(int posFrom);
2645      int               moveForward(int posFrom);
2646      UChar32           cAt(int pos);
2647  
2648      UVector      *fSets;
2649  
2650      UnicodeSet  *fSepSet;
2651      UnicodeSet  *fFormatSet;
2652      UnicodeSet  *fSpSet;
2653      UnicodeSet  *fLowerSet;
2654      UnicodeSet  *fUpperSet;
2655      UnicodeSet  *fOLetterSet;
2656      UnicodeSet  *fNumericSet;
2657      UnicodeSet  *fATermSet;
2658      UnicodeSet  *fSContinueSet;
2659      UnicodeSet  *fSTermSet;
2660      UnicodeSet  *fCloseSet;
2661      UnicodeSet  *fOtherSet;
2662      UnicodeSet  *fExtendSet;
2663  
2664      const UnicodeString  *fText;
2665  
2666  };
2667  
RBBISentMonkey()2668  RBBISentMonkey::RBBISentMonkey()
2669  {
2670      UErrorCode  status = U_ZERO_ERROR;
2671  
2672      fSets            = new UVector(status);
2673  
2674      //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2675      //                       set and made into character classes of their own.  For the monkey impl,
2676      //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2677      fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2678      fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2679      fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2680      fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2681      fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2682      fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2683      fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2684      fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2685      fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2686      fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2687      fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2688      fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2689      fOtherSet        = new UnicodeSet();
2690  
2691      if(U_FAILURE(status)) {
2692        deferredStatus = status;
2693        return;
2694      }
2695  
2696      fOtherSet->complement();
2697      fOtherSet->removeAll(*fSepSet);
2698      fOtherSet->removeAll(*fFormatSet);
2699      fOtherSet->removeAll(*fSpSet);
2700      fOtherSet->removeAll(*fLowerSet);
2701      fOtherSet->removeAll(*fUpperSet);
2702      fOtherSet->removeAll(*fOLetterSet);
2703      fOtherSet->removeAll(*fNumericSet);
2704      fOtherSet->removeAll(*fATermSet);
2705      fOtherSet->removeAll(*fSContinueSet);
2706      fOtherSet->removeAll(*fSTermSet);
2707      fOtherSet->removeAll(*fCloseSet);
2708      fOtherSet->removeAll(*fExtendSet);
2709  
2710      fSets->addElement(fSepSet,       status);
2711      fSets->addElement(fFormatSet,    status);
2712      fSets->addElement(fSpSet,        status);
2713      fSets->addElement(fLowerSet,     status);
2714      fSets->addElement(fUpperSet,     status);
2715      fSets->addElement(fOLetterSet,   status);
2716      fSets->addElement(fNumericSet,   status);
2717      fSets->addElement(fATermSet,     status);
2718      fSets->addElement(fSContinueSet, status);
2719      fSets->addElement(fSTermSet,     status);
2720      fSets->addElement(fCloseSet,     status);
2721      fSets->addElement(fOtherSet,     status);
2722      fSets->addElement(fExtendSet,    status);
2723  
2724      if (U_FAILURE(status)) {
2725          deferredStatus = status;
2726      }
2727  }
2728  
2729  
2730  
setText(const UnicodeString & s)2731  void RBBISentMonkey::setText(const UnicodeString &s) {
2732      fText       = &s;
2733  }
2734  
charClasses()2735  UVector  *RBBISentMonkey::charClasses() {
2736      return fSets;
2737  }
2738  
2739  
2740  //  moveBack()   Find the "significant" code point preceding the index i.
2741  //               Skips over ($Extend | $Format)* .
2742  //
moveBack(int i)2743  int RBBISentMonkey::moveBack(int i) {
2744      if (i <= 0) {
2745          return -1;
2746      }
2747      UChar32   c;
2748      int32_t   j = i;
2749      do {
2750          j = fText->moveIndex32(j, -1);
2751          c = fText->char32At(j);
2752      }
2753      while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2754      return j;
2755  
2756   }
2757  
2758  
moveForward(int i)2759  int RBBISentMonkey::moveForward(int i) {
2760      if (i>=fText->length()) {
2761          return fText->length();
2762      }
2763      UChar32   c;
2764      int32_t   j = i;
2765      do {
2766          j = fText->moveIndex32(j, 1);
2767          c = cAt(j);
2768      }
2769      while (fFormatSet->contains(c) || fExtendSet->contains(c));
2770      return j;
2771  }
2772  
cAt(int pos)2773  UChar32 RBBISentMonkey::cAt(int pos) {
2774      if (pos<0 || pos>=fText->length()) {
2775          return -1;
2776      } else {
2777          return fText->char32At(pos);
2778      }
2779  }
2780  
next(int32_t prevPos)2781  int32_t RBBISentMonkey::next(int32_t prevPos) {
2782      int    p0, p1, p2, p3;    // Indices of the significant code points around the
2783                                //   break position being tested.  The candidate break
2784                                //   location is before p2.
2785  
2786      int     breakPos = -1;
2787  
2788      UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2789      UChar32 c;
2790  
2791      if (U_FAILURE(deferredStatus)) {
2792          return -1;
2793      }
2794  
2795      // Prev break at end of string.  return DONE.
2796      if (prevPos >= fText->length()) {
2797          return -1;
2798      }
2799      p0 = p1 = p2 = p3 = prevPos;
2800      c3 =  fText->char32At(prevPos);
2801      c0 = c1 = c2 = 0;
2802      (void)p0;     // Suppress set but not used warning.
2803  
2804      // Loop runs once per "significant" character position in the input text.
2805      for (;;) {
2806          // Move all of the positions forward in the input string.
2807          p0 = p1;  c0 = c1;
2808          p1 = p2;  c1 = c2;
2809          p2 = p3;  c2 = c3;
2810  
2811          // Advancd p3 by    X(Extend | Format)*   Rule 4
2812          p3 = moveForward(p3);
2813          c3 = cAt(p3);
2814  
2815          // Rule (3)  CR x LF
2816          if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2817              continue;
2818          }
2819  
2820          // Rule (4).   Sep  <break>
2821          if (fSepSet->contains(c1)) {
2822              p2 = p1+1;   // Separators don't combine with Extend or Format.
2823              break;
2824          }
2825  
2826          if (p2 >= fText->length()) {
2827              // Reached end of string.  Always a break position.
2828              break;
2829          }
2830  
2831          if (p2 == prevPos) {
2832              // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2833              continue;
2834          }
2835  
2836          // Rule (6).   ATerm x Numeric
2837          if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2838              continue;
2839          }
2840  
2841          // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2842          if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2843                  fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2844              continue;
2845          }
2846  
2847          // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2848          //           Note:  STerm | ATerm are added to the negated part of the expression by a
2849          //                  note to the Unicode 5.0 documents.
2850          int p8 = p1;
2851          while (fSpSet->contains(cAt(p8))) {
2852              p8 = moveBack(p8);
2853          }
2854          while (fCloseSet->contains(cAt(p8))) {
2855              p8 = moveBack(p8);
2856          }
2857          if (fATermSet->contains(cAt(p8))) {
2858              p8=p2;
2859              for (;;) {
2860                  c = cAt(p8);
2861                  if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2862                      fLowerSet->contains(c) || fSepSet->contains(c) ||
2863                      fATermSet->contains(c) || fSTermSet->contains(c))  {
2864                      break;
2865                  }
2866                  p8 = moveForward(p8);
2867              }
2868              if (fLowerSet->contains(cAt(p8))) {
2869                  continue;
2870              }
2871          }
2872  
2873          // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2874          if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2875              p8 = p1;
2876              while (fSpSet->contains(cAt(p8))) {
2877                  p8 = moveBack(p8);
2878              }
2879              while (fCloseSet->contains(cAt(p8))) {
2880                  p8 = moveBack(p8);
2881              }
2882              c = cAt(p8);
2883              if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2884                  continue;
2885              }
2886          }
2887  
2888          // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2889          int p9 = p1;
2890          while (fCloseSet->contains(cAt(p9))) {
2891              p9 = moveBack(p9);
2892          }
2893          c = cAt(p9);
2894          if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2895              if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2896                  continue;
2897              }
2898          }
2899  
2900          // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2901          int p10 = p1;
2902          while (fSpSet->contains(cAt(p10))) {
2903              p10 = moveBack(p10);
2904          }
2905          while (fCloseSet->contains(cAt(p10))) {
2906              p10 = moveBack(p10);
2907          }
2908          if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2909              if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2910                  continue;
2911              }
2912          }
2913  
2914          // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2915          int p11 = p1;
2916          if (fSepSet->contains(cAt(p11))) {
2917              p11 = moveBack(p11);
2918          }
2919          while (fSpSet->contains(cAt(p11))) {
2920              p11 = moveBack(p11);
2921          }
2922          while (fCloseSet->contains(cAt(p11))) {
2923              p11 = moveBack(p11);
2924          }
2925          if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2926              break;
2927          }
2928  
2929          //  Rule (12)  Any x Any
2930          continue;
2931      }
2932      breakPos = p2;
2933      return breakPos;
2934  }
2935  
~RBBISentMonkey()2936  RBBISentMonkey::~RBBISentMonkey() {
2937      delete fSets;
2938      delete fSepSet;
2939      delete fFormatSet;
2940      delete fSpSet;
2941      delete fLowerSet;
2942      delete fUpperSet;
2943      delete fOLetterSet;
2944      delete fNumericSet;
2945      delete fATermSet;
2946      delete fSContinueSet;
2947      delete fSTermSet;
2948      delete fCloseSet;
2949      delete fOtherSet;
2950      delete fExtendSet;
2951  }
2952  
2953  
2954  
2955  //-------------------------------------------------------------------------------------------
2956  //
2957  //  RBBILineMonkey
2958  //
2959  //-------------------------------------------------------------------------------------------
2960  
2961  class RBBILineMonkey: public RBBIMonkeyKind {
2962  public:
2963      RBBILineMonkey();
2964      virtual          ~RBBILineMonkey();
2965      virtual  UVector *charClasses();
2966      virtual  void     setText(const UnicodeString &s);
2967      virtual  int32_t  next(int32_t i);
2968      virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2969  private:
2970      UVector      *fSets;
2971  
2972      UnicodeSet  *fBK;
2973      UnicodeSet  *fCR;
2974      UnicodeSet  *fLF;
2975      UnicodeSet  *fCM;
2976      UnicodeSet  *fNL;
2977      UnicodeSet  *fSG;
2978      UnicodeSet  *fWJ;
2979      UnicodeSet  *fZW;
2980      UnicodeSet  *fGL;
2981      UnicodeSet  *fCB;
2982      UnicodeSet  *fSP;
2983      UnicodeSet  *fB2;
2984      UnicodeSet  *fBA;
2985      UnicodeSet  *fBB;
2986      UnicodeSet  *fHY;
2987      UnicodeSet  *fH2;
2988      UnicodeSet  *fH3;
2989      UnicodeSet  *fCL;
2990      UnicodeSet  *fCP;
2991      UnicodeSet  *fEX;
2992      UnicodeSet  *fIN;
2993      UnicodeSet  *fJL;
2994      UnicodeSet  *fJV;
2995      UnicodeSet  *fJT;
2996      UnicodeSet  *fNS;
2997      UnicodeSet  *fOP;
2998      UnicodeSet  *fQU;
2999      UnicodeSet  *fIS;
3000      UnicodeSet  *fNU;
3001      UnicodeSet  *fPO;
3002      UnicodeSet  *fPR;
3003      UnicodeSet  *fSY;
3004      UnicodeSet  *fAI;
3005      UnicodeSet  *fAL;
3006      UnicodeSet  *fCJ;
3007      UnicodeSet  *fHL;
3008      UnicodeSet  *fID;
3009      UnicodeSet  *fRI;
3010      UnicodeSet  *fXX;
3011      UnicodeSet  *fEB;
3012      UnicodeSet  *fEM;
3013      UnicodeSet  *fZJ;
3014      UnicodeSet  *fExtendedPict;
3015      UnicodeSet  *fEmojiNRK;
3016  
3017      BreakIterator        *fCharBI;
3018      const UnicodeString  *fText;
3019      RegexMatcher         *fNumberMatcher;
3020  };
3021  
RBBILineMonkey()3022  RBBILineMonkey::RBBILineMonkey() :
3023      RBBIMonkeyKind(),
3024      fSets(NULL),
3025  
3026      fCharBI(NULL),
3027      fText(NULL),
3028      fNumberMatcher(NULL)
3029  
3030  {
3031      if (U_FAILURE(deferredStatus)) {
3032          return;
3033      }
3034  
3035      UErrorCode  status = U_ZERO_ERROR;
3036  
3037      fSets  = new UVector(status);
3038  
3039      fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3040      fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3041      fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3042      fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3043      fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3044      fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3045      fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3046      fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3047      fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3048      fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3049      fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3050      fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3051      fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3052      fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3053      fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3054      fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3055      fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3056      fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3057      fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3058      fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3059      fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3060      fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3061      fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3062      fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3063      fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3064      fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3065      fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3066      fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3067      fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3068      fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3069      fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3070      fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3071      fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3072      fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3073      fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3074      fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3075      fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3076      fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3077      fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3078      fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE(
3079              "[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
3080      fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3081      fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
3082      fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
3083      fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
3084  
3085      if (U_FAILURE(status)) {
3086          deferredStatus = status;
3087          return;
3088      }
3089  
3090      fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3091      fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3092      fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3093  
3094      fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
3095      fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
3096  
3097      fSets->addElement(fBK, status);
3098      fSets->addElement(fCR, status);
3099      fSets->addElement(fLF, status);
3100      fSets->addElement(fCM, status);
3101      fSets->addElement(fNL, status);
3102      fSets->addElement(fWJ, status);
3103      fSets->addElement(fZW, status);
3104      fSets->addElement(fGL, status);
3105      fSets->addElement(fCB, status);
3106      fSets->addElement(fSP, status);
3107      fSets->addElement(fB2, status);
3108      fSets->addElement(fBA, status);
3109      fSets->addElement(fBB, status);
3110      fSets->addElement(fHY, status);
3111      fSets->addElement(fH2, status);
3112      fSets->addElement(fH3, status);
3113      fSets->addElement(fCL, status);
3114      fSets->addElement(fCP, status);
3115      fSets->addElement(fEX, status);
3116      fSets->addElement(fIN, status);
3117      fSets->addElement(fJL, status);
3118      fSets->addElement(fJT, status);
3119      fSets->addElement(fJV, status);
3120      fSets->addElement(fNS, status);
3121      fSets->addElement(fOP, status);
3122      fSets->addElement(fQU, status);
3123      fSets->addElement(fIS, status);
3124      fSets->addElement(fNU, status);
3125      fSets->addElement(fPO, status);
3126      fSets->addElement(fPR, status);
3127      fSets->addElement(fSY, status);
3128      fSets->addElement(fAI, status);
3129      fSets->addElement(fAL, status);
3130      fSets->addElement(fHL, status);
3131      fSets->addElement(fID, status);
3132      fSets->addElement(fWJ, status);
3133      fSets->addElement(fRI, status);
3134      fSets->addElement(fSG, status);
3135      fSets->addElement(fEB, status);
3136      fSets->addElement(fEM, status);
3137      fSets->addElement(fZJ, status);
3138      fSets->addElement(fExtendedPict, status);
3139      fSets->addElement(fEmojiNRK, status);
3140  
3141  
3142      const char *rules =
3143              "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3144              "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3145              "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3146              "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3147              "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3148              "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
3149  
3150      fNumberMatcher = new RegexMatcher(
3151          UnicodeString(rules, -1, US_INV), 0, status);
3152  
3153      fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3154  
3155      if (U_FAILURE(status)) {
3156          deferredStatus = status;
3157      }
3158  }
3159  
3160  
setText(const UnicodeString & s)3161  void RBBILineMonkey::setText(const UnicodeString &s) {
3162      fText       = &s;
3163      fCharBI->setText(s);
3164      fNumberMatcher->reset(s);
3165  }
3166  
3167  //
3168  //  rule9Adjust
3169  //     Line Break TR rules 9 and 10 implementation.
3170  //     This deals with combining marks and other sequences that
3171  //     that must be treated as if they were something other than what they actually are.
3172  //
3173  //     This is factored out into a separate function because it must be applied twice for
3174  //     each potential break, once to the chars before the position being checked, then
3175  //     again to the text following the possible break.
3176  //
rule9Adjust(int32_t pos,UChar32 * posChar,int32_t * nextPos,UChar32 * nextChar)3177  void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3178      if (pos == -1) {
3179          // Invalid initial position.  Happens during the warmup iteration of the
3180          //   main loop in next().
3181          return;
3182      }
3183  
3184      int32_t  nPos = *nextPos;
3185  
3186      // LB 9  Keep combining sequences together.
3187      //  advance over any CM class chars.  Note that Line Break CM is different
3188      //  from the normal Grapheme Extend property.
3189      if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3190            *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3191          for (;;) {
3192              *nextChar = fText->char32At(nPos);
3193              if (!fCM->contains(*nextChar)) {
3194                  break;
3195              }
3196              nPos = fText->moveIndex32(nPos, 1);
3197          }
3198      }
3199  
3200  
3201      // LB 9 Treat X CM* as if it were x.
3202      //       No explicit action required.
3203  
3204      // LB 10  Treat any remaining combining mark as AL
3205      if (fCM->contains(*posChar)) {
3206          *posChar = 0x41;   // thisChar = 'A';
3207      }
3208  
3209      // Push the updated nextPos and nextChar back to our caller.
3210      // This only makes a difference if posChar got bigger by consuming a
3211      // combining sequence.
3212      *nextPos  = nPos;
3213      *nextChar = fText->char32At(nPos);
3214  }
3215  
3216  
3217  
next(int32_t startPos)3218  int32_t RBBILineMonkey::next(int32_t startPos) {
3219      UErrorCode status = U_ZERO_ERROR;
3220      int32_t    pos;       //  Index of the char following a potential break position
3221      UChar32    thisChar;  //  Character at above position "pos"
3222  
3223      int32_t    prevPos;   //  Index of the char preceding a potential break position
3224      UChar32    prevChar;  //  Character at above position.  Note that prevChar
3225                            //   and thisChar may not be adjacent because combining
3226                            //   characters between them will be ignored.
3227  
3228      int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3229      UChar32    prevCharX2;
3230  
3231      int32_t    nextPos;   //  Index of the next character following pos.
3232                            //     Usually skips over combining marks.
3233      int32_t    nextCPPos; //  Index of the code point following "pos."
3234                            //     May point to a combining mark.
3235      int32_t    tPos;      //  temp value.
3236      UChar32    c;
3237  
3238      if (U_FAILURE(deferredStatus)) {
3239          return -1;
3240      }
3241  
3242      if (startPos >= fText->length()) {
3243          return -1;
3244      }
3245  
3246  
3247      // Initial values for loop.  Loop will run the first time without finding breaks,
3248      //                           while the invalid values shift out and the "this" and
3249      //                           "prev" positions are filled in with good values.
3250      pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3251      thisChar = prevChar  = prevCharX2 = 0;
3252      nextPos  = nextCPPos = startPos;
3253  
3254  
3255      // Loop runs once per position in the test text, until a break position
3256      //  is found.
3257      for (;;) {
3258          prevPosX2 = prevPos;
3259          prevCharX2 = prevChar;
3260  
3261          prevPos   = pos;
3262          prevChar  = thisChar;
3263  
3264          pos       = nextPos;
3265          thisChar  = fText->char32At(pos);
3266  
3267          nextCPPos = fText->moveIndex32(pos, 1);
3268          nextPos   = nextCPPos;
3269  
3270          // Rule LB2 - Break at end of text.
3271          if (pos >= fText->length()) {
3272              break;
3273          }
3274  
3275          // Rule LB 9 - adjust for combining sequences.
3276          //             We do this one out-of-order because the adjustment does not change anything
3277          //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3278          //             be applied.
3279          rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3280          nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3281          c = fText->char32At(nextPos);
3282          rule9Adjust(pos,     &thisChar, &nextPos, &c);
3283  
3284          // If the loop is still warming up - if we haven't shifted the initial
3285          //   -1 positions out of prevPos yet - loop back to advance the
3286          //    position in the input without any further looking for breaks.
3287          if (prevPos == -1) {
3288              continue;
3289          }
3290  
3291          // LB 4  Always break after hard line breaks,
3292          if (fBK->contains(prevChar)) {
3293              break;
3294          }
3295  
3296          // LB 5  Break after CR, LF, NL, but not inside CR LF
3297          if (prevChar == 0x0d && thisChar == 0x0a) {
3298              continue;
3299          }
3300          if (prevChar == 0x0d ||
3301              prevChar == 0x0a ||
3302              prevChar == 0x85)  {
3303              break;
3304          }
3305  
3306          // LB 6  Don't break before hard line breaks
3307          if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3308              fBK->contains(thisChar)) {
3309                  continue;
3310          }
3311  
3312  
3313          // LB 7  Don't break before spaces or zero-width space.
3314          if (fSP->contains(thisChar)) {
3315              continue;
3316          }
3317  
3318          if (fZW->contains(thisChar)) {
3319              continue;
3320          }
3321  
3322          // LB 8  Break after zero width space
3323          if (fZW->contains(prevChar)) {
3324              break;
3325          }
3326  
3327          // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
3328          //       The monkey test's way of ignoring combining characters doesn't work
3329          //       for this rule. ZJ is also a CM. Need to get the actual character
3330          //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3331          {
3332              int32_t prevIdx = fText->moveIndex32(pos, -1);
3333              UChar32 prevC = fText->char32At(prevIdx);
3334              if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
3335                  continue;
3336              }
3337          }
3338  
3339          // LB 9, 10  Already done, at top of loop.
3340          //
3341  
3342  
3343          // LB 11  Do not break before or after WORD JOINER and related characters.
3344          //    x  WJ
3345          //    WJ  x
3346          //
3347          if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3348              continue;
3349          }
3350  
3351          // LB 12
3352          //    GL  x
3353          if (fGL->contains(prevChar)) {
3354              continue;
3355          }
3356  
3357          // LB 12a
3358          //    [^SP BA HY] x GL
3359          if (!(fSP->contains(prevChar) ||
3360                fBA->contains(prevChar) ||
3361                fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3362              continue;
3363          }
3364  
3365  
3366  
3367          // LB 13  Don't break before closings.
3368          //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3369          //        fall into LB 17 and the more general number regular expression.
3370          //
3371          if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3372              (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3373                                           fEX->contains(thisChar)  ||
3374              (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3375              (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3376              continue;
3377          }
3378  
3379          // LB 14 Don't break after OP SP*
3380          //       Scan backwards, checking for this sequence.
3381          //       The OP char could include combining marks, so we actually check for
3382          //           OP CM* SP*
3383          //       Another Twist: The Rule 67 fixes may have changed a SP CM
3384          //       sequence into a ID char, so before scanning back through spaces,
3385          //       verify that prevChar is indeed a space.  The prevChar variable
3386          //       may differ from fText[prevPos]
3387          tPos = prevPos;
3388          if (fSP->contains(prevChar)) {
3389              while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3390                  tPos=fText->moveIndex32(tPos, -1);
3391              }
3392          }
3393          while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3394              tPos=fText->moveIndex32(tPos, -1);
3395          }
3396          if (fOP->contains(fText->char32At(tPos))) {
3397              continue;
3398          }
3399  
3400  
3401          // LB 15    QU SP* x OP
3402          if (fOP->contains(thisChar)) {
3403              // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3404              int tPos = prevPos;
3405              while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3406                  tPos = fText->moveIndex32(tPos, -1);
3407              }
3408              while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3409                  tPos = fText->moveIndex32(tPos, -1);
3410              }
3411              if (fQU->contains(fText->char32At(tPos))) {
3412                  continue;
3413              }
3414          }
3415  
3416  
3417  
3418          // LB 16   (CL | CP) SP* x NS
3419          //    Scan backwards for SP* CM* (CL | CP)
3420          if (fNS->contains(thisChar)) {
3421              int tPos = prevPos;
3422              while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3423                  tPos = fText->moveIndex32(tPos, -1);
3424              }
3425              while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3426                  tPos = fText->moveIndex32(tPos, -1);
3427              }
3428              if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3429                  continue;
3430              }
3431          }
3432  
3433  
3434          // LB 17        B2 SP* x B2
3435          if (fB2->contains(thisChar)) {
3436              //  Scan backwards, checking for the B2 CM* SP* sequence.
3437              tPos = prevPos;
3438              if (fSP->contains(prevChar)) {
3439                  while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3440                      tPos=fText->moveIndex32(tPos, -1);
3441                  }
3442              }
3443              while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3444                  tPos=fText->moveIndex32(tPos, -1);
3445              }
3446              if (fB2->contains(fText->char32At(tPos))) {
3447                  continue;
3448              }
3449          }
3450  
3451  
3452          // LB 18    break after space
3453          if (fSP->contains(prevChar)) {
3454              break;
3455          }
3456  
3457          // LB 19
3458          //    x   QU
3459          //    QU  x
3460          if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3461              continue;
3462          }
3463  
3464          // LB 20  Break around a CB
3465          if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3466              break;
3467          }
3468  
3469          // LB 21
3470          if (fBA->contains(thisChar) ||
3471              fHY->contains(thisChar) ||
3472              fNS->contains(thisChar) ||
3473              fBB->contains(prevChar) )   {
3474              continue;
3475          }
3476  
3477          // LB 21a
3478          //   HL (HY | BA) x
3479          if (fHL->contains(prevCharX2) &&
3480                  (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3481              continue;
3482          }
3483  
3484          // LB 21b
3485          //   SY x HL
3486          if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3487              continue;
3488          }
3489  
3490          // LB 22
3491          if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3492              (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3493              (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3494              ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3495              (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3496              (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3497              continue;
3498          }
3499  
3500  
3501          // LB 23    (AL | HL) x NU
3502          //          NU x (AL | HL)
3503          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3504              continue;
3505          }
3506          if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3507              continue;
3508          }
3509  
3510          // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3511          //      PR x (ID | EB | EM)
3512          //     (ID | EB | EM) x PO
3513          if (fPR->contains(prevChar) &&
3514                  (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3515              continue;
3516          }
3517          if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3518                  fPO->contains(thisChar)) {
3519              continue;
3520          }
3521  
3522          // LB 24  Do not break between prefix and letters or ideographs.
3523          //         (PR | PO) x (AL | HL)
3524          //         (AL | HL) x (PR | PO)
3525          if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3526                  (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3527              continue;
3528          }
3529          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3530                  (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3531              continue;
3532          }
3533  
3534  
3535  
3536          // LB 25    Numbers
3537          if (fNumberMatcher->lookingAt(prevPos, status)) {
3538              if (U_FAILURE(status)) {
3539                  break;
3540              }
3541              // Matched a number.  But could have been just a single digit, which would
3542              //    not represent a "no break here" between prevChar and thisChar
3543              int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3544              if (numEndIdx > pos) {
3545                  // Number match includes at least our two chars being checked
3546                  if (numEndIdx > nextPos) {
3547                      // Number match includes additional chars.  Update pos and nextPos
3548                      //   so that next loop iteration will continue at the end of the number,
3549                      //   checking for breaks between last char in number & whatever follows.
3550                      pos = nextPos = numEndIdx;
3551                      do {
3552                          pos = fText->moveIndex32(pos, -1);
3553                          thisChar = fText->char32At(pos);
3554                      } while (fCM->contains(thisChar));
3555                  }
3556                  continue;
3557              }
3558          }
3559  
3560  
3561          // LB 26 Do not break a Korean syllable.
3562          if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3563                                          fJV->contains(thisChar) ||
3564                                          fH2->contains(thisChar) ||
3565                                          fH3->contains(thisChar))) {
3566                                              continue;
3567                                          }
3568  
3569          if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3570              (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3571                  continue;
3572          }
3573  
3574          if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3575              fJT->contains(thisChar)) {
3576                  continue;
3577          }
3578  
3579          // LB 27 Treat a Korean Syllable Block the same as ID.
3580          if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3581              fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3582              fIN->contains(thisChar)) {
3583                  continue;
3584              }
3585          if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3586              fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3587              fPO->contains(thisChar)) {
3588                  continue;
3589              }
3590          if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3591              fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3592                  continue;
3593              }
3594  
3595  
3596  
3597          // LB 28  Do not break between alphabetics ("at").
3598          if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3599              continue;
3600          }
3601  
3602          // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3603          if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3604              continue;
3605          }
3606  
3607          // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3608          //          (AL | NU) x OP
3609          //          CP x (AL | NU)
3610          if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3611              continue;
3612          }
3613          if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3614              continue;
3615          }
3616  
3617          // LB30a    RI RI <break> RI
3618          //             RI    x    RI
3619          if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3620              break;
3621          }
3622          if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3623              continue;
3624          }
3625  
3626          // LB30b    Emoji Base x Emoji Modifier
3627          if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3628              continue;
3629          }
3630  
3631          // LB 31    Break everywhere else
3632          break;
3633  
3634      }
3635  
3636      return pos;
3637  }
3638  
3639  
charClasses()3640  UVector  *RBBILineMonkey::charClasses() {
3641      return fSets;
3642  }
3643  
3644  
~RBBILineMonkey()3645  RBBILineMonkey::~RBBILineMonkey() {
3646      delete fSets;
3647  
3648      delete fBK;
3649      delete fCR;
3650      delete fLF;
3651      delete fCM;
3652      delete fNL;
3653      delete fWJ;
3654      delete fZW;
3655      delete fGL;
3656      delete fCB;
3657      delete fSP;
3658      delete fB2;
3659      delete fBA;
3660      delete fBB;
3661      delete fHY;
3662      delete fH2;
3663      delete fH3;
3664      delete fCL;
3665      delete fCP;
3666      delete fEX;
3667      delete fIN;
3668      delete fJL;
3669      delete fJV;
3670      delete fJT;
3671      delete fNS;
3672      delete fOP;
3673      delete fQU;
3674      delete fIS;
3675      delete fNU;
3676      delete fPO;
3677      delete fPR;
3678      delete fSY;
3679      delete fAI;
3680      delete fAL;
3681      delete fCJ;
3682      delete fHL;
3683      delete fID;
3684      delete fRI;
3685      delete fSG;
3686      delete fXX;
3687      delete fEB;
3688      delete fEM;
3689      delete fZJ;
3690      delete fExtendedPict;
3691      delete fEmojiNRK;
3692  
3693      delete fCharBI;
3694      delete fNumberMatcher;
3695  }
3696  
3697  
3698  //-------------------------------------------------------------------------------------------
3699  //
3700  //   TestMonkey
3701  //
3702  //     params
3703  //       seed=nnnnn        Random number starting seed.
3704  //                         Setting the seed allows errors to be reproduced.
3705  //       loop=nnn          Looping count.  Controls running time.
3706  //                         -1:  run forever.
3707  //                          0 or greater:  run length.
3708  //
3709  //       type = char | word | line | sent | title
3710  //
3711  //  Example:
3712  //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3713  //
3714  //-------------------------------------------------------------------------------------------
3715  
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)3716  static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3717      int32_t val = defaultVal;
3718      name.append(" *= *(-?\\d+)");
3719      UErrorCode status = U_ZERO_ERROR;
3720      RegexMatcher m(name, params, 0, status);
3721      if (m.find()) {
3722          // The param exists.  Convert the string to an int.
3723          char valString[100];
3724          int32_t paramLength = m.end(1, status) - m.start(1, status);
3725          if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3726              paramLength = (int32_t)(sizeof(valString)-2);
3727          }
3728          params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3729          val = strtol(valString,  NULL, 10);
3730  
3731          // Delete this parameter from the params string.
3732          m.reset();
3733          params = m.replaceFirst("", status);
3734      }
3735      U_ASSERT(U_SUCCESS(status));
3736      return val;
3737  }
3738  #endif
3739  
3740  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
testBreakBoundPreceding(RBBITest * test,UnicodeString ustr,BreakIterator * bi,int expected[],int expectedcount)3741  static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3742                                      BreakIterator *bi,
3743                                      int expected[],
3744                                      int expectedcount)
3745  {
3746      int count = 0;
3747      int i = 0;
3748      int forward[50];
3749      bi->setText(ustr);
3750      for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3751          forward[count] = i;
3752          if (count < expectedcount && expected[count] != i) {
3753              test->errln("break forward test failed: expected %d but got %d",
3754                          expected[count], i);
3755              break;
3756          }
3757          count ++;
3758      }
3759      if (count != expectedcount) {
3760          printStringBreaks(ustr, expected, expectedcount);
3761          test->errln("break forward test failed: missed %d match",
3762                      expectedcount - count);
3763          return;
3764      }
3765      // testing boundaries
3766      for (i = 1; i < expectedcount; i ++) {
3767          int j = expected[i - 1];
3768          if (!bi->isBoundary(j)) {
3769              printStringBreaks(ustr, expected, expectedcount);
3770              test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3771              return;
3772          }
3773          for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3774              if (bi->isBoundary(j)) {
3775                  printStringBreaks(ustr, expected, expectedcount);
3776                  test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3777                  return;
3778              }
3779          }
3780      }
3781  
3782      for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3783          count --;
3784          if (forward[count] != i) {
3785              printStringBreaks(ustr, expected, expectedcount);
3786              test->errln("happy break test previous() failed: expected %d but got %d",
3787                          forward[count], i);
3788              break;
3789          }
3790      }
3791      if (count != 0) {
3792          printStringBreaks(ustr, expected, expectedcount);
3793          test->errln("break test previous() failed: missed a match");
3794          return;
3795      }
3796  
3797      // testing preceding
3798      for (i = 0; i < expectedcount - 1; i ++) {
3799          // int j = expected[i] + 1;
3800          int j = ustr.moveIndex32(expected[i], 1);
3801          for (; j <= expected[i + 1]; j ++) {
3802              if (bi->preceding(j) != expected[i]) {
3803                  printStringBreaks(ustr, expected, expectedcount);
3804                  test->errln("preceding(): Not expecting boundary at position %d", j);
3805                  return;
3806              }
3807          }
3808      }
3809  }
3810  #endif
3811  
TestWordBreaks(void)3812  void RBBITest::TestWordBreaks(void)
3813  {
3814  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3815  
3816      Locale        locale("en");
3817      UErrorCode    status = U_ZERO_ERROR;
3818      // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3819      BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3820      // Replaced any C+J characters in a row with a random sequence of characters
3821      // of the same length to make our C+J segmentation not get in the way.
3822      static const char *strlist[] =
3823      {
3824      "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3825      "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3826      "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3827      "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3828      "\\uac00\\u3588\\u009c\\u0953\\u194b",
3829      "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3830      "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3831      "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3832      "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3833      "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3834      "\\u2027\\U000e0067\\u0a47\\u00b7",
3835      "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3836      "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3837      "\\u0589\\U000e006e\\u0a42\\U000104a5",
3838      "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3839      "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3840      "\\u0027\\u11af\\U000e0057\\u0602",
3841      "\\U0001d7f2\\U000e007\\u0004\\u0589",
3842      "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3843      "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3844      "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3845      "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3846      "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3847      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3848      "\\u0233\\U000e0020\\u0a69\\u0d6a",
3849      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3850      "\\u18f4\\U000e0049\\u20e7\\u2027",
3851      "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3852      "\\ua183\\u102d\\u0bec\\u003a",
3853      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3854      "\\u003a\\u0e57\\u0fad\\u002e",
3855      "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3856      "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3857      "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3858      "\\u003a\\u0664\\u00b7\\u1fba",
3859      "\\u003b\\u0027\\u00b7\\u47a3",
3860      "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3861      "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3862      "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3863      };
3864      int loop;
3865      if (U_FAILURE(status)) {
3866          errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3867          return;
3868      }
3869      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3870          // printf("looping %d\n", loop);
3871          UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3872          // RBBICharMonkey monkey;
3873          RBBIWordMonkey monkey;
3874  
3875          int expected[50];
3876          int expectedcount = 0;
3877  
3878          monkey.setText(ustr);
3879          int i;
3880          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3881              expected[expectedcount ++] = i;
3882          }
3883  
3884          testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3885      }
3886      delete bi;
3887  #endif
3888  }
3889  
TestWordBoundary(void)3890  void RBBITest::TestWordBoundary(void)
3891  {
3892      // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3893      Locale        locale("en");
3894      UErrorCode    status = U_ZERO_ERROR;
3895      // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3896      BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3897      UChar         str[50];
3898      static const char *strlist[] =
3899      {
3900      "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3901      "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3902      "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3903      "\\u2027\\U000e0067\\u0a47\\u00b7",
3904      "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3905      "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3906      "\\u0589\\U000e006e\\u0a42\\U000104a5",
3907      "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3908      "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3909      "\\u0027\\u11af\\U000e0057\\u0602",
3910      "\\U0001d7f2\\U000e007\\u0004\\u0589",
3911      "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3912      "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3913      "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3914      "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3915      "\\U000e0065\\u302c\\u09ee\\U000e0068",
3916      "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3917      "\\u0233\\U000e0020\\u0a69\\u0d6a",
3918      "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3919      "\\u58f4\\U000e0049\\u20e7\\u2027",
3920      "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3921      "\\ua183\\u102d\\u0bec\\u003a",
3922      "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3923      "\\u003a\\u0e57\\u0fad\\u002e",
3924      "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3925      "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3926      "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3927      "\\u003a\\u0664\\u00b7\\u1fba",
3928      "\\u003b\\u0027\\u00b7\\u47a3",
3929      };
3930      int loop;
3931      if (U_FAILURE(status)) {
3932          errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3933          return;
3934      }
3935      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3936          // printf("looping %d\n", loop);
3937          u_unescape(strlist[loop], str, 20);
3938          UnicodeString ustr(str);
3939          int forward[50];
3940          int count = 0;
3941  
3942          bi->setText(ustr);
3943          int prev = 0;
3944          int i;
3945          for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3946              forward[count ++] = i;
3947              if (i > prev) {
3948                  int j;
3949                  for (j = prev + 1; j < i; j ++) {
3950                      if (bi->isBoundary(j)) {
3951                          printStringBreaks(ustr, forward, count);
3952                          errln("happy boundary test failed: expected %d not a boundary",
3953                                 j);
3954                          return;
3955                      }
3956                  }
3957              }
3958              if (!bi->isBoundary(i)) {
3959                  printStringBreaks(ustr, forward, count);
3960                  errln("happy boundary test failed: expected %d a boundary",
3961                         i);
3962                  return;
3963              }
3964              prev = i;
3965          }
3966      }
3967      delete bi;
3968  }
3969  
TestLineBreaks(void)3970  void RBBITest::TestLineBreaks(void)
3971  {
3972  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3973      Locale        locale("en");
3974      UErrorCode    status = U_ZERO_ERROR;
3975      BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3976      const int32_t  STRSIZE = 50;
3977      UChar         str[STRSIZE];
3978      static const char *strlist[] =
3979      {
3980       "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3981       "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3982               "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3983       "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3984               "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3985       "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3986       "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3987       "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3988       "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3989       "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3990       "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3991       "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3992       "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3993       "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3994       "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3995       "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3996       "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3997       "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3998       "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3999       "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4000       "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4001       "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4002       "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4003       "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4004       "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4005       "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4006       "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4007       "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4008       "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4009       "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4010       "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4011       "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4012       "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4013       "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4014       "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4015       "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4016       "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4017       "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4018       "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4019           "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4020      };
4021      int loop;
4022      TEST_ASSERT_SUCCESS(status);
4023      if (U_FAILURE(status)) {
4024          return;
4025      }
4026      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4027          // printf("looping %d\n", loop);
4028          int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4029          if (t >= STRSIZE) {
4030              TEST_ASSERT(FALSE);
4031              continue;
4032          }
4033  
4034  
4035          UnicodeString ustr(str);
4036          RBBILineMonkey monkey;
4037          if (U_FAILURE(monkey.deferredStatus)) {
4038              continue;
4039          }
4040  
4041          const int EXPECTEDSIZE = 50;
4042          int expected[EXPECTEDSIZE];
4043          int expectedcount = 0;
4044  
4045          monkey.setText(ustr);
4046          int i;
4047          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4048              if (expectedcount >= EXPECTEDSIZE) {
4049                  TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4050                  return;
4051              }
4052              expected[expectedcount ++] = i;
4053          }
4054  
4055          testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4056      }
4057      delete bi;
4058  #endif
4059  }
4060  
TestSentBreaks(void)4061  void RBBITest::TestSentBreaks(void)
4062  {
4063  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4064      Locale        locale("en");
4065      UErrorCode    status = U_ZERO_ERROR;
4066      BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4067      UChar         str[200];
4068      static const char *strlist[] =
4069      {
4070       "Now\ris\nthe\r\ntime\n\rfor\r\r",
4071       "This\n",
4072       "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4073       "\"Sentence ending with a quote.\" Bye.",
4074       "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4075       "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4076       "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4077       "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4078       "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4079       "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4080       "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4081               "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4082               "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4083               "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4084       "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4085               "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4086               "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4087               "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4088               "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4089               "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4090      };
4091      int loop;
4092      if (U_FAILURE(status)) {
4093          errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4094          return;
4095      }
4096      for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4097          u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4098          UnicodeString ustr(str);
4099  
4100          RBBISentMonkey monkey;
4101          if (U_FAILURE(monkey.deferredStatus)) {
4102              continue;
4103          }
4104  
4105          const int EXPECTEDSIZE = 50;
4106          int expected[EXPECTEDSIZE];
4107          int expectedcount = 0;
4108  
4109          monkey.setText(ustr);
4110          int i;
4111          for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4112              if (expectedcount >= EXPECTEDSIZE) {
4113                  TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4114                  return;
4115              }
4116              expected[expectedcount ++] = i;
4117          }
4118  
4119          testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4120      }
4121      delete bi;
4122  #endif
4123  }
4124  
TestMonkey()4125  void RBBITest::TestMonkey() {
4126  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4127  
4128      UErrorCode     status    = U_ZERO_ERROR;
4129      int32_t        loopCount = 500;
4130      int32_t        seed      = 1;
4131      UnicodeString  breakType = "all";
4132      Locale         locale("en");
4133      UBool          useUText  = FALSE;
4134  
4135      if (quick == FALSE) {
4136          loopCount = 10000;
4137      }
4138  
4139      if (fTestParams) {
4140          UnicodeString p(fTestParams);
4141          loopCount = getIntParam("loop", p, loopCount);
4142          seed      = getIntParam("seed", p, seed);
4143  
4144          RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4145          if (m.find()) {
4146              breakType = m.group(1, status);
4147              m.reset();
4148              p = m.replaceFirst("", status);
4149          }
4150  
4151          RegexMatcher u(" *utext", p, 0, status);
4152          if (u.find()) {
4153              useUText = TRUE;
4154              u.reset();
4155              p = u.replaceFirst("", status);
4156          }
4157  
4158  
4159          // m.reset(p);
4160          if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4161              // Each option is stripped out of the option string as it is processed.
4162              // All options have been checked.  The option string should have been completely emptied..
4163              char buf[100];
4164              p.extract(buf, sizeof(buf), NULL, status);
4165              buf[sizeof(buf)-1] = 0;
4166              errln("Unrecognized or extra parameter:  %s\n", buf);
4167              return;
4168          }
4169  
4170      }
4171  
4172      if (breakType == "char" || breakType == "all") {
4173          RBBICharMonkey  m;
4174          BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4175          if (U_SUCCESS(status)) {
4176              RunMonkey(bi, m, "char", seed, loopCount, useUText);
4177              if (breakType == "all" && useUText==FALSE) {
4178                  // Also run a quick test with UText when "all" is specified
4179                  RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4180              }
4181          }
4182          else {
4183              errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4184          }
4185          delete bi;
4186      }
4187  
4188      if (breakType == "word" || breakType == "all") {
4189          logln("Word Break Monkey Test");
4190          RBBIWordMonkey  m;
4191          BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4192          if (U_SUCCESS(status)) {
4193              RunMonkey(bi, m, "word", seed, loopCount, useUText);
4194          }
4195          else {
4196              errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4197          }
4198          delete bi;
4199      }
4200  
4201      if (breakType == "line" || breakType == "all") {
4202          logln("Line Break Monkey Test");
4203          RBBILineMonkey  m;
4204          BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4205          if (loopCount >= 10) {
4206              loopCount = loopCount / 5;   // Line break runs slower than the others.
4207          }
4208          if (U_SUCCESS(status)) {
4209              RunMonkey(bi, m, "line", seed, loopCount, useUText);
4210          }
4211          else {
4212              errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4213          }
4214          delete bi;
4215      }
4216  
4217      if (breakType == "sent" || breakType == "all"  ) {
4218          logln("Sentence Break Monkey Test");
4219          RBBISentMonkey  m;
4220          BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4221          if (loopCount >= 10) {
4222              loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4223          }
4224          if (U_SUCCESS(status)) {
4225              RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4226          }
4227          else {
4228              errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4229          }
4230          delete bi;
4231      }
4232  
4233  #endif
4234  }
4235  
4236  //
4237  //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4238  //    Parameters:
4239  //       bi      - the break iterator to use
4240  //       mk      - MonkeyKind, abstraction for obtaining expected results
4241  //       name    - Name of test (char, word, etc.) for use in error messages
4242  //       seed    - Seed for starting random number generator (parameter from user)
4243  //       numIterations
4244  //
RunMonkey(BreakIterator * bi,RBBIMonkeyKind & mk,const char * name,uint32_t seed,int32_t numIterations,UBool useUText)4245  void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4246                           int32_t numIterations, UBool useUText) {
4247  
4248  #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4249  
4250      const int32_t    TESTSTRINGLEN = 500;
4251      UnicodeString    testText;
4252      int32_t          numCharClasses;
4253      UVector          *chClasses;
4254      int              expected[TESTSTRINGLEN*2 + 1];
4255      int              expectedCount = 0;
4256      char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4257      char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4258      char             reverseBreaks[TESTSTRINGLEN*2+1];
4259      char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4260      char             followingBreaks[TESTSTRINGLEN*2+1];
4261      char             precedingBreaks[TESTSTRINGLEN*2+1];
4262      int              i;
4263      int              loopCount = 0;
4264  
4265      m_seed = seed;
4266  
4267      numCharClasses = mk.charClasses()->size();
4268      chClasses      = mk.charClasses();
4269  
4270      // Check for errors that occured during the construction of the MonkeyKind object.
4271      //  Can't report them where they occured because errln() is a method coming from intlTest,
4272      //  and is not visible outside of RBBITest :-(
4273      if (U_FAILURE(mk.deferredStatus)) {
4274          errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4275          return;
4276      }
4277  
4278      // Verify that the character classes all have at least one member.
4279      for (i=0; i<numCharClasses; i++) {
4280          UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4281          if (s == NULL || s->size() == 0) {
4282              errln("Character Class #%d is null or of zero size.", i);
4283              return;
4284          }
4285      }
4286  
4287      while (loopCount < numIterations || numIterations == -1) {
4288          if (numIterations == -1 && loopCount % 10 == 0) {
4289              // If test is running in an infinite loop, display a periodic tic so
4290              //   we can tell that it is making progress.
4291              fprintf(stderr, ".");
4292          }
4293          // Save current random number seed, so that we can recreate the random numbers
4294          //   for this loop iteration in event of an error.
4295          seed = m_seed;
4296  
4297          // Populate a test string with data.
4298          testText.truncate(0);
4299          for (i=0; i<TESTSTRINGLEN; i++) {
4300              int32_t  aClassNum = m_rand() % numCharClasses;
4301              UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4302              int32_t   charIdx = m_rand() % classSet->size();
4303              UChar32   c = classSet->charAt(charIdx);
4304              if (c < 0) {   // TODO:  deal with sets containing strings.
4305                  errln("%s:%d c < 0", __FILE__, __LINE__);
4306                  break;
4307              }
4308              // Do not assemble a supplementary character from randomly generated separate surrogates.
4309              //   (It could be a dictionary character)
4310              if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4311                  continue;
4312              }
4313  
4314              testText.append(c);
4315          }
4316  
4317          // Calculate the expected results for this test string.
4318          mk.setText(testText);
4319          memset(expectedBreaks, 0, sizeof(expectedBreaks));
4320          expectedBreaks[0] = 1;
4321          int32_t breakPos = 0;
4322          expectedCount = 0;
4323          for (;;) {
4324              breakPos = mk.next(breakPos);
4325              if (breakPos == -1) {
4326                  break;
4327              }
4328              if (breakPos > testText.length()) {
4329                  errln("breakPos > testText.length()");
4330              }
4331              expectedBreaks[breakPos] = 1;
4332              U_ASSERT(expectedCount<testText.length());
4333              expected[expectedCount ++] = breakPos;
4334              (void)expected;   // Set but not used warning.
4335                                // TODO (andy): check it out.
4336          }
4337  
4338          // Find the break positions using forward iteration
4339          memset(forwardBreaks, 0, sizeof(forwardBreaks));
4340          if (useUText) {
4341              UErrorCode status = U_ZERO_ERROR;
4342              UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4343              // testUText = utext_openUnicodeString(testUText, &testText, &status);
4344              bi->setText(testUText, status);
4345              TEST_ASSERT_SUCCESS(status);
4346              utext_close(testUText);   // The break iterator does a shallow clone of the UText
4347                                        //  This UText can be closed immediately, so long as the
4348                                        //  testText string continues to exist.
4349          } else {
4350              bi->setText(testText);
4351          }
4352  
4353          for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4354              if (i < 0 || i > testText.length()) {
4355                  errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4356                  break;
4357              }
4358              forwardBreaks[i] = 1;
4359          }
4360  
4361          // Find the break positions using reverse iteration
4362          memset(reverseBreaks, 0, sizeof(reverseBreaks));
4363          for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4364              if (i < 0 || i > testText.length()) {
4365                  errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4366                  break;
4367              }
4368              reverseBreaks[i] = 1;
4369          }
4370  
4371          // Find the break positions using isBoundary() tests.
4372          memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4373          U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4374          for (i=0; i<=testText.length(); i++) {
4375              isBoundaryBreaks[i] = bi->isBoundary(i);
4376          }
4377  
4378  
4379          // Find the break positions using the following() function.
4380          // printf(".");
4381          memset(followingBreaks, 0, sizeof(followingBreaks));
4382          int32_t   lastBreakPos = 0;
4383          followingBreaks[0] = 1;
4384          for (i=0; i<testText.length(); i++) {
4385              breakPos = bi->following(i);
4386              if (breakPos <= i ||
4387                  breakPos < lastBreakPos ||
4388                  breakPos > testText.length() ||
4389                  (breakPos > lastBreakPos && lastBreakPos > i)) {
4390                  errln("%s break monkey test: "
4391                      "Out of range value returned by BreakIterator::following().\n"
4392                          "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4393                           name, seed, i, breakPos, lastBreakPos);
4394                  break;
4395              }
4396              followingBreaks[breakPos] = 1;
4397              lastBreakPos = breakPos;
4398          }
4399  
4400          // Find the break positions using the preceding() function.
4401          memset(precedingBreaks, 0, sizeof(precedingBreaks));
4402          lastBreakPos = testText.length();
4403          precedingBreaks[testText.length()] = 1;
4404          for (i=testText.length(); i>0; i--) {
4405              breakPos = bi->preceding(i);
4406              if (breakPos >= i ||
4407                  breakPos > lastBreakPos ||
4408                  (breakPos < 0 && testText.getChar32Start(i)>0) ||
4409                  (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4410                  errln("%s break monkey test: "
4411                      "Out of range value returned by BreakIterator::preceding().\n"
4412                      "index=%d;  prev returned %d; lastBreak=%d" ,
4413                      name,  i, breakPos, lastBreakPos);
4414                  if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4415                      precedingBreaks[i] = 2;   // Forces an error.
4416                  }
4417              } else {
4418                  if (breakPos >= 0) {
4419                      precedingBreaks[breakPos] = 1;
4420                  }
4421                  lastBreakPos = breakPos;
4422              }
4423          }
4424  
4425          // Compare the expected and actual results.
4426          for (i=0; i<=testText.length(); i++) {
4427              const char *errorType = NULL;
4428              if  (forwardBreaks[i] != expectedBreaks[i]) {
4429                  errorType = "next()";
4430              } else if (reverseBreaks[i] != forwardBreaks[i]) {
4431                  errorType = "previous()";
4432              } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4433                  errorType = "isBoundary()";
4434              } else if (followingBreaks[i] != expectedBreaks[i]) {
4435                  errorType = "following()";
4436              } else if (precedingBreaks[i] != expectedBreaks[i]) {
4437                  errorType = "preceding()";
4438              }
4439  
4440  
4441              if (errorType != NULL) {
4442                  // Format a range of the test text that includes the failure as
4443                  //  a data item that can be included in the rbbi test data file.
4444  
4445                  // Start of the range is the last point where expected and actual results
4446                  //   both agreed that there was a break position.
4447                  int startContext = i;
4448                  int32_t count = 0;
4449                  for (;;) {
4450                      if (startContext==0) { break; }
4451                      startContext --;
4452                      if (expectedBreaks[startContext] != 0) {
4453                          if (count == 2) break;
4454                          count ++;
4455                      }
4456                  }
4457  
4458                  // End of range is two expected breaks past the start position.
4459                  int endContext = i + 1;
4460                  int ci;
4461                  for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4462                      for (;;) {
4463                          if (endContext >= testText.length()) {break;}
4464                          if (expectedBreaks[endContext-1] != 0) {
4465                              if (count == 0) break;
4466                              count --;
4467                          }
4468                          endContext ++;
4469                      }
4470                  }
4471  
4472                  // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4473                  UnicodeString errorText = "<data>";
4474                  /***if (strcmp(errorType, "next()") == 0) {
4475                      startContext = 0;
4476                      endContext = testText.length();
4477  
4478                      printStringBreaks(testText, expected, expectedCount);
4479                  }***/
4480  
4481                  for (ci=startContext; ci<endContext;) {
4482                      UnicodeString hexChars("0123456789abcdef");
4483                      UChar32  c;
4484                      int      bn;
4485                      c = testText.char32At(ci);
4486                      if (ci == i) {
4487                          // This is the location of the error.
4488                          errorText.append("<?>");
4489                      } else if (expectedBreaks[ci] != 0) {
4490                          // This a non-error expected break position.
4491                          errorText.append("\\");
4492                      }
4493                      if (c < 0x10000) {
4494                          errorText.append("\\u");
4495                          for (bn=12; bn>=0; bn-=4) {
4496                              errorText.append(hexChars.charAt((c>>bn)&0xf));
4497                          }
4498                      } else {
4499                          errorText.append("\\U");
4500                          for (bn=28; bn>=0; bn-=4) {
4501                              errorText.append(hexChars.charAt((c>>bn)&0xf));
4502                          }
4503                      }
4504                      ci = testText.moveIndex32(ci, 1);
4505                  }
4506                  errorText.append("\\");
4507                  errorText.append("</data>\n");
4508  
4509                  // Output the error
4510                  char  charErrorTxt[500];
4511                  UErrorCode status = U_ZERO_ERROR;
4512                  errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4513                  charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4514                  const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4515  
4516                  errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4517                      name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4518                      errorType, seed, i, charErrorTxt);
4519                  break;
4520              }
4521          }
4522  
4523          loopCount++;
4524      }
4525  #endif
4526  }
4527  
4528  
4529  //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4530  //             This test checks the initial patch,
4531  //             which is to just keep it from crashing.  Correct word boundaries
4532  //             await a proper fix to the dictionary code.
4533  //
TestBug5532(void)4534  void RBBITest::TestBug5532(void)  {
4535     // Text includes a mixture of Thai and Latin.
4536     const unsigned char utf8Data[] = {
4537             0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4538             0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4539             0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4540             0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4541             0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4542             0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4543             0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4544             0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4545             0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4546             0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4547             0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4548  
4549      UErrorCode status = U_ZERO_ERROR;
4550      UText utext=UTEXT_INITIALIZER;
4551      utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4552      TEST_ASSERT_SUCCESS(status);
4553  
4554      BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4555      TEST_ASSERT_SUCCESS(status);
4556      if (U_SUCCESS(status)) {
4557          bi->setText(&utext, status);
4558          TEST_ASSERT_SUCCESS(status);
4559  
4560          int32_t breakCount = 0;
4561          int32_t previousBreak = -1;
4562          for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4563              // For now, just make sure that the break iterator doesn't hang.
4564              TEST_ASSERT(previousBreak < bi->current());
4565              previousBreak = bi->current();
4566          }
4567          TEST_ASSERT(breakCount > 0);
4568      }
4569      delete bi;
4570      utext_close(&utext);
4571  }
4572  
4573  
TestBug9983(void)4574  void RBBITest::TestBug9983(void)  {
4575      UnicodeString text = UnicodeString("\\u002A"  // * Other
4576                                         "\\uFF65"  //   Other
4577                                         "\\u309C"  //   Katakana
4578                                         "\\uFF9F"  //   Extend
4579                                         "\\uFF65"  //   Other
4580                                         "\\u0020"  //   Other
4581                                         "\\u0000").unescape();
4582  
4583      UErrorCode status = U_ZERO_ERROR;
4584      LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4585          BreakIterator::createWordInstance(Locale::getRoot(), status)));
4586      TEST_ASSERT_SUCCESS(status);
4587      LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4588          BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4589      TEST_ASSERT_SUCCESS(status);
4590      if (U_FAILURE(status)) {
4591          return;
4592      }
4593      int32_t offset, rstatus, iterationCount;
4594  
4595      brkiter->setText(text);
4596      brkiter->last();
4597      iterationCount = 0;
4598      while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4599          iterationCount++;
4600          rstatus = brkiter->getRuleStatus();
4601          (void)rstatus;     // Suppress set but not used warning.
4602          if (iterationCount >= 10) {
4603             break;
4604          }
4605      }
4606      TEST_ASSERT(iterationCount == 6);
4607  
4608      brkiterPOSIX->setText(text);
4609      brkiterPOSIX->last();
4610      iterationCount = 0;
4611      while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4612          iterationCount++;
4613          rstatus = brkiterPOSIX->getRuleStatus();
4614          (void)rstatus;     // Suppress set but not used warning.
4615          if (iterationCount >= 10) {
4616             break;
4617          }
4618      }
4619      TEST_ASSERT(iterationCount == 6);
4620  }
4621  
4622  // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4623  //
TestBug7547()4624  void RBBITest::TestBug7547() {
4625      UnicodeString rules;
4626      UErrorCode status = U_ZERO_ERROR;
4627      UParseError parseError;
4628      RuleBasedBreakIterator breakIterator(rules, parseError, status);
4629      if (status != U_BRK_RULE_SYNTAX) {
4630          errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4631      }
4632      if (parseError.line != 1 || parseError.offset != 0) {
4633          errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4634      }
4635  }
4636  
4637  
TestBug12797()4638  void RBBITest::TestBug12797() {
4639      UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4640      UErrorCode status = U_ZERO_ERROR;
4641      UParseError parseError;
4642      RuleBasedBreakIterator bi(rules, parseError, status);
4643      if (U_FAILURE(status)) {
4644          errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4645          return;
4646      }
4647      UnicodeString text = "abc";
4648      bi.setText(text);
4649      bi.first();
4650      int32_t boundary = bi.next();
4651      if (boundary != 3) {
4652          errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4653      }
4654  }
4655  
TestBug12918()4656  void RBBITest::TestBug12918() {
4657      // This test triggers an assertion failure in dictbe.cpp
4658      const UChar crasherString[] = { 0x3325, 0x4a16, 0 };
4659      UErrorCode status = U_ZERO_ERROR;
4660      UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4661      if (U_FAILURE(status)) {
4662          errln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4663          return;
4664      }
4665      ubrk_first(iter);
4666      int32_t pos = 0;
4667      int32_t lastPos = -1;
4668      while((pos = ubrk_next(iter)) != UBRK_DONE) {
4669          if (pos <= lastPos) {
4670              errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4671              break;
4672          }
4673      }
4674      ubrk_close(iter);
4675  }
4676  
4677  //
4678  //  TestDebug    -  A place-holder test for debugging purposes.
4679  //                  For putting in fragments of other tests that can be invoked
4680  //                  for tracing  without a lot of unwanted extra stuff happening.
4681  //
TestDebug(void)4682  void RBBITest::TestDebug(void) {
4683  
4684  }
4685  
TestProperties()4686  void RBBITest::TestProperties() {
4687      UErrorCode errorCode = U_ZERO_ERROR;
4688      UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4689      if (!prependSet.isEmpty()) {
4690          errln(
4691              "[:GCB=Prepend:] is not empty any more. "
4692              "Uncomment relevant lines in source/data/brkitr/char.txt and "
4693              "change this test to the opposite condition.");
4694      }
4695  }
4696  
4697  #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
4698