1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * Copyright (c) 2016, International Business Machines Corporation and
5  * others. All Rights Reserved.
6  ********************************************************************/
7 
8 
9 #include "unicode/utypes.h"
10 
11 #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
12 
13 #include "rbbimonkeytest.h"
14 #include "unicode/utypes.h"
15 #include "unicode/brkiter.h"
16 #include "unicode/utf16.h"
17 #include "unicode/uniset.h"
18 #include "unicode/unistr.h"
19 
20 #include "charstr.h"
21 #include "cmemory.h"
22 #include "cstr.h"
23 #include "uelement.h"
24 #include "uhash.h"
25 
26 #include <iostream>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string>
30 
31 using namespace icu;
32 
33 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)34 void RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
35     fParams = params;            // Work around TESTCASE_AUTO not being able to pass params to test function.
36 
37     TESTCASE_AUTO_BEGIN;
38     TESTCASE_AUTO(testMonkey);
39     TESTCASE_AUTO_END;
40 }
41 
42 //---------------------------------------------------------------------------------------
43 //
44 //   class BreakRule implementation.
45 //
46 //---------------------------------------------------------------------------------------
47 
BreakRule()48 BreakRule::BreakRule()      // :  all field default initialized.
49 {
50 }
51 
~BreakRule()52 BreakRule::~BreakRule() {}
53 
54 
55 //---------------------------------------------------------------------------------------
56 //
57 //   class BreakRules implementation.
58 //
59 //---------------------------------------------------------------------------------------
BreakRules(RBBIMonkeyImpl * monkeyImpl,UErrorCode & status)60 BreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status)  :
61         fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
62     fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
63                                          uhash_compareUnicodeString,
64                                          NULL,      // value comparator.
65                                          &status));
66     if (U_FAILURE(status)) {
67         return;
68     }
69     uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
70     uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
71     fBreakRules.setDeleter(uprv_deleteUObject);
72 
73     fCharClassList.adoptInstead(new UVector(status));
74 
75     fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
76              "(?!(?:\\{|=|\\[:)[ \\t]{0,4})"              // Negative look behind for '{' or '=' or '[:'
77                                                           //   (the identifier is a unicode property name or value)
78              "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"),     // The char class name
79         0, status));
80 
81     // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
82     fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
83                 "(^|(?<=;))"                    // Start either at start of line, or just after a ';' (look-behind for ';')
84                 "[ \\t]*+"                      //   Match white space.
85                 "(#.*)?+"                       //   Optional # plus whatever follows
86                 "\\R$"                          //   new-line at end of line.
87             ), 0, status));
88 
89     // Match (initial parse) of a character class definition line.
90     fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
91                 "[ \\t]*"                                // leading white space
92                 "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"   // The char class name
93                 "[ \\t]*=[ \\t]*"                        //   =
94                 "(?<ClassDef>.*?)"                       // The char class UnicodeSet expression
95                 "[ \\t]*;$"),                     // ; <end of line>
96             0, status));
97 
98     // Match (initial parse) of a break rule line.
99     fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
100                 "[ \\t]*"                                // leading white space
101                 "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)"    // The rule name
102                 "[ \\t]*:[ \\t]*"                        //   :
103                 "(?<RuleDef>.*?)"                        // The rule definition
104                 "[ \\t]*;$"),                            // ; <end of line>
105             0, status));
106 
107 }
108 
109 
~BreakRules()110 BreakRules::~BreakRules() {}
111 
112 
addCharClass(const UnicodeString & name,const UnicodeString & definition,UErrorCode & status)113 CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
114 
115     // Create the expanded definition for this char class,
116     // replacing any set references with the corresponding definition.
117 
118     UnicodeString expandedDef;
119     UnicodeString emptyString;
120     fSetRefsMatcher->reset(definition);
121     while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
122         const UnicodeString name =
123                 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
124         CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
125         const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
126 
127         fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
128         expandedDef.append(expansionForName);
129     }
130     fSetRefsMatcher->appendTail(expandedDef);
131 
132     // Verify that the expanded set definition is valid.
133 
134     if (fMonkeyImpl->fDumpExpansions) {
135         printf("epandedDef: %s\n", CStr(expandedDef)());
136     }
137 
138     UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
139     if (U_FAILURE(status)) {
140         IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
141                                u_errorName(status), CStr(name)());
142         return NULL;
143     }
144     CharClass *cclass = new CharClass(name, definition, expandedDef, s);
145     CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
146                                                         new UnicodeString(name),   // Key, owned by hash table.
147                                                         cclass,                    // Value, owned by hash table.
148                                                         &status));
149 
150     if (previousClass != NULL) {
151         // Duplicate class def.
152         // These are legitimate, they are adjustments of an existing class.
153         // TODO: will need to keep the old around when we handle tailorings.
154         IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
155         delete previousClass;
156     }
157     return cclass;
158 }
159 
160 
addRule(const UnicodeString & name,const UnicodeString & definition,UErrorCode & status)161 void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
162     LocalPointer<BreakRule> thisRule(new BreakRule);
163     thisRule->fName = name;
164     thisRule->fRule = definition;
165 
166     // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
167     // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
168     UnicodeString emptyString;
169 
170     // Expand the char class definitions within the rule.
171     fSetRefsMatcher->reset(definition);
172     while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
173         const UnicodeString name =
174                 fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
175         CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
176         if (!nameClass) {
177             IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
178                 __FILE__, __LINE__, CStr(name)(), CStr(definition)());
179         }
180         const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
181 
182         fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
183         thisRule->fExpandedRule.append(expansionForName);
184     }
185     fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
186 
187     // If rule begins with a '^' rule chaining is disallowed.
188     // Strip off the '^' from the rule expression, and set the flag.
189     if (thisRule->fExpandedRule.charAt(0) == u'^') {
190         thisRule->fInitialMatchOnly = true;
191         thisRule->fExpandedRule.remove(0, 1);
192         thisRule->fExpandedRule.trim();
193     }
194 
195     // Replace the divide sign (\u00f7) with a regular expression named capture.
196     // When running the rules, a match that includes this group means we found a break position.
197 
198     int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
199     if (dividePos >= 0) {
200         thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
201     }
202     if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
203         status = U_ILLEGAL_ARGUMENT_ERROR;   // TODO: produce a good error message.
204     }
205 
206     // UAX break rule set definitions can be empty, just [].
207     // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
208     // also matches nothing.
209 
210     static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
211     int32_t where = 0;
212     while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
213         thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
214     }
215     if (fMonkeyImpl->fDumpExpansions) {
216         printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
217     }
218 
219     // Compile a regular expression for this rule.
220     thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
221     if (U_FAILURE(status)) {
222         IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
223                 __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
224         return;
225     }
226 
227     // Put this new rule into the vector of all Rules.
228     fBreakRules.addElement(thisRule.orphan(), status);
229 }
230 
231 
setKeywordParameter(const UnicodeString & keyword,const UnicodeString & value,UErrorCode & status)232 bool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
233     if (keyword == UnicodeString("locale")) {
234         CharString localeName;
235         localeName.append(CStr(value)(), -1, status);
236         fLocale = Locale::createFromName(localeName.data());
237         return true;
238     }
239     if (keyword == UnicodeString("type")) {
240         if (value == UnicodeString("grapheme")) {
241             fType = UBRK_CHARACTER;
242         } else if (value == UnicodeString("word")) {
243             fType = UBRK_WORD;
244         } else if (value == UnicodeString("line")) {
245             fType = UBRK_LINE;
246         } else if (value == UnicodeString("sentence")) {
247             fType = UBRK_SENTENCE;
248         } else {
249             IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__,  CStr(value)());
250         }
251         return true;
252     }
253     // TODO: add tailoring base setting here.
254     return false;
255 }
256 
createICUBreakIterator(UErrorCode & status)257 RuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
258     if (U_FAILURE(status)) {
259         return NULL;
260     }
261     RuleBasedBreakIterator *bi = NULL;
262     switch(fType) {
263         case UBRK_CHARACTER:
264             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
265             break;
266         case UBRK_WORD:
267             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
268             break;
269         case UBRK_LINE:
270             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
271             break;
272         case UBRK_SENTENCE:
273             bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
274             break;
275         default:
276             IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
277             status = U_ILLEGAL_ARGUMENT_ERROR;
278     }
279     return bi;
280 }
281 
282 
compileRules(UCHARBUF * rules,UErrorCode & status)283 void BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
284     if (U_FAILURE(status)) {
285         return;
286     }
287 
288     UnicodeString emptyString;
289     for (int32_t lineNumber=0; ;lineNumber++) {    // Loop once per input line.
290         if (U_FAILURE(status)) {
291             return;
292         }
293         int32_t lineLength = 0;
294         const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
295         if (lineBuf == NULL) {
296             break;
297         }
298         UnicodeString line(lineBuf, lineLength);
299 
300         // Strip comment lines.
301         fCommentsMatcher->reset(line);
302         line = fCommentsMatcher->replaceFirst(emptyString, status);
303         if (line.isEmpty()) {
304             continue;
305         }
306 
307         // Recognize character class definition and keyword lines
308         fClassDefMatcher->reset(line);
309         if (fClassDefMatcher->matches(status)) {
310             UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
311             UnicodeString classDef  = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
312             if (fMonkeyImpl->fDumpExpansions) {
313                 printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
314             }
315             if (setKeywordParameter(className, classDef, status)) {
316                 // The scanned item was "type = ..." or "locale = ...", etc.
317                 //   which are not actual character classes.
318                 continue;
319             }
320             addCharClass(className, classDef, status);
321             continue;
322         }
323 
324         // Recognize rule lines.
325         fRuleDefMatcher->reset(line);
326         if (fRuleDefMatcher->matches(status)) {
327             UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
328             UnicodeString ruleDef  = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
329             if (fMonkeyImpl->fDumpExpansions) {
330                 printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
331             }
332             addRule(ruleName, ruleDef, status);
333             continue;
334         }
335 
336         IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
337             __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
338     }
339 
340     // Build the vector of char classes, omitting the dictionary class if there is one.
341     // This will be used when constructing the random text to be tested.
342 
343     // Also compute the "other" set, consisting of any characters not included in
344     // one or more of the user defined sets.
345 
346     UnicodeSet otherSet((UChar32)0, 0x10ffff);
347     int32_t pos = UHASH_FIRST;
348     const UHashElement *el = NULL;
349     while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
350         const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
351         CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
352         // printf("    Adding %s\n", CStr(*ccName)());
353         if (*ccName != cclass->fName) {
354             IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
355                     __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
356         }
357         const UnicodeSet *set = cclass->fSet.getAlias();
358         otherSet.removeAll(*set);
359         if (*ccName == UnicodeString("dictionary")) {
360             fDictionarySet = *set;
361         } else {
362             fCharClassList->addElement(cclass, status);
363         }
364     }
365 
366     if (!otherSet.isEmpty()) {
367         // fprintf(stderr, "have an other set.\n");
368         UnicodeString pattern;
369         CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
370         fCharClassList->addElement(cclass, status);
371     }
372 }
373 
374 
getClassForChar(UChar32 c,int32_t * iter) const375 const CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
376    int32_t localIter = 0;
377    int32_t &it = iter? *iter : localIter;
378 
379    while (it < fCharClassList->size()) {
380        const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
381        ++it;
382        if (cc->fSet->contains(c)) {
383            return cc;
384        }
385     }
386     return NULL;
387 }
388 
389 //---------------------------------------------------------------------------------------
390 //
391 //   class MonkeyTestData implementation.
392 //
393 //---------------------------------------------------------------------------------------
394 
set(BreakRules * rules,IntlTest::icu_rand & rand,UErrorCode & status)395 void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
396     const int32_t dataLength = 1000;
397 
398     // Fill the test string with random characters.
399     // First randomly pick a char class, then randomly pick a character from that class.
400     // Exclude any characters from the dictionary set.
401 
402     // std::cout << "Populating Test Data" << std::endl;
403     fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
404                                           // allowing recreation of failing data.
405     fBkRules = rules;
406     fString.remove();
407     for (int32_t n=0; n<dataLength;) {
408         int charClassIndex = rand() % rules->fCharClassList->size();
409         const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
410         if (cclass->fSet->size() == 0) {
411             // Some rules or tailorings do end up with empty char classes.
412             continue;
413         }
414         int32_t charIndex = rand() % cclass->fSet->size();
415         UChar32 c = cclass->fSet->charAt(charIndex);
416         if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
417             // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
418             // Don't let random unpaired surrogates combine in the test data because they might
419             // produce an unwanted dictionary character.
420             continue;
421         }
422 
423         if (!rules->fDictionarySet.contains(c)) {
424             fString.append(c);
425             ++n;
426         }
427     }
428 
429     // Reset each rule matcher regex with this new string.
430     //    (Although we are always using the same string object, ICU regular expressions
431     //    don't like the underlying string data changing without doing a reset).
432 
433     for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
434         BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
435             rule->fRuleMatcher->reset(fString);
436     }
437 
438     // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
439     // Expected and Actual breaks are one longer than the input string; a non-zero value
440     // will indicate a boundary preceding that position.
441 
442     clearActualBreaks();
443     fExpectedBreaks  = fActualBreaks;
444     fRuleForPosition = fActualBreaks;
445     f2ndRuleForPos   = fActualBreaks;
446 
447     // Apply reference rules to find the expected breaks.
448 
449     fExpectedBreaks.setCharAt(0, (UChar)1);  // Force an expected break before the start of the text.
450                                              // ICU always reports a break there.
451                                              // The reference rules do not have a means to do so.
452     int32_t strIdx = 0;
453     bool    initialMatch = true;             // True at start of text, and immediately after each boundary,
454                                              // for control over rule chaining.
455     while (strIdx < fString.length()) {
456         BreakRule *matchingRule = NULL;
457         UBool      hasBreak = FALSE;
458         int32_t ruleNum = 0;
459         int32_t matchStart = 0;
460         int32_t matchEnd = 0;
461         int32_t breakGroup = 0;
462         for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
463             BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
464             if (rule->fInitialMatchOnly && !initialMatch) {
465                 // Skip checking this '^' rule. (No rule chaining)
466                 continue;
467             }
468             rule->fRuleMatcher->reset();
469             if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
470                 // A candidate rule match, check further to see if we take it or continue to check other rules.
471                 // Matches of zero or one codepoint count only if they also specify a break.
472                 matchStart = rule->fRuleMatcher->start(status);
473                 matchEnd = rule->fRuleMatcher->end(status);
474                 breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
475                 hasBreak = U_SUCCESS(status);
476                 if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
477                     status = U_ZERO_ERROR;
478                 }
479                 if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
480                     matchingRule = rule;
481                     break;
482                 }
483             }
484         }
485         if (matchingRule == NULL) {
486             // No reference rule matched. This is an error in the rules that should never happen.
487             IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
488                  __FILE__, __LINE__, strIdx);
489             dump(strIdx);
490             status = U_INVALID_FORMAT_ERROR;
491             return;
492         }
493         if (matchingRule->fRuleMatcher->group(status).length() == 0) {
494             // Zero length rule match. This is also an error in the rule expressions.
495             IntlTest::gTest->errln("%s:%d Zero length rule match.",
496                 __FILE__, __LINE__);
497             status =  U_INVALID_FORMAT_ERROR;
498             return;
499         }
500 
501         // Record which rule matched over the length of the match.
502         for (int i = matchStart; i < matchEnd; i++) {
503             if (fRuleForPosition.charAt(i) == 0) {
504                 fRuleForPosition.setCharAt(i, (UChar)ruleNum);
505             } else {
506                 f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
507             }
508         }
509 
510         // Break positions appear in rules as a matching named capture of zero length at the break position,
511         //   the adjusted pattern contains (?<BreakPosition>)
512         if (hasBreak) {
513             int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
514             if (U_FAILURE(status) || breakPos < 0) {
515                 // Rule specified a break, but that break wasn't part of the match, even
516                 // though the rule as a whole matched.
517                 // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
518                 // Shouldn't get here.
519                 IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
520                 status =  U_INVALID_FORMAT_ERROR;
521                 break;
522             }
523             fExpectedBreaks.setCharAt(breakPos, (UChar)1);
524             // printf("recording break at %d\n", breakPos);
525             // For the next iteration, pick up applying rules immediately after the break,
526             // which may differ from end of the match. The matching rule may have included
527             // context following the boundary that needs to be looked at again.
528             strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
529             initialMatch = true;
530         } else {
531             // Original rule didn't specify a break.
532             // Continue applying rules starting on the last code point of this match.
533             strIdx = fString.moveIndex32(matchEnd, -1);
534             initialMatch = false;
535             if (strIdx == matchStart) {
536                 // Match was only one code point, no progress if we continue.
537                 // Shouldn't get here, case is filtered out at top of loop.
538                 CharString ruleName;
539                 ruleName.appendInvariantChars(matchingRule->fName, status);
540                 IntlTest::gTest->errln("%s:%d Rule %s internal error",
541                         __FILE__, __LINE__, ruleName.data());
542                 status = U_INVALID_FORMAT_ERROR;
543                 break;
544             }
545         }
546         if (U_FAILURE(status)) {
547             IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
548                 __FILE__, __LINE__, u_errorName(status));
549             break;
550         }
551     }
552 }
553 
clearActualBreaks()554 void MonkeyTestData::clearActualBreaks() {
555     fActualBreaks.remove();
556     // Actual Breaks length is one longer than the data string length, allowing
557     //    for breaks before the first and after the last character in the data.
558     for (int32_t i=0; i<=fString.length(); i++) {
559         fActualBreaks.append((UChar)0);
560     }
561 }
562 
dump(int32_t around) const563 void MonkeyTestData::dump(int32_t around) const {
564     printf("\n"
565            "         char                        break  Rule                     Character\n"
566            "   pos   code   class                 R I   name                     name\n"
567            "---------------------------------------------------------------------------------------------\n");
568 
569     int32_t start;
570     int32_t end;
571 
572     if (around == -1) {
573         start = 0;
574         end = fString.length();
575     } else {
576         // Display context around a failure.
577         start = fString.moveIndex32(around, -30);
578         end = fString.moveIndex32(around, +30);
579     }
580 
581     for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
582         UErrorCode status = U_ZERO_ERROR;
583         UChar32 c = fString.char32At(charIdx);
584         const CharClass *cc = fBkRules->getClassForChar(c);
585         CharString ccName;
586         ccName.appendInvariantChars(cc->fName, status);
587         CharString ruleName, secondRuleName;
588         const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
589         ruleName.appendInvariantChars(rule->fName, status);
590         if (f2ndRuleForPos.charAt(charIdx) > 0) {
591             const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
592             secondRuleName.appendInvariantChars(secondRule->fName, status);
593         }
594         char cName[200];
595         u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
596 
597         printf("  %4.1d %6.4x   %-20s  %c %c   %-10s %-10s    %s\n",
598             charIdx, c, ccName.data(),
599             fExpectedBreaks.charAt(charIdx) ? '*' : '.',
600             fActualBreaks.charAt(charIdx) ? '*' : '.',
601             ruleName.data(), secondRuleName.data(), cName
602         );
603     }
604 }
605 
606 
607 //---------------------------------------------------------------------------------------
608 //
609 //   class RBBIMonkeyImpl
610 //
611 //---------------------------------------------------------------------------------------
612 
RBBIMonkeyImpl(UErrorCode & status)613 RBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
614     (void)status;    // suppress unused parameter compiler warning.
615 }
616 
617 
618 // RBBIMonkeyImpl setup       does all of the setup for a single rule set - compiling the
619 //                            reference rules and creating the icu breakiterator to test,
620 //                            with its type and locale coming from the reference rules.
621 
setup(const char * ruleFile,UErrorCode & status)622 void RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
623     fRuleFileName = ruleFile;
624     openBreakRules(ruleFile, status);
625     if (U_FAILURE(status)) {
626         IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
627         return;
628     }
629     fRuleSet.adoptInstead(new BreakRules(this, status));
630     fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
631     if (U_FAILURE(status)) {
632         IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
633         return;
634     }
635     fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
636     fTestData.adoptInstead(new MonkeyTestData());
637 }
638 
639 
~RBBIMonkeyImpl()640 RBBIMonkeyImpl::~RBBIMonkeyImpl() {
641 }
642 
643 
openBreakRules(const char * fileName,UErrorCode & status)644 void RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
645     CharString path;
646     path.append(IntlTest::getSourceTestData(status), status);
647     path.append("break_rules" U_FILE_SEP_STRING, status);
648     path.appendPathPart(fileName, status);
649     const char *codePage = "UTF-8";
650     fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
651 }
652 
653 
startTest()654 void RBBIMonkeyImpl::startTest() {
655     fThread.start();   // invokes runTest() in a separate thread.
656 }
657 
join()658 void RBBIMonkeyImpl::join() {
659     fThread.join();
660 }
661 
662 
663 #define MONKEY_ERROR(msg, index) { \
664     IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
665                     __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
666     if (fVerbose) { fTestData->dump(index); } \
667     status = U_INVALID_STATE_ERROR;  \
668 }
669 
runTest()670 void RBBIMonkeyImpl::runTest() {
671     UErrorCode status = U_ZERO_ERROR;
672     int32_t errorCount = 0;
673     for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
674         status = U_ZERO_ERROR;
675         fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
676         if (fBI.isNull()) {
677             IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
678             return;
679         }
680         // fTestData->dump();
681         testForwards(status);
682         testPrevious(status);
683         testFollowing(status);
684         testPreceding(status);
685         testIsBoundary(status);
686         testIsBoundaryRandom(status);
687 
688         if (fLoopCount < 0 && loopCount % 100 == 0) {
689             fprintf(stderr, ".");
690         }
691         if (U_FAILURE(status)) {
692             if (++errorCount > 10) {
693                 return;
694             }
695         }
696     }
697 }
698 
testForwards(UErrorCode & status)699 void RBBIMonkeyImpl::testForwards(UErrorCode &status) {
700     if (U_FAILURE(status)) {
701         return;
702     }
703     fTestData->clearActualBreaks();
704     fBI->setText(fTestData->fString);
705     int32_t previousBreak = -2;
706     for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
707         if (bk <= previousBreak) {
708             MONKEY_ERROR("Break Iterator Stall", bk);
709             return;
710         }
711         if (bk < 0 || bk > fTestData->fString.length()) {
712             MONKEY_ERROR("Boundary out of bounds", bk);
713             return;
714         }
715         fTestData->fActualBreaks.setCharAt(bk, 1);
716     }
717     checkResults("testForwards", FORWARD, status);
718 }
719 
testFollowing(UErrorCode & status)720 void RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
721     if (U_FAILURE(status)) {
722         return;
723     }
724     fTestData->clearActualBreaks();
725     fBI->setText(fTestData->fString);
726     int32_t nextBreak = -1;
727     for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
728         int32_t bk = fBI->following(i);
729         if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
730             continue;
731         }
732         if (bk == nextBreak && bk > i) {
733             // i is in the gap between two breaks.
734             continue;
735         }
736         if (i == nextBreak && bk > nextBreak) {
737             fTestData->fActualBreaks.setCharAt(bk, 1);
738             nextBreak = bk;
739             continue;
740         }
741         MONKEY_ERROR("following(i)", i);
742         return;
743     }
744     checkResults("testFollowing", FORWARD, status);
745 }
746 
747 
748 
testPrevious(UErrorCode & status)749 void RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
750     if (U_FAILURE(status)) {return;}
751 
752     fTestData->clearActualBreaks();
753     fBI->setText(fTestData->fString);
754     int32_t previousBreak = INT32_MAX;
755     for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
756          if (bk >= previousBreak) {
757             MONKEY_ERROR("Break Iterator Stall", bk);
758             return;
759         }
760         if (bk < 0 || bk > fTestData->fString.length()) {
761             MONKEY_ERROR("Boundary out of bounds", bk);
762             return;
763         }
764         fTestData->fActualBreaks.setCharAt(bk, 1);
765     }
766     checkResults("testPrevius", REVERSE, status);
767 }
768 
769 
testPreceding(UErrorCode & status)770 void RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
771     if (U_FAILURE(status)) {
772         return;
773     }
774     fTestData->clearActualBreaks();
775     fBI->setText(fTestData->fString);
776     int32_t nextBreak = fTestData->fString.length()+1;
777     for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
778         int32_t bk = fBI->preceding(i);
779         // printf("i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
780         if (bk == BreakIterator::DONE && i == 0) {
781             continue;
782         }
783         if (bk == nextBreak && bk < i) {
784             // i is in the gap between two breaks.
785             continue;
786         }
787         if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
788             // i indexes to a trailing surrogate.
789             // Break Iterators treat an index to either half as referring to the supplemental code point,
790             // with preceding going to some preceding code point.
791             if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
792                 MONKEY_ERROR("preceding of trailing surrogate error", i);
793             }
794             continue;
795         }
796         if (i == nextBreak && bk < nextBreak) {
797             fTestData->fActualBreaks.setCharAt(bk, 1);
798             nextBreak = bk;
799             continue;
800         }
801         MONKEY_ERROR("preceding(i)", i);
802         return;
803     }
804     checkResults("testPreceding", REVERSE, status);
805 }
806 
807 
testIsBoundary(UErrorCode & status)808 void RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
809     if (U_FAILURE(status)) {
810         return;
811     }
812     fTestData->clearActualBreaks();
813     fBI->setText(fTestData->fString);
814     for (int i=fTestData->fString.length(); i>=0; --i) {
815         if (fBI->isBoundary(i)) {
816             fTestData->fActualBreaks.setCharAt(i, 1);
817         }
818     }
819     checkResults("testForwards", FORWARD, status);
820 }
821 
testIsBoundaryRandom(UErrorCode & status)822 void RBBIMonkeyImpl::testIsBoundaryRandom(UErrorCode &status) {
823     if (U_FAILURE(status)) {
824         return;
825     }
826     fBI->setText(fTestData->fString);
827 
828     int stringLen = fTestData->fString.length();
829     for (int i=stringLen; i>=0; --i) {
830         int strIdx = fRandomGenerator() % stringLen;
831         if (fTestData->fExpectedBreaks.charAt(strIdx) != fBI->isBoundary(strIdx)) {
832             IntlTest::gTest->errln("%s:%d testIsBoundaryRandom failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
833                     __FILE__, __LINE__, strIdx, fRuleFileName, fTestData->fRandomSeed);
834             if (fVerbose) {
835                 fTestData->dump(i);
836             }
837             status = U_INVALID_STATE_ERROR;
838             break;
839         }
840     }
841 }
842 
843 
844 
checkResults(const char * msg,CheckDirection direction,UErrorCode & status)845 void RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
846     if (U_FAILURE(status)) {
847         return;
848     }
849     if (direction == FORWARD) {
850         for (int i=0; i<=fTestData->fString.length(); ++i) {
851             if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
852                 IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
853                         __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
854                 if (fVerbose) {
855                     fTestData->dump(i);
856                 }
857                 status = U_INVALID_STATE_ERROR;   // Prevent the test from continuing, which would likely
858                 break;                            // produce many redundant errors.
859             }
860         }
861     } else {
862         for (int i=fTestData->fString.length(); i>=0; i--) {
863             if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
864                 IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
865                         __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
866                 if (fVerbose) {
867                     fTestData->dump(i);
868                 }
869                 status = U_INVALID_STATE_ERROR;
870                 break;
871             }
872         }
873     }
874 }
875 
876 
877 
878 //---------------------------------------------------------------------------------------
879 //
880 //   class RBBIMonkeyTest implementation.
881 //
882 //---------------------------------------------------------------------------------------
RBBIMonkeyTest()883 RBBIMonkeyTest::RBBIMonkeyTest() {
884 }
885 
~RBBIMonkeyTest()886 RBBIMonkeyTest::~RBBIMonkeyTest() {
887 }
888 
889 
890 //     params, taken from this->fParams.
891 //       rules=file_name   Name of file containing the reference rules.
892 //       seed=nnnnn        Random number starting seed.
893 //                         Setting the seed allows errors to be reproduced.
894 //       loop=nnn          Looping count.  Controls running time.
895 //                         -1:  run forever.
896 //                          0 or greater:  run length.
897 //       expansions        debug option, show expansions of rules and sets.
898 //       verbose           Display details of the failure.
899 //
900 //     Parameters on the intltest command line follow the test name, and are preceded by '@'.
901 //     For example,
902 //           intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
903 //
testMonkey()904 void RBBIMonkeyTest::testMonkey() {
905     // printf("Test parameters: %s\n", fParams);
906     UnicodeString params(fParams);
907     UErrorCode status = U_ZERO_ERROR;
908 
909     const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
910                            "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
911                            NULL };
912     CharString testNameFromParams;
913     if (getStringParam("rules", params, testNameFromParams, status)) {
914         tests[0] = testNameFromParams.data();
915         tests[1] = NULL;
916     }
917 
918     int64_t loopCount = quick? 100 : 5000;
919     getIntParam("loop", params, loopCount, status);
920 
921     UBool dumpExpansions = FALSE;
922     getBoolParam("expansions", params, dumpExpansions, status);
923 
924     UBool verbose = FALSE;
925     getBoolParam("verbose", params, verbose, status);
926 
927     int64_t seed = 0;
928     getIntParam("seed", params, seed, status);
929 
930     if (params.length() != 0) {
931         // Options processing did not consume all of the parameters. Something unrecognized was present.
932         CharString unrecognizedParameters;
933         unrecognizedParameters.append(CStr(params)(), -1, status);
934         errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
935         return;
936     }
937 
938     UVector startedTests(status);
939     if (U_FAILURE(status)) {
940         errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
941         return;
942     }
943 
944     // Monkey testing is multi-threaded.
945     // Each set of break rules to be tested is run in a separate thread.
946     // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
947     int32_t i;
948     for (i=0; tests[i] != NULL; ++i) {
949         logln("beginning testing of %s", tests[i]);
950         LocalPointer<RBBIMonkeyImpl> test(new RBBIMonkeyImpl(status));
951         if (U_FAILURE(status)) {
952             dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
953             break;
954         }
955         test->fDumpExpansions = dumpExpansions;
956         test->fVerbose = verbose;
957         test->fRandomGenerator.seed((uint32_t)seed);
958         test->fLoopCount = loopCount;
959         test->setup(tests[i], status);
960         if (U_FAILURE(status)) {
961             dataerrln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
962             break;
963         }
964         test->startTest();
965         startedTests.addElement(test.orphan(), status);
966         if (U_FAILURE(status)) {
967             errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
968             break;
969         }
970     }
971 
972     for (i=0; i<startedTests.size(); ++i) {
973         RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
974         test->join();
975         delete test;
976     }
977 }
978 
979 
getIntParam(UnicodeString name,UnicodeString & params,int64_t & val,UErrorCode & status)980 UBool  RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
981     name.append(" *= *(-?\\d+) *,? *");
982     RegexMatcher m(name, params, 0, status);
983     if (m.find()) {
984         // The param exists.  Convert the string to an int.
985         CharString str;
986         str.append(CStr(m.group(1, status))(), -1, status);
987         val = strtol(str.data(),  NULL, 10);
988 
989         // Delete this parameter from the params string.
990         m.reset();
991         params = m.replaceFirst(UnicodeString(), status);
992         return TRUE;
993     }
994     return FALSE;
995 }
996 
getStringParam(UnicodeString name,UnicodeString & params,CharString & dest,UErrorCode & status)997 UBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
998     name.append(" *= *([^ ,]*) *,? *");
999     RegexMatcher m(name, params, 0, status);
1000     if (m.find()) {
1001         // The param exists.
1002         dest.append(CStr(m.group(1, status))(), -1, status);
1003 
1004         // Delete this parameter from the params string.
1005         m.reset();
1006         params = m.replaceFirst(UnicodeString(), status);
1007         return TRUE;
1008     }
1009     return FALSE;
1010 }
1011 
getBoolParam(UnicodeString name,UnicodeString & params,UBool & dest,UErrorCode & status)1012 UBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
1013     name.append("(?: *= *(true|false))? *,? *");
1014     RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
1015     if (m.find()) {
1016         if (m.start(1, status) > 0) {
1017             // user option included a value.
1018             dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
1019         } else {
1020             // No explicit user value, implies true.
1021             dest = TRUE;
1022         }
1023 
1024         // Delete this parameter from the params string.
1025         m.reset();
1026         params = m.replaceFirst(UnicodeString(), status);
1027         return TRUE;
1028     }
1029     return FALSE;
1030 }
1031 
1032 #endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
1033