1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2014, International Business Machines Corporation and    *
4  * others. All Rights Reserved.                                                *
5  *******************************************************************************
6  */
7 package com.ibm.icu.dev.test.rbbi;
8 
9 import java.text.StringCharacterIterator;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.Locale;
13 
14 import com.ibm.icu.dev.test.TestFmwk;
15 import com.ibm.icu.text.BreakIterator;
16 import com.ibm.icu.text.FilteredBreakIteratorBuilder;
17 import com.ibm.icu.util.ULocale;
18 
19 public class BreakIteratorTest extends TestFmwk
20 {
21     private BreakIterator characterBreak;
22     private BreakIterator wordBreak;
23     private BreakIterator lineBreak;
24     private BreakIterator sentenceBreak;
25     private BreakIterator titleBreak;
26 
main(String[] args)27     public static void main(String[] args) throws Exception {
28         new BreakIteratorTest().run(args);
29     }
BreakIteratorTest()30     public BreakIteratorTest()
31     {
32 
33     }
init()34     protected void init(){
35         characterBreak = BreakIterator.getCharacterInstance();
36         wordBreak = BreakIterator.getWordInstance();
37         lineBreak = BreakIterator.getLineInstance();
38         //logln("Creating sentence iterator...");
39         sentenceBreak = BreakIterator.getSentenceInstance();
40         //logln("Finished creating sentence iterator...");
41         titleBreak = BreakIterator.getTitleInstance();
42     }
43     //=========================================================================
44     // general test subroutines
45     //=========================================================================
46 
generalIteratorTest(BreakIterator bi, List<String> expectedResult)47     private void generalIteratorTest(BreakIterator bi, List<String> expectedResult) {
48         StringBuffer buffer = new StringBuffer();
49         String text;
50         for (int i = 0; i < expectedResult.size(); i++) {
51             text = expectedResult.get(i);
52             buffer.append(text);
53         }
54         text = buffer.toString();
55 
56         bi.setText(text);
57 
58         List<String> nextResults = _testFirstAndNext(bi, text);
59         List<String> previousResults = _testLastAndPrevious(bi, text);
60 
61         logln("comparing forward and backward...");
62         int errs = getErrorCount();
63         compareFragmentLists("forward iteration", "backward iteration", nextResults,
64                         previousResults);
65         if (getErrorCount() == errs) {
66             logln("comparing expected and actual...");
67             compareFragmentLists("expected result", "actual result", expectedResult,
68                             nextResults);
69         }
70 
71         int[] boundaries = new int[expectedResult.size() + 3];
72         boundaries[0] = BreakIterator.DONE;
73         boundaries[1] = 0;
74         for (int i = 0; i < expectedResult.size(); i++)
75             boundaries[i + 2] = boundaries[i + 1] + (expectedResult.get(i)).
76                             length();
77         boundaries[boundaries.length - 1] = BreakIterator.DONE;
78 
79         _testFollowing(bi, text, boundaries);
80         _testPreceding(bi, text, boundaries);
81         _testIsBoundary(bi, text, boundaries);
82 
83         doMultipleSelectionTest(bi, text);
84     }
85 
_testFirstAndNext(BreakIterator bi, String text)86     private List<String> _testFirstAndNext(BreakIterator bi, String text) {
87         int p = bi.first();
88         int lastP = p;
89         List<String> result = new ArrayList<String>();
90 
91         if (p != 0)
92             errln("first() returned " + p + " instead of 0");
93         while (p != BreakIterator.DONE) {
94             p = bi.next();
95             if (p != BreakIterator.DONE) {
96                 if (p <= lastP)
97                     errln("next() failed to move forward: next() on position "
98                                     + lastP + " yielded " + p);
99 
100                 result.add(text.substring(lastP, p));
101             }
102             else {
103                 if (lastP != text.length())
104                     errln("next() returned DONE prematurely: offset was "
105                                     + lastP + " instead of " + text.length());
106             }
107             lastP = p;
108         }
109         return result;
110     }
111 
_testLastAndPrevious(BreakIterator bi, String text)112     private List<String> _testLastAndPrevious(BreakIterator bi, String text) {
113         int p = bi.last();
114         int lastP = p;
115         List<String> result = new ArrayList<String>();
116 
117         if (p != text.length())
118             errln("last() returned " + p + " instead of " + text.length());
119         while (p != BreakIterator.DONE) {
120             p = bi.previous();
121             if (p != BreakIterator.DONE) {
122                 if (p >= lastP)
123                     errln("previous() failed to move backward: previous() on position "
124                                     + lastP + " yielded " + p);
125 
126                 result.add(0, text.substring(p, lastP));
127             }
128             else {
129                 if (lastP != 0)
130                     errln("previous() returned DONE prematurely: offset was "
131                                     + lastP + " instead of 0");
132             }
133             lastP = p;
134         }
135         return result;
136     }
137 
compareFragmentLists(String f1Name, String f2Name, List<String> f1, List<String> f2)138     private void compareFragmentLists(String f1Name, String f2Name, List<String> f1, List<String> f2) {
139         int p1 = 0;
140         int p2 = 0;
141         String s1;
142         String s2;
143         int t1 = 0;
144         int t2 = 0;
145 
146         while (p1 < f1.size() && p2 < f2.size()) {
147             s1 = f1.get(p1);
148             s2 = f2.get(p2);
149             t1 += s1.length();
150             t2 += s2.length();
151 
152             if (s1.equals(s2)) {
153                 debugLogln("   >" + s1 + "<");
154                 ++p1;
155                 ++p2;
156             }
157             else {
158                 int tempT1 = t1;
159                 int tempT2 = t2;
160                 int tempP1 = p1;
161                 int tempP2 = p2;
162 
163                 while (tempT1 != tempT2 && tempP1 < f1.size() && tempP2 < f2.size()) {
164                     while (tempT1 < tempT2 && tempP1 < f1.size()) {
165                         tempT1 += (f1.get(tempP1)).length();
166                         ++tempP1;
167                     }
168                     while (tempT2 < tempT1 && tempP2 < f2.size()) {
169                         tempT2 += (f2.get(tempP2)).length();
170                         ++tempP2;
171                     }
172                 }
173                 logln("*** " + f1Name + " has:");
174                 while (p1 <= tempP1 && p1 < f1.size()) {
175                     s1 = f1.get(p1);
176                     t1 += s1.length();
177                     debugLogln(" *** >" + s1 + "<");
178                     ++p1;
179                 }
180                 logln("***** " + f2Name + " has:");
181                 while (p2 <= tempP2 && p2 < f2.size()) {
182                     s2 = f2.get(p2);
183                     t2 += s2.length();
184                     debugLogln(" ***** >" + s2 + "<");
185                     ++p2;
186                 }
187                 errln("Discrepancy between " + f1Name + " and " + f2Name);
188             }
189         }
190     }
191 
_testFollowing(BreakIterator bi, String text, int[] boundaries)192     private void _testFollowing(BreakIterator bi, String text, int[] boundaries) {
193         logln("testFollowing():");
194         int p = 2;
195         for (int i = 0; i <= text.length(); i++) {
196             if (i == boundaries[p])
197                 ++p;
198 
199             int b = bi.following(i);
200             logln("bi.following(" + i + ") -> " + b);
201             if (b != boundaries[p])
202                 errln("Wrong result from following() for " + i + ": expected " + boundaries[p]
203                                 + ", got " + b);
204         }
205     }
206 
_testPreceding(BreakIterator bi, String text, int[] boundaries)207     private void _testPreceding(BreakIterator bi, String text, int[] boundaries) {
208         logln("testPreceding():");
209         int p = 0;
210         for (int i = 0; i <= text.length(); i++) {
211             int b = bi.preceding(i);
212             logln("bi.preceding(" + i + ") -> " + b);
213             if (b != boundaries[p])
214                 errln("Wrong result from preceding() for " + i + ": expected " + boundaries[p]
215                                 + ", got " + b);
216 
217             if (i == boundaries[p + 1])
218                 ++p;
219         }
220     }
221 
_testIsBoundary(BreakIterator bi, String text, int[] boundaries)222     private void _testIsBoundary(BreakIterator bi, String text, int[] boundaries) {
223         logln("testIsBoundary():");
224         int p = 1;
225         boolean isB;
226         for (int i = 0; i <= text.length(); i++) {
227             isB = bi.isBoundary(i);
228             logln("bi.isBoundary(" + i + ") -> " + isB);
229 
230             if (i == boundaries[p]) {
231                 if (!isB)
232                     errln("Wrong result from isBoundary() for " + i + ": expected true, got false");
233                 ++p;
234             }
235             else {
236                 if (isB)
237                     errln("Wrong result from isBoundary() for " + i + ": expected false, got true");
238             }
239         }
240     }
241 
doMultipleSelectionTest(BreakIterator iterator, String testText)242     private void doMultipleSelectionTest(BreakIterator iterator, String testText)
243     {
244         logln("Multiple selection test...");
245         BreakIterator testIterator = (BreakIterator)iterator.clone();
246         int offset = iterator.first();
247         int testOffset;
248         int count = 0;
249 
250         do {
251             testOffset = testIterator.first();
252             testOffset = testIterator.next(count);
253             logln("next(" + count + ") -> " + testOffset);
254             if (offset != testOffset)
255                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
256 
257             if (offset != BreakIterator.DONE) {
258                 count++;
259                 offset = iterator.next();
260             }
261         } while (offset != BreakIterator.DONE);
262 
263         // now do it backwards...
264         offset = iterator.last();
265         count = 0;
266 
267         do {
268             testOffset = testIterator.last();
269             testOffset = testIterator.next(count);
270             logln("next(" + count + ") -> " + testOffset);
271             if (offset != testOffset)
272                 errln("next(n) and next() not returning consistent results: for step " + count + ", next(n) returned " + testOffset + " and next() had " + offset);
273 
274             if (offset != BreakIterator.DONE) {
275                 count--;
276                 offset = iterator.previous();
277             }
278         } while (offset != BreakIterator.DONE);
279     }
280 
281 
doOtherInvariantTest(BreakIterator tb, String testChars)282     private void doOtherInvariantTest(BreakIterator tb, String testChars)
283     {
284         StringBuffer work = new StringBuffer("a\r\na");
285         int errorCount = 0;
286 
287         // a break should never occur between CR and LF
288         for (int i = 0; i < testChars.length(); i++) {
289             work.setCharAt(0, testChars.charAt(i));
290             for (int j = 0; j < testChars.length(); j++) {
291                 work.setCharAt(3, testChars.charAt(j));
292                 tb.setText(work.toString());
293                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
294                     if (k == 2) {
295                         errln("Break between CR and LF in string U+" + Integer.toHexString(
296                                 (int)(work.charAt(0))) + ", U+d U+a U+" + Integer.toHexString(
297                                 (int)(work.charAt(3))));
298                         errorCount++;
299                         if (errorCount >= 75)
300                             return;
301                     }
302             }
303         }
304 
305         // a break should never occur before a non-spacing mark, unless it's preceded
306         // by a line terminator
307         work.setLength(0);
308         work.append("aaaa");
309         for (int i = 0; i < testChars.length(); i++) {
310             char c = testChars.charAt(i);
311             if (c == '\n' || c == '\r' || c == '\u2029' || c == '\u2028' || c == '\u0003')
312                 continue;
313             work.setCharAt(1, c);
314             for (int j = 0; j < testChars.length(); j++) {
315                 c = testChars.charAt(j);
316                 if (Character.getType(c) != Character.NON_SPACING_MARK && Character.getType(c)
317                         != Character.ENCLOSING_MARK)
318                     continue;
319                 work.setCharAt(2, c);
320                 tb.setText(work.toString());
321                 for (int k = tb.first(); k != BreakIterator.DONE; k = tb.next())
322                     if (k == 2) {
323                         errln("Break between U+" + Integer.toHexString((int)(work.charAt(1)))
324                                 + " and U+" + Integer.toHexString((int)(work.charAt(2))));
325                         errorCount++;
326                         if (errorCount >= 75)
327                             return;
328                     }
329             }
330         }
331     }
332 
debugLogln(String s)333     public void debugLogln(String s) {
334         final String zeros = "0000";
335         String temp;
336         StringBuffer out = new StringBuffer();
337         for (int i = 0; i < s.length(); i++) {
338             char c = s.charAt(i);
339             if (c >= ' ' && c < '\u007f')
340                 out.append(c);
341             else {
342                 out.append("\\u");
343                 temp = Integer.toHexString((int)c);
344                 out.append(zeros.substring(0, 4 - temp.length()));
345                 out.append(temp);
346             }
347         }
348         logln(out.toString());
349     }
350 
351     //=========================================================================
352     // tests
353     //=========================================================================
354 
355 
356     /**
357      * @bug 4097779
358      */
TestBug4097779()359     public void TestBug4097779() {
360         List<String> wordSelectionData = new ArrayList<String>(2);
361 
362         wordSelectionData.add("aa\u0300a");
363         wordSelectionData.add(" ");
364 
365         generalIteratorTest(wordBreak, wordSelectionData);
366     }
367 
368     /**
369      * @bug 4098467
370      */
TestBug4098467Words()371     public void TestBug4098467Words() {
372         List<String> wordSelectionData = new ArrayList<String>();
373 
374         // What follows is a string of Korean characters (I found it in the Yellow Pages
375         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
376         // it correctly), first as precomposed syllables, and then as conjoining jamo.
377         // Both sequences should be semantically identical and break the same way.
378         // precomposed syllables...
379         wordSelectionData.add("\uc0c1\ud56d");
380         wordSelectionData.add(" ");
381         wordSelectionData.add("\ud55c\uc778");
382         wordSelectionData.add(" ");
383         wordSelectionData.add("\uc5f0\ud569");
384         wordSelectionData.add(" ");
385         wordSelectionData.add("\uc7a5\ub85c\uad50\ud68c");
386         wordSelectionData.add(" ");
387         // conjoining jamo...
388         wordSelectionData.add("\u1109\u1161\u11bc\u1112\u1161\u11bc");
389         wordSelectionData.add(" ");
390         wordSelectionData.add("\u1112\u1161\u11ab\u110b\u1175\u11ab");
391         wordSelectionData.add(" ");
392         wordSelectionData.add("\u110b\u1167\u11ab\u1112\u1161\u11b8");
393         wordSelectionData.add(" ");
394         wordSelectionData.add("\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c");
395         wordSelectionData.add(" ");
396 
397         generalIteratorTest(wordBreak, wordSelectionData);
398     }
399 
400 
401     /**
402      * @bug 4111338
403      */
TestBug4111338()404     public void TestBug4111338() {
405         List<String> sentenceSelectionData = new ArrayList<String>();
406 
407         // test for bug #4111338: Don't break sentences at the boundary between CJK
408         // and other letters
409         sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:\"JAVA\u821c"
410                 + "\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba"
411                 + "\u611d\u57b6\u2510\u5d46\".\u2029");
412         sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8"
413                 + "\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0"
414                 + "\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
415         sentenceSelectionData.add("\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4"
416                 + "\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8"
417                 + "\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2029");
418         sentenceSelectionData.add("He said, \"I can go there.\"\u2029");
419 
420         generalIteratorTest(sentenceBreak, sentenceSelectionData);
421     }
422 
423 
424     /**
425      * @bug 4143071
426      */
TestBug4143071()427     public void TestBug4143071() {
428         List<String> sentenceSelectionData = new ArrayList<String>(3);
429 
430         // Make sure sentences that end with digits work right
431         sentenceSelectionData.add("Today is the 27th of May, 1998.  ");
432         sentenceSelectionData.add("Tomorrow will be 28 May 1998.  ");
433         sentenceSelectionData.add("The day after will be the 30th.\u2029");
434 
435         generalIteratorTest(sentenceBreak, sentenceSelectionData);
436     }
437 
438     /**
439      * @bug 4152416
440      */
TestBug4152416()441     public void TestBug4152416() {
442         List<String> sentenceSelectionData = new ArrayList<String>(2);
443 
444         // Make sure sentences ending with a capital letter are treated correctly
445         sentenceSelectionData.add("The type of all primitive "
446                 + "<code>boolean</code> values accessed in the target VM.  ");
447         sentenceSelectionData.add("Calls to xxx will return an "
448                 + "implementor of this interface.\u2029");
449 
450         generalIteratorTest(sentenceBreak, sentenceSelectionData);
451     }
452 
453     /**
454      * @bug 4152117
455      */
TestBug4152117()456     public void TestBug4152117() {
457         List<String> sentenceSelectionData = new ArrayList<String>(3);
458 
459         // Make sure sentence breaking is handling punctuation correctly
460         // [COULD NOT REPRODUCE THIS BUG, BUT TEST IS HERE TO MAKE SURE
461         // IT DOESN'T CROP UP]
462         sentenceSelectionData.add("Constructs a randomly generated "
463                 + "BigInteger, uniformly distributed over the range <tt>0</tt> "
464                 + "to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive.  ");
465         sentenceSelectionData.add("The uniformity of the distribution "
466                 + "assumes that a fair source of random bits is provided in "
467                 + "<tt>rnd</tt>.  ");
468         sentenceSelectionData.add("Note that this constructor always "
469                 + "constructs a non-negative BigInteger.\u2029");
470 
471         generalIteratorTest(sentenceBreak, sentenceSelectionData);
472     }
473 
TestLineBreak()474     public void TestLineBreak() {
475         List<String> lineSelectionData = new ArrayList<String>();
476 
477         lineSelectionData.add("Multi-");
478         lineSelectionData.add("Level ");
479         lineSelectionData.add("example ");
480         lineSelectionData.add("of ");
481         lineSelectionData.add("a ");
482         lineSelectionData.add("semi-");
483         lineSelectionData.add("idiotic ");
484         lineSelectionData.add("non-");
485         lineSelectionData.add("sensical ");
486         lineSelectionData.add("(non-");
487         lineSelectionData.add("important) ");
488         lineSelectionData.add("sentence. ");
489 
490         lineSelectionData.add("Hi  ");
491         lineSelectionData.add("Hello ");
492         lineSelectionData.add("How\n");
493         lineSelectionData.add("are\r");
494         lineSelectionData.add("you\u2028");
495         lineSelectionData.add("fine.\t");
496         lineSelectionData.add("good.  ");
497 
498         lineSelectionData.add("Now\r");
499         lineSelectionData.add("is\n");
500         lineSelectionData.add("the\r\n");
501         lineSelectionData.add("time\n");
502         lineSelectionData.add("\r");
503         lineSelectionData.add("for\r");
504         lineSelectionData.add("\r");
505         lineSelectionData.add("all");
506 
507         generalIteratorTest(lineBreak, lineSelectionData);
508     }
509 
510     /**
511      * @bug 4068133
512      */
TestBug4068133()513     public void TestBug4068133() {
514         List<String> lineSelectionData = new ArrayList<String>(9);
515 
516         lineSelectionData.add("\u96f6");
517         lineSelectionData.add("\u4e00\u3002");
518         lineSelectionData.add("\u4e8c\u3001");
519         lineSelectionData.add("\u4e09\u3002\u3001");
520         lineSelectionData.add("\u56db\u3001\u3002\u3001");
521         lineSelectionData.add("\u4e94,");
522         lineSelectionData.add("\u516d.");
523         lineSelectionData.add("\u4e03.\u3001,\u3002");
524         lineSelectionData.add("\u516b");
525 
526         generalIteratorTest(lineBreak, lineSelectionData);
527     }
528 
529     /**
530      * @bug 4086052
531      */
TestBug4086052()532     public void TestBug4086052() {
533         List<String> lineSelectionData = new ArrayList<String>(1);
534 
535         lineSelectionData.add("foo\u00a0bar ");
536 //        lineSelectionData.addElement("foo\ufeffbar");
537 
538         generalIteratorTest(lineBreak, lineSelectionData);
539     }
540 
541     /**
542      * @bug 4097920
543      */
TestBug4097920()544     public void TestBug4097920() {
545         List<String> lineSelectionData = new ArrayList<String>(3);
546 
547         lineSelectionData.add("dog,cat,mouse ");
548         lineSelectionData.add("(one)");
549         lineSelectionData.add("(two)\n");
550         generalIteratorTest(lineBreak, lineSelectionData);
551     }
552 
553 
554 
555     /**
556      * @bug 4117554
557      */
TestBug4117554Lines()558     public void TestBug4117554Lines() {
559         List<String> lineSelectionData = new ArrayList<String>(3);
560 
561         // Fullwidth .!? should be treated as postJwrd
562         lineSelectionData.add("\u4e01\uff0e");
563         lineSelectionData.add("\u4e02\uff01");
564         lineSelectionData.add("\u4e03\uff1f");
565 
566         generalIteratorTest(lineBreak, lineSelectionData);
567     }
568 
TestLettersAndDigits()569     public void TestLettersAndDigits() {
570         // a character sequence such as "X11" or "30F3" or "native2ascii" should
571         // be kept together as a single word
572         List<String> lineSelectionData = new ArrayList<String>(3);
573 
574         lineSelectionData.add("X11 ");
575         lineSelectionData.add("30F3 ");
576         lineSelectionData.add("native2ascii");
577 
578         generalIteratorTest(lineBreak, lineSelectionData);
579     }
580 
581 
582     private static final String graveS = "S\u0300";
583     private static final String acuteBelowI = "i\u0317";
584     private static final String acuteE = "e\u0301";
585     private static final String circumflexA = "a\u0302";
586     private static final String tildeE = "e\u0303";
587 
TestCharacterBreak()588     public void TestCharacterBreak() {
589         List<String> characterSelectionData = new ArrayList<String>();
590 
591         characterSelectionData.add(graveS);
592         characterSelectionData.add(acuteBelowI);
593         characterSelectionData.add("m");
594         characterSelectionData.add("p");
595         characterSelectionData.add("l");
596         characterSelectionData.add(acuteE);
597         characterSelectionData.add(" ");
598         characterSelectionData.add("s");
599         characterSelectionData.add(circumflexA);
600         characterSelectionData.add("m");
601         characterSelectionData.add("p");
602         characterSelectionData.add("l");
603         characterSelectionData.add(tildeE);
604         characterSelectionData.add(".");
605         characterSelectionData.add("w");
606         characterSelectionData.add(circumflexA);
607         characterSelectionData.add("w");
608         characterSelectionData.add("a");
609         characterSelectionData.add("f");
610         characterSelectionData.add("q");
611         characterSelectionData.add("\n");
612         characterSelectionData.add("\r");
613         characterSelectionData.add("\r\n");
614         characterSelectionData.add("\n");
615 
616         generalIteratorTest(characterBreak, characterSelectionData);
617     }
618 
619     /**
620      * @bug 4098467
621      */
TestBug4098467Characters()622     public void TestBug4098467Characters() {
623         List<String> characterSelectionData = new ArrayList<String>();
624 
625         // What follows is a string of Korean characters (I found it in the Yellow Pages
626         // ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
627         // it correctly), first as precomposed syllables, and then as conjoining jamo.
628         // Both sequences should be semantically identical and break the same way.
629         // precomposed syllables...
630         characterSelectionData.add("\uc0c1");
631         characterSelectionData.add("\ud56d");
632         characterSelectionData.add(" ");
633         characterSelectionData.add("\ud55c");
634         characterSelectionData.add("\uc778");
635         characterSelectionData.add(" ");
636         characterSelectionData.add("\uc5f0");
637         characterSelectionData.add("\ud569");
638         characterSelectionData.add(" ");
639         characterSelectionData.add("\uc7a5");
640         characterSelectionData.add("\ub85c");
641         characterSelectionData.add("\uad50");
642         characterSelectionData.add("\ud68c");
643         characterSelectionData.add(" ");
644         // conjoining jamo...
645         characterSelectionData.add("\u1109\u1161\u11bc");
646         characterSelectionData.add("\u1112\u1161\u11bc");
647         characterSelectionData.add(" ");
648         characterSelectionData.add("\u1112\u1161\u11ab");
649         characterSelectionData.add("\u110b\u1175\u11ab");
650         characterSelectionData.add(" ");
651         characterSelectionData.add("\u110b\u1167\u11ab");
652         characterSelectionData.add("\u1112\u1161\u11b8");
653         characterSelectionData.add(" ");
654         characterSelectionData.add("\u110c\u1161\u11bc");
655         characterSelectionData.add("\u1105\u1169");
656         characterSelectionData.add("\u1100\u116d");
657         characterSelectionData.add("\u1112\u116c");
658 
659         generalIteratorTest(characterBreak, characterSelectionData);
660     }
661 
TestTitleBreak()662     public void TestTitleBreak()
663     {
664         List<String> titleData = new ArrayList<String>();
665         titleData.add("   ");
666         titleData.add("This ");
667         titleData.add("is ");
668         titleData.add("a ");
669         titleData.add("simple ");
670         titleData.add("sample ");
671         titleData.add("sentence. ");
672         titleData.add("This ");
673 
674         generalIteratorTest(titleBreak, titleData);
675     }
676 
677 
678 
679     /*
680      * @bug 4153072
681      */
TestBug4153072()682     public void TestBug4153072() {
683         BreakIterator iter = BreakIterator.getWordInstance();
684         String str = "...Hello, World!...";
685         int begin = 3;
686         int end = str.length() - 3;
687         // not used boolean gotException = false;
688 
689 
690         iter.setText(new StringCharacterIterator(str, begin, end, begin));
691         for (int index = -1; index < begin + 1; ++index) {
692             try {
693                 iter.isBoundary(index);
694                 if (index < begin)
695                     errln("Didn't get exception with offset = " + index +
696                                     " and begin index = " + begin);
697             }
698             catch (IllegalArgumentException e) {
699                 if (index >= begin)
700                     errln("Got exception with offset = " + index +
701                                     " and begin index = " + begin);
702             }
703         }
704     }
705 
706 
TestBug4146175Lines()707     public void TestBug4146175Lines() {
708         List<String> lineSelectionData = new ArrayList<String>(2);
709 
710         // the fullwidth comma should stick to the preceding Japanese character
711         lineSelectionData.add("\u7d42\uff0c");
712         lineSelectionData.add("\u308f");
713 
714         generalIteratorTest(lineBreak, lineSelectionData);
715     }
716 
717     private static final String cannedTestChars
718         = "\u0000\u0001\u0002\u0003\u0004 !\"#$%&()+-01234<=>ABCDE[]^_`abcde{}|\u00a0\u00a2"
719         + "\u00a3\u00a4\u00a5\u00a6\u00a7\u00a8\u00a9\u00ab\u00ad\u00ae\u00af\u00b0\u00b2\u00b3"
720         + "\u00b4\u00b9\u00bb\u00bc\u00bd\u02b0\u02b1\u02b2\u02b3\u02b4\u0300\u0301\u0302\u0303"
721         + "\u0304\u05d0\u05d1\u05d2\u05d3\u05d4\u0903\u093e\u093f\u0940\u0949\u0f3a\u0f3b\u2000"
722         + "\u2001\u2002\u200c\u200d\u200e\u200f\u2010\u2011\u2012\u2028\u2029\u202a\u203e\u203f"
723         + "\u2040\u20dd\u20de\u20df\u20e0\u2160\u2161\u2162\u2163\u2164";
724 
TestSentenceInvariants()725     public void TestSentenceInvariants()
726     {
727         BreakIterator e = BreakIterator.getSentenceInstance();
728         doOtherInvariantTest(e, cannedTestChars + ".,\u3001\u3002\u3041\u3042\u3043\ufeff");
729     }
730 
TestEmptyString()731     public void TestEmptyString()
732     {
733         String text = "";
734         List<String> x = new ArrayList<String>(1);
735         x.add(text);
736 
737         generalIteratorTest(lineBreak, x);
738     }
739 
TestGetAvailableLocales()740     public void TestGetAvailableLocales()
741     {
742         Locale[] locList = BreakIterator.getAvailableLocales();
743 
744         if (locList.length == 0)
745             errln("getAvailableLocales() returned an empty list!");
746         // I have no idea how to test this function...
747 
748         com.ibm.icu.util.ULocale[] ulocList = BreakIterator.getAvailableULocales();
749         if (ulocList.length == 0) {
750             errln("getAvailableULocales() returned an empty list!");
751         } else {
752             logln("getAvailableULocales() returned " + ulocList.length + " locales");
753         }
754     }
755 
756 
757     /**
758      * @bug 4068137
759      */
TestEndBehavior()760     public void TestEndBehavior()
761     {
762         String testString = "boo.";
763         BreakIterator wb = BreakIterator.getWordInstance();
764         wb.setText(testString);
765 
766         if (wb.first() != 0)
767             errln("Didn't get break at beginning of string.");
768         if (wb.next() != 3)
769             errln("Didn't get break before period in \"boo.\"");
770         if (wb.current() != 4 && wb.next() != 4)
771             errln("Didn't get break at end of string.");
772     }
773 
774     // The Following two tests are ported from ICU4C 1.8.1 [Richard/GCL]
775     /**
776      * Port From:   ICU4C v1.8.1 : textbounds : IntlTestTextBoundary
777      * Source File: $ICU4CRoot/source/test/intltest/ittxtbd.cpp
778      **/
779     /**
780      * test methods preceding, following and isBoundary
781      **/
TestPreceding()782     public void TestPreceding() {
783         String words3 = "aaa bbb ccc";
784         BreakIterator e = BreakIterator.getWordInstance(Locale.getDefault());
785         e.setText( words3 );
786         e.first();
787         int p1 = e.next();
788         int p2 = e.next();
789         int p3 = e.next();
790         int p4 = e.next();
791 
792         int f = e.following(p2+1);
793         int p = e.preceding(p2+1);
794         if (f!=p3)
795             errln("IntlTestTextBoundary::TestPreceding: f!=p3");
796         if (p!=p2)
797             errln("IntlTestTextBoundary::TestPreceding: p!=p2");
798 
799         if (p1+1!=p2)
800             errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2");
801 
802         if (p3+1!=p4)
803             errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4");
804 
805         if (!e.isBoundary(p2) || e.isBoundary(p2+1) || !e.isBoundary(p3))
806         {
807             errln("IntlTestTextBoundary::TestPreceding: isBoundary err");
808         }
809     }
810 
811 
812     /**
813      * Bug 4450804
814      */
TestLineBreakContractions()815     public void TestLineBreakContractions() {
816         List<String> expected = new ArrayList<String>(7);
817         expected.add("These ");
818         expected.add("are ");
819         expected.add("'foobles'. ");
820         expected.add("Don't ");
821         expected.add("you ");
822         expected.add("like ");
823         expected.add("them?");
824         generalIteratorTest(lineBreak, expected);
825     }
826 
827     /**
828      * Ticket#5615
829      */
TestT5615()830     public void TestT5615() {
831         com.ibm.icu.util.ULocale[] ulocales = BreakIterator.getAvailableULocales();
832         int type = 0;
833         com.ibm.icu.util.ULocale loc = null;
834         try {
835             for (int i = 0; i < ulocales.length; i++) {
836                 loc = ulocales[i];
837                 for (type = 0; type < 5 /* 5 = BreakIterator.KIND_COUNT */; ++type) {
838                     BreakIterator brk = BreakIterator.getBreakInstance(loc, type);
839                     if (brk == null) {
840                         errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc);
841                     }
842                 }
843             }
844         } catch (Exception e) {
845             errln("ERR: Failed to create an instance type: " + type + " / locale: " + loc + " / exception: " + e.getMessage());
846         }
847     }
848 
849     /*
850      * Test case for Ticket#10721. BreakIterator factory method should throw NPE
851      * when specified locale is null.
852      */
TestNullLocale()853     public void TestNullLocale() {
854         Locale loc = null;
855         ULocale uloc = null;
856 
857         @SuppressWarnings("unused")
858         BreakIterator brk;
859 
860         // Character
861         try {
862             brk = BreakIterator.getCharacterInstance(loc);
863             errln("getCharacterInstance((Locale)null) did not throw NPE.");
864         } catch (NullPointerException e) { /* OK */ }
865         try {
866             brk = BreakIterator.getCharacterInstance(uloc);
867             errln("getCharacterInstance((ULocale)null) did not throw NPE.");
868         } catch (NullPointerException e) { /* OK */ }
869 
870         // Line
871         try {
872             brk = BreakIterator.getLineInstance(loc);
873             errln("getLineInstance((Locale)null) did not throw NPE.");
874         } catch (NullPointerException e) { /* OK */ }
875         try {
876             brk = BreakIterator.getLineInstance(uloc);
877             errln("getLineInstance((ULocale)null) did not throw NPE.");
878         } catch (NullPointerException e) { /* OK */ }
879 
880         // Sentence
881         try {
882             brk = BreakIterator.getSentenceInstance(loc);
883             errln("getSentenceInstance((Locale)null) did not throw NPE.");
884         } catch (NullPointerException e) { /* OK */ }
885         try {
886             brk = BreakIterator.getSentenceInstance(uloc);
887             errln("getSentenceInstance((ULocale)null) did not throw NPE.");
888         } catch (NullPointerException e) { /* OK */ }
889 
890         // Title
891         try {
892             brk = BreakIterator.getTitleInstance(loc);
893             errln("getTitleInstance((Locale)null) did not throw NPE.");
894         } catch (NullPointerException e) { /* OK */ }
895         try {
896             brk = BreakIterator.getTitleInstance(uloc);
897             errln("getTitleInstance((ULocale)null) did not throw NPE.");
898         } catch (NullPointerException e) { /* OK */ }
899 
900         // Word
901         try {
902             brk = BreakIterator.getWordInstance(loc);
903             errln("getWordInstance((Locale)null) did not throw NPE.");
904         } catch (NullPointerException e) { /* OK */ }
905         try {
906             brk = BreakIterator.getWordInstance(uloc);
907             errln("getWordInstance((ULocale)null) did not throw NPE.");
908         } catch (NullPointerException e) { /* OK */ }
909     }
910 
911     /**
912      * Test FilteredBreakIteratorBuilder newly introduced
913      */
TestFilteredBreakIteratorBuilder()914     public void TestFilteredBreakIteratorBuilder() {
915         FilteredBreakIteratorBuilder builder;
916         BreakIterator baseBI;
917         BreakIterator filteredBI;
918 
919         String text = "In the meantime Mr. Weston arrived with his small ship, which he had now recovered. Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge."; // (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
920         String ABBR_MR = "Mr.";
921         String ABBR_CAPT = "Capt.";
922 
923         {
924             logln("Constructing empty builder\n");
925             builder = FilteredBreakIteratorBuilder.createInstance();
926 
927             logln("Constructing base BI\n");
928             baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
929 
930             logln("Building new BI\n");
931             filteredBI = builder.build(baseBI);
932 
933             logln("Testing:");
934             filteredBI.setText(text);
935             assertEquals("1st next", 20, filteredBI.next());
936             assertEquals("1st next", 84, filteredBI.next());
937             assertEquals("1st next", 90, filteredBI.next());
938             assertEquals("1st next", 181, filteredBI.next());
939             assertEquals("1st next", 278, filteredBI.next());
940             filteredBI.first();
941         }
942 
943         {
944             logln("Constructing empty builder\n");
945             builder = FilteredBreakIteratorBuilder.createInstance();
946 
947             logln("Adding Mr. as an exception\n");
948 
949             assertEquals("2.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR));
950             assertEquals("2.2 suppressBreakAfter", false, builder.suppressBreakAfter(ABBR_MR));
951             assertEquals("2.3 unsuppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_MR));
952             assertEquals("2.4 unsuppressBreakAfter", false, builder.unsuppressBreakAfter(ABBR_MR));
953             assertEquals("2.5 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR));
954 
955             logln("Constructing base BI\n");
956             baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
957 
958             logln("Building new BI\n");
959             filteredBI = builder.build(baseBI);
960 
961             logln("Testing:");
962             filteredBI.setText(text);
963             assertEquals("2nd next", 84, filteredBI.next());
964             assertEquals("2nd next", 90, filteredBI.next());
965             assertEquals("2nd next", 278, filteredBI.next());
966             filteredBI.first();
967         }
968 
969 
970         {
971           logln("Constructing empty builder\n");
972           builder = FilteredBreakIteratorBuilder.createInstance();
973 
974           logln("Adding Mr. and Capt as an exception\n");
975           assertEquals("3.1 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_MR));
976           assertEquals("3.2 suppressBreakAfter", true, builder.suppressBreakAfter(ABBR_CAPT));
977 
978           logln("Constructing base BI\n");
979           baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
980 
981           logln("Building new BI\n");
982           filteredBI = builder.build(baseBI);
983 
984           logln("Testing:");
985           filteredBI.setText(text);
986           assertEquals("3rd next", 84, filteredBI.next());
987           assertEquals("3rd next", 278, filteredBI.next());
988           filteredBI.first();
989         }
990 
991         {
992           logln("Constructing English builder\n");
993           builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH);
994 
995           logln("Constructing base BI\n");
996           baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
997 
998           logln("unsuppressing 'Capt'");
999           assertEquals("1st suppressBreakAfter", true, builder.unsuppressBreakAfter(ABBR_CAPT));
1000 
1001           logln("Building new BI\n");
1002           filteredBI = builder.build(baseBI);
1003 
1004           if(filteredBI != null) {
1005             logln("Testing:");
1006             filteredBI.setText(text);
1007             assertEquals("4th next", 84, filteredBI.next());
1008             assertEquals("4th next", 90, filteredBI.next());
1009             assertEquals("4th next", 278, filteredBI.next());
1010             filteredBI.first();
1011           }
1012         }
1013 
1014         {
1015           logln("Constructing English builder\n");
1016           builder = FilteredBreakIteratorBuilder.createInstance(ULocale.ENGLISH);
1017 
1018           logln("Constructing base BI\n");
1019           baseBI = BreakIterator.getSentenceInstance(Locale.ENGLISH);
1020 
1021           logln("Building new BI\n");
1022           filteredBI = builder.build(baseBI);
1023 
1024           if(filteredBI != null) {
1025             logln("Testing:");
1026             filteredBI.setText(text);
1027 
1028             assertEquals("5th next", 84, filteredBI.next());
1029             assertEquals("5th next", 278, filteredBI.next());
1030             filteredBI.first();
1031           }
1032         }
1033 
1034         {
1035           logln("Constructing French builder");
1036           builder = FilteredBreakIteratorBuilder.createInstance(ULocale.FRENCH);
1037 
1038           logln("Constructing base BI\n");
1039           baseBI = BreakIterator.getSentenceInstance(Locale.FRENCH);
1040 
1041           logln("Building new BI\n");
1042           filteredBI = builder.build(baseBI);
1043 
1044           if(filteredBI != null) {
1045             logln("Testing:");
1046             filteredBI.setText(text);
1047             assertEquals("6th next", 20, filteredBI.next());
1048             assertEquals("6th next", 84, filteredBI.next());
1049             filteredBI.first();
1050           }
1051         }
1052     }
1053 }
1054