1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
6  * and others. All Rights Reserved.                                           *
7  ******************************************************************************
8  */
9 
10 #include <errno.h>
11 #include <stdio.h>
12 #include <string.h>
13 
14 #include "unicode/utypes.h"
15 #include "unicode/uchar.h"
16 #include "unicode/uchriter.h"
17 #include "unicode/brkiter.h"
18 #include "unicode/locid.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22 
23 /*
24  * This program takes a Unicode text file containing Thai text with
25  * spaces inserted where the word breaks are. It computes a copy of
26  * the text without spaces and uses a word instance of a Thai BreakIterator
27  * to compute the word breaks. The program reports any differences in the
28  * breaks.
29  *
30  * NOTE: by it's very nature, Thai word breaking is not exact, so it is
31  * exptected that this program will always report some differences.
32  */
33 
34 /*
35  * This class is a break iterator that counts words and spaces.
36  */
37 class SpaceBreakIterator
38 {
39 public:
40     // The constructor:
41     // text  - pointer to an array of UChars to iterate over
42     // count - the number of UChars in text
43     SpaceBreakIterator(const UChar *text, int32_t count);
44 
45     // the destructor
46     ~SpaceBreakIterator();
47 
48     // return next break position
49     int32_t next();
50 
51     // return current word count
52     int32_t getWordCount();
53 
54     // return current space count
55     int32_t getSpaceCount();
56 
57 private:
58     // No arg constructor: private so clients can't call it.
59     SpaceBreakIterator();
60 
61     // The underlying BreakIterator
62     BreakIterator *fBreakIter;
63 
64     // address of the UChar array
65     const UChar *fText;
66 
67     // number of UChars in fText
68     int32_t fTextCount;
69 
70     // current word count
71     int32_t fWordCount;
72 
73     // current space count
74     int32_t fSpaceCount;
75 
76     // UnicodeSet of SA characters
77     UnicodeSet fComplexContext;
78 
79     // true when fBreakIter has returned DONE
80     UBool fDone;
81 };
82 
83 /*
84  * This is the main class. It compares word breaks and reports the differences.
85  */
86 class ThaiWordbreakTest
87 {
88 public:
89     // The main constructor:
90     // spaces       - pointer to a UChar array for the text with spaces
91     // spaceCount   - the number of characters in the spaces array
92     // noSpaces     - pointer to a UChar array for the text without spaces
93     // noSpaceCount - the number of characters in the noSpaces array
94     // verbose      - report all breaks if true, otherwise just report differences
95     ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
96     ~ThaiWordbreakTest();
97 
98     // returns the number of breaks that are in the spaces array
99     // but aren't found in the noSpaces array
100     int32_t getBreaksNotFound();
101 
102     // returns the number of breaks which are found in the noSpaces
103     // array but aren't in the spaces array
104     int32_t getInvalidBreaks();
105 
106     // returns the number of words found in the spaces array
107     int32_t getWordCount();
108 
109     // reads the input Unicode text file:
110     // fileName  - the path name of the file
111     // charCount - set to the number of UChars read from the file
112     // returns   - the address of the UChar array containing the characters
113     static const UChar *readFile(char *fileName, int32_t &charCount);
114 
115     // removes spaces form the input UChar array:
116     // spaces        - pointer to the input UChar array
117     // count         - number of UChars in the spaces array
118     // nonSpaceCount - the number of UChars in the result array
119     // returns       - the address of the UChar array with spaces removed
120     static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
121 
122 private:
123     // The no arg constructor - private so clients can't call it
124     ThaiWordbreakTest();
125 
126     // This does the actual comparison:
127     // spaces - the address of the UChar array for the text with spaces
128     // spaceCount - the number of UChars in the spaces array
129     // noSpaces   - the address of the UChar array for the text without spaces
130     // noSpaceCount - the number of UChars in the noSpaces array
131     // returns      - true if all breaks match, FALSE otherwise
132     UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
133                             const UChar *noSpaces, int32_t noSpaceCount);
134 
135     // helper method to report a break in the spaces
136     // array that's not found in the noSpaces array
137     void breakNotFound(int32_t br);
138 
139     // helper method to report a break that's found in
140     // the noSpaces array that's not in the spaces array
141     void foundInvalidBreak(int32_t br);
142 
143     // count of breaks in the spaces array that
144     // aren't found in the noSpaces array
145     int32_t fBreaksNotFound;
146 
147     // count of breaks found in the noSpaces array
148     // that aren't in the spaces array
149     int32_t fInvalidBreaks;
150 
151     // number of words found in the spaces array
152     int32_t fWordCount;
153 
154     // report all breaks if true, otherwise just report differences
155     UBool fVerbose;
156 };
157 
158 /*
159  * The main constructor: it calls compareWordBreaks and reports any differences
160  */
ThaiWordbreakTest(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount,UBool verbose)161 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
162                                      const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
163 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
164 {
165     compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
166 }
167 
168 /*
169  * The no arg constructor
170  */
ThaiWordbreakTest()171 ThaiWordbreakTest::ThaiWordbreakTest()
172 {
173     // nothing
174 }
175 
176 /*
177  * The destructor
178  */
~ThaiWordbreakTest()179 ThaiWordbreakTest::~ThaiWordbreakTest()
180 {
181     // nothing?
182 }
183 
184 /*
185  * returns the number of breaks in the spaces array
186  * that aren't found in the noSpaces array
187  */
getBreaksNotFound()188 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
189 {
190     return fBreaksNotFound;
191 }
192 
193 /*
194  * Returns the number of breaks found in the noSpaces
195  * array that aren't in the spaces array
196  */
getInvalidBreaks()197 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
198 {
199     return fInvalidBreaks;
200 }
201 
202 /*
203  * Returns the number of words found in the spaces array
204  */
getWordCount()205 inline int32_t ThaiWordbreakTest::getWordCount()
206 {
207     return fWordCount;
208 }
209 
210 /*
211  * This method does the acutal break comparison and reports the results.
212  * It uses a SpaceBreakIterator to iterate over the text with spaces,
213  * and a word instance of a Thai BreakIterator to iterate over the text
214  * without spaces.
215  */
compareWordBreaks(const UChar * spaces,int32_t spaceCount,const UChar * noSpaces,int32_t noSpaceCount)216 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
217                                            const UChar *noSpaces, int32_t noSpaceCount)
218 {
219     UBool result = TRUE;
220     Locale thai("th");
221     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
222     UErrorCode status = U_ZERO_ERROR;
223 
224     BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
225     breakIter->adoptText(noSpaceIter);
226 
227     SpaceBreakIterator spaceIter(spaces, spaceCount);
228 
229     int32_t nextBreak = 0;
230     int32_t nextSpaceBreak = 0;
231     int32_t iterCount = 0;
232 
233     while (TRUE) {
234         nextSpaceBreak = spaceIter.next();
235         nextBreak = breakIter->next();
236 
237         if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
238             if (nextBreak != BreakIterator::DONE) {
239                 fprintf(stderr, "break iterator didn't end.\n");
240             } else if (nextSpaceBreak != BreakIterator::DONE) {
241                 fprintf(stderr, "premature break iterator end.\n");
242             }
243 
244             break;
245         }
246 
247         while (nextSpaceBreak != nextBreak &&
248                nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
249             if (nextSpaceBreak < nextBreak) {
250                 breakNotFound(nextSpaceBreak);
251                 result = FALSE;
252                 nextSpaceBreak = spaceIter.next();
253             } else if (nextSpaceBreak > nextBreak) {
254                 foundInvalidBreak(nextBreak);
255                 result = FALSE;
256                 nextBreak = breakIter->next();
257             }
258         }
259 
260         if (fVerbose) {
261             printf("%d   %d\n", nextSpaceBreak, nextBreak);
262         }
263     }
264 
265 
266     fWordCount = spaceIter.getWordCount();
267 
268     delete breakIter;
269 
270     return result;
271 }
272 
273 /*
274  * Report a break that's in the text with spaces but
275  * not found in the text without spaces.
276  */
breakNotFound(int32_t br)277 void ThaiWordbreakTest::breakNotFound(int32_t br)
278 {
279     if (fVerbose) {
280         printf("%d   ****\n", br);
281     } else {
282         fprintf(stderr, "break not found: %d\n", br);
283     }
284 
285     fBreaksNotFound += 1;
286 }
287 
288 /*
289  * Report a break that's found in the text without spaces
290  * that isn't in the text with spaces.
291  */
foundInvalidBreak(int32_t br)292 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
293 {
294     if (fVerbose) {
295         printf("****   %d\n", br);
296     } else {
297         fprintf(stderr, "found invalid break: %d\n", br);
298     }
299 
300     fInvalidBreaks += 1;
301 }
302 
303 /*
304  * Read the text from a file. The text must start with a Unicode Byte
305  * Order Mark (BOM) so that we know what order to read the bytes in.
306  */
readFile(char * fileName,int32_t & charCount)307 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
308 {
309     FILE *f;
310     int32_t fileSize;
311 
312     UChar *buffer;
313     char *bufferChars;
314 
315     f = fopen(fileName, "rb");
316 
317     if( f == NULL ) {
318         fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
319         return 0;
320     }
321 
322     fseek(f, 0, SEEK_END);
323     fileSize = ftell(f);
324 
325     fseek(f, 0, SEEK_SET);
326     bufferChars = new char[fileSize];
327 
328     if(bufferChars == 0) {
329         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
330         fclose(f);
331         return 0;
332     }
333 
334     fread(bufferChars, sizeof(char), fileSize, f);
335     if( ferror(f) ) {
336         fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
337         fclose(f);
338         delete[] bufferChars;
339         return 0;
340     }
341     fclose(f);
342 
343     UnicodeString myText(bufferChars, fileSize, "UTF-8");
344 
345     delete[] bufferChars;
346 
347     charCount = myText.length();
348     buffer = new UChar[charCount];
349     if(buffer == 0) {
350         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
351         return 0;
352     }
353 
354     myText.extract(1, myText.length(), buffer);
355     charCount--;  // skip the BOM
356     buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
357 
358     return buffer;
359 }
360 
361 /*
362  * Remove spaces from the input UChar array.
363  *
364  * We check explicitly for a Unicode code value of 0x0020
365  * because Unicode::isSpaceChar returns true for CR, LF, etc.
366  *
367  */
crunchSpaces(const UChar * spaces,int32_t count,int32_t & nonSpaceCount)368 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
369 {
370     int32_t i, out, spaceCount;
371 
372     spaceCount = 0;
373     for (i = 0; i < count; i += 1) {
374         if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
375             spaceCount += 1;
376         }
377     }
378 
379     nonSpaceCount = count - spaceCount;
380     UChar *noSpaces = new UChar[nonSpaceCount];
381 
382     if (noSpaces == 0) {
383         fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
384         return 0;
385     }
386 
387     for (out = 0, i = 0; i < count; i += 1) {
388         if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
389             noSpaces[out++] = spaces[i];
390         }
391     }
392 
393     return noSpaces;
394 }
395 
396 /*
397  * Generate a text file with spaces in it from a file without.
398  */
generateFile(const UChar * chars,int32_t length)399 int generateFile(const UChar *chars, int32_t length) {
400     Locale root("");
401     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
402     UErrorCode status = U_ZERO_ERROR;
403 
404     UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
405     BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
406     breakIter->adoptText(noSpaceIter);
407     char outbuf[1024];
408     int32_t strlength;
409     UChar bom = 0xFEFF;
410 
411     printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
412     int32_t prevbreak = 0;
413     while (U_SUCCESS(status)) {
414         int32_t nextbreak = breakIter->next();
415         if (nextbreak == BreakIterator::DONE) {
416             break;
417         }
418         printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
419                                     nextbreak-prevbreak, &status));
420         if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
421             && complexContext.contains(chars[nextbreak])) {
422             printf(" ");
423         }
424         prevbreak = nextbreak;
425     }
426 
427     if (U_FAILURE(status)) {
428         fprintf(stderr, "generate failed: %s\n", u_errorName(status));
429         return status;
430     }
431     else {
432         return 0;
433     }
434 }
435 
436 /*
437  * The main routine. Read the command line arguments, read the text file,
438  * remove the spaces, do the comparison and report the final results
439  */
main(int argc,char ** argv)440 int main(int argc, char **argv)
441 {
442     char *fileName = "space.txt";
443     int arg = 1;
444     UBool verbose = FALSE;
445     UBool generate = FALSE;
446 
447     if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
448         generate = TRUE;
449         arg += 1;
450     }
451 
452     if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
453         verbose = TRUE;
454         arg += 1;
455     }
456 
457     if (arg == argc - 1) {
458         fileName = argv[arg++];
459     }
460 
461     if (arg != argc) {
462         fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
463         return 1;
464     }
465 
466     int32_t spaceCount, nonSpaceCount;
467     const UChar *spaces, *noSpaces;
468 
469     spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
470 
471     if (spaces == 0) {
472         return 1;
473     }
474 
475     if (generate) {
476         return generateFile(spaces, spaceCount);
477     }
478 
479     noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
480 
481     if (noSpaces == 0) {
482         return 1;
483     }
484 
485     ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
486 
487     printf("word count: %d\n", test.getWordCount());
488     printf("breaks not found: %d\n", test.getBreaksNotFound());
489     printf("invalid breaks found: %d\n", test.getInvalidBreaks());
490 
491     return 0;
492 }
493 
494 /*
495  * The main constructor. Clear all the counts and construct a default
496  * word instance of a BreakIterator.
497  */
SpaceBreakIterator(const UChar * text,int32_t count)498 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
499   : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
500 {
501     UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
502     UErrorCode status = U_ZERO_ERROR;
503     fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
504     Locale root("");
505 
506     fBreakIter = BreakIterator::createWordInstance(root, status);
507     fBreakIter->adoptText(iter);
508 }
509 
SpaceBreakIterator()510 SpaceBreakIterator::SpaceBreakIterator()
511 {
512     // nothing
513 }
514 
515 /*
516  * The destructor. delete the underlying BreakIterator
517  */
~SpaceBreakIterator()518 SpaceBreakIterator::~SpaceBreakIterator()
519 {
520     delete fBreakIter;
521 }
522 
523 /*
524  * Return the next break, counting words and spaces.
525  */
next()526 int32_t SpaceBreakIterator::next()
527 {
528     if (fDone) {
529         return BreakIterator::DONE;
530     }
531 
532     int32_t nextBreak;
533     do {
534         nextBreak = fBreakIter->next();
535 
536         if (nextBreak == BreakIterator::DONE) {
537             fDone = TRUE;
538             return BreakIterator::DONE;
539         }
540     }
541     while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
542             && fComplexContext.contains(fText[nextBreak]));
543 
544    int32_t result = nextBreak - fSpaceCount;
545 
546     if (nextBreak < fTextCount) {
547         if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
548             fSpaceCount += fBreakIter->next() - nextBreak;
549         }
550     }
551 
552     fWordCount += 1;
553 
554     return result;
555 }
556 
557 /*
558  * Returns the current space count
559  */
getSpaceCount()560 int32_t SpaceBreakIterator::getSpaceCount()
561 {
562     return fSpaceCount;
563 }
564 
565 /*
566  * Returns the current word count
567  */
getWordCount()568 int32_t SpaceBreakIterator::getWordCount()
569 {
570     return fWordCount;
571 }
572 
573 
574