1 /***********************************************************************
2 * Copyright (C) 2016 and later: Unicode, Inc. and others.
3 * License & terms of use: http://www.unicode.org/copyright.html#License
4 *
5 ***********************************************************************
6 ***********************************************************************
7 * COPYRIGHT:
8 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
9 *
10 ***********************************************************************/
11 /********************************************************************************
12 *
13 * File ubrkperf.cpp
14 *
15 * Modification History:
16 * Name Description
17 * Vladimir Weinstein First Version, based on collperf
18 *
19 *********************************************************************************
20 */
21
22 //
23 // This program tests break iterator performance
24 // Currently we test only ICU APIs with the future possibility of testing *nix & win32 APIs
25 // (if any)
26 // A text file is required as input. It must be in utf-8 or utf-16 format,
27 // and include a byte order mark. Either LE or BE format is OK.
28 //
29
30 const char gUsageString[] =
31 "usage: ubrkperf options...\n"
32 "-help Display this message.\n"
33 "-file file_name utf-16/utf-8 format file.\n"
34 "-locale name ICU locale to use. Default is en_US\n"
35 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
36 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
37 "-win Run test using Windows native services. (currently not working) (ICU is default)\n"
38 "-unix Run test using Unix word breaking services. (currently not working) \n"
39 "-mac Run test using MacOSX word breaking services.\n"
40 "-uselen Use API with string lengths. Default is null-terminated strings\n"
41 "-char Use character break iterator\n"
42 "-word Use word break iterator\n"
43 "-line Use line break iterator\n"
44 "-sentence Use sentence break iterator\n"
45 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
46 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
47 " under test at each call point. For measuring test overhead.\n"
48 "-terse Terse numbers-only output. Intended for use by scripts.\n"
49 "-dump Display stuff.\n"
50 "-capi Use C APIs instead of C++ APIs (currently not working)\n"
51 "-next Do the next test\n"
52 "-isBound Do the isBound test\n"
53 ;
54
55
56 #include <stdio.h>
57 #include <string.h>
58 #include <stdlib.h>
59 #include <math.h>
60 #include <locale.h>
61 #include <errno.h>
62 #include <sys/stat.h>
63
64 #include <unicode/utypes.h>
65 #include <unicode/ucol.h>
66 #include <unicode/ucoleitr.h>
67 #include <unicode/uloc.h>
68 #include <unicode/ustring.h>
69 #include <unicode/ures.h>
70 #include <unicode/uchar.h>
71 #include <unicode/ucnv.h>
72 #include <unicode/utf8.h>
73
74 #include <unicode/brkiter.h>
75
76
77 #if U_PLATFORM_HAS_WIN32_API
78 #include <windows.h>
79 #else
80 //
81 // Stubs for Windows API functions when building on UNIXes.
82 //
83 #include <sys/time.h>
timeGetTime()84 unsigned long timeGetTime() {
85 struct timeval t;
86 gettimeofday(&t, 0);
87 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
88 val += t.tv_usec / 1000;
89 return val;
90 };
91 #define MAKELCID(a,b) 0
92 #endif
93
94
95 //
96 // Command line option variables
97 // These global variables are set according to the options specified
98 // on the command line by the user.
99 char * opt_fName = 0;
100 char * opt_locale = "en_US";
101 int opt_langid = 0; // Defaults to value corresponding to opt_locale.
102 char * opt_rules = 0;
103 UBool opt_help = FALSE;
104 int opt_time = 0;
105 int opt_loopCount = 0;
106 int opt_passesCount= 1;
107 UBool opt_terse = FALSE;
108 UBool opt_icu = TRUE;
109 UBool opt_win = FALSE; // Run with Windows native functions.
110 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
111 UBool opt_mac = FALSE; // Run with MacOSX word break services.
112 UBool opt_uselen = FALSE;
113 UBool opt_dump = FALSE;
114 UBool opt_char = FALSE;
115 UBool opt_word = FALSE;
116 UBool opt_line = FALSE;
117 UBool opt_sentence = FALSE;
118 UBool opt_capi = FALSE;
119
120 UBool opt_next = FALSE;
121 UBool opt_isBound = FALSE;
122
123
124
125 //
126 // Definitions for the command line options
127 //
128 struct OptSpec {
129 const char *name;
130 enum {FLAG, NUM, STRING} type;
131 void *pVar;
132 };
133
134 OptSpec opts[] = {
135 {"-file", OptSpec::STRING, &opt_fName},
136 {"-locale", OptSpec::STRING, &opt_locale},
137 {"-langid", OptSpec::NUM, &opt_langid},
138 {"-win", OptSpec::FLAG, &opt_win},
139 {"-unix", OptSpec::FLAG, &opt_unix},
140 {"-mac", OptSpec::FLAG, &opt_mac},
141 {"-uselen", OptSpec::FLAG, &opt_uselen},
142 {"-loop", OptSpec::NUM, &opt_loopCount},
143 {"-time", OptSpec::NUM, &opt_time},
144 {"-passes", OptSpec::NUM, &opt_passesCount},
145 {"-char", OptSpec::FLAG, &opt_char},
146 {"-word", OptSpec::FLAG, &opt_word},
147 {"-line", OptSpec::FLAG, &opt_line},
148 {"-sentence", OptSpec::FLAG, &opt_sentence},
149 {"-terse", OptSpec::FLAG, &opt_terse},
150 {"-dump", OptSpec::FLAG, &opt_dump},
151 {"-capi", OptSpec::FLAG, &opt_capi},
152 {"-next", OptSpec::FLAG, &opt_next},
153 {"-isBound", OptSpec::FLAG, &opt_isBound},
154 {"-help", OptSpec::FLAG, &opt_help},
155 {"-?", OptSpec::FLAG, &opt_help},
156 {0, OptSpec::FLAG, 0}
157 };
158
159
160 //---------------------------------------------------------------------------
161 //
162 // Global variables pointing to and describing the test file
163 //
164 //---------------------------------------------------------------------------
165
166 //DWORD gWinLCID;
167 BreakIterator *brkit = NULL;
168 UChar *text = NULL;
169 int32_t textSize = 0;
170
171
172
173 #if U_PLATFORM_IS_DARWIN_BASED
174 #include <ApplicationServices/ApplicationServices.h>
175 enum{
176 kUCTextBreakAllMask = (kUCTextBreakClusterMask | kUCTextBreakWordMask | kUCTextBreakLineMask)
177 };
178 UCTextBreakType breakTypes[4] = {kUCTextBreakCharMask, kUCTextBreakClusterMask, kUCTextBreakWordMask, kUCTextBreakLineMask};
179 TextBreakLocatorRef breakRef;
180 UCTextBreakType macBreakType;
181
createMACBrkIt()182 void createMACBrkIt() {
183 OSStatus status = noErr;
184 LocaleRef lref;
185 status = LocaleRefFromLocaleString(opt_locale, &lref);
186 status = UCCreateTextBreakLocator(lref, 0, kUCTextBreakAllMask, (TextBreakLocatorRef*)&breakRef);
187 if(opt_char == TRUE) {
188 macBreakType = kUCTextBreakClusterMask;
189 } else if(opt_word == TRUE) {
190 macBreakType = kUCTextBreakWordMask;
191 } else if(opt_line == TRUE) {
192 macBreakType = kUCTextBreakLineMask;
193 } else if(opt_sentence == TRUE) {
194 // error
195 // brkit = BreakIterator::createSentenceInstance(opt_locale, status);
196 } else {
197 // default is character iterator
198 macBreakType = kUCTextBreakClusterMask;
199 }
200 }
201 #endif
202
createICUBrkIt()203 void createICUBrkIt() {
204 //
205 // Set up an ICU break iterator
206 //
207 UErrorCode status = U_ZERO_ERROR;
208 if(opt_char == TRUE) {
209 brkit = BreakIterator::createCharacterInstance(opt_locale, status);
210 } else if(opt_word == TRUE) {
211 brkit = BreakIterator::createWordInstance(opt_locale, status);
212 } else if(opt_line == TRUE) {
213 brkit = BreakIterator::createLineInstance(opt_locale, status);
214 } else if(opt_sentence == TRUE) {
215 brkit = BreakIterator::createSentenceInstance(opt_locale, status);
216 } else {
217 // default is character iterator
218 brkit = BreakIterator::createCharacterInstance(opt_locale, status);
219 }
220 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
221 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
222 }
223 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
224 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
225 }
226
227 }
228
229 //---------------------------------------------------------------------------
230 //
231 // ProcessOptions() Function to read the command line options.
232 //
233 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])234 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
235 {
236 int i;
237 int argNum;
238 const char *pArgName;
239 OptSpec *pOpt;
240
241 for (argNum=1; argNum<argc; argNum++) {
242 pArgName = argv[argNum];
243 for (pOpt = opts; pOpt->name != 0; pOpt++) {
244 if (strcmp(pOpt->name, pArgName) == 0) {
245 switch (pOpt->type) {
246 case OptSpec::FLAG:
247 *(UBool *)(pOpt->pVar) = TRUE;
248 break;
249 case OptSpec::STRING:
250 argNum ++;
251 if (argNum >= argc) {
252 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
253 return FALSE;
254 }
255 *(const char **)(pOpt->pVar) = argv[argNum];
256 break;
257 case OptSpec::NUM:
258 argNum ++;
259 if (argNum >= argc) {
260 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
261 return FALSE;
262 }
263 char *endp;
264 i = strtol(argv[argNum], &endp, 0);
265 if (endp == argv[argNum]) {
266 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
267 return FALSE;
268 }
269 *(int *)(pOpt->pVar) = i;
270 }
271 break;
272 }
273 }
274 if (pOpt->name == 0)
275 {
276 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
277 return FALSE;
278 }
279 }
280 return TRUE;
281 }
282
283
doForwardTest()284 void doForwardTest() {
285 if (opt_terse == FALSE) {
286 printf("Doing the forward test\n");
287 }
288 int32_t noBreaks = 0;
289 int32_t i = 0;
290 unsigned long startTime = timeGetTime();
291 unsigned long elapsedTime = 0;
292 if(opt_icu) {
293 createICUBrkIt();
294 brkit->setText(UnicodeString(text, textSize));
295 brkit->first();
296 if (opt_terse == FALSE) {
297 printf("Warmup\n");
298 }
299 int j;
300 while((j = brkit->next()) != BreakIterator::DONE) {
301 noBreaks++;
302 //fprintf(stderr, "%d ", j);
303 }
304
305 if (opt_terse == FALSE) {
306 printf("Measure\n");
307 }
308 startTime = timeGetTime();
309 for(i = 0; i < opt_loopCount; i++) {
310 brkit->first();
311 while(brkit->next() != BreakIterator::DONE) {
312 }
313 }
314
315 elapsedTime = timeGetTime()-startTime;
316 } else if(opt_mac) {
317 #if U_PLATFORM_IS_DARWIN_BASED
318 createMACBrkIt();
319 UniChar* filePtr = text;
320 OSStatus status = noErr;
321 UniCharCount startOffset = 0, breakOffset = 0, numUniChars = textSize;
322 startOffset = 0;
323 //printf("\t---Search forward--\n");
324
325 while (startOffset < numUniChars)
326 {
327 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
328 startOffset, &breakOffset);
329 //require_action(status == noErr, EXIT, printf( "**UCFindTextBreak failed: startOffset %d, status %d\n", (int)startOffset, (int)status));
330 //require_action((breakOffset <= numUniChars),EXIT, printf("**UCFindTextBreak breakOffset too big: startOffset %d, breakOffset %d\n", (int)startOffset, (int)breakOffset));
331
332 // Output break
333 //printf("\t%d\n", (int)breakOffset);
334
335 // Increment counters
336 noBreaks++;
337 startOffset = breakOffset;
338 }
339 startTime = timeGetTime();
340 for(i = 0; i < opt_loopCount; i++) {
341 startOffset = 0;
342
343 while (startOffset < numUniChars)
344 {
345 status = UCFindTextBreak(breakRef, macBreakType, kUCTextBreakLeadingEdgeMask, filePtr, numUniChars,
346 startOffset, &breakOffset);
347 // Increment counters
348 startOffset = breakOffset;
349 }
350 }
351 elapsedTime = timeGetTime()-startTime;
352 UCDisposeTextBreakLocator(&breakRef);
353 #endif
354
355
356 }
357
358
359 if (opt_terse == FALSE) {
360 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
361 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
362 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
363 printf("forward break iteration average loop time %d\n", loopTime);
364 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
365 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
366 } else {
367 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
368 }
369
370
371 }
372
doIsBoundTest()373 void doIsBoundTest() {
374 int32_t noBreaks = 0, hit = 0;
375 int32_t i = 0, j = 0;
376 unsigned long startTime = timeGetTime();
377 unsigned long elapsedTime = 0;
378 createICUBrkIt();
379 brkit->setText(UnicodeString(text, textSize));
380 brkit->first();
381 for(j = 0; j < textSize; j++) {
382 if(brkit->isBoundary(j)) {
383 noBreaks++;
384 //fprintf(stderr, "%d ", j);
385 }
386 }
387 /*
388 while(brkit->next() != BreakIterator::DONE) {
389 noBreaks++;
390 }
391 */
392
393 startTime = timeGetTime();
394 for(i = 0; i < opt_loopCount; i++) {
395 for(j = 0; j < textSize; j++) {
396 if(brkit->isBoundary(j)) {
397 hit++;
398 }
399 }
400 }
401
402 elapsedTime = timeGetTime()-startTime;
403 int32_t loopTime = (int)(float(1000) * ((float)elapsedTime/(float)opt_loopCount));
404 if (opt_terse == FALSE) {
405 int32_t timePerCU = (int)(float(1000) * ((float)loopTime/(float)textSize));
406 int32_t timePerBreak = (int)(float(1000) * ((float)loopTime/(float)noBreaks));
407 printf("forward break iteration average loop time %d\n", loopTime);
408 printf("number of code units %d average time per code unit %d\n", textSize, timePerCU);
409 printf("number of breaks %d average time per break %d\n", noBreaks, timePerBreak);
410 } else {
411 printf("time=%d\nevents=%d\nsize=%d\n", elapsedTime, noBreaks, textSize);
412 }
413 }
414
415 //----------------------------------------------------------------------------------------
416 //
417 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
418 // Since it appears that Unicode support is going in the general
419 // direction of the use of UTF-8 locales, that is the approach
420 // that is used here.
421 //
422 //----------------------------------------------------------------------------------------
UnixConvert()423 void UnixConvert() {
424 #if 0
425 int line;
426
427 UConverter *cvrtr; // An ICU code page converter.
428 UErrorCode status = U_ZERO_ERROR;
429
430
431 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
432 if (U_FAILURE(status)) {
433 fprintf(stderr, "ICU Converter open failed.: %d\n", &status);
434 exit(-1);
435 }
436 // redo for unix
437 for (line=0; line < gNumFileLines; line++) {
438 int sizeNeeded = ucnv_fromUChars(cvrtr,
439 0, // ptr to target buffer.
440 0, // length of target buffer.
441 gFileLines[line].name,
442 -1, // source is null terminated
443 &status);
444 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
445 fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
446 exit(-1);
447 }
448 status = U_ZERO_ERROR;
449 gFileLines[line].unixName = new char[sizeNeeded+1];
450 sizeNeeded = ucnv_fromUChars(cvrtr,
451 gFileLines[line].unixName, // ptr to target buffer.
452 sizeNeeded+1, // length of target buffer.
453 gFileLines[line].name,
454 -1, // source is null terminated
455 &status);
456 if (U_FAILURE(status)) {
457 fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
458 exit(-1);
459 }
460 gFileLines[line].unixName[sizeNeeded] = 0;
461 };
462 ucnv_close(cvrtr);
463 #endif
464 }
465
466
467 //----------------------------------------------------------------------------------------
468 //
469 // class UCharFile Class to hide all the gorp to read a file in
470 // and produce a stream of UChars.
471 //
472 //----------------------------------------------------------------------------------------
473 class UCharFile {
474 public:
475 UCharFile(const char *fileName);
476 ~UCharFile();
477 UChar get();
eof()478 UBool eof() {return fEof;};
error()479 UBool error() {return fError;};
size()480 int32_t size() { return fFileSize; };
481
482 private:
UCharFile(const UCharFile & other)483 UCharFile (const UCharFile &other) {}; // No copy constructor.
operator =(const UCharFile & other)484 UCharFile & operator = (const UCharFile &other) {return *this;}; // No assignment op
485
486 FILE *fFile;
487 const char *fName;
488 UBool fEof;
489 UBool fError;
490 UChar fPending2ndSurrogate;
491 int32_t fFileSize;
492
493 enum {UTF16LE, UTF16BE, UTF8} fEncoding;
494 };
495
UCharFile(const char * fileName)496 UCharFile::UCharFile(const char * fileName) {
497 fEof = FALSE;
498 fError = FALSE;
499 fName = fileName;
500 struct stat buf;
501 int32_t result = stat(fileName, &buf);
502 if(result != 0) {
503 fprintf(stderr, "Error getting info\n");
504 fFileSize = -1;
505 } else {
506 fFileSize = buf.st_size;
507 }
508 fFile = fopen(fName, "rb");
509 fPending2ndSurrogate = 0;
510 if (fFile == NULL) {
511 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
512 fError = TRUE;
513 return;
514 }
515 //
516 // Look for the byte order mark at the start of the file.
517 //
518 int BOMC1, BOMC2, BOMC3;
519 BOMC1 = fgetc(fFile);
520 BOMC2 = fgetc(fFile);
521
522 if (BOMC1 == 0xff && BOMC2 == 0xfe) {
523 fEncoding = UTF16LE; }
524 else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
525 fEncoding = UTF16BE; }
526 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
527 fEncoding = UTF8; }
528 else
529 {
530 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
531 "must include a BOM.\n", fileName);
532 fError = true;
533 return;
534 }
535 }
536
537
~UCharFile()538 UCharFile::~UCharFile() {
539 fclose(fFile);
540 }
541
542
543
get()544 UChar UCharFile::get() {
545 UChar c;
546 switch (fEncoding) {
547 case UTF16LE:
548 {
549 int cL, cH;
550 cL = fgetc(fFile);
551 cH = fgetc(fFile);
552 c = cL | (cH << 8);
553 if (cH == EOF) {
554 c = 0;
555 fEof = TRUE;
556 }
557 break;
558 }
559 case UTF16BE:
560 {
561 int cL, cH;
562 cH = fgetc(fFile);
563 cL = fgetc(fFile);
564 c = cL | (cH << 8);
565 if (cL == EOF) {
566 c = 0;
567 fEof = TRUE;
568 }
569 break;
570 }
571 case UTF8:
572 {
573 if (fPending2ndSurrogate != 0) {
574 c = fPending2ndSurrogate;
575 fPending2ndSurrogate = 0;
576 break;
577 }
578
579 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
580 if (ch == EOF) {
581 c = 0;
582 fEof = TRUE;
583 break;
584 }
585
586 if (ch <= 0x7f) {
587 // It's ascii. No further utf-8 conversion.
588 c = ch;
589 break;
590 }
591
592 // Figure out the lenght of the char and read the rest of the bytes
593 // into a temp array.
594 int nBytes;
595 if (ch >= 0xF0) {nBytes=4;}
596 else if (ch >= 0xE0) {nBytes=3;}
597 else if (ch >= 0xC0) {nBytes=2;}
598 else {
599 fprintf(stderr, "not likely utf-8 encoded file %s contains corrupt data at offset %d.\n", fName, ftell(fFile));
600 fError = TRUE;
601 return 0;
602 }
603
604 unsigned char bytes[10];
605 bytes[0] = (unsigned char)ch;
606 int i;
607 for (i=1; i<nBytes; i++) {
608 bytes[i] = fgetc(fFile);
609 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
610 fprintf(stderr, "utf-8 encoded file %s contains corrupt data at offset %d. Expected %d bytes, byte %d is invalid. First byte is %02X\n", fName, ftell(fFile), nBytes, i, ch);
611 fError = TRUE;
612 return 0;
613 }
614 }
615
616 // Convert the bytes from the temp array to a Unicode char.
617 i = 0;
618 uint32_t cp;
619 U8_NEXT_UNSAFE(bytes, i, cp);
620 c = (UChar)cp;
621
622 if (cp >= 0x10000) {
623 // The code point needs to be broken up into a utf-16 surrogate pair.
624 // Process first half this time through the main loop, and
625 // remember the other half for the next time through.
626 UChar utf16Buf[3];
627 i = 0;
628 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
629 fPending2ndSurrogate = utf16Buf[1];
630 c = utf16Buf[0];
631 }
632 break;
633 };
634 }
635 return c;
636 }
637
638
639 //----------------------------------------------------------------------------------------
640 //
641 // Main -- process command line, read in and pre-process the test file,
642 // call other functions to do the actual tests.
643 //
644 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)645 int main(int argc, const char** argv) {
646 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
647 printf(gUsageString);
648 exit (1);
649 }
650 // Make sure that we've only got one API selected.
651 if (opt_mac || opt_unix || opt_win) opt_icu = FALSE;
652 if (opt_mac || opt_unix) opt_win = FALSE;
653 if (opt_mac) opt_unix = FALSE;
654
655 UErrorCode status = U_ZERO_ERROR;
656
657
658
659 //
660 // Set up a Windows LCID
661 //
662 /*
663 if (opt_langid != 0) {
664 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
665 }
666 else {
667 gWinLCID = uloc_getLCID(opt_locale);
668 }
669 */
670
671 //
672 // Set the UNIX locale
673 //
674 if (opt_unix) {
675 if (setlocale(LC_ALL, opt_locale) == 0) {
676 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
677 exit(-1);
678 }
679 }
680
681 // Read in the input file.
682 // File assumed to be utf-16.
683 // Lines go onto heap buffers. Global index array to line starts is created.
684 // Lines themselves are null terminated.
685 //
686
687 UCharFile f(opt_fName);
688 if (f.error()) {
689 exit(-1);
690 }
691 int32_t fileSize = f.size();
692 const int STARTSIZE = 70000;
693 int32_t bufSize = 0;
694 int32_t charCount = 0;
695 if(fileSize != -1) {
696 text = (UChar *)malloc(fileSize*sizeof(UChar));
697 bufSize = fileSize;
698 } else {
699 text = (UChar *)malloc(STARTSIZE*sizeof(UChar));
700 bufSize = STARTSIZE;
701 }
702 if(text == NULL) {
703 fprintf(stderr, "Allocating buffer failed\n");
704 exit(-1);
705 }
706
707
708 // Read the file, split into lines, and save in memory.
709 // Loop runs once per utf-16 value from the input file,
710 // (The number of bytes read from file per loop iteration depends on external encoding.)
711 for (;;) {
712
713 UChar c = f.get();
714 if(f.eof()) {
715 break;
716 }
717 if (f.error()){
718 exit(-1);
719 }
720 // We now have a good UTF-16 value in c.
721 text[charCount++] = c;
722 if(charCount == bufSize) {
723 text = (UChar *)realloc(text, 2*bufSize*sizeof(UChar));
724 if(text == NULL) {
725 fprintf(stderr, "Reallocating buffer failed\n");
726 exit(-1);
727 }
728 bufSize *= 2;
729 }
730 }
731
732
733 if (opt_terse == FALSE) {
734 printf("file \"%s\", %d charCount code units.\n", opt_fName, charCount);
735 }
736
737 textSize = charCount;
738
739
740
741
742 //
743 // Dump file contents if requested.
744 //
745 if (opt_dump) {
746 // dump file, etc... possibly
747 }
748
749
750 //
751 // We've got the file read into memory. Go do something with it.
752 //
753 int32_t i = 0;
754 for(i = 0; i < opt_passesCount; i++) {
755 if(opt_loopCount != 0) {
756 if(opt_next) {
757 doForwardTest();
758 } else if(opt_isBound) {
759 doIsBoundTest();
760 } else {
761 doForwardTest();
762 }
763 } else if(opt_time != 0) {
764
765 }
766 }
767
768 if(text != NULL) {
769 free(text);
770 }
771 if(brkit != NULL) {
772 delete brkit;
773 }
774
775 return 0;
776 }
777