1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (C) 2001-2012 IBM, Inc. All Rights Reserved.
4 *
5 ********************************************************************/
6 /********************************************************************************
7 *
8 * File CALLCOLL.C
9 *
10 * Modification History:
11 * Name Description
12 * Andy Heninger First Version
13 *
14 *********************************************************************************
15 */
16
17 //
18 // This program tests string collation and sort key generation performance.
19 // Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
20 // A file of names is required as input, one per line. It must be in utf-8 or utf-16 format,
21 // and include a byte order mark. Either LE or BE format is OK.
22 //
23
24 const char gUsageString[] =
25 "usage: collperf options...\n"
26 "-help Display this message.\n"
27 "-file file_name utf-16 format file of names.\n"
28 "-locale name ICU locale to use. Default is en_US\n"
29 "-rules file_name Collation rules file (overrides locale)\n"
30 "-langid 0x1234 Windows Language ID number. Default to value for -locale option\n"
31 " see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
32 "-win Run test using Windows native services. (ICU is default)\n"
33 "-unix Run test using Unix strxfrm, strcoll services.\n"
34 "-uselen Use API with string lengths. Default is null-terminated strings\n"
35 "-usekeys Run tests using sortkeys rather than strcoll\n"
36 "-strcmp Run tests using u_strcmp rather than strcoll\n"
37 "-strcmpCPO Run tests using u_strcmpCodePointOrder rather than strcoll\n"
38 "-loop nnnn Loopcount for test. Adjust for reasonable total running time.\n"
39 "-iloop n Inner Loop Count. Default = 1. Number of calls to function\n"
40 " under test at each call point. For measuring test overhead.\n"
41 "-terse Terse numbers-only output. Intended for use by scripts.\n"
42 "-french French accent ordering\n"
43 "-frenchoff No French accent ordering (for use with French locales.)\n"
44 "-norm Normalizing mode on\n"
45 "-shifted Shifted mode\n"
46 "-lower Lower case first\n"
47 "-upper Upper case first\n"
48 "-case Enable separate case level\n"
49 "-level n Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
50 "-keyhist Produce a table sort key size vs. string length\n"
51 "-binsearch Binary Search timing test\n"
52 "-keygen Sort Key Generation timing test\n"
53 "-qsort Quicksort timing test\n"
54 "-iter Iteration Performance Test\n"
55 "-dump Display strings, sort keys and CEs.\n"
56 ;
57
58
59
60 #include <stdio.h>
61 #include <string.h>
62 #include <stdlib.h>
63 #include <math.h>
64 #include <locale.h>
65 #include <errno.h>
66
67 #include <unicode/utypes.h>
68 #include <unicode/ucol.h>
69 #include <unicode/ucoleitr.h>
70 #include <unicode/uloc.h>
71 #include <unicode/ustring.h>
72 #include <unicode/ures.h>
73 #include <unicode/uchar.h>
74 #include <unicode/ucnv.h>
75 #include <unicode/utf8.h>
76
77 #ifdef WIN32
78 #include <windows.h>
79 #else
80 //
81 // Stubs for Windows API functions when building on UNIXes.
82 //
83 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)84 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
85 #include <sys/time.h>
timeGetTime()86 unsigned long timeGetTime() {
87 struct timeval t;
88 gettimeofday(&t, 0);
89 unsigned long val = t.tv_sec * 1000; // Let it overflow. Who cares.
90 val += t.tv_usec / 1000;
91 return val;
92 }
LCMapStringW(DWORD,DWORD,UChar *,int,UChar *,int)93 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
94 const int LCMAP_SORTKEY = 0;
95 #define MAKELCID(a,b) 0
96 const int SORT_DEFAULT = 0;
97 #endif
98
99
100
101 //
102 // Command line option variables
103 // These global variables are set according to the options specified
104 // on the command line by the user.
105 char * opt_fName = 0;
106 const char * opt_locale = "en_US";
107 int opt_langid = 0; // Defaults to value corresponding to opt_locale.
108 char * opt_rules = 0;
109 UBool opt_help = FALSE;
110 int opt_loopCount = 1;
111 int opt_iLoopCount = 1;
112 UBool opt_terse = FALSE;
113 UBool opt_qsort = FALSE;
114 UBool opt_binsearch = FALSE;
115 UBool opt_icu = TRUE;
116 UBool opt_win = FALSE; // Run with Windows native functions.
117 UBool opt_unix = FALSE; // Run with UNIX strcoll, strxfrm functions.
118 UBool opt_uselen = FALSE;
119 UBool opt_usekeys = FALSE;
120 UBool opt_strcmp = FALSE;
121 UBool opt_strcmpCPO = FALSE;
122 UBool opt_norm = FALSE;
123 UBool opt_keygen = FALSE;
124 UBool opt_french = FALSE;
125 UBool opt_frenchoff = FALSE;
126 UBool opt_shifted = FALSE;
127 UBool opt_lower = FALSE;
128 UBool opt_upper = FALSE;
129 UBool opt_case = FALSE;
130 int opt_level = 0;
131 UBool opt_keyhist = FALSE;
132 UBool opt_itertest = FALSE;
133 UBool opt_dump = FALSE;
134
135
136
137 //
138 // Definitions for the command line options
139 //
140 struct OptSpec {
141 const char *name;
142 enum {FLAG, NUM, STRING} type;
143 void *pVar;
144 };
145
146 OptSpec opts[] = {
147 {"-file", OptSpec::STRING, &opt_fName},
148 {"-locale", OptSpec::STRING, &opt_locale},
149 {"-langid", OptSpec::NUM, &opt_langid},
150 {"-rules", OptSpec::STRING, &opt_rules},
151 {"-qsort", OptSpec::FLAG, &opt_qsort},
152 {"-binsearch", OptSpec::FLAG, &opt_binsearch},
153 {"-iter", OptSpec::FLAG, &opt_itertest},
154 {"-win", OptSpec::FLAG, &opt_win},
155 {"-unix", OptSpec::FLAG, &opt_unix},
156 {"-uselen", OptSpec::FLAG, &opt_uselen},
157 {"-usekeys", OptSpec::FLAG, &opt_usekeys},
158 {"-strcmp", OptSpec::FLAG, &opt_strcmp},
159 {"-strcmpCPO", OptSpec::FLAG, &opt_strcmpCPO},
160 {"-norm", OptSpec::FLAG, &opt_norm},
161 {"-french", OptSpec::FLAG, &opt_french},
162 {"-frenchoff", OptSpec::FLAG, &opt_frenchoff},
163 {"-shifted", OptSpec::FLAG, &opt_shifted},
164 {"-lower", OptSpec::FLAG, &opt_lower},
165 {"-upper", OptSpec::FLAG, &opt_upper},
166 {"-case", OptSpec::FLAG, &opt_case},
167 {"-level", OptSpec::NUM, &opt_level},
168 {"-keyhist", OptSpec::FLAG, &opt_keyhist},
169 {"-keygen", OptSpec::FLAG, &opt_keygen},
170 {"-loop", OptSpec::NUM, &opt_loopCount},
171 {"-iloop", OptSpec::NUM, &opt_iLoopCount},
172 {"-terse", OptSpec::FLAG, &opt_terse},
173 {"-dump", OptSpec::FLAG, &opt_dump},
174 {"-help", OptSpec::FLAG, &opt_help},
175 {"-?", OptSpec::FLAG, &opt_help},
176 {0, OptSpec::FLAG, 0}
177 };
178
179
180 //---------------------------------------------------------------------------
181 //
182 // Global variables pointing to and describing the test file
183 //
184 //---------------------------------------------------------------------------
185
186 //
187 // struct Line
188 //
189 // Each line from the source file (containing a name, presumably) gets
190 // one of these structs.
191 //
192 struct Line {
193 UChar *name;
194 int len;
195 char *winSortKey;
196 char *icuSortKey;
197 char *unixSortKey;
198 char *unixName;
199 };
200
201
202
203 Line *gFileLines; // Ptr to array of Line structs, one per line in the file.
204 int gNumFileLines;
205 UCollator *gCol;
206 DWORD gWinLCID;
207
208 Line **gSortedLines;
209 Line **gRandomLines;
210 int gCount;
211
212
213
214 //---------------------------------------------------------------------------
215 //
216 // ProcessOptions() Function to read the command line options.
217 //
218 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])219 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
220 {
221 int i;
222 int argNum;
223 const char *pArgName;
224 OptSpec *pOpt;
225
226 for (argNum=1; argNum<argc; argNum++) {
227 pArgName = argv[argNum];
228 for (pOpt = opts; pOpt->name != 0; pOpt++) {
229 if (strcmp(pOpt->name, pArgName) == 0) {
230 switch (pOpt->type) {
231 case OptSpec::FLAG:
232 *(UBool *)(pOpt->pVar) = TRUE;
233 break;
234 case OptSpec::STRING:
235 argNum ++;
236 if (argNum >= argc) {
237 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
238 return FALSE;
239 }
240 *(const char **)(pOpt->pVar) = argv[argNum];
241 break;
242 case OptSpec::NUM:
243 argNum ++;
244 if (argNum >= argc) {
245 fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
246 return FALSE;
247 }
248 char *endp;
249 i = strtol(argv[argNum], &endp, 0);
250 if (endp == argv[argNum]) {
251 fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
252 return FALSE;
253 }
254 *(int *)(pOpt->pVar) = i;
255 }
256 break;
257 }
258 }
259 if (pOpt->name == 0)
260 {
261 fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
262 return FALSE;
263 }
264 }
265 return TRUE;
266 }
267
268 //---------------------------------------------------------------------------------------
269 //
270 // Comparison functions for use by qsort.
271 //
272 // Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
273 // or null terminated.
274 //
275 //---------------------------------------------------------------------------------------
ICUstrcmpK(const void * a,const void * b)276 int ICUstrcmpK(const void *a, const void *b) {
277 gCount++;
278 int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
279 return t;
280 }
281
282
ICUstrcmpL(const void * a,const void * b)283 int ICUstrcmpL(const void *a, const void *b) {
284 gCount++;
285 UCollationResult t;
286 t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
287 if (t == UCOL_LESS) return -1;
288 if (t == UCOL_GREATER) return +1;
289 return 0;
290 }
291
292
ICUstrcmp(const void * a,const void * b)293 int ICUstrcmp(const void *a, const void *b) {
294 gCount++;
295 UCollationResult t;
296 t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
297 if (t == UCOL_LESS) return -1;
298 if (t == UCOL_GREATER) return +1;
299 return 0;
300 }
301
302
Winstrcmp(const void * a,const void * b)303 int Winstrcmp(const void *a, const void *b) {
304 gCount++;
305 int t;
306 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
307 return t-2;
308 }
309
310
UNIXstrcmp(const void * a,const void * b)311 int UNIXstrcmp(const void *a, const void *b) {
312 gCount++;
313 int t;
314 t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
315 return t;
316 }
317
318
WinstrcmpL(const void * a,const void * b)319 int WinstrcmpL(const void *a, const void *b) {
320 gCount++;
321 int t;
322 t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
323 return t-2;
324 }
325
326
WinstrcmpK(const void * a,const void * b)327 int WinstrcmpK(const void *a, const void *b) {
328 gCount++;
329 int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
330 return t;
331 }
332
333
334 //---------------------------------------------------------------------------------------
335 //
336 // Function for sorting the names (lines) into a random order.
337 // Order is based on a hash of the ICU Sort key for the lines
338 // The randomized order is used as input for the sorting timing tests.
339 //
340 //---------------------------------------------------------------------------------------
ICURandomCmp(const void * a,const void * b)341 int ICURandomCmp(const void *a, const void *b) {
342 char *ask = (*(Line **)a)->icuSortKey;
343 char *bsk = (*(Line **)b)->icuSortKey;
344 int aVal = 0;
345 int bVal = 0;
346 int retVal;
347 while (*ask != 0) {
348 aVal += aVal*37 + *ask++;
349 }
350 while (*bsk != 0) {
351 bVal += bVal*37 + *bsk++;
352 }
353 retVal = -1;
354 if (aVal == bVal) {
355 retVal = 0;
356 }
357 else if (aVal > bVal) {
358 retVal = 1;
359 }
360 return retVal;
361 }
362
363 //---------------------------------------------------------------------------------------
364 //
365 // doKeyGen() Key Generation Timing Test
366 //
367 //---------------------------------------------------------------------------------------
doKeyGen()368 void doKeyGen()
369 {
370 int line;
371 int loops = 0;
372 int iLoop;
373 int t;
374 int len=-1;
375
376 // Adjust loop count to compensate for file size. Should be order n
377 double dLoopCount = double(opt_loopCount) * (1000. / double(gNumFileLines));
378 int adj_loopCount = int(dLoopCount);
379 if (adj_loopCount < 1) adj_loopCount = 1;
380
381
382 unsigned long startTime = timeGetTime();
383
384 if (opt_win) {
385 for (loops=0; loops<adj_loopCount; loops++) {
386 for (line=0; line < gNumFileLines; line++) {
387 if (opt_uselen) {
388 len = gFileLines[line].len;
389 }
390 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
391 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY,
392 gFileLines[line].name, len,
393 (unsigned short *)gFileLines[line].winSortKey, 5000); // TODO something with length.
394 }
395 }
396 }
397 }
398 else if (opt_icu)
399 {
400 for (loops=0; loops<adj_loopCount; loops++) {
401 for (line=0; line < gNumFileLines; line++) {
402 if (opt_uselen) {
403 len = gFileLines[line].len;
404 }
405 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
406 t = ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
407 }
408 }
409 }
410 }
411 else if (opt_unix)
412 {
413 for (loops=0; loops<adj_loopCount; loops++) {
414 for (line=0; line < gNumFileLines; line++) {
415 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
416 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
417 }
418 }
419 }
420 }
421
422 unsigned long elapsedTime = timeGetTime() - startTime;
423 int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
424
425 if (opt_terse == FALSE) {
426 printf("Sort Key Generation: total # of keys = %d\n", loops*gNumFileLines);
427 printf("Sort Key Generation: time per key = %d ns\n", ns);
428 }
429 else {
430 printf("%d, ", ns);
431 }
432
433 int totalKeyLen = 0;
434 int totalChars = 0;
435 for (line=0; line<gNumFileLines; line++) {
436 totalChars += u_strlen(gFileLines[line].name);
437 if (opt_win) {
438 totalKeyLen += strlen(gFileLines[line].winSortKey);
439 }
440 else if (opt_icu) {
441 totalKeyLen += strlen(gFileLines[line].icuSortKey);
442 }
443 else if (opt_unix) {
444 totalKeyLen += strlen(gFileLines[line].unixSortKey);
445 }
446
447 }
448 if (opt_terse == FALSE) {
449 printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
450 } else {
451 printf("%f, ", (float)totalKeyLen / (float)totalChars);
452 }
453 }
454
455
456
457 //---------------------------------------------------------------------------------------
458 //
459 // doBinarySearch() Binary Search timing test. Each name from the list
460 // is looked up in the full sorted list of names.
461 //
462 //---------------------------------------------------------------------------------------
doBinarySearch()463 void doBinarySearch()
464 {
465
466 gCount = 0;
467 int line;
468 int loops = 0;
469 int iLoop = 0;
470 unsigned long elapsedTime = 0;
471
472 // Adjust loop count to compensate for file size. Should be order n (lookups) * log n (compares/lookup)
473 // Accurate timings do not depend on this being perfect. The correction is just to try to
474 // get total running times of about the right order, so the that user doesn't need to
475 // manually adjust the loop count for every different file size.
476 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
477 if (opt_usekeys) dLoopCount *= 5;
478 int adj_loopCount = int(dLoopCount);
479 if (adj_loopCount < 1) adj_loopCount = 1;
480
481
482 for (;;) { // not really a loop, just allows "break" to work, to simplify
483 // inadvertantly running more than one test through here.
484 if (opt_strcmp || opt_strcmpCPO)
485 {
486 unsigned long startTime = timeGetTime();
487 typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
488 PF pf = u_strcmp;
489 if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
490 //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;} // Damn the difference between int32_t and int
491 // which forces the use of a cast here.
492
493 int r = 0;
494 for (loops=0; loops<adj_loopCount; loops++) {
495
496 for (line=0; line < gNumFileLines; line++) {
497 int hi = gNumFileLines-1;
498 int lo = 0;
499 int guess = -1;
500 for (;;) {
501 int newGuess = (hi + lo) / 2;
502 if (newGuess == guess)
503 break;
504 guess = newGuess;
505 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
506 r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
507 }
508 gCount++;
509 if (r== 0)
510 break;
511 if (r < 0)
512 hi = guess;
513 else
514 lo = guess;
515 }
516 }
517 }
518 elapsedTime = timeGetTime() - startTime;
519 break;
520 }
521
522
523 if (opt_icu)
524 {
525 unsigned long startTime = timeGetTime();
526 UCollationResult r = UCOL_EQUAL;
527 for (loops=0; loops<adj_loopCount; loops++) {
528
529 for (line=0; line < gNumFileLines; line++) {
530 int lineLen = -1;
531 int guessLen = -1;
532 if (opt_uselen) {
533 lineLen = (gSortedLines[line])->len;
534 }
535 int hi = gNumFileLines-1;
536 int lo = 0;
537 int guess = -1;
538 for (;;) {
539 int newGuess = (hi + lo) / 2;
540 if (newGuess == guess)
541 break;
542 guess = newGuess;
543 int ri = 0;
544 if (opt_usekeys) {
545 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
546 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
547 }
548 gCount++;
549 r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
550 }
551 else
552 {
553 if (opt_uselen) {
554 guessLen = (gSortedLines[guess])->len;
555 }
556 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
557 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
558 }
559 gCount++;
560 }
561 if (r== UCOL_EQUAL)
562 break;
563 if (r == UCOL_LESS)
564 hi = guess;
565 else
566 lo = guess;
567 }
568 }
569 }
570 elapsedTime = timeGetTime() - startTime;
571 break;
572 }
573
574 if (opt_win)
575 {
576 unsigned long startTime = timeGetTime();
577 int r = 0;
578 for (loops=0; loops<adj_loopCount; loops++) {
579
580 for (line=0; line < gNumFileLines; line++) {
581 int lineLen = -1;
582 int guessLen = -1;
583 if (opt_uselen) {
584 lineLen = (gSortedLines[line])->len;
585 }
586 int hi = gNumFileLines-1;
587 int lo = 0;
588 int guess = -1;
589 for (;;) {
590 int newGuess = (hi + lo) / 2;
591 if (newGuess == guess)
592 break;
593 guess = newGuess;
594 if (opt_usekeys) {
595 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
596 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
597 }
598 gCount++;
599 r+=2;
600 }
601 else
602 {
603 if (opt_uselen) {
604 guessLen = (gSortedLines[guess])->len;
605 }
606 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
607 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
608 }
609 if (r == 0) {
610 if (opt_terse == FALSE) {
611 fprintf(stderr, "Error returned from Windows CompareStringW.\n");
612 }
613 exit(-1);
614 }
615 gCount++;
616 }
617 if (r== 2) // strings ==
618 break;
619 if (r == 1) // line < guess
620 hi = guess;
621 else // line > guess
622 lo = guess;
623 }
624 }
625 }
626 elapsedTime = timeGetTime() - startTime;
627 break;
628 }
629
630 if (opt_unix)
631 {
632 unsigned long startTime = timeGetTime();
633 int r = 0;
634 for (loops=0; loops<adj_loopCount; loops++) {
635
636 for (line=0; line < gNumFileLines; line++) {
637 int hi = gNumFileLines-1;
638 int lo = 0;
639 int guess = -1;
640 for (;;) {
641 int newGuess = (hi + lo) / 2;
642 if (newGuess == guess)
643 break;
644 guess = newGuess;
645 if (opt_usekeys) {
646 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
647 r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
648 }
649 gCount++;
650 }
651 else
652 {
653 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
654 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
655 }
656 errno = 0;
657 if (errno != 0) {
658 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
659 exit(-1);
660 }
661 gCount++;
662 }
663 if (r == 0) // strings ==
664 break;
665 if (r < 0) // line < guess
666 hi = guess;
667 else // line > guess
668 lo = guess;
669 }
670 }
671 }
672 elapsedTime = timeGetTime() - startTime;
673 break;
674 }
675 break;
676 }
677
678 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
679 if (opt_terse == FALSE) {
680 printf("binary search: total # of string compares = %d\n", gCount);
681 printf("binary search: compares per loop = %d\n", gCount / loops);
682 printf("binary search: time per compare = %d ns\n", ns);
683 } else {
684 printf("%d, ", ns);
685 }
686
687 }
688
689
690
691
692 //---------------------------------------------------------------------------------------
693 //
694 // doQSort() The quick sort timing test. Uses the C library qsort function.
695 //
696 //---------------------------------------------------------------------------------------
doQSort()697 void doQSort() {
698 int i;
699 Line **sortBuf = new Line *[gNumFileLines];
700
701 // Adjust loop count to compensate for file size. QSort should be n log(n)
702 double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
703 if (opt_usekeys) dLoopCount *= 5;
704 int adj_loopCount = int(dLoopCount);
705 if (adj_loopCount < 1) adj_loopCount = 1;
706
707
708 gCount = 0;
709 unsigned long startTime = timeGetTime();
710 if (opt_win && opt_usekeys) {
711 for (i=0; i<opt_loopCount; i++) {
712 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
713 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
714 }
715 }
716
717 else if (opt_win && opt_uselen) {
718 for (i=0; i<adj_loopCount; i++) {
719 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
720 qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
721 }
722 }
723
724
725 else if (opt_win && !opt_uselen) {
726 for (i=0; i<adj_loopCount; i++) {
727 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
728 qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
729 }
730 }
731
732 else if (opt_icu && opt_usekeys) {
733 for (i=0; i<adj_loopCount; i++) {
734 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
735 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
736 }
737 }
738
739 else if (opt_icu && opt_uselen) {
740 for (i=0; i<adj_loopCount; i++) {
741 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
742 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
743 }
744 }
745
746
747 else if (opt_icu && !opt_uselen) {
748 for (i=0; i<adj_loopCount; i++) {
749 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
750 qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
751 }
752 }
753
754 else if (opt_unix && !opt_usekeys) {
755 for (i=0; i<adj_loopCount; i++) {
756 memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
757 qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
758 }
759 }
760
761 unsigned long elapsedTime = timeGetTime() - startTime;
762 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
763 if (opt_terse == FALSE) {
764 printf("qsort: total # of string compares = %d\n", gCount);
765 printf("qsort: time per compare = %d ns\n", ns);
766 } else {
767 printf("%d, ", ns);
768 }
769 }
770
771
772
773 //---------------------------------------------------------------------------------------
774 //
775 // doKeyHist() Output a table of data for
776 // average sort key size vs. string length.
777 //
778 //---------------------------------------------------------------------------------------
doKeyHist()779 void doKeyHist() {
780 int i;
781 int maxLen = 0;
782
783 // Find the maximum string length
784 for (i=0; i<gNumFileLines; i++) {
785 if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
786 }
787
788 // Allocate arrays to hold the histogram data
789 int *accumulatedLen = new int[maxLen+1];
790 int *numKeysOfSize = new int[maxLen+1];
791 for (i=0; i<=maxLen; i++) {
792 accumulatedLen[i] = 0;
793 numKeysOfSize[i] = 0;
794 }
795
796 // Fill the arrays...
797 for (i=0; i<gNumFileLines; i++) {
798 int len = gFileLines[i].len;
799 accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
800 numKeysOfSize[len] += 1;
801 }
802
803 // And write out averages
804 printf("String Length, Avg Key Length, Avg Key Len per char\n");
805 for (i=1; i<=maxLen; i++) {
806 if (numKeysOfSize[i] > 0) {
807 printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
808 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
809 }
810 }
811 delete []accumulatedLen;
812 delete []numKeysOfSize ;
813 }
814
815 //---------------------------------------------------------------------------------------
816 //
817 // doForwardIterTest(UBool) Forward iteration test
818 // argument null-terminated string used
819 //
820 //---------------------------------------------------------------------------------------
doForwardIterTest(UBool haslen)821 void doForwardIterTest(UBool haslen) {
822 int count = 0;
823
824 UErrorCode error = U_ZERO_ERROR;
825 printf("\n\nPerforming forward iteration performance test with ");
826
827 if (haslen) {
828 printf("non-null terminated data -----------\n");
829 }
830 else {
831 printf("null terminated data -----------\n");
832 }
833 printf("performance test on strings from file -----------\n");
834
835 UChar dummytext[] = {0, 0};
836 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
837 ucol_setText(iter, dummytext, 1, &error);
838
839 gCount = 0;
840 unsigned long startTime = timeGetTime();
841 while (count < opt_loopCount) {
842 int linecount = 0;
843 while (linecount < gNumFileLines) {
844 UChar *str = gFileLines[linecount].name;
845 int strlen = haslen?gFileLines[linecount].len:-1;
846 ucol_setText(iter, str, strlen, &error);
847 while (ucol_next(iter, &error) != UCOL_NULLORDER) {
848 gCount++;
849 }
850
851 linecount ++;
852 }
853 count ++;
854 }
855 unsigned long elapsedTime = timeGetTime() - startTime;
856 printf("elapsedTime %ld\n", elapsedTime);
857
858 // empty loop recalculation
859 count = 0;
860 startTime = timeGetTime();
861 while (count < opt_loopCount) {
862 int linecount = 0;
863 while (linecount < gNumFileLines) {
864 UChar *str = gFileLines[linecount].name;
865 int strlen = haslen?gFileLines[linecount].len:-1;
866 ucol_setText(iter, str, strlen, &error);
867 linecount ++;
868 }
869 count ++;
870 }
871 elapsedTime -= (timeGetTime() - startTime);
872 printf("elapsedTime %ld\n", elapsedTime);
873
874 ucol_closeElements(iter);
875
876 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
877 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
878 opt_loopCount);
879 printf("Average time per ucol_next() nano seconds %d\n", ns);
880
881 printf("performance test on skipped-5 concatenated strings from file -----------\n");
882
883 UChar *str;
884 int strlen = 0;
885 // appending all the strings
886 int linecount = 0;
887 while (linecount < gNumFileLines) {
888 strlen += haslen?gFileLines[linecount].len:
889 u_strlen(gFileLines[linecount].name);
890 linecount ++;
891 }
892 str = (UChar *)malloc(sizeof(UChar) * strlen);
893 int strindex = 0;
894 linecount = 0;
895 while (strindex < strlen) {
896 int len = 0;
897 len += haslen?gFileLines[linecount].len:
898 u_strlen(gFileLines[linecount].name);
899 memcpy(str + strindex, gFileLines[linecount].name,
900 sizeof(UChar) * len);
901 strindex += len;
902 linecount ++;
903 }
904
905 printf("Total size of strings %d\n", strlen);
906
907 gCount = 0;
908 count = 0;
909
910 if (!haslen) {
911 strlen = -1;
912 }
913 iter = ucol_openElements(gCol, str, strlen, &error);
914 if (!haslen) {
915 strlen = u_strlen(str);
916 }
917 strlen -= 5; // any left over characters are not iterated,
918 // this is to ensure the backwards and forwards iterators
919 // gets the same position
920 startTime = timeGetTime();
921 while (count < opt_loopCount) {
922 int count5 = 5;
923 strindex = 0;
924 ucol_setOffset(iter, strindex, &error);
925 while (TRUE) {
926 if (ucol_next(iter, &error) == UCOL_NULLORDER) {
927 break;
928 }
929 gCount++;
930 count5 --;
931 if (count5 == 0) {
932 strindex += 10;
933 if (strindex > strlen) {
934 break;
935 }
936 ucol_setOffset(iter, strindex, &error);
937 count5 = 5;
938 }
939 }
940 count ++;
941 }
942
943 elapsedTime = timeGetTime() - startTime;
944 printf("elapsedTime %ld\n", elapsedTime);
945
946 // empty loop recalculation
947 int tempgCount = 0;
948 count = 0;
949 startTime = timeGetTime();
950 while (count < opt_loopCount) {
951 int count5 = 5;
952 strindex = 0;
953 ucol_setOffset(iter, strindex, &error);
954 while (TRUE) {
955 tempgCount ++;
956 count5 --;
957 if (count5 == 0) {
958 strindex += 10;
959 if (strindex > strlen) {
960 break;
961 }
962 ucol_setOffset(iter, strindex, &error);
963 count5 = 5;
964 }
965 }
966 count ++;
967 }
968 elapsedTime -= (timeGetTime() - startTime);
969 printf("elapsedTime %ld\n", elapsedTime);
970
971 ucol_closeElements(iter);
972
973 printf("gCount %d\n", gCount);
974 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
975 printf("Average time per ucol_next() nano seconds %d\n", ns);
976 }
977
978 //---------------------------------------------------------------------------------------
979 //
980 // doBackwardIterTest(UBool) Backwards iteration test
981 // argument null-terminated string used
982 //
983 //---------------------------------------------------------------------------------------
doBackwardIterTest(UBool haslen)984 void doBackwardIterTest(UBool haslen) {
985 int count = 0;
986 UErrorCode error = U_ZERO_ERROR;
987 printf("\n\nPerforming backward iteration performance test with ");
988
989 if (haslen) {
990 printf("non-null terminated data -----------\n");
991 }
992 else {
993 printf("null terminated data -----------\n");
994 }
995
996 printf("performance test on strings from file -----------\n");
997
998 UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
999 UChar dummytext[] = {0, 0};
1000 ucol_setText(iter, dummytext, 1, &error);
1001
1002 gCount = 0;
1003 unsigned long startTime = timeGetTime();
1004 while (count < opt_loopCount) {
1005 int linecount = 0;
1006 while (linecount < gNumFileLines) {
1007 UChar *str = gFileLines[linecount].name;
1008 int strlen = haslen?gFileLines[linecount].len:-1;
1009 ucol_setText(iter, str, strlen, &error);
1010 while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1011 gCount ++;
1012 }
1013
1014 linecount ++;
1015 }
1016 count ++;
1017 }
1018 unsigned long elapsedTime = timeGetTime() - startTime;
1019
1020 printf("elapsedTime %ld\n", elapsedTime);
1021
1022 // empty loop recalculation
1023 count = 0;
1024 startTime = timeGetTime();
1025 while (count < opt_loopCount) {
1026 int linecount = 0;
1027 while (linecount < gNumFileLines) {
1028 UChar *str = gFileLines[linecount].name;
1029 int strlen = haslen?gFileLines[linecount].len:-1;
1030 ucol_setText(iter, str, strlen, &error);
1031 linecount ++;
1032 }
1033 count ++;
1034 }
1035 elapsedTime -= (timeGetTime() - startTime);
1036
1037 printf("elapsedTime %ld\n", elapsedTime);
1038 ucol_closeElements(iter);
1039
1040 int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1041 printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1042 opt_loopCount);
1043 printf("Average time per ucol_previous() nano seconds %d\n", ns);
1044
1045 printf("performance test on skipped-5 concatenated strings from file -----------\n");
1046
1047 UChar *str;
1048 int strlen = 0;
1049 // appending all the strings
1050 int linecount = 0;
1051 while (linecount < gNumFileLines) {
1052 strlen += haslen?gFileLines[linecount].len:
1053 u_strlen(gFileLines[linecount].name);
1054 linecount ++;
1055 }
1056 str = (UChar *)malloc(sizeof(UChar) * strlen);
1057 int strindex = 0;
1058 linecount = 0;
1059 while (strindex < strlen) {
1060 int len = 0;
1061 len += haslen?gFileLines[linecount].len:
1062 u_strlen(gFileLines[linecount].name);
1063 memcpy(str + strindex, gFileLines[linecount].name,
1064 sizeof(UChar) * len);
1065 strindex += len;
1066 linecount ++;
1067 }
1068
1069 printf("Total size of strings %d\n", strlen);
1070
1071 gCount = 0;
1072 count = 0;
1073
1074 if (!haslen) {
1075 strlen = -1;
1076 }
1077
1078 iter = ucol_openElements(gCol, str, strlen, &error);
1079 if (!haslen) {
1080 strlen = u_strlen(str);
1081 }
1082
1083 startTime = timeGetTime();
1084 while (count < opt_loopCount) {
1085 int count5 = 5;
1086 strindex = 5;
1087 ucol_setOffset(iter, strindex, &error);
1088 while (TRUE) {
1089 if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1090 break;
1091 }
1092 gCount ++;
1093 count5 --;
1094 if (count5 == 0) {
1095 strindex += 10;
1096 if (strindex > strlen) {
1097 break;
1098 }
1099 ucol_setOffset(iter, strindex, &error);
1100 count5 = 5;
1101 }
1102 }
1103 count ++;
1104 }
1105
1106 elapsedTime = timeGetTime() - startTime;
1107 printf("elapsedTime %ld\n", elapsedTime);
1108
1109 // empty loop recalculation
1110 count = 0;
1111 int tempgCount = 0;
1112 startTime = timeGetTime();
1113 while (count < opt_loopCount) {
1114 int count5 = 5;
1115 strindex = 5;
1116 ucol_setOffset(iter, strindex, &error);
1117 while (TRUE) {
1118 tempgCount ++;
1119 count5 --;
1120 if (count5 == 0) {
1121 strindex += 10;
1122 if (strindex > strlen) {
1123 break;
1124 }
1125 ucol_setOffset(iter, strindex, &error);
1126 count5 = 5;
1127 }
1128 }
1129 count ++;
1130 }
1131 elapsedTime -= (timeGetTime() - startTime);
1132 printf("elapsedTime %ld\n", elapsedTime);
1133 ucol_closeElements(iter);
1134
1135 printf("gCount %d\n", gCount);
1136 ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1137 printf("Average time per ucol_previous() nano seconds %d\n", ns);
1138 }
1139
1140 //---------------------------------------------------------------------------------------
1141 //
1142 // doIterTest() Iteration test
1143 //
1144 //---------------------------------------------------------------------------------------
doIterTest()1145 void doIterTest() {
1146 doForwardIterTest(opt_uselen);
1147 doBackwardIterTest(opt_uselen);
1148 }
1149
1150
1151 //----------------------------------------------------------------------------------------
1152 //
1153 // UnixConvert -- Convert the lines of the file to the encoding for UNIX
1154 // Since it appears that Unicode support is going in the general
1155 // direction of the use of UTF-8 locales, that is the approach
1156 // that is used here.
1157 //
1158 //----------------------------------------------------------------------------------------
UnixConvert()1159 void UnixConvert() {
1160 int line;
1161
1162 UConverter *cvrtr; // An ICU code page converter.
1163 UErrorCode status = U_ZERO_ERROR;
1164
1165
1166 cvrtr = ucnv_open("utf-8", &status); // we are just doing UTF-8 locales for now.
1167 if (U_FAILURE(status)) {
1168 fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1169 exit(-1);
1170 }
1171
1172 for (line=0; line < gNumFileLines; line++) {
1173 int sizeNeeded = ucnv_fromUChars(cvrtr,
1174 0, // ptr to target buffer.
1175 0, // length of target buffer.
1176 gFileLines[line].name,
1177 -1, // source is null terminated
1178 &status);
1179 if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1180 //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1181 //exit(-1);
1182 }
1183 status = U_ZERO_ERROR;
1184 gFileLines[line].unixName = new char[sizeNeeded+1];
1185 sizeNeeded = ucnv_fromUChars(cvrtr,
1186 gFileLines[line].unixName, // ptr to target buffer.
1187 sizeNeeded+1, // length of target buffer.
1188 gFileLines[line].name,
1189 -1, // source is null terminated
1190 &status);
1191 if (U_FAILURE(status)) {
1192 fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1193 exit(-1);
1194 }
1195 gFileLines[line].unixName[sizeNeeded] = 0;
1196 };
1197 ucnv_close(cvrtr);
1198 }
1199
1200
1201 //----------------------------------------------------------------------------------------
1202 //
1203 // class UCharFile Class to hide all the gorp to read a file in
1204 // and produce a stream of UChars.
1205 //
1206 //----------------------------------------------------------------------------------------
1207 class UCharFile {
1208 public:
1209 UCharFile(const char *fileName);
1210 ~UCharFile();
1211 UChar get();
eof()1212 UBool eof() {return fEof;};
error()1213 UBool error() {return fError;};
1214
1215 private:
UCharFile(const UCharFile &)1216 UCharFile (const UCharFile & /*other*/) {}; // No copy constructor.
operator =(const UCharFile &)1217 UCharFile & operator = (const UCharFile &/*other*/) {return *this;}; // No assignment op
1218
1219 FILE *fFile;
1220 const char *fName;
1221 UBool fEof;
1222 UBool fError;
1223 UChar fPending2ndSurrogate;
1224
1225 enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1226 };
1227
UCharFile(const char * fileName)1228 UCharFile::UCharFile(const char * fileName) {
1229 fEof = FALSE;
1230 fError = FALSE;
1231 fName = fileName;
1232 fFile = fopen(fName, "rb");
1233 fPending2ndSurrogate = 0;
1234 if (fFile == NULL) {
1235 fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1236 fError = TRUE;
1237 return;
1238 }
1239 //
1240 // Look for the byte order mark at the start of the file.
1241 //
1242 int BOMC1, BOMC2, BOMC3;
1243 BOMC1 = fgetc(fFile);
1244 BOMC2 = fgetc(fFile);
1245
1246 if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1247 fEncoding = UTF16LE; }
1248 else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1249 fEncoding = UTF16BE; }
1250 else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1251 fEncoding = UTF8; }
1252 else
1253 {
1254 fprintf(stderr, "collperf: file \"%s\" encoding must be UTF-8 or UTF-16, and "
1255 "must include a BOM.\n", fileName);
1256 fError = true;
1257 return;
1258 }
1259 }
1260
1261
~UCharFile()1262 UCharFile::~UCharFile() {
1263 fclose(fFile);
1264 }
1265
1266
1267
get()1268 UChar UCharFile::get() {
1269 UChar c;
1270 switch (fEncoding) {
1271 case UTF16LE:
1272 {
1273 int cL, cH;
1274 cL = fgetc(fFile);
1275 cH = fgetc(fFile);
1276 c = cL | (cH << 8);
1277 if (cH == EOF) {
1278 c = 0;
1279 fEof = TRUE;
1280 }
1281 break;
1282 }
1283 case UTF16BE:
1284 {
1285 int cL, cH;
1286 cH = fgetc(fFile);
1287 cL = fgetc(fFile);
1288 c = cL | (cH << 8);
1289 if (cL == EOF) {
1290 c = 0;
1291 fEof = TRUE;
1292 }
1293 break;
1294 }
1295 case UTF8:
1296 {
1297 if (fPending2ndSurrogate != 0) {
1298 c = fPending2ndSurrogate;
1299 fPending2ndSurrogate = 0;
1300 break;
1301 }
1302
1303 int ch = fgetc(fFile); // Note: c and ch are separate cause eof test doesn't work on UChar type.
1304 if (ch == EOF) {
1305 c = 0;
1306 fEof = TRUE;
1307 break;
1308 }
1309
1310 if (ch <= 0x7f) {
1311 // It's ascii. No further utf-8 conversion.
1312 c = ch;
1313 break;
1314 }
1315
1316 // Figure out the lenght of the char and read the rest of the bytes
1317 // into a temp array.
1318 int nBytes;
1319 if (ch >= 0xF0) {nBytes=4;}
1320 else if (ch >= 0xE0) {nBytes=3;}
1321 else if (ch >= 0xC0) {nBytes=2;}
1322 else {
1323 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1324 fError = TRUE;
1325 return 0;
1326 }
1327
1328 unsigned char bytes[10];
1329 bytes[0] = (unsigned char)ch;
1330 int i;
1331 for (i=1; i<nBytes; i++) {
1332 bytes[i] = fgetc(fFile);
1333 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1334 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1335 fError = TRUE;
1336 return 0;
1337 }
1338 }
1339
1340 // Convert the bytes from the temp array to a Unicode char.
1341 i = 0;
1342 uint32_t cp;
1343 U8_NEXT_UNSAFE(bytes, i, cp);
1344 c = (UChar)cp;
1345
1346 if (cp >= 0x10000) {
1347 // The code point needs to be broken up into a utf-16 surrogate pair.
1348 // Process first half this time through the main loop, and
1349 // remember the other half for the next time through.
1350 UChar utf16Buf[3];
1351 i = 0;
1352 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1353 fPending2ndSurrogate = utf16Buf[1];
1354 c = utf16Buf[0];
1355 }
1356 break;
1357 };
1358 default:
1359 c = 0xFFFD; /* Error, unspecified codepage*/
1360 fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1361 exit(1);
1362 }
1363 return c;
1364 }
1365
1366 //----------------------------------------------------------------------------------------
1367 //
1368 // openRulesCollator - Command line specified a rules file. Read it in
1369 // and open a collator with it.
1370 //
1371 //----------------------------------------------------------------------------------------
openRulesCollator()1372 UCollator *openRulesCollator() {
1373 UCharFile f(opt_rules);
1374 if (f.error()) {
1375 return 0;
1376 }
1377
1378 int bufLen = 10000;
1379 UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1380 UChar *tmp;
1381 int i = 0;
1382
1383 for(;;) {
1384 buf[i] = f.get();
1385 if (f.eof()) {
1386 break;
1387 }
1388 if (f.error()) {
1389 return 0;
1390 }
1391 i++;
1392 if (i >= bufLen) {
1393 tmp = buf;
1394 bufLen += 10000;
1395 buf = (UChar *)realloc(buf, bufLen);
1396 if (buf == NULL) {
1397 free(tmp);
1398 return 0;
1399 }
1400 }
1401 }
1402 buf[i] = 0;
1403
1404 UErrorCode status = U_ZERO_ERROR;
1405 UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1406 UCOL_DEFAULT_STRENGTH, NULL, &status);
1407 if (U_FAILURE(status)) {
1408 fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1409 return 0;
1410 }
1411 free(buf);
1412 return coll;
1413 }
1414
1415
1416
1417
1418
1419 //----------------------------------------------------------------------------------------
1420 //
1421 // Main -- process command line, read in and pre-process the test file,
1422 // call other functions to do the actual tests.
1423 //
1424 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)1425 int main(int argc, const char** argv) {
1426 if (ProcessOptions(argc, argv, opts) != TRUE || opt_help || opt_fName == 0) {
1427 printf(gUsageString);
1428 exit (1);
1429 }
1430
1431 // Make sure that we've only got one API selected.
1432 if (opt_unix || opt_win) opt_icu = FALSE;
1433 if (opt_unix) opt_win = FALSE;
1434
1435 //
1436 // Set up an ICU collator
1437 //
1438 UErrorCode status = U_ZERO_ERROR;
1439
1440 if (opt_rules != 0) {
1441 gCol = openRulesCollator();
1442 if (gCol == 0) {return -1;}
1443 }
1444 else {
1445 gCol = ucol_open(opt_locale, &status);
1446 if (U_FAILURE(status)) {
1447 fprintf(stderr, "Collator creation failed.: %d\n", status);
1448 return -1;
1449 }
1450 }
1451 if (status==U_USING_DEFAULT_WARNING && opt_terse==FALSE) {
1452 fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1453 }
1454 if (status==U_USING_FALLBACK_WARNING && opt_terse==FALSE) {
1455 fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1456 }
1457
1458 if (opt_norm) {
1459 ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1460 }
1461 if (opt_french && opt_frenchoff) {
1462 fprintf(stderr, "collperf: Error, specified both -french and -frenchoff options.");
1463 exit(-1);
1464 }
1465 if (opt_french) {
1466 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1467 }
1468 if (opt_frenchoff) {
1469 ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1470 }
1471 if (opt_lower) {
1472 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1473 }
1474 if (opt_upper) {
1475 ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1476 }
1477 if (opt_case) {
1478 ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1479 }
1480 if (opt_shifted) {
1481 ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1482 }
1483 if (opt_level != 0) {
1484 switch (opt_level) {
1485 case 1:
1486 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1487 break;
1488 case 2:
1489 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1490 break;
1491 case 3:
1492 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1493 break;
1494 case 4:
1495 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1496 break;
1497 case 5:
1498 ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1499 break;
1500 default:
1501 fprintf(stderr, "-level param must be between 1 and 5\n");
1502 exit(-1);
1503 }
1504 }
1505
1506 if (U_FAILURE(status)) {
1507 fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1508 return -1;
1509 }
1510
1511
1512 //
1513 // Set up a Windows LCID
1514 //
1515 if (opt_langid != 0) {
1516 gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1517 }
1518 else {
1519 gWinLCID = uloc_getLCID(opt_locale);
1520 }
1521
1522
1523 //
1524 // Set the UNIX locale
1525 //
1526 if (opt_unix) {
1527 if (setlocale(LC_ALL, opt_locale) == 0) {
1528 fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1529 exit(-1);
1530 }
1531 }
1532
1533 // Read in the input file.
1534 // File assumed to be utf-16.
1535 // Lines go onto heap buffers. Global index array to line starts is created.
1536 // Lines themselves are null terminated.
1537 //
1538
1539 UCharFile f(opt_fName);
1540 if (f.error()) {
1541 exit(-1);
1542 }
1543
1544 const int MAXLINES = 100000;
1545 gFileLines = new Line[MAXLINES];
1546 UChar buf[1024];
1547 int column = 0;
1548
1549 // Read the file, split into lines, and save in memory.
1550 // Loop runs once per utf-16 value from the input file,
1551 // (The number of bytes read from file per loop iteration depends on external encoding.)
1552 for (;;) {
1553
1554 UChar c = f.get();
1555 if (f.error()){
1556 exit(-1);
1557 }
1558
1559
1560 // We now have a good UTF-16 value in c.
1561
1562 // Watch for CR, LF, EOF; these finish off a line.
1563 if (c == 0xd) {
1564 continue;
1565 }
1566
1567 if (f.eof() || c == 0x0a || c==0x2028) { // Unipad inserts 2028 line separators!
1568 buf[column++] = 0;
1569 if (column > 1) {
1570 gFileLines[gNumFileLines].name = new UChar[column];
1571 gFileLines[gNumFileLines].len = column-1;
1572 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1573 gNumFileLines++;
1574 column = 0;
1575 if (gNumFileLines >= MAXLINES) {
1576 fprintf(stderr, "File too big. Max number of lines is %d\n", MAXLINES);
1577 exit(-1);
1578 }
1579
1580 }
1581 if (c == 0xa || c == 0x2028)
1582 continue;
1583 else
1584 break; // EOF
1585 }
1586 buf[column++] = c;
1587 if (column >= 1023)
1588 {
1589 static UBool warnFlag = TRUE;
1590 if (warnFlag) {
1591 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1592 warnFlag = FALSE;
1593 }
1594 column--;
1595 }
1596 }
1597
1598 if (opt_terse == FALSE) {
1599 printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1600 }
1601
1602
1603 // Convert the lines to the UNIX encoding.
1604 if (opt_unix) {
1605 UnixConvert();
1606 }
1607
1608 //
1609 // Pre-compute ICU sort keys for the lines of the file.
1610 //
1611 int line;
1612 int32_t t;
1613
1614 for (line=0; line<gNumFileLines; line++) {
1615 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1616 gFileLines[line].icuSortKey = new char[t];
1617
1618 if (t > (int32_t)sizeof(buf)) {
1619 t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1620 }
1621 else
1622 {
1623 memcpy(gFileLines[line].icuSortKey, buf, t);
1624 }
1625 }
1626
1627
1628
1629 //
1630 // Pre-compute Windows sort keys for the lines of the file.
1631 //
1632 for (line=0; line<gNumFileLines; line++) {
1633 t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1634 gFileLines[line].winSortKey = new char[t];
1635 if (t > (int32_t)sizeof(buf)) {
1636 t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (unsigned short *)(gFileLines[line].winSortKey), t);
1637 }
1638 else
1639 {
1640 memcpy(gFileLines[line].winSortKey, buf, t);
1641 }
1642 }
1643
1644 //
1645 // Pre-compute UNIX sort keys for the lines of the file.
1646 //
1647 if (opt_unix) {
1648 for (line=0; line<gNumFileLines; line++) {
1649 t=strxfrm((char *)buf, gFileLines[line].unixName, sizeof(buf));
1650 gFileLines[line].unixSortKey = new char[t];
1651 if (t > (int32_t)sizeof(buf)) {
1652 t = strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, sizeof(buf));
1653 }
1654 else
1655 {
1656 memcpy(gFileLines[line].unixSortKey, buf, t);
1657 }
1658 }
1659 }
1660
1661
1662 //
1663 // Dump file lines, CEs, Sort Keys if requested.
1664 //
1665 if (opt_dump) {
1666 int i;
1667 for (line=0; line<gNumFileLines; line++) {
1668 for (i=0;;i++) {
1669 UChar c = gFileLines[line].name[i];
1670 if (c == 0)
1671 break;
1672 if (c < 0x20 || c > 0x7e) {
1673 printf("\\u%.4x", c);
1674 }
1675 else {
1676 printf("%c", c);
1677 }
1678 }
1679 printf("\n");
1680
1681 printf(" CEs: ");
1682 UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1683 int32_t ce;
1684 i = 0;
1685 for (;;) {
1686 ce = ucol_next(CEiter, &status);
1687 if (ce == UCOL_NULLORDER) {
1688 break;
1689 }
1690 printf(" %.8x", ce);
1691 if (++i > 8) {
1692 printf("\n ");
1693 i = 0;
1694 }
1695 }
1696 printf("\n");
1697 ucol_closeElements(CEiter);
1698
1699
1700 printf(" ICU Sort Key: ");
1701 for (i=0; ; i++) {
1702 unsigned char c = gFileLines[line].icuSortKey[i];
1703 printf("%02x ", c);
1704 if (c == 0) {
1705 break;
1706 }
1707 if (i > 0 && i % 20 == 0) {
1708 printf("\n ");
1709 }
1710 }
1711 printf("\n");
1712 }
1713 }
1714
1715
1716 //
1717 // Pre-sort the lines.
1718 //
1719 int i;
1720 gSortedLines = new Line *[gNumFileLines];
1721 for (i=0; i<gNumFileLines; i++) {
1722 gSortedLines[i] = &gFileLines[i];
1723 }
1724
1725 if (opt_win) {
1726 qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1727 }
1728 else if (opt_unix) {
1729 qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1730 }
1731 else /* ICU */
1732 {
1733 qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1734 }
1735
1736
1737 //
1738 // Make up a randomized order, will be used for sorting tests.
1739 //
1740 gRandomLines = new Line *[gNumFileLines];
1741 for (i=0; i<gNumFileLines; i++) {
1742 gRandomLines[i] = &gFileLines[i];
1743 }
1744 qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1745
1746
1747
1748
1749 //
1750 // We've got the file read into memory. Go do something with it.
1751 //
1752
1753 if (opt_qsort) doQSort();
1754 if (opt_binsearch) doBinarySearch();
1755 if (opt_keygen) doKeyGen();
1756 if (opt_keyhist) doKeyHist();
1757 if (opt_itertest) doIterTest();
1758
1759 return 0;
1760
1761 }
1762