1 /***********************************************************************
2  * © 2016 and later: Unicode, Inc. and others.
3  * License & terms of use: http://www.unicode.org/copyright.html
4  ***********************************************************************
5  ***********************************************************************
6  * COPYRIGHT:
7  * Copyright (C) 2001-2012 IBM, Inc.   All Rights Reserved.
8  *
9  ***********************************************************************/
10 /********************************************************************************
11 *
12 * File CALLCOLL.C
13 *
14 * Modification History:
15 *        Name                     Description
16 *     Andy Heninger             First Version
17 *
18 *********************************************************************************
19 */
20 
21 //
22 //  This program tests string collation and sort key generation performance.
23 //      Three APIs can be teste: ICU C , Unix strcoll, strxfrm and Windows LCMapString
24 //      A file of names is required as input, one per line.  It must be in utf-8 or utf-16 format,
25 //      and include a byte order mark.  Either LE or BE format is OK.
26 //
27 
28 const char gUsageString[] =
29  "usage:  collperf options...\n"
30     "-help                      Display this message.\n"
31     "-file file_name            utf-16 format file of names.\n"
32     "-locale name               ICU locale to use.  Default is en_US\n"
33     "-rules file_name           Collation rules file (overrides locale)\n"
34     "-langid 0x1234             Windows Language ID number.  Default to value for -locale option\n"
35     "                              see http://msdn.microsoft.com/library/psdk/winbase/nls_8xo3.htm\n"
36     "-win                       Run test using Windows native services.  (ICU is default)\n"
37     "-unix                      Run test using Unix strxfrm, strcoll services.\n"
38     "-uselen                    Use API with string lengths.  Default is null-terminated strings\n"
39     "-usekeys                   Run tests using sortkeys rather than strcoll\n"
40     "-strcmp                    Run tests using u_strcmp rather than strcoll\n"
41     "-strcmpCPO                 Run tests using u_strcmpCodePointOrder rather than strcoll\n"
42     "-loop nnnn                 Loopcount for test.  Adjust for reasonable total running time.\n"
43     "-iloop n                   Inner Loop Count.  Default = 1.  Number of calls to function\n"
44     "                               under test at each call point.  For measuring test overhead.\n"
45     "-terse                     Terse numbers-only output.  Intended for use by scripts.\n"
46     "-french                    French accent ordering\n"
47     "-frenchoff                 No French accent ordering (for use with French locales.)\n"
48     "-norm                      Normalizing mode on\n"
49     "-shifted                   Shifted mode\n"
50     "-lower                     Lower case first\n"
51     "-upper                     Upper case first\n"
52     "-case                      Enable separate case level\n"
53     "-level n                   Sort level, 1 to 5, for Primary, Secndary, Tertiary, Quaternary, Identical\n"
54     "-keyhist                   Produce a table sort key size vs. string length\n"
55     "-binsearch                 Binary Search timing test\n"
56     "-keygen                    Sort Key Generation timing test\n"
57     "-qsort                     Quicksort timing test\n"
58     "-iter                      Iteration Performance Test\n"
59     "-dump                      Display strings, sort keys and CEs.\n"
60     ;
61 
62 
63 
64 #include <stdio.h>
65 #include <string.h>
66 #include <stdlib.h>
67 #include <math.h>
68 #include <locale.h>
69 #include <errno.h>
70 
71 #include <unicode/utypes.h>
72 #include <unicode/ucol.h>
73 #include <unicode/ucoleitr.h>
74 #include <unicode/uloc.h>
75 #include <unicode/ustring.h>
76 #include <unicode/ures.h>
77 #include <unicode/uchar.h>
78 #include <unicode/ucnv.h>
79 #include <unicode/utf8.h>
80 
81 #ifdef WIN32
82 #include <windows.h>
83 #else
84 //
85 //  Stubs for Windows API functions when building on UNIXes.
86 //
87 typedef int DWORD;
CompareStringW(DWORD,DWORD,UChar *,int,UChar *,int)88 inline int CompareStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
89 #include <sys/time.h>
timeGetTime()90 unsigned long timeGetTime() {
91     struct timeval t;
92     gettimeofday(&t, 0);
93     unsigned long val = t.tv_sec * 1000;  // Let it overflow.  Who cares.
94     val += t.tv_usec / 1000;
95     return val;
96 }
LCMapStringW(DWORD,DWORD,UChar *,int,UChar *,int)97 inline int LCMapStringW(DWORD, DWORD, UChar *, int, UChar *, int) {return 0;}
98 const int LCMAP_SORTKEY = 0;
99 #define MAKELCID(a,b) 0
100 const int SORT_DEFAULT = 0;
101 #endif
102 
103 
104 
105 //
106 //  Command line option variables
107 //     These global variables are set according to the options specified
108 //     on the command line by the user.
109 char * opt_fName      = 0;
110 const char * opt_locale     = "en_US";
111 int    opt_langid     = 0;         // Defaults to value corresponding to opt_locale.
112 char * opt_rules      = 0;
113 UBool  opt_help       = false;
114 int    opt_loopCount  = 1;
115 int    opt_iLoopCount = 1;
116 UBool  opt_terse      = false;
117 UBool  opt_qsort      = false;
118 UBool  opt_binsearch  = false;
119 UBool  opt_icu        = true;
120 UBool  opt_win        = false;      // Run with Windows native functions.
121 UBool  opt_unix       = false;      // Run with UNIX strcoll, strxfrm functions.
122 UBool  opt_uselen     = false;
123 UBool  opt_usekeys    = false;
124 UBool  opt_strcmp     = false;
125 UBool  opt_strcmpCPO  = false;
126 UBool  opt_norm       = false;
127 UBool  opt_keygen     = false;
128 UBool  opt_french     = false;
129 UBool  opt_frenchoff  = false;
130 UBool  opt_shifted    = false;
131 UBool  opt_lower      = false;
132 UBool  opt_upper      = false;
133 UBool  opt_case       = false;
134 int    opt_level      = 0;
135 UBool  opt_keyhist    = false;
136 UBool  opt_itertest   = false;
137 UBool  opt_dump       = false;
138 
139 
140 
141 //
142 //   Definitions for the command line options
143 //
144 struct OptSpec {
145     const char *name;
146     enum {FLAG, NUM, STRING} type;
147     void *pVar;
148 };
149 
150 OptSpec opts[] = {
151     {"-file",        OptSpec::STRING, &opt_fName},
152     {"-locale",      OptSpec::STRING, &opt_locale},
153     {"-langid",      OptSpec::NUM,    &opt_langid},
154     {"-rules",       OptSpec::STRING, &opt_rules},
155     {"-qsort",       OptSpec::FLAG,   &opt_qsort},
156     {"-binsearch",   OptSpec::FLAG,   &opt_binsearch},
157     {"-iter",        OptSpec::FLAG,   &opt_itertest},
158     {"-win",         OptSpec::FLAG,   &opt_win},
159     {"-unix",        OptSpec::FLAG,   &opt_unix},
160     {"-uselen",      OptSpec::FLAG,   &opt_uselen},
161     {"-usekeys",     OptSpec::FLAG,   &opt_usekeys},
162     {"-strcmp",      OptSpec::FLAG,   &opt_strcmp},
163     {"-strcmpCPO",   OptSpec::FLAG,   &opt_strcmpCPO},
164     {"-norm",        OptSpec::FLAG,   &opt_norm},
165     {"-french",      OptSpec::FLAG,   &opt_french},
166     {"-frenchoff",   OptSpec::FLAG,   &opt_frenchoff},
167     {"-shifted",     OptSpec::FLAG,   &opt_shifted},
168     {"-lower",       OptSpec::FLAG,   &opt_lower},
169     {"-upper",       OptSpec::FLAG,   &opt_upper},
170     {"-case",        OptSpec::FLAG,   &opt_case},
171     {"-level",       OptSpec::NUM,    &opt_level},
172     {"-keyhist",     OptSpec::FLAG,   &opt_keyhist},
173     {"-keygen",      OptSpec::FLAG,   &opt_keygen},
174     {"-loop",        OptSpec::NUM,    &opt_loopCount},
175     {"-iloop",       OptSpec::NUM,    &opt_iLoopCount},
176     {"-terse",       OptSpec::FLAG,   &opt_terse},
177     {"-dump",        OptSpec::FLAG,   &opt_dump},
178     {"-help",        OptSpec::FLAG,   &opt_help},
179     {"-?",           OptSpec::FLAG,   &opt_help},
180     {0, OptSpec::FLAG, 0}
181 };
182 
183 
184 //---------------------------------------------------------------------------
185 //
186 //  Global variables pointing to and describing the test file
187 //
188 //---------------------------------------------------------------------------
189 
190 //
191 //   struct Line
192 //
193 //      Each line from the source file (containing a name, presumably) gets
194 //      one of these structs.
195 //
196 struct  Line {
197     UChar     *name;
198     int        len;
199     char      *winSortKey;
200     char      *icuSortKey;
201     char      *unixSortKey;
202     char      *unixName;
203 };
204 
205 
206 
207 Line          *gFileLines;           // Ptr to array of Line structs, one per line in the file.
208 int            gNumFileLines;
209 UCollator     *gCol;
210 DWORD          gWinLCID;
211 
212 Line          **gSortedLines;
213 Line          **gRandomLines;
214 int            gCount;
215 
216 
217 
218 //---------------------------------------------------------------------------
219 //
220 //  ProcessOptions()    Function to read the command line options.
221 //
222 //---------------------------------------------------------------------------
ProcessOptions(int argc,const char ** argv,OptSpec opts[])223 UBool ProcessOptions(int argc, const char **argv, OptSpec opts[])
224 {
225     int         i;
226     int         argNum;
227     const char  *pArgName;
228     OptSpec    *pOpt;
229 
230     for (argNum=1; argNum<argc; argNum++) {
231         pArgName = argv[argNum];
232         for (pOpt = opts;  pOpt->name != 0; pOpt++) {
233             if (strcmp(pOpt->name, pArgName) == 0) {
234                 switch (pOpt->type) {
235                 case OptSpec::FLAG:
236                     *(UBool *)(pOpt->pVar) = true;
237                     break;
238                 case OptSpec::STRING:
239                     argNum ++;
240                     if (argNum >= argc) {
241                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
242                         return false;
243                     }
244                     *(const char **)(pOpt->pVar)  = argv[argNum];
245                     break;
246                 case OptSpec::NUM:
247                     argNum ++;
248                     if (argNum >= argc) {
249                         fprintf(stderr, "value expected for \"%s\" option.\n", pOpt->name);
250                         return false;
251                     }
252                     char *endp;
253                     i = strtol(argv[argNum], &endp, 0);
254                     if (endp == argv[argNum]) {
255                         fprintf(stderr, "integer value expected for \"%s\" option.\n", pOpt->name);
256                         return false;
257                     }
258                     *(int *)(pOpt->pVar) = i;
259                 }
260                 break;
261             }
262         }
263         if (pOpt->name == 0)
264         {
265             fprintf(stderr, "Unrecognized option \"%s\"\n", pArgName);
266             return false;
267         }
268     }
269 return true;
270 }
271 
272 //---------------------------------------------------------------------------------------
273 //
274 //   Comparison functions for use by qsort.
275 //
276 //       Six flavors, ICU or Windows, SortKey or String Compare, Strings with length
277 //           or null terminated.
278 //
279 //---------------------------------------------------------------------------------------
ICUstrcmpK(const void * a,const void * b)280 int ICUstrcmpK(const void *a, const void *b) {
281     gCount++;
282     int t = strcmp((*(Line **)a)->icuSortKey, (*(Line **)b)->icuSortKey);
283     return t;
284 }
285 
286 
ICUstrcmpL(const void * a,const void * b)287 int ICUstrcmpL(const void *a, const void *b) {
288     gCount++;
289     UCollationResult t;
290     t = ucol_strcoll(gCol, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
291     if (t == UCOL_LESS) return -1;
292     if (t == UCOL_GREATER) return +1;
293     return 0;
294 }
295 
296 
ICUstrcmp(const void * a,const void * b)297 int ICUstrcmp(const void *a, const void *b) {
298     gCount++;
299     UCollationResult t;
300     t = ucol_strcoll(gCol, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
301     if (t == UCOL_LESS) return -1;
302     if (t == UCOL_GREATER) return +1;
303     return 0;
304 }
305 
306 
Winstrcmp(const void * a,const void * b)307 int Winstrcmp(const void *a, const void *b) {
308     gCount++;
309     int t;
310     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, -1, (*(Line **)b)->name, -1);
311     return t-2;
312 }
313 
314 
UNIXstrcmp(const void * a,const void * b)315 int UNIXstrcmp(const void *a, const void *b) {
316     gCount++;
317     int t;
318     t = strcoll((*(Line **)a)->unixName, (*(Line **)b)->unixName);
319     return t;
320 }
321 
322 
WinstrcmpL(const void * a,const void * b)323 int WinstrcmpL(const void *a, const void *b) {
324     gCount++;
325     int t;
326     t = CompareStringW(gWinLCID, 0, (*(Line **)a)->name, (*(Line **)a)->len, (*(Line **)b)->name, (*(Line **)b)->len);
327     return t-2;
328 }
329 
330 
WinstrcmpK(const void * a,const void * b)331 int WinstrcmpK(const void *a, const void *b) {
332     gCount++;
333     int t = strcmp((*(Line **)a)->winSortKey, (*(Line **)b)->winSortKey);
334     return t;
335 }
336 
337 
338 //---------------------------------------------------------------------------------------
339 //
340 //   Function for sorting the names (lines) into a random order.
341 //      Order is based on a hash of the  ICU Sort key for the lines
342 //      The randomized order is used as input for the sorting timing tests.
343 //
344 //---------------------------------------------------------------------------------------
ICURandomCmp(const void * a,const void * b)345 int ICURandomCmp(const void *a, const void *b) {
346     char  *ask = (*(Line **)a)->icuSortKey;
347     char  *bsk = (*(Line **)b)->icuSortKey;
348     int   aVal = 0;
349     int   bVal = 0;
350     int   retVal;
351     while (*ask != 0) {
352         aVal += aVal*37 + *ask++;
353     }
354     while (*bsk != 0) {
355         bVal += bVal*37 + *bsk++;
356     }
357     retVal = -1;
358     if (aVal == bVal) {
359         retVal = 0;
360     }
361     else if (aVal > bVal) {
362         retVal = 1;
363     }
364     return retVal;
365 }
366 
367 //---------------------------------------------------------------------------------------
368 //
369 //   doKeyGen()     Key Generation Timing Test
370 //
371 //---------------------------------------------------------------------------------------
doKeyGen()372 void doKeyGen()
373 {
374     int  line;
375     int  loops = 0;
376     int  iLoop;
377     int  len=-1;
378 
379     // Adjust loop count to compensate for file size.   Should be order n
380     double dLoopCount = double(opt_loopCount) * (1000. /  double(gNumFileLines));
381     int adj_loopCount = int(dLoopCount);
382     if (adj_loopCount < 1) adj_loopCount = 1;
383 
384 
385     unsigned long startTime = timeGetTime();
386 
387     if (opt_win) {
388         for (loops=0; loops<adj_loopCount; loops++) {
389             for (line=0; line < gNumFileLines; line++) {
390                 if (opt_uselen) {
391                     len = gFileLines[line].len;
392                 }
393                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
394                     LCMapStringW(gWinLCID, LCMAP_SORTKEY,
395                         gFileLines[line].name, len,
396                         (UChar *)gFileLines[line].winSortKey, 5000);    // TODO  something with length.
397                 }
398             }
399         }
400     }
401     else if (opt_icu)
402     {
403         for (loops=0; loops<adj_loopCount; loops++) {
404             for (line=0; line < gNumFileLines; line++) {
405                 if (opt_uselen) {
406                     len = gFileLines[line].len;
407                 }
408                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
409                     ucol_getSortKey(gCol, gFileLines[line].name, len, (unsigned char *)gFileLines[line].icuSortKey, 5000);
410                 }
411             }
412         }
413     }
414     else if (opt_unix)
415     {
416         for (loops=0; loops<adj_loopCount; loops++) {
417             for (line=0; line < gNumFileLines; line++) {
418                 for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
419                     strxfrm(gFileLines[line].unixSortKey, gFileLines[line].unixName, 5000);
420                 }
421             }
422         }
423     }
424 
425     unsigned long elapsedTime = timeGetTime() - startTime;
426     int ns = (int)(float(1000000) * (float)elapsedTime / (float)(adj_loopCount*gNumFileLines));
427 
428     if (opt_terse == false) {
429         printf("Sort Key Generation:  total # of keys = %d\n", loops*gNumFileLines);
430         printf("Sort Key Generation:  time per key = %d ns\n", ns);
431     }
432     else {
433         printf("%d,  ", ns);
434     }
435 
436     int   totalKeyLen = 0;
437     int   totalChars  = 0;
438     for (line=0; line<gNumFileLines; line++) {
439         totalChars += u_strlen(gFileLines[line].name);
440         if (opt_win) {
441             totalKeyLen += strlen(gFileLines[line].winSortKey);
442         }
443         else if (opt_icu) {
444             totalKeyLen += strlen(gFileLines[line].icuSortKey);
445         }
446         else if (opt_unix) {
447             totalKeyLen += strlen(gFileLines[line].unixSortKey);
448         }
449 
450     }
451     if (opt_terse == false) {
452         printf("Key Length / character = %f\n", (float)totalKeyLen / (float)totalChars);
453     } else {
454         printf("%f, ", (float)totalKeyLen / (float)totalChars);
455     }
456 }
457 
458 
459 
460 //---------------------------------------------------------------------------------------
461 //
462 //    doBinarySearch()    Binary Search timing test.  Each name from the list
463 //                        is looked up in the full sorted list of names.
464 //
465 //---------------------------------------------------------------------------------------
doBinarySearch()466 void doBinarySearch()
467 {
468 
469     gCount = 0;
470     int  line;
471     int  loops = 0;
472     int  iLoop = 0;
473     unsigned long elapsedTime = 0;
474 
475     // Adjust loop count to compensate for file size.   Should be order n (lookups) * log n  (compares/lookup)
476     // Accurate timings do not depend on this being perfect.  The correction is just to try to
477     //   get total running times of about the right order, so the that user doesn't need to
478     //   manually adjust the loop count for every different file size.
479     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
480     if (opt_usekeys) dLoopCount *= 5;
481     int adj_loopCount = int(dLoopCount);
482     if (adj_loopCount < 1) adj_loopCount = 1;
483 
484 
485     for (;;) {  // not really a loop, just allows "break" to work, to simplify
486                 //   inadvertantly running more than one test through here.
487         if (opt_strcmp || opt_strcmpCPO)
488         {
489             unsigned long startTime = timeGetTime();
490             typedef int32_t (U_EXPORT2 *PF)(const UChar *, const UChar *);
491             PF pf = u_strcmp;
492             if (opt_strcmpCPO) {pf = u_strcmpCodePointOrder;}
493             //if (opt_strcmp && opt_win) {pf = (PF)wcscmp;}   // Damn the difference between int32_t and int
494                                                             //   which forces the use of a cast here.
495 
496             int r = 0;
497             for (loops=0; loops<adj_loopCount; loops++) {
498 
499                 for (line=0; line < gNumFileLines; line++) {
500                     int hi      = gNumFileLines-1;
501                     int lo      = 0;
502                     int  guess = -1;
503                     for (;;) {
504                         int newGuess = (hi + lo) / 2;
505                         if (newGuess == guess)
506                             break;
507                         guess = newGuess;
508                         for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
509                             r = (*pf)((gSortedLines[line])->name, (gSortedLines[guess])->name);
510                         }
511                         gCount++;
512                         if (r== 0)
513                             break;
514                         if (r < 0)
515                             hi = guess;
516                         else
517                             lo   = guess;
518                     }
519                 }
520             }
521             elapsedTime = timeGetTime() - startTime;
522             break;
523         }
524 
525 
526         if (opt_icu)
527         {
528             unsigned long startTime = timeGetTime();
529             UCollationResult  r = UCOL_EQUAL;
530             for (loops=0; loops<adj_loopCount; loops++) {
531 
532                 for (line=0; line < gNumFileLines; line++) {
533                     int lineLen  = -1;
534                     int guessLen = -1;
535                     if (opt_uselen) {
536                         lineLen = (gSortedLines[line])->len;
537                     }
538                     int hi      = gNumFileLines-1;
539                     int lo      = 0;
540                     int  guess = -1;
541                     for (;;) {
542                         int newGuess = (hi + lo) / 2;
543                         if (newGuess == guess)
544                             break;
545                         guess = newGuess;
546                         int ri = 0;
547                         if (opt_usekeys) {
548                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
549                                 ri = strcmp((gSortedLines[line])->icuSortKey, (gSortedLines[guess])->icuSortKey);
550                             }
551                             gCount++;
552                             r=UCOL_GREATER; if(ri<0) {r=UCOL_LESS;} else if (ri==0) {r=UCOL_EQUAL;}
553                         }
554                         else
555                         {
556                             if (opt_uselen) {
557                                 guessLen = (gSortedLines[guess])->len;
558                             }
559                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
560                                 r = ucol_strcoll(gCol, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
561                             }
562                             gCount++;
563                         }
564                         if (r== UCOL_EQUAL)
565                             break;
566                         if (r == UCOL_LESS)
567                             hi = guess;
568                         else
569                             lo   = guess;
570                     }
571                 }
572             }
573             elapsedTime = timeGetTime() - startTime;
574             break;
575         }
576 
577         if (opt_win)
578         {
579             unsigned long startTime = timeGetTime();
580             int r = 0;
581             for (loops=0; loops<adj_loopCount; loops++) {
582 
583                 for (line=0; line < gNumFileLines; line++) {
584                     int lineLen  = -1;
585                     int guessLen = -1;
586                     if (opt_uselen) {
587                         lineLen = (gSortedLines[line])->len;
588                     }
589                     int hi   = gNumFileLines-1;
590                     int lo   = 0;
591                     int  guess = -1;
592                     for (;;) {
593                         int newGuess = (hi + lo) / 2;
594                         if (newGuess == guess)
595                             break;
596                         guess = newGuess;
597                         if (opt_usekeys) {
598                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
599                                 r = strcmp((gSortedLines[line])->winSortKey, (gSortedLines[guess])->winSortKey);
600                             }
601                             gCount++;
602                             r+=2;
603                         }
604                         else
605                         {
606                             if (opt_uselen) {
607                                 guessLen = (gSortedLines[guess])->len;
608                             }
609                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
610                                 r = CompareStringW(gWinLCID, 0, (gSortedLines[line])->name, lineLen, (gSortedLines[guess])->name, guessLen);
611                             }
612                             if (r == 0) {
613                                 if (opt_terse == false) {
614                                     fprintf(stderr, "Error returned from Windows CompareStringW.\n");
615                                 }
616                                 exit(-1);
617                             }
618                             gCount++;
619                         }
620                         if (r== 2)   //  strings ==
621                             break;
622                         if (r == 1)  //  line < guess
623                             hi = guess;
624                         else         //  line > guess
625                             lo   = guess;
626                     }
627                 }
628             }
629             elapsedTime = timeGetTime() - startTime;
630             break;
631         }
632 
633         if (opt_unix)
634         {
635             unsigned long startTime = timeGetTime();
636             int r = 0;
637             for (loops=0; loops<adj_loopCount; loops++) {
638 
639                 for (line=0; line < gNumFileLines; line++) {
640                     int hi   = gNumFileLines-1;
641                     int lo   = 0;
642                     int  guess = -1;
643                     for (;;) {
644                         int newGuess = (hi + lo) / 2;
645                         if (newGuess == guess)
646                             break;
647                         guess = newGuess;
648                         if (opt_usekeys) {
649                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
650                                  r = strcmp((gSortedLines[line])->unixSortKey, (gSortedLines[guess])->unixSortKey);
651                             }
652                             gCount++;
653                         }
654                         else
655                         {
656                             for (iLoop=0; iLoop < opt_iLoopCount; iLoop++) {
657                                 r = strcoll((gSortedLines[line])->unixName, (gSortedLines[guess])->unixName);
658                             }
659                             errno = 0;
660                             if (errno != 0) {
661                                 fprintf(stderr, "Error %d returned from strcoll.\n", errno);
662                                 exit(-1);
663                             }
664                             gCount++;
665                         }
666                         if (r == 0)   //  strings ==
667                             break;
668                         if (r < 0)  //  line < guess
669                             hi = guess;
670                         else         //  line > guess
671                             lo   = guess;
672                     }
673                 }
674             }
675             elapsedTime = timeGetTime() - startTime;
676             break;
677         }
678         break;
679     }
680 
681     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
682     if (opt_terse == false) {
683         printf("binary search:  total # of string compares = %d\n", gCount);
684         printf("binary search:  compares per loop = %d\n", gCount / loops);
685         printf("binary search:  time per compare = %d ns\n", ns);
686     } else {
687         printf("%d, ", ns);
688     }
689 
690 }
691 
692 
693 
694 
695 //---------------------------------------------------------------------------------------
696 //
697 //   doQSort()    The quick sort timing test.  Uses the C library qsort function.
698 //
699 //---------------------------------------------------------------------------------------
doQSort()700 void doQSort() {
701     int i;
702     Line **sortBuf = new Line *[gNumFileLines];
703 
704     // Adjust loop count to compensate for file size.   QSort should be n log(n)
705     double dLoopCount = double(opt_loopCount) * 3000. / (log10((double)gNumFileLines) * double(gNumFileLines));
706     if (opt_usekeys) dLoopCount *= 5;
707     int adj_loopCount = int(dLoopCount);
708     if (adj_loopCount < 1) adj_loopCount = 1;
709 
710 
711     gCount = 0;
712     unsigned long startTime = timeGetTime();
713     if (opt_win && opt_usekeys) {
714         for (i=0; i<opt_loopCount; i++) {
715             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
716             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpK);
717         }
718     }
719 
720     else if (opt_win && opt_uselen) {
721         for (i=0; i<adj_loopCount; i++) {
722             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
723             qsort(sortBuf, gNumFileLines, sizeof(Line *), WinstrcmpL);
724         }
725     }
726 
727 
728     else if (opt_win && !opt_uselen) {
729         for (i=0; i<adj_loopCount; i++) {
730             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
731             qsort(sortBuf, gNumFileLines, sizeof(Line *), Winstrcmp);
732         }
733     }
734 
735     else if (opt_icu && opt_usekeys) {
736         for (i=0; i<adj_loopCount; i++) {
737             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
738             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpK);
739         }
740     }
741 
742     else if (opt_icu && opt_uselen) {
743         for (i=0; i<adj_loopCount; i++) {
744             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
745             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmpL);
746         }
747     }
748 
749 
750     else if (opt_icu && !opt_uselen) {
751         for (i=0; i<adj_loopCount; i++) {
752             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
753             qsort(sortBuf, gNumFileLines, sizeof(Line *), ICUstrcmp);
754         }
755     }
756 
757     else if (opt_unix && !opt_usekeys) {
758         for (i=0; i<adj_loopCount; i++) {
759             memcpy(sortBuf, gRandomLines, gNumFileLines * sizeof(Line *));
760             qsort(sortBuf, gNumFileLines, sizeof(Line *), UNIXstrcmp);
761         }
762     }
763 
764     unsigned long elapsedTime = timeGetTime() - startTime;
765     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
766     if (opt_terse == false) {
767         printf("qsort:  total # of string compares = %d\n", gCount);
768         printf("qsort:  time per compare = %d ns\n", ns);
769     } else {
770         printf("%d, ", ns);
771     }
772 }
773 
774 
775 
776 //---------------------------------------------------------------------------------------
777 //
778 //    doKeyHist()       Output a table of data for
779 //                        average sort key size vs. string length.
780 //
781 //---------------------------------------------------------------------------------------
doKeyHist()782 void doKeyHist() {
783     int     i;
784     int     maxLen = 0;
785 
786     // Find the maximum string length
787     for (i=0; i<gNumFileLines; i++) {
788         if (gFileLines[i].len > maxLen) maxLen = gFileLines[i].len;
789     }
790 
791     // Allocate arrays to hold the histogram data
792     int *accumulatedLen  = new int[maxLen+1];
793     int *numKeysOfSize   = new int[maxLen+1];
794     for (i=0; i<=maxLen; i++) {
795         accumulatedLen[i] = 0;
796         numKeysOfSize[i] = 0;
797     }
798 
799     // Fill the arrays...
800     for (i=0; i<gNumFileLines; i++) {
801         int len = gFileLines[i].len;
802         accumulatedLen[len] += strlen(gFileLines[i].icuSortKey);
803         numKeysOfSize[len] += 1;
804     }
805 
806     // And write out averages
807     printf("String Length,  Avg Key Length,  Avg Key Len per char\n");
808     for (i=1; i<=maxLen; i++) {
809         if (numKeysOfSize[i] > 0) {
810             printf("%d, %f, %f\n", i, (float)accumulatedLen[i] / (float)numKeysOfSize[i],
811                 (float)accumulatedLen[i] / (float)(numKeysOfSize[i] * i));
812         }
813     }
814     delete []accumulatedLen;
815     delete []numKeysOfSize ;
816 }
817 
818 //---------------------------------------------------------------------------------------
819 //
820 //    doForwardIterTest(UBool)       Forward iteration test
821 //                                   argument null-terminated string used
822 //
823 //---------------------------------------------------------------------------------------
doForwardIterTest(UBool haslen)824 void doForwardIterTest(UBool haslen) {
825     int count = 0;
826 
827     UErrorCode error = U_ZERO_ERROR;
828     printf("\n\nPerforming forward iteration performance test with ");
829 
830     if (haslen) {
831         printf("non-null terminated data -----------\n");
832     }
833     else {
834         printf("null terminated data -----------\n");
835     }
836     printf("performance test on strings from file -----------\n");
837 
838     UChar dummytext[] = {0, 0};
839     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
840     ucol_setText(iter, dummytext, 1, &error);
841 
842     gCount = 0;
843     unsigned long startTime = timeGetTime();
844     while (count < opt_loopCount) {
845         int linecount = 0;
846         while (linecount < gNumFileLines) {
847             UChar *str = gFileLines[linecount].name;
848             int strlen = haslen?gFileLines[linecount].len:-1;
849             ucol_setText(iter, str, strlen, &error);
850             while (ucol_next(iter, &error) != UCOL_NULLORDER) {
851                 gCount++;
852             }
853 
854             linecount ++;
855         }
856         count ++;
857     }
858     unsigned long elapsedTime = timeGetTime() - startTime;
859     printf("elapsedTime %ld\n", elapsedTime);
860 
861     // empty loop recalculation
862     count = 0;
863     startTime = timeGetTime();
864     while (count < opt_loopCount) {
865         int linecount = 0;
866         while (linecount < gNumFileLines) {
867             UChar *str = gFileLines[linecount].name;
868             int strlen = haslen?gFileLines[linecount].len:-1;
869             ucol_setText(iter, str, strlen, &error);
870             linecount ++;
871         }
872         count ++;
873     }
874     elapsedTime -= (timeGetTime() - startTime);
875     printf("elapsedTime %ld\n", elapsedTime);
876 
877     ucol_closeElements(iter);
878 
879     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
880     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
881                                                                 opt_loopCount);
882     printf("Average time per ucol_next() nano seconds %d\n", ns);
883 
884     printf("performance test on skipped-5 concatenated strings from file -----------\n");
885 
886     UChar *str;
887     int    strlen = 0;
888     // appending all the strings
889     int linecount = 0;
890     while (linecount < gNumFileLines) {
891         strlen += haslen?gFileLines[linecount].len:
892                                       u_strlen(gFileLines[linecount].name);
893         linecount ++;
894     }
895     str = (UChar *)malloc(sizeof(UChar) * strlen);
896     int strindex = 0;
897     linecount = 0;
898     while (strindex < strlen) {
899         int len = 0;
900         len += haslen?gFileLines[linecount].len:
901                                       u_strlen(gFileLines[linecount].name);
902         memcpy(str + strindex, gFileLines[linecount].name,
903                sizeof(UChar) * len);
904         strindex += len;
905         linecount ++;
906     }
907 
908     printf("Total size of strings %d\n", strlen);
909 
910     gCount = 0;
911     count  = 0;
912 
913     if (!haslen) {
914         strlen = -1;
915     }
916     iter = ucol_openElements(gCol, str, strlen, &error);
917     if (!haslen) {
918         strlen = u_strlen(str);
919     }
920     strlen -= 5; // any left over characters are not iterated,
921                  // this is to ensure the backwards and forwards iterators
922                  // gets the same position
923     startTime = timeGetTime();
924     while (count < opt_loopCount) {
925         int count5 = 5;
926         strindex = 0;
927         ucol_setOffset(iter, strindex, &error);
928         while (true) {
929             if (ucol_next(iter, &error) == UCOL_NULLORDER) {
930                 break;
931             }
932             gCount++;
933             count5 --;
934             if (count5 == 0) {
935                 strindex += 10;
936                 if (strindex > strlen) {
937                     break;
938                 }
939                 ucol_setOffset(iter, strindex, &error);
940                 count5 = 5;
941             }
942         }
943         count ++;
944     }
945 
946     elapsedTime = timeGetTime() - startTime;
947     printf("elapsedTime %ld\n", elapsedTime);
948 
949     // empty loop recalculation
950     int tempgCount = 0;
951     count = 0;
952     startTime = timeGetTime();
953     while (count < opt_loopCount) {
954         int count5 = 5;
955         strindex = 0;
956         ucol_setOffset(iter, strindex, &error);
957         while (true) {
958             tempgCount ++;
959             count5 --;
960             if (count5 == 0) {
961                 strindex += 10;
962                 if (strindex > strlen) {
963                     break;
964                 }
965                 ucol_setOffset(iter, strindex, &error);
966                 count5 = 5;
967             }
968         }
969         count ++;
970     }
971     elapsedTime -= (timeGetTime() - startTime);
972     printf("elapsedTime %ld\n", elapsedTime);
973 
974     ucol_closeElements(iter);
975 
976     printf("gCount %d\n", gCount);
977     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
978     printf("Average time per ucol_next() nano seconds %d\n", ns);
979 }
980 
981 //---------------------------------------------------------------------------------------
982 //
983 //    doBackwardIterTest(UBool)      Backwards iteration test
984 //                                   argument null-terminated string used
985 //
986 //---------------------------------------------------------------------------------------
doBackwardIterTest(UBool haslen)987 void doBackwardIterTest(UBool haslen) {
988     int count = 0;
989     UErrorCode error = U_ZERO_ERROR;
990     printf("\n\nPerforming backward iteration performance test with ");
991 
992     if (haslen) {
993         printf("non-null terminated data -----------\n");
994     }
995     else {
996         printf("null terminated data -----------\n");
997     }
998 
999     printf("performance test on strings from file -----------\n");
1000 
1001     UCollationElements *iter = ucol_openElements(gCol, NULL, 0, &error);
1002     UChar dummytext[] = {0, 0};
1003     ucol_setText(iter, dummytext, 1, &error);
1004 
1005     gCount = 0;
1006     unsigned long startTime = timeGetTime();
1007     while (count < opt_loopCount) {
1008         int linecount = 0;
1009         while (linecount < gNumFileLines) {
1010             UChar *str = gFileLines[linecount].name;
1011             int strlen = haslen?gFileLines[linecount].len:-1;
1012             ucol_setText(iter, str, strlen, &error);
1013             while (ucol_previous(iter, &error) != UCOL_NULLORDER) {
1014                 gCount ++;
1015             }
1016 
1017             linecount ++;
1018         }
1019         count ++;
1020     }
1021     unsigned long elapsedTime = timeGetTime() - startTime;
1022 
1023     printf("elapsedTime %ld\n", elapsedTime);
1024 
1025     // empty loop recalculation
1026     count = 0;
1027     startTime = timeGetTime();
1028     while (count < opt_loopCount) {
1029         int linecount = 0;
1030         while (linecount < gNumFileLines) {
1031             UChar *str = gFileLines[linecount].name;
1032             int strlen = haslen?gFileLines[linecount].len:-1;
1033             ucol_setText(iter, str, strlen, &error);
1034             linecount ++;
1035         }
1036         count ++;
1037     }
1038     elapsedTime -= (timeGetTime() - startTime);
1039 
1040     printf("elapsedTime %ld\n", elapsedTime);
1041     ucol_closeElements(iter);
1042 
1043     int ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1044     printf("Total number of strings compared %d in %d loops\n", gNumFileLines,
1045                                                                 opt_loopCount);
1046     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1047 
1048     printf("performance test on skipped-5 concatenated strings from file -----------\n");
1049 
1050     UChar *str;
1051     int    strlen = 0;
1052     // appending all the strings
1053     int linecount = 0;
1054     while (linecount < gNumFileLines) {
1055         strlen += haslen?gFileLines[linecount].len:
1056                                       u_strlen(gFileLines[linecount].name);
1057         linecount ++;
1058     }
1059     str = (UChar *)malloc(sizeof(UChar) * strlen);
1060     int strindex = 0;
1061     linecount = 0;
1062     while (strindex < strlen) {
1063         int len = 0;
1064         len += haslen?gFileLines[linecount].len:
1065                                       u_strlen(gFileLines[linecount].name);
1066         memcpy(str + strindex, gFileLines[linecount].name,
1067                sizeof(UChar) * len);
1068         strindex += len;
1069         linecount ++;
1070     }
1071 
1072     printf("Total size of strings %d\n", strlen);
1073 
1074     gCount = 0;
1075     count  = 0;
1076 
1077     if (!haslen) {
1078         strlen = -1;
1079     }
1080 
1081     iter = ucol_openElements(gCol, str, strlen, &error);
1082     if (!haslen) {
1083         strlen = u_strlen(str);
1084     }
1085 
1086     startTime = timeGetTime();
1087     while (count < opt_loopCount) {
1088         int count5 = 5;
1089         strindex = 5;
1090         ucol_setOffset(iter, strindex, &error);
1091         while (true) {
1092             if (ucol_previous(iter, &error) == UCOL_NULLORDER) {
1093                 break;
1094             }
1095              gCount ++;
1096              count5 --;
1097              if (count5 == 0) {
1098                  strindex += 10;
1099                  if (strindex > strlen) {
1100                     break;
1101                  }
1102                  ucol_setOffset(iter, strindex, &error);
1103                  count5 = 5;
1104              }
1105         }
1106         count ++;
1107     }
1108 
1109     elapsedTime = timeGetTime() - startTime;
1110     printf("elapsedTime %ld\n", elapsedTime);
1111 
1112     // empty loop recalculation
1113     count = 0;
1114     int tempgCount = 0;
1115     startTime = timeGetTime();
1116     while (count < opt_loopCount) {
1117         int count5 = 5;
1118         strindex = 5;
1119         ucol_setOffset(iter, strindex, &error);
1120         while (true) {
1121              tempgCount ++;
1122              count5 --;
1123              if (count5 == 0) {
1124                  strindex += 10;
1125                  if (strindex > strlen) {
1126                     break;
1127                  }
1128                  ucol_setOffset(iter, strindex, &error);
1129                  count5 = 5;
1130              }
1131         }
1132         count ++;
1133     }
1134     elapsedTime -= (timeGetTime() - startTime);
1135     printf("elapsedTime %ld\n", elapsedTime);
1136     ucol_closeElements(iter);
1137 
1138     printf("gCount %d\n", gCount);
1139     ns = (int)(float(1000000) * (float)elapsedTime / (float)gCount);
1140     printf("Average time per ucol_previous() nano seconds %d\n", ns);
1141 }
1142 
1143 //---------------------------------------------------------------------------------------
1144 //
1145 //    doIterTest()       Iteration test
1146 //
1147 //---------------------------------------------------------------------------------------
doIterTest()1148 void doIterTest() {
1149     doForwardIterTest(opt_uselen);
1150     doBackwardIterTest(opt_uselen);
1151 }
1152 
1153 
1154 //----------------------------------------------------------------------------------------
1155 //
1156 //   UnixConvert   -- Convert the lines of the file to the encoding for UNIX
1157 //                    Since it appears that Unicode support is going in the general
1158 //                    direction of the use of UTF-8 locales, that is the approach
1159 //                    that is used here.
1160 //
1161 //----------------------------------------------------------------------------------------
UnixConvert()1162 void  UnixConvert() {
1163     int    line;
1164 
1165     UConverter   *cvrtr;    // An ICU code page converter.
1166     UErrorCode    status = U_ZERO_ERROR;
1167 
1168 
1169     cvrtr = ucnv_open("utf-8", &status);    // we are just doing UTF-8 locales for now.
1170     if (U_FAILURE(status)) {
1171         fprintf(stderr, "ICU Converter open failed.: %s\n", u_errorName(status));
1172         exit(-1);
1173     }
1174 
1175     for (line=0; line < gNumFileLines; line++) {
1176         int sizeNeeded = ucnv_fromUChars(cvrtr,
1177                                          0,            // ptr to target buffer.
1178                                          0,            // length of target buffer.
1179                                          gFileLines[line].name,
1180                                          -1,           //  source is null terminated
1181                                          &status);
1182         if (status != U_BUFFER_OVERFLOW_ERROR && status != U_ZERO_ERROR) {
1183             //fprintf(stderr, "Conversion from Unicode, something is wrong.\n");
1184             //exit(-1);
1185         }
1186         status = U_ZERO_ERROR;
1187         gFileLines[line].unixName = new char[sizeNeeded+1];
1188         sizeNeeded = ucnv_fromUChars(cvrtr,
1189                                          gFileLines[line].unixName, // ptr to target buffer.
1190                                          sizeNeeded+1, // length of target buffer.
1191                                          gFileLines[line].name,
1192                                          -1,           //  source is null terminated
1193                                          &status);
1194         if (U_FAILURE(status)) {
1195             fprintf(stderr, "ICU Conversion Failed.: %d\n", status);
1196             exit(-1);
1197         }
1198         gFileLines[line].unixName[sizeNeeded] = 0;
1199     };
1200     ucnv_close(cvrtr);
1201 }
1202 
1203 
1204 //----------------------------------------------------------------------------------------
1205 //
1206 //  class UCharFile   Class to hide all the gorp to read a file in
1207 //                    and produce a stream of UChars.
1208 //
1209 //----------------------------------------------------------------------------------------
1210 class UCharFile {
1211 public:
1212     UCharFile(const char *fileName);
1213     ~UCharFile();
1214     UChar   get();
eof()1215     UBool   eof() {return fEof;};
error()1216     UBool   error() {return fError;};
1217 
1218 private:
UCharFile(const UCharFile &)1219     UCharFile (const UCharFile & /*other*/) {};                         // No copy constructor.
operator =(const UCharFile &)1220     UCharFile & operator = (const UCharFile &/*other*/) {return *this;};   // No assignment op
1221 
1222     FILE         *fFile;
1223     const char   *fName;
1224     UBool        fEof;
1225     UBool        fError;
1226     UChar        fPending2ndSurrogate;
1227 
1228     enum {UTF16LE, UTF16BE, UTF8} fEncoding;
1229 };
1230 
UCharFile(const char * fileName)1231 UCharFile::UCharFile(const char * fileName) {
1232     fEof                 = false;
1233     fError               = false;
1234     fName                = fileName;
1235     fFile                = fopen(fName, "rb");
1236     fPending2ndSurrogate = 0;
1237     if (fFile == NULL) {
1238         fprintf(stderr, "Can not open file \"%s\"\n", opt_fName);
1239         fError = true;
1240         return;
1241     }
1242     //
1243     //  Look for the byte order mark at the start of the file.
1244     //
1245     int BOMC1, BOMC2, BOMC3;
1246     BOMC1 = fgetc(fFile);
1247     BOMC2 = fgetc(fFile);
1248 
1249     if (BOMC1 == 0xff && BOMC2 == 0xfe) {
1250         fEncoding = UTF16LE; }
1251     else if (BOMC1 == 0xfe && BOMC2 == 0xff) {
1252         fEncoding = UTF16BE; }
1253     else if (BOMC1 == 0xEF && BOMC2 == 0xBB && (BOMC3 = fgetc(fFile)) == 0xBF ) {
1254         fEncoding = UTF8; }
1255     else
1256     {
1257         fprintf(stderr, "collperf:  file \"%s\" encoding must be UTF-8 or UTF-16, and "
1258             "must include a BOM.\n", fileName);
1259         fError = true;
1260         return;
1261     }
1262 }
1263 
1264 
~UCharFile()1265 UCharFile::~UCharFile() {
1266     fclose(fFile);
1267 }
1268 
1269 
1270 
get()1271 UChar UCharFile::get() {
1272     UChar   c;
1273     switch (fEncoding) {
1274     case UTF16LE:
1275         {
1276             int  cL, cH;
1277             cL = fgetc(fFile);
1278             cH = fgetc(fFile);
1279             c  = cL  | (cH << 8);
1280             if (cH == EOF) {
1281                 c   = 0;
1282                 fEof = true;
1283             }
1284             break;
1285         }
1286     case UTF16BE:
1287         {
1288             int  cL, cH;
1289             cH = fgetc(fFile);
1290             cL = fgetc(fFile);
1291             c  = cL  | (cH << 8);
1292             if (cL == EOF) {
1293                 c   = 0;
1294                 fEof = true;
1295             }
1296             break;
1297         }
1298     case UTF8:
1299         {
1300             if (fPending2ndSurrogate != 0) {
1301                 c = fPending2ndSurrogate;
1302                 fPending2ndSurrogate = 0;
1303                 break;
1304             }
1305 
1306             int ch = fgetc(fFile);   // Note:  c and ch are separate cause eof test doesn't work on UChar type.
1307             if (ch == EOF) {
1308                 c = 0;
1309                 fEof = true;
1310                 break;
1311             }
1312 
1313             if (ch <= 0x7f) {
1314                 // It's ascii.  No further utf-8 conversion.
1315                 c = ch;
1316                 break;
1317             }
1318 
1319             // Figure out the lenght of the char and read the rest of the bytes
1320             //   into a temp array.
1321             int nBytes;
1322             if (ch >= 0xF0) {nBytes=4;}
1323             else if (ch >= 0xE0) {nBytes=3;}
1324             else if (ch >= 0xC0) {nBytes=2;}
1325             else {
1326                 fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1327                 fError = true;
1328                 return 0;
1329             }
1330 
1331             unsigned char  bytes[10];
1332             bytes[0] = (unsigned char)ch;
1333             int i;
1334             for (i=1; i<nBytes; i++) {
1335                 bytes[i] = fgetc(fFile);
1336                 if (bytes[i] < 0x80 || bytes[i] >= 0xc0) {
1337                     fprintf(stderr, "utf-8 encoded file contains corrupt data.\n");
1338                     fError = true;
1339                     return 0;
1340                 }
1341             }
1342 
1343             // Convert the bytes from the temp array to a Unicode char.
1344             i = 0;
1345             uint32_t  cp;
1346             U8_NEXT_UNSAFE(bytes, i, cp);
1347             c = (UChar)cp;
1348 
1349             if (cp >= 0x10000) {
1350                 // The code point needs to be broken up into a utf-16 surrogate pair.
1351                 //  Process first half this time through the main loop, and
1352                 //   remember the other half for the next time through.
1353                 UChar utf16Buf[3];
1354                 i = 0;
1355                 UTF16_APPEND_CHAR_UNSAFE(utf16Buf, i, cp);
1356                 fPending2ndSurrogate = utf16Buf[1];
1357                 c = utf16Buf[0];
1358             }
1359             break;
1360         };
1361     default:
1362         c = 0xFFFD; /* Error, unspecified codepage*/
1363         fprintf(stderr, "UCharFile: Error: unknown fEncoding\n");
1364         exit(1);
1365     }
1366     return c;
1367 }
1368 
1369 //----------------------------------------------------------------------------------------
1370 //
1371 //   openRulesCollator  - Command line specified a rules file.  Read it in
1372 //                        and open a collator with it.
1373 //
1374 //----------------------------------------------------------------------------------------
openRulesCollator()1375 UCollator *openRulesCollator() {
1376     UCharFile f(opt_rules);
1377     if (f.error()) {
1378         return 0;
1379     }
1380 
1381     int  bufLen = 10000;
1382     UChar *buf = (UChar *)malloc(bufLen * sizeof(UChar));
1383     UChar *tmp;
1384     int i = 0;
1385 
1386     for(;;) {
1387         buf[i] = f.get();
1388         if (f.eof()) {
1389             break;
1390         }
1391         if (f.error()) {
1392             return 0;
1393         }
1394         i++;
1395         if (i >= bufLen) {
1396             tmp = buf;
1397             bufLen += 10000;
1398             buf = (UChar *)realloc(buf, bufLen);
1399             if (buf == NULL) {
1400                 free(tmp);
1401                 return 0;
1402             }
1403         }
1404     }
1405     buf[i] = 0;
1406 
1407     UErrorCode    status = U_ZERO_ERROR;
1408     UCollator *coll = ucol_openRules(buf, u_strlen(buf), UCOL_OFF,
1409                                          UCOL_DEFAULT_STRENGTH, NULL, &status);
1410     if (U_FAILURE(status)) {
1411         fprintf(stderr, "ICU ucol_openRules() open failed.: %d\n", status);
1412         return 0;
1413     }
1414     free(buf);
1415     return coll;
1416 }
1417 
1418 
1419 
1420 
1421 
1422 //----------------------------------------------------------------------------------------
1423 //
1424 //    Main   --  process command line, read in and pre-process the test file,
1425 //                 call other functions to do the actual tests.
1426 //
1427 //----------------------------------------------------------------------------------------
main(int argc,const char ** argv)1428 int main(int argc, const char** argv) {
1429     if (ProcessOptions(argc, argv, opts) != true || opt_help || opt_fName == 0) {
1430         printf(gUsageString);
1431         exit (1);
1432     }
1433 
1434     // Make sure that we've only got one API selected.
1435     if (opt_unix || opt_win) opt_icu = false;
1436     if (opt_unix) opt_win = false;
1437 
1438     //
1439     //  Set up an ICU collator
1440     //
1441     UErrorCode          status = U_ZERO_ERROR;
1442 
1443     if (opt_rules != 0) {
1444         gCol = openRulesCollator();
1445         if (gCol == 0) {return -1;}
1446     }
1447     else {
1448         gCol = ucol_open(opt_locale, &status);
1449         if (U_FAILURE(status)) {
1450             fprintf(stderr, "Collator creation failed.: %d\n", status);
1451             return -1;
1452         }
1453     }
1454     if (status==U_USING_DEFAULT_WARNING && opt_terse==false) {
1455         fprintf(stderr, "Warning, U_USING_DEFAULT_WARNING for %s\n", opt_locale);
1456     }
1457     if (status==U_USING_FALLBACK_WARNING && opt_terse==false) {
1458         fprintf(stderr, "Warning, U_USING_FALLBACK_ERROR for %s\n", opt_locale);
1459     }
1460 
1461     if (opt_norm) {
1462         ucol_setAttribute(gCol, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
1463     }
1464     if (opt_french && opt_frenchoff) {
1465         fprintf(stderr, "collperf:  Error, specified both -french and -frenchoff options.");
1466         exit(-1);
1467     }
1468     if (opt_french) {
1469         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_ON, &status);
1470     }
1471     if (opt_frenchoff) {
1472         ucol_setAttribute(gCol, UCOL_FRENCH_COLLATION, UCOL_OFF, &status);
1473     }
1474     if (opt_lower) {
1475         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_LOWER_FIRST, &status);
1476     }
1477     if (opt_upper) {
1478         ucol_setAttribute(gCol, UCOL_CASE_FIRST, UCOL_UPPER_FIRST, &status);
1479     }
1480     if (opt_case) {
1481         ucol_setAttribute(gCol, UCOL_CASE_LEVEL, UCOL_ON, &status);
1482     }
1483     if (opt_shifted) {
1484         ucol_setAttribute(gCol, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, &status);
1485     }
1486     if (opt_level != 0) {
1487         switch (opt_level) {
1488         case 1:
1489             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_PRIMARY, &status);
1490             break;
1491         case 2:
1492             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_SECONDARY, &status);
1493             break;
1494         case 3:
1495             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_TERTIARY, &status);
1496             break;
1497         case 4:
1498             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_QUATERNARY, &status);
1499             break;
1500         case 5:
1501             ucol_setAttribute(gCol, UCOL_STRENGTH, UCOL_IDENTICAL, &status);
1502             break;
1503         default:
1504             fprintf(stderr, "-level param must be between 1 and 5\n");
1505             exit(-1);
1506         }
1507     }
1508 
1509     if (U_FAILURE(status)) {
1510         fprintf(stderr, "Collator attribute setting failed.: %d\n", status);
1511         return -1;
1512     }
1513 
1514 
1515     //
1516     //  Set up a Windows LCID
1517     //
1518     if (opt_langid != 0) {
1519         gWinLCID = MAKELCID(opt_langid, SORT_DEFAULT);
1520     }
1521     else {
1522         gWinLCID = uloc_getLCID(opt_locale);
1523     }
1524 
1525 
1526     //
1527     //  Set the UNIX locale
1528     //
1529     if (opt_unix) {
1530         if (setlocale(LC_ALL, opt_locale) == 0) {
1531             fprintf(stderr, "setlocale(LC_ALL, %s) failed.\n", opt_locale);
1532             exit(-1);
1533         }
1534     }
1535 
1536     // Read in  the input file.
1537     //   File assumed to be utf-16.
1538     //   Lines go onto heap buffers.  Global index array to line starts is created.
1539     //   Lines themselves are null terminated.
1540     //
1541 
1542     UCharFile f(opt_fName);
1543     if (f.error()) {
1544         exit(-1);
1545     }
1546 
1547     const int MAXLINES = 100000;
1548     gFileLines = new Line[MAXLINES];
1549     UChar buf[1024];
1550     int   column = 0;
1551 
1552     //  Read the file, split into lines, and save in memory.
1553     //  Loop runs once per utf-16 value from the input file,
1554     //    (The number of bytes read from file per loop iteration depends on external encoding.)
1555     for (;;) {
1556 
1557         UChar c = f.get();
1558         if (f.error()){
1559             exit(-1);
1560         }
1561 
1562 
1563         // We now have a good UTF-16 value in c.
1564 
1565         // Watch for CR, LF, EOF; these finish off a line.
1566         if (c == 0xd) {
1567             continue;
1568         }
1569 
1570         if (f.eof() || c == 0x0a || c==0x2028) {  // Unipad inserts 2028 line separators!
1571             buf[column++] = 0;
1572             if (column > 1) {
1573                 gFileLines[gNumFileLines].name  = new UChar[column];
1574                 gFileLines[gNumFileLines].len   = column-1;
1575                 memcpy(gFileLines[gNumFileLines].name, buf, column * sizeof(UChar));
1576                 gNumFileLines++;
1577                 column = 0;
1578                 if (gNumFileLines >= MAXLINES) {
1579                     fprintf(stderr, "File too big.  Max number of lines is %d\n", MAXLINES);
1580                     exit(-1);
1581                 }
1582 
1583             }
1584             if (c == 0xa || c == 0x2028)
1585                 continue;
1586             else
1587                 break;  // EOF
1588         }
1589         buf[column++] = c;
1590         if (column >= 1023)
1591         {
1592             static UBool warnFlag = true;
1593             if (warnFlag) {
1594                 fprintf(stderr, "Warning - file line longer than 1023 chars truncated.\n");
1595                 warnFlag = false;
1596             }
1597             column--;
1598         }
1599     }
1600 
1601     if (opt_terse == false) {
1602         printf("file \"%s\", %d lines.\n", opt_fName, gNumFileLines);
1603     }
1604 
1605 
1606     // Convert the lines to the UNIX encoding.
1607     if (opt_unix) {
1608         UnixConvert();
1609     }
1610 
1611     //
1612     //  Pre-compute ICU sort keys for the lines of the file.
1613     //
1614     int line;
1615     int32_t t;
1616 
1617     for (line=0; line<gNumFileLines; line++) {
1618          t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)buf, sizeof(buf));
1619          gFileLines[line].icuSortKey  = new char[t];
1620 
1621          if (t > (int32_t)sizeof(buf)) {
1622              t = ucol_getSortKey(gCol, gFileLines[line].name, -1, (unsigned char *)gFileLines[line].icuSortKey , t);
1623          }
1624          else
1625          {
1626              memcpy(gFileLines[line].icuSortKey, buf, t);
1627          }
1628     }
1629 
1630 
1631 
1632     //
1633     //  Pre-compute Windows sort keys for the lines of the file.
1634     //
1635     for (line=0; line<gNumFileLines; line++) {
1636          t=LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, buf, sizeof(buf));
1637          gFileLines[line].winSortKey  = new char[t];
1638          if (t > (int32_t)sizeof(buf)) {
1639              t = LCMapStringW(gWinLCID, LCMAP_SORTKEY, gFileLines[line].name, -1, (UChar *)(gFileLines[line].winSortKey), t);
1640          }
1641          else
1642          {
1643              memcpy(gFileLines[line].winSortKey, buf, t);
1644          }
1645     }
1646 
1647     //
1648     //  Pre-compute UNIX sort keys for the lines of the file.
1649     //
1650     if (opt_unix) {
1651         for (line=0; line<gNumFileLines; line++) {
1652             t=strxfrm((char *)buf,  gFileLines[line].unixName,  sizeof(buf));
1653             gFileLines[line].unixSortKey  = new char[t];
1654             if (t > (int32_t)sizeof(buf)) {
1655                 t = strxfrm(gFileLines[line].unixSortKey,  gFileLines[line].unixName,  sizeof(buf));
1656             }
1657             else
1658             {
1659                 memcpy(gFileLines[line].unixSortKey, buf, t);
1660             }
1661         }
1662     }
1663 
1664 
1665     //
1666     //  Dump file lines, CEs, Sort Keys if requested.
1667     //
1668     if (opt_dump) {
1669         int  i;
1670         for (line=0; line<gNumFileLines; line++) {
1671             for (i=0;;i++) {
1672                 UChar  c = gFileLines[line].name[i];
1673                 if (c == 0)
1674                     break;
1675                 if (c < 0x20 || c > 0x7e) {
1676                     printf("\\u%.4x", c);
1677                 }
1678                 else {
1679                     printf("%c", c);
1680                 }
1681             }
1682             printf("\n");
1683 
1684             printf("   CEs: ");
1685             UCollationElements *CEiter = ucol_openElements(gCol, gFileLines[line].name, -1, &status);
1686             int32_t ce;
1687             i = 0;
1688             for (;;) {
1689                 ce = ucol_next(CEiter, &status);
1690                 if (ce == UCOL_NULLORDER) {
1691                     break;
1692                 }
1693                 printf(" %.8x", ce);
1694                 if (++i > 8) {
1695                     printf("\n        ");
1696                     i = 0;
1697                 }
1698             }
1699             printf("\n");
1700             ucol_closeElements(CEiter);
1701 
1702 
1703             printf("   ICU Sort Key: ");
1704             for (i=0; ; i++) {
1705                 unsigned char c = gFileLines[line].icuSortKey[i];
1706                 printf("%02x ", c);
1707                 if (c == 0) {
1708                     break;
1709                 }
1710                 if (i > 0 && i % 20 == 0) {
1711                     printf("\n                 ");
1712                 }
1713            }
1714             printf("\n");
1715         }
1716     }
1717 
1718 
1719     //
1720     //  Pre-sort the lines.
1721     //
1722     int i;
1723     gSortedLines = new Line *[gNumFileLines];
1724     for (i=0; i<gNumFileLines; i++) {
1725         gSortedLines[i] = &gFileLines[i];
1726     }
1727 
1728     if (opt_win) {
1729         qsort(gSortedLines, gNumFileLines, sizeof(Line *), Winstrcmp);
1730     }
1731     else if (opt_unix) {
1732         qsort(gSortedLines, gNumFileLines, sizeof(Line *), UNIXstrcmp);
1733     }
1734     else   /* ICU */
1735     {
1736         qsort(gSortedLines, gNumFileLines, sizeof(Line *), ICUstrcmp);
1737     }
1738 
1739 
1740     //
1741     //  Make up a randomized order, will be used for sorting tests.
1742     //
1743     gRandomLines = new Line *[gNumFileLines];
1744     for (i=0; i<gNumFileLines; i++) {
1745         gRandomLines[i] = &gFileLines[i];
1746     }
1747     qsort(gRandomLines, gNumFileLines, sizeof(Line *), ICURandomCmp);
1748 
1749 
1750 
1751 
1752     //
1753     //  We've got the file read into memory.  Go do something with it.
1754     //
1755 
1756     if (opt_qsort)     doQSort();
1757     if (opt_binsearch) doBinarySearch();
1758     if (opt_keygen)    doKeyGen();
1759     if (opt_keyhist)   doKeyHist();
1760     if (opt_itertest)  doIterTest();
1761 
1762     return 0;
1763 
1764 }
1765