1 /*
2 **********************************************************************
3 *   Copyright (C) 2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  unisetperf.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2007jan31
12 *   created by: Markus Scherer
13 */
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include "unicode/uperf.h"
19 #include "unicode/uniset.h"
20 #include "unicode/unistr.h"
21 #include "uoptions.h"
22 #include "cmemory.h" // for UPRV_LENGTHOF
23 
24 // Command-line options specific to unisetperf.
25 // Options do not have abbreviations: Force readable command lines.
26 // (Using U+0001 for abbreviation characters.)
27 enum {
28     SET_PATTERN,
29     FAST_TYPE,
30     UNISETPERF_OPTIONS_COUNT
31 };
32 
33 static UOption options[UNISETPERF_OPTIONS_COUNT]={
34     UOPTION_DEF("pattern", '\x01', UOPT_REQUIRES_ARG),
35     UOPTION_DEF("type",    '\x01', UOPT_REQUIRES_ARG)
36 };
37 
38 static const char *const unisetperf_usage =
39     "\t--pattern   UnicodeSet pattern for instantiation.\n"
40     "\t            Default: [:ID_Continue:]\n"
41     "\t--type      Type of UnicodeSet: slow fast\n"
42     "\t            Default: slow\n";
43 
44 // Test object with setup data.
45 class UnicodeSetPerformanceTest : public UPerfTest {
46 public:
UnicodeSetPerformanceTest(int32_t argc,const char * argv[],UErrorCode & status)47     UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
48             : UPerfTest(argc, argv, options, UPRV_LENGTHOF(options), unisetperf_usage, status),
49               utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) {
50         if (U_SUCCESS(status)) {
51             UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape();
52             set.applyPattern(pattern, status);
53             prefrozen=set;
54             if(0==strcmp(options[FAST_TYPE].value, "fast")) {
55                 set.freeze();
56             }
57 
58             int32_t inputLength;
59             UPerfTest::getBuffer(inputLength, status);
60             if(U_SUCCESS(status) && inputLength>0) {
61                 countInputCodePoints = u_countChar32(buffer, bufferLen);
62 
63                 countSpans();
64 
65                 // Preflight the UTF-8 length and allocate utf8.
66                 u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status);
67                 if(status==U_BUFFER_OVERFLOW_ERROR) {
68                     utf8=(char *)malloc(utf8Length);
69                     if(utf8!=NULL) {
70                         status=U_ZERO_ERROR;
71                         u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status);
72                     } else {
73                         status=U_MEMORY_ALLOCATION_ERROR;
74                     }
75                 }
76 
77                 if(verbose) {
78                     printf("code points:%ld  len16:%ld  len8:%ld  spans:%ld  "
79                            "cp/span:%.3g  UChar/span:%.3g  B/span:%.3g  B/cp:%.3g\n",
80                            (long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount,
81                            (double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount,
82                            (double)utf8Length/countInputCodePoints);
83                 }
84             }
85         }
86     }
87 
88     virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);
89 
90     // Count spans of characters that are in the set,
91     // and spans of characters that are not in the set.
92     // If the very first character is in the set, then one additional
93     // not-span is counted.
countSpans()94     void countSpans() {
95         const UChar *s=getBuffer();
96         int32_t length=getBufferLen();
97         int32_t i=0;
98         UBool tf=FALSE;
99         while(i<length) {
100             i=span(s, length, i, tf);
101             tf=(UBool)(!tf);
102             ++spanCount;
103         }
104     }
span(const UChar * s,int32_t length,int32_t start,UBool tf) const105     int32_t span(const UChar *s, int32_t length, int32_t start, UBool tf) const {
106         UChar32 c;
107         int32_t prev;
108         while((prev=start)<length) {
109             U16_NEXT(s, start, length, c);
110             if(tf!=set.contains(c)) {
111                 break;
112             }
113         }
114         return prev;
115     }
116 
getBuffer() const117     const UChar *getBuffer() const { return buffer; }
getBufferLen() const118     int32_t getBufferLen() const { return bufferLen; }
119 
120     char *utf8;
121     int32_t utf8Length;
122 
123     // Number of code points in the input text.
124     int32_t countInputCodePoints;
125     int32_t spanCount;
126 
127     UnicodeSet set;
128     UnicodeSet prefrozen;
129 };
130 
131 // Performance test function object.
132 class Command : public UPerfFunction {
133 protected:
Command(const UnicodeSetPerformanceTest & testcase)134     Command(const UnicodeSetPerformanceTest &testcase) : testcase(testcase) {}
135 
136 public:
~Command()137     virtual ~Command() {}
138 
139     // virtual void call(UErrorCode* pErrorCode) { ... }
140 
getOperationsPerIteration()141     virtual long getOperationsPerIteration() {
142         // Number of code points tested:
143         // Input code points, plus one for the end of each span except the last span.
144         return testcase.countInputCodePoints+testcase.spanCount-1;
145     }
146 
getEventsPerIteration()147     virtual long getEventsPerIteration() {
148         return testcase.spanCount;
149     }
150 
151     const UnicodeSetPerformanceTest &testcase;
152 };
153 
154 class Contains : public Command {
155 protected:
Contains(const UnicodeSetPerformanceTest & testcase)156     Contains(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
157         // Verify that the frozen set is equal to the unfrozen one.
158         UnicodeSet set;
159         UChar32 c;
160 
161         for(c=0; c<=0x10ffff; ++c) {
162             if(testcase.set.contains(c)) {
163                 set.add(c);
164             }
165         }
166         if(set!=testcase.set) {
167             fprintf(stderr, "error: frozen set != original!\n");
168         }
169     }
170 public:
get(const UnicodeSetPerformanceTest & testcase)171     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
172         return new Contains(testcase);
173     }
call(UErrorCode * pErrorCode)174     virtual void call(UErrorCode* pErrorCode) {
175         const UnicodeSet &set=testcase.set;
176         const UChar *s=testcase.getBuffer();
177         int32_t length=testcase.getBufferLen();
178         int32_t count=0;
179         int32_t i=0;
180         UBool tf=FALSE;
181         while(i<length) {
182             i+=span(set, s+i, length-i, tf);
183             tf=(UBool)(!tf);
184             ++count;
185         }
186         if(count!=testcase.spanCount) {
187             fprintf(stderr, "error: Contains() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
188                     (long)count, (long)testcase.spanCount);
189         }
190     }
span(const UnicodeSet & set,const UChar * s,int32_t length,UBool tf)191     static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) {
192         UChar32 c;
193         int32_t start=0, prev;
194         while((prev=start)<length) {
195             U16_NEXT(s, start, length, c);
196             if(tf!=set.contains(c)) {
197                 break;
198             }
199         }
200         return prev;
201     }
202 };
203 
204 class SpanUTF16 : public Command {
205 protected:
SpanUTF16(const UnicodeSetPerformanceTest & testcase)206     SpanUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
207         // Verify that the frozen set is equal to the unfrozen one.
208         UnicodeSet set;
209         UChar utf16[2];
210         UChar32 c, c2;
211 
212         for(c=0; c<=0xffff; ++c) {
213             utf16[0]=(UChar)c;
214             if(testcase.set.span(utf16, 1, USET_SPAN_CONTAINED)>0) {
215                 set.add(c);
216             }
217         }
218         for(c=0xd800; c<=0xdbff; ++c) {
219             utf16[0]=(UChar)c;
220             for(c2=0xdc00; c2<=0xdfff; ++c2) {
221                 utf16[1]=(UChar)c2;
222                 if(testcase.set.span(utf16, 2, USET_SPAN_CONTAINED)>0) {
223                     set.add(U16_GET_SUPPLEMENTARY(c, c2));
224                 }
225             }
226         }
227 
228         if(set!=testcase.set) {
229             fprintf(stderr, "error: frozen set != original!\n");
230         }
231     }
232 public:
get(const UnicodeSetPerformanceTest & testcase)233     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
234         return new SpanUTF16(testcase);
235     }
call(UErrorCode * pErrorCode)236     virtual void call(UErrorCode* pErrorCode) {
237         const UnicodeSet &set=testcase.set;
238         const UChar *s=testcase.getBuffer();
239         int32_t length=testcase.getBufferLen();
240         int32_t count=0;
241         int32_t i=0;
242         UBool tf=FALSE;
243         while(i<length) {
244             i+=set.span(s+i, length-i, (USetSpanCondition)tf);
245             tf=(UBool)(!tf);
246             ++count;
247         }
248         if(count!=testcase.spanCount) {
249             fprintf(stderr, "error: SpanUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
250                     (long)count, (long)testcase.spanCount);
251         }
252     }
253 };
254 
255 class SpanBackUTF16 : public Command {
256 protected:
SpanBackUTF16(const UnicodeSetPerformanceTest & testcase)257     SpanBackUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
258         // Verify that the frozen set is equal to the unfrozen one.
259         UnicodeSet set;
260         UChar utf16[2];
261         UChar32 c, c2;
262 
263         for(c=0; c<=0xffff; ++c) {
264             utf16[0]=(UChar)c;
265             if(testcase.set.spanBack(utf16, 1, USET_SPAN_CONTAINED)==0) {
266                 set.add(c);
267             }
268         }
269         for(c=0xd800; c<=0xdbff; ++c) {
270             utf16[0]=(UChar)c;
271             for(c2=0xdc00; c2<=0xdfff; ++c2) {
272                 utf16[1]=(UChar)c2;
273                 if(testcase.set.spanBack(utf16, 2, USET_SPAN_CONTAINED)==0) {
274                     set.add(U16_GET_SUPPLEMENTARY(c, c2));
275                 }
276             }
277         }
278 
279         if(set!=testcase.set) {
280             fprintf(stderr, "error: frozen set != original!\n");
281         }
282     }
283 public:
get(const UnicodeSetPerformanceTest & testcase)284     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
285         return new SpanBackUTF16(testcase);
286     }
call(UErrorCode * pErrorCode)287     virtual void call(UErrorCode* pErrorCode) {
288         const UnicodeSet &set=testcase.set;
289         const UChar *s=testcase.getBuffer();
290         int32_t length=testcase.getBufferLen();
291         int32_t count=0;
292         /*
293          * Get the same spans as with span() where we always start with a not-contained span.
294          * If testcase.spanCount is an odd number, then the last span() was not-contained.
295          * The last spanBack() must be not-contained to match the first span().
296          */
297         UBool tf=(UBool)((testcase.spanCount&1)==0);
298         while(length>0 || !tf) {
299             length=set.spanBack(s, length, (USetSpanCondition)tf);
300             tf=(UBool)(!tf);
301             ++count;
302         }
303         if(count!=testcase.spanCount) {
304             fprintf(stderr, "error: SpanBackUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
305                     (long)count, (long)testcase.spanCount);
306         }
307     }
308 };
309 
310 class SpanUTF8 : public Command {
311 protected:
SpanUTF8(const UnicodeSetPerformanceTest & testcase)312     SpanUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
313         // Verify that the frozen set is equal to the unfrozen one.
314         UnicodeSet set;
315         char utf8[4];
316         UChar32 c;
317         int32_t length;
318 
319         for(c=0; c<=0x10ffff; ++c) {
320             if(c==0xd800) {
321                 c=0xe000;
322             }
323             length=0;
324             U8_APPEND_UNSAFE(utf8, length, c);
325             if(testcase.set.spanUTF8(utf8, length, USET_SPAN_CONTAINED)>0) {
326                 set.add(c);
327             }
328         }
329         if(set!=testcase.set) {
330             fprintf(stderr, "error: frozen set != original!\n");
331         }
332     }
333 public:
get(const UnicodeSetPerformanceTest & testcase)334     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
335         return new SpanUTF8(testcase);
336     }
call(UErrorCode * pErrorCode)337     virtual void call(UErrorCode* pErrorCode) {
338         const UnicodeSet &set=testcase.set;
339         const char *s=testcase.utf8;
340         int32_t length=testcase.utf8Length;
341         int32_t count=0;
342         int32_t i=0;
343         UBool tf=FALSE;
344         while(i<length) {
345             i+=set.spanUTF8(s+i, length-i, (USetSpanCondition)tf);
346             tf=(UBool)(!tf);
347             ++count;
348         }
349         if(count!=testcase.spanCount) {
350             fprintf(stderr, "error: SpanUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
351                     (long)count, (long)testcase.spanCount);
352         }
353     }
354 };
355 
356 class SpanBackUTF8 : public Command {
357 protected:
SpanBackUTF8(const UnicodeSetPerformanceTest & testcase)358     SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
359         // Verify that the frozen set is equal to the unfrozen one.
360         UnicodeSet set;
361         char utf8[4];
362         UChar32 c;
363         int32_t length;
364 
365         for(c=0; c<=0x10ffff; ++c) {
366             if(c==0xd800) {
367                 c=0xe000;
368             }
369             length=0;
370             U8_APPEND_UNSAFE(utf8, length, c);
371             if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) {
372                 set.add(c);
373             }
374         }
375         if(set!=testcase.set) {
376             fprintf(stderr, "error: frozen set != original!\n");
377         }
378     }
379 public:
get(const UnicodeSetPerformanceTest & testcase)380     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
381         return new SpanBackUTF8(testcase);
382     }
call(UErrorCode * pErrorCode)383     virtual void call(UErrorCode* pErrorCode) {
384         const UnicodeSet &set=testcase.set;
385         const char *s=testcase.utf8;
386         int32_t length=testcase.utf8Length;
387         int32_t count=0;
388         /*
389          * Get the same spans as with span() where we always start with a not-contained span.
390          * If testcase.spanCount is an odd number, then the last span() was not-contained.
391          * The last spanBack() must be not-contained to match the first span().
392          */
393         UBool tf=(UBool)((testcase.spanCount&1)==0);
394         while(length>0 || !tf) {
395             length=set.spanBackUTF8(s, length, (USetSpanCondition)tf);
396             tf=(UBool)(!tf);
397             ++count;
398         }
399         if(count!=testcase.spanCount) {
400             fprintf(stderr, "error: SpanBackUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
401                     (long)count, (long)testcase.spanCount);
402         }
403     }
404 };
405 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * par)406 UPerfFunction* UnicodeSetPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
407     switch (index) {
408         case 0: name = "Contains";     if (exec) return Contains::get(*this); break;
409         case 1: name = "SpanUTF16";    if (exec) return SpanUTF16::get(*this); break;
410         case 2: name = "SpanBackUTF16";if (exec) return SpanBackUTF16::get(*this); break;
411         case 3: name = "SpanUTF8";     if (exec) return SpanUTF8::get(*this); break;
412         case 4: name = "SpanBackUTF8"; if (exec) return SpanBackUTF8::get(*this); break;
413         default: name = ""; break;
414     }
415     return NULL;
416 }
417 
main(int argc,const char * argv[])418 int main(int argc, const char *argv[])
419 {
420     // Default values for command-line options.
421     options[SET_PATTERN].value = "[:ID_Continue:]";
422     options[FAST_TYPE].value = "slow";
423 
424     UErrorCode status = U_ZERO_ERROR;
425     UnicodeSetPerformanceTest test(argc, argv, status);
426 
427 	if (U_FAILURE(status)){
428         printf("The error is %s\n", u_errorName(status));
429         test.usage();
430         return status;
431     }
432 
433     if (test.run() == FALSE){
434         fprintf(stderr, "FAILED: Tests could not be run, please check the "
435 			            "arguments.\n");
436         return 1;
437     }
438 
439     return 0;
440 }
441