1 /*
2 **************************************************************************
3 *    © 2016 and later: Unicode, Inc. and others.
4 *    License & terms of use: http://www.unicode.org/copyright.html#License
5 **************************************************************************
6 **************************************************************************
7 *   Copyright (C) 2014, International Business Machines
8 *   Corporation and others.  All Rights Reserved.
9 **************************************************************************
10 *   file name:  unisetperf.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007jan31
16 *   created by: Markus Scherer
17 */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 #include "unicode/uperf.h"
23 #include "unicode/uniset.h"
24 #include "unicode/unistr.h"
25 #include "uoptions.h"
26 #include "cmemory.h" // for UPRV_LENGTHOF
27 
28 // Command-line options specific to unisetperf.
29 // Options do not have abbreviations: Force readable command lines.
30 // (Using U+0001 for abbreviation characters.)
31 enum {
32     SET_PATTERN,
33     FAST_TYPE,
34     UNISETPERF_OPTIONS_COUNT
35 };
36 
37 static UOption options[UNISETPERF_OPTIONS_COUNT]={
38     UOPTION_DEF("pattern", '\x01', UOPT_REQUIRES_ARG),
39     UOPTION_DEF("type",    '\x01', UOPT_REQUIRES_ARG)
40 };
41 
42 static const char *const unisetperf_usage =
43     "\t--pattern   UnicodeSet pattern for instantiation.\n"
44     "\t            Default: [:ID_Continue:]\n"
45     "\t--type      Type of UnicodeSet: slow fast\n"
46     "\t            Default: slow\n";
47 
48 // Test object with setup data.
49 class UnicodeSetPerformanceTest : public UPerfTest {
50 public:
UnicodeSetPerformanceTest(int32_t argc,const char * argv[],UErrorCode & status)51     UnicodeSetPerformanceTest(int32_t argc, const char *argv[], UErrorCode &status)
52             : UPerfTest(argc, argv, options, UPRV_LENGTHOF(options), unisetperf_usage, status),
53               utf8(NULL), utf8Length(0), countInputCodePoints(0), spanCount(0) {
54         if (U_SUCCESS(status)) {
55             UnicodeString pattern=UnicodeString(options[SET_PATTERN].value, -1, US_INV).unescape();
56             set.applyPattern(pattern, status);
57             prefrozen=set;
58             if(0==strcmp(options[FAST_TYPE].value, "fast")) {
59                 set.freeze();
60             }
61 
62             int32_t inputLength;
63             UPerfTest::getBuffer(inputLength, status);
64             if(U_SUCCESS(status) && inputLength>0) {
65                 countInputCodePoints = u_countChar32(buffer, bufferLen);
66 
67                 countSpans();
68 
69                 // Preflight the UTF-8 length and allocate utf8.
70                 u_strToUTF8(NULL, 0, &utf8Length, buffer, bufferLen, &status);
71                 if(status==U_BUFFER_OVERFLOW_ERROR) {
72                     utf8=(char *)malloc(utf8Length);
73                     if(utf8!=NULL) {
74                         status=U_ZERO_ERROR;
75                         u_strToUTF8(utf8, utf8Length, NULL, buffer, bufferLen, &status);
76                     } else {
77                         status=U_MEMORY_ALLOCATION_ERROR;
78                     }
79                 }
80 
81                 if(verbose) {
82                     printf("code points:%ld  len16:%ld  len8:%ld  spans:%ld  "
83                            "cp/span:%.3g  UChar/span:%.3g  B/span:%.3g  B/cp:%.3g\n",
84                            (long)countInputCodePoints, (long)bufferLen, (long)utf8Length, (long)spanCount,
85                            (double)countInputCodePoints/spanCount, (double)bufferLen/spanCount, (double)utf8Length/spanCount,
86                            (double)utf8Length/countInputCodePoints);
87                 }
88             }
89         }
90     }
91 
92     virtual UPerfFunction* runIndexedTest(int32_t index, UBool exec, const char* &name, char* par = NULL);
93 
94     // Count spans of characters that are in the set,
95     // and spans of characters that are not in the set.
96     // If the very first character is in the set, then one additional
97     // not-span is counted.
countSpans()98     void countSpans() {
99         const UChar *s=getBuffer();
100         int32_t length=getBufferLen();
101         int32_t i=0;
102         UBool tf=FALSE;
103         while(i<length) {
104             i=span(s, length, i, tf);
105             tf=(UBool)(!tf);
106             ++spanCount;
107         }
108     }
span(const UChar * s,int32_t length,int32_t start,UBool tf) const109     int32_t span(const UChar *s, int32_t length, int32_t start, UBool tf) const {
110         UChar32 c;
111         int32_t prev;
112         while((prev=start)<length) {
113             U16_NEXT(s, start, length, c);
114             if(tf!=set.contains(c)) {
115                 break;
116             }
117         }
118         return prev;
119     }
120 
getBuffer() const121     const UChar *getBuffer() const { return buffer; }
getBufferLen() const122     int32_t getBufferLen() const { return bufferLen; }
123 
124     char *utf8;
125     int32_t utf8Length;
126 
127     // Number of code points in the input text.
128     int32_t countInputCodePoints;
129     int32_t spanCount;
130 
131     UnicodeSet set;
132     UnicodeSet prefrozen;
133 };
134 
135 // Performance test function object.
136 class Command : public UPerfFunction {
137 protected:
Command(const UnicodeSetPerformanceTest & testcase)138     Command(const UnicodeSetPerformanceTest &testcase) : testcase(testcase) {}
139 
140 public:
~Command()141     virtual ~Command() {}
142 
143     // virtual void call(UErrorCode* pErrorCode) { ... }
144 
getOperationsPerIteration()145     virtual long getOperationsPerIteration() {
146         // Number of code points tested:
147         // Input code points, plus one for the end of each span except the last span.
148         return testcase.countInputCodePoints+testcase.spanCount-1;
149     }
150 
getEventsPerIteration()151     virtual long getEventsPerIteration() {
152         return testcase.spanCount;
153     }
154 
155     const UnicodeSetPerformanceTest &testcase;
156 };
157 
158 class Contains : public Command {
159 protected:
Contains(const UnicodeSetPerformanceTest & testcase)160     Contains(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
161         // Verify that the frozen set is equal to the unfrozen one.
162         UnicodeSet set;
163         UChar32 c;
164 
165         for(c=0; c<=0x10ffff; ++c) {
166             if(testcase.set.contains(c)) {
167                 set.add(c);
168             }
169         }
170         if(set!=testcase.set) {
171             fprintf(stderr, "error: frozen set != original!\n");
172         }
173     }
174 public:
get(const UnicodeSetPerformanceTest & testcase)175     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
176         return new Contains(testcase);
177     }
call(UErrorCode * pErrorCode)178     virtual void call(UErrorCode* pErrorCode) {
179         const UnicodeSet &set=testcase.set;
180         const UChar *s=testcase.getBuffer();
181         int32_t length=testcase.getBufferLen();
182         int32_t count=0;
183         int32_t i=0;
184         UBool tf=FALSE;
185         while(i<length) {
186             i+=span(set, s+i, length-i, tf);
187             tf=(UBool)(!tf);
188             ++count;
189         }
190         if(count!=testcase.spanCount) {
191             fprintf(stderr, "error: Contains() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
192                     (long)count, (long)testcase.spanCount);
193         }
194     }
span(const UnicodeSet & set,const UChar * s,int32_t length,UBool tf)195     static int32_t span(const UnicodeSet &set, const UChar *s, int32_t length, UBool tf) {
196         UChar32 c;
197         int32_t start=0, prev;
198         while((prev=start)<length) {
199             U16_NEXT(s, start, length, c);
200             if(tf!=set.contains(c)) {
201                 break;
202             }
203         }
204         return prev;
205     }
206 };
207 
208 class SpanUTF16 : public Command {
209 protected:
SpanUTF16(const UnicodeSetPerformanceTest & testcase)210     SpanUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
211         // Verify that the frozen set is equal to the unfrozen one.
212         UnicodeSet set;
213         UChar utf16[2];
214         UChar32 c, c2;
215 
216         for(c=0; c<=0xffff; ++c) {
217             utf16[0]=(UChar)c;
218             if(testcase.set.span(utf16, 1, USET_SPAN_CONTAINED)>0) {
219                 set.add(c);
220             }
221         }
222         for(c=0xd800; c<=0xdbff; ++c) {
223             utf16[0]=(UChar)c;
224             for(c2=0xdc00; c2<=0xdfff; ++c2) {
225                 utf16[1]=(UChar)c2;
226                 if(testcase.set.span(utf16, 2, USET_SPAN_CONTAINED)>0) {
227                     set.add(U16_GET_SUPPLEMENTARY(c, c2));
228                 }
229             }
230         }
231 
232         if(set!=testcase.set) {
233             fprintf(stderr, "error: frozen set != original!\n");
234         }
235     }
236 public:
get(const UnicodeSetPerformanceTest & testcase)237     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
238         return new SpanUTF16(testcase);
239     }
call(UErrorCode * pErrorCode)240     virtual void call(UErrorCode* pErrorCode) {
241         const UnicodeSet &set=testcase.set;
242         const UChar *s=testcase.getBuffer();
243         int32_t length=testcase.getBufferLen();
244         int32_t count=0;
245         int32_t i=0;
246         UBool tf=FALSE;
247         while(i<length) {
248             i+=set.span(s+i, length-i, (USetSpanCondition)tf);
249             tf=(UBool)(!tf);
250             ++count;
251         }
252         if(count!=testcase.spanCount) {
253             fprintf(stderr, "error: SpanUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
254                     (long)count, (long)testcase.spanCount);
255         }
256     }
257 };
258 
259 class SpanBackUTF16 : public Command {
260 protected:
SpanBackUTF16(const UnicodeSetPerformanceTest & testcase)261     SpanBackUTF16(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
262         // Verify that the frozen set is equal to the unfrozen one.
263         UnicodeSet set;
264         UChar utf16[2];
265         UChar32 c, c2;
266 
267         for(c=0; c<=0xffff; ++c) {
268             utf16[0]=(UChar)c;
269             if(testcase.set.spanBack(utf16, 1, USET_SPAN_CONTAINED)==0) {
270                 set.add(c);
271             }
272         }
273         for(c=0xd800; c<=0xdbff; ++c) {
274             utf16[0]=(UChar)c;
275             for(c2=0xdc00; c2<=0xdfff; ++c2) {
276                 utf16[1]=(UChar)c2;
277                 if(testcase.set.spanBack(utf16, 2, USET_SPAN_CONTAINED)==0) {
278                     set.add(U16_GET_SUPPLEMENTARY(c, c2));
279                 }
280             }
281         }
282 
283         if(set!=testcase.set) {
284             fprintf(stderr, "error: frozen set != original!\n");
285         }
286     }
287 public:
get(const UnicodeSetPerformanceTest & testcase)288     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
289         return new SpanBackUTF16(testcase);
290     }
call(UErrorCode * pErrorCode)291     virtual void call(UErrorCode* pErrorCode) {
292         const UnicodeSet &set=testcase.set;
293         const UChar *s=testcase.getBuffer();
294         int32_t length=testcase.getBufferLen();
295         int32_t count=0;
296         /*
297          * Get the same spans as with span() where we always start with a not-contained span.
298          * If testcase.spanCount is an odd number, then the last span() was not-contained.
299          * The last spanBack() must be not-contained to match the first span().
300          */
301         UBool tf=(UBool)((testcase.spanCount&1)==0);
302         while(length>0 || !tf) {
303             length=set.spanBack(s, length, (USetSpanCondition)tf);
304             tf=(UBool)(!tf);
305             ++count;
306         }
307         if(count!=testcase.spanCount) {
308             fprintf(stderr, "error: SpanBackUTF16() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
309                     (long)count, (long)testcase.spanCount);
310         }
311     }
312 };
313 
314 class SpanUTF8 : public Command {
315 protected:
SpanUTF8(const UnicodeSetPerformanceTest & testcase)316     SpanUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
317         // Verify that the frozen set is equal to the unfrozen one.
318         UnicodeSet set;
319         char utf8[4];
320         UChar32 c;
321         int32_t length;
322 
323         for(c=0; c<=0x10ffff; ++c) {
324             if(c==0xd800) {
325                 c=0xe000;
326             }
327             length=0;
328             U8_APPEND_UNSAFE(utf8, length, c);
329             if(testcase.set.spanUTF8(utf8, length, USET_SPAN_CONTAINED)>0) {
330                 set.add(c);
331             }
332         }
333         if(set!=testcase.set) {
334             fprintf(stderr, "error: frozen set != original!\n");
335         }
336     }
337 public:
get(const UnicodeSetPerformanceTest & testcase)338     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
339         return new SpanUTF8(testcase);
340     }
call(UErrorCode * pErrorCode)341     virtual void call(UErrorCode* pErrorCode) {
342         const UnicodeSet &set=testcase.set;
343         const char *s=testcase.utf8;
344         int32_t length=testcase.utf8Length;
345         int32_t count=0;
346         int32_t i=0;
347         UBool tf=FALSE;
348         while(i<length) {
349             i+=set.spanUTF8(s+i, length-i, (USetSpanCondition)tf);
350             tf=(UBool)(!tf);
351             ++count;
352         }
353         if(count!=testcase.spanCount) {
354             fprintf(stderr, "error: SpanUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
355                     (long)count, (long)testcase.spanCount);
356         }
357     }
358 };
359 
360 class SpanBackUTF8 : public Command {
361 protected:
SpanBackUTF8(const UnicodeSetPerformanceTest & testcase)362     SpanBackUTF8(const UnicodeSetPerformanceTest &testcase) : Command(testcase) {
363         // Verify that the frozen set is equal to the unfrozen one.
364         UnicodeSet set;
365         char utf8[4];
366         UChar32 c;
367         int32_t length;
368 
369         for(c=0; c<=0x10ffff; ++c) {
370             if(c==0xd800) {
371                 c=0xe000;
372             }
373             length=0;
374             U8_APPEND_UNSAFE(utf8, length, c);
375             if(testcase.set.spanBackUTF8(utf8, length, USET_SPAN_CONTAINED)==0) {
376                 set.add(c);
377             }
378         }
379         if(set!=testcase.set) {
380             fprintf(stderr, "error: frozen set != original!\n");
381         }
382     }
383 public:
get(const UnicodeSetPerformanceTest & testcase)384     static UPerfFunction* get(const UnicodeSetPerformanceTest &testcase) {
385         return new SpanBackUTF8(testcase);
386     }
call(UErrorCode * pErrorCode)387     virtual void call(UErrorCode* pErrorCode) {
388         const UnicodeSet &set=testcase.set;
389         const char *s=testcase.utf8;
390         int32_t length=testcase.utf8Length;
391         int32_t count=0;
392         /*
393          * Get the same spans as with span() where we always start with a not-contained span.
394          * If testcase.spanCount is an odd number, then the last span() was not-contained.
395          * The last spanBack() must be not-contained to match the first span().
396          */
397         UBool tf=(UBool)((testcase.spanCount&1)==0);
398         while(length>0 || !tf) {
399             length=set.spanBackUTF8(s, length, (USetSpanCondition)tf);
400             tf=(UBool)(!tf);
401             ++count;
402         }
403         if(count!=testcase.spanCount) {
404             fprintf(stderr, "error: SpanBackUTF8() count=%ld != %ld=UnicodeSetPerformanceTest.spanCount\n",
405                     (long)count, (long)testcase.spanCount);
406         }
407     }
408 };
409 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * par)410 UPerfFunction* UnicodeSetPerformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* par) {
411     switch (index) {
412         case 0: name = "Contains";     if (exec) return Contains::get(*this); break;
413         case 1: name = "SpanUTF16";    if (exec) return SpanUTF16::get(*this); break;
414         case 2: name = "SpanBackUTF16";if (exec) return SpanBackUTF16::get(*this); break;
415         case 3: name = "SpanUTF8";     if (exec) return SpanUTF8::get(*this); break;
416         case 4: name = "SpanBackUTF8"; if (exec) return SpanBackUTF8::get(*this); break;
417         default: name = ""; break;
418     }
419     return NULL;
420 }
421 
main(int argc,const char * argv[])422 int main(int argc, const char *argv[])
423 {
424     // Default values for command-line options.
425     options[SET_PATTERN].value = "[:ID_Continue:]";
426     options[FAST_TYPE].value = "slow";
427 
428     UErrorCode status = U_ZERO_ERROR;
429     UnicodeSetPerformanceTest test(argc, argv, status);
430 
431 	if (U_FAILURE(status)){
432         printf("The error is %s\n", u_errorName(status));
433         test.usage();
434         return status;
435     }
436 
437     if (test.run() == FALSE){
438         fprintf(stderr, "FAILED: Tests could not be run, please check the "
439 			            "arguments.\n");
440         return 1;
441     }
442 
443     return 0;
444 }
445