1 /*
2 **********************************************************************
3 *   Copyright (C) 2007, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   file name:  trieset.cpp
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2007jan15
12 *   created by: Markus Scherer
13 *
14 *   Idea for a "compiled", fast, read-only (immutable) version of a UnicodeSet
15 *   using a UTrie with 8-bit (byte) results per code point.
16 *   Modifies the trie index to make the BMP linear, and uses the original set
17 *   for supplementary code points.
18 */
19 
20 #include "unicode/utypes.h"
21 #include "unicont.h"
22 
23 #define UTRIE_GET8_LATIN1(trie) ((const uint8_t *)(trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
24 
25 #define UTRIE_GET8_FROM_LEAD(trie, c16) \
26     ((const uint8_t *)(trie)->data32)[ \
27         ((int32_t)((trie)->index[(c16)>>UTRIE_SHIFT])<<UTRIE_INDEX_SHIFT)+ \
28         ((c16)&UTRIE_MASK) \
29     ]
30 
31 class TrieSet : public UObject, public UnicodeContainable {
32 public:
TrieSet(const UnicodeSet & set,UErrorCode & errorCode)33     TrieSet(const UnicodeSet &set, UErrorCode &errorCode)
34             : trieData(NULL), latin1(NULL), restSet(set.clone()) {
35         if(U_FAILURE(errorCode)) {
36             return;
37         }
38         if(restSet==NULL) {
39             errorCode=U_MEMORY_ALLOCATION_ERROR;
40             return;
41         }
42 
43         UNewTrie *newTrie=utrie_open(NULL, NULL, 0x11000, 0, 0, TRUE);
44         UChar32 start, end;
45 
46         UnicodeSetIterator iter(set);
47 
48         while(iter.nextRange() && !iter.isString()) {
49             start=iter.getCodepoint();
50             end=iter.getCodepointEnd();
51             if(start>0xffff) {
52                 break;
53             }
54             if(end>0xffff) {
55                 end=0xffff;
56             }
57             if(!utrie_setRange32(newTrie, start, end+1, TRUE, TRUE)) {
58                 errorCode=U_INTERNAL_PROGRAM_ERROR;
59                 return;
60             }
61         }
62 
63         // Preflight the trie length.
64         int32_t length=utrie_serialize(newTrie, NULL, 0, NULL, 8, &errorCode);
65         if(errorCode!=U_BUFFER_OVERFLOW_ERROR) {
66             return;
67         }
68 
69         trieData=(uint32_t *)uprv_malloc(length);
70         if(trieData==NULL) {
71             errorCode=U_MEMORY_ALLOCATION_ERROR;
72             return;
73         }
74 
75         errorCode=U_ZERO_ERROR;
76         utrie_serialize(newTrie, trieData, length, NULL, 8, &errorCode);
77         utrie_unserialize(&trie, trieData, length, &errorCode);  // TODO: Implement for 8-bit UTrie!
78 
79         if(U_SUCCESS(errorCode)) {
80             // Copy the indexes for surrogate code points into the BMP range
81             // for simple access across the entire BMP.
82             uprv_memcpy((uint16_t *)trie.index+(0xd800>>UTRIE_SHIFT),
83                         trie.index+UTRIE_BMP_INDEX_LENGTH,
84                         (0x800>>UTRIE_SHIFT)*2);
85             latin1=UTRIE_GET8_LATIN1(&trie);
86         }
87 
88         restSet.remove(0, 0xffff);
89     }
90 
~TrieSet()91     ~TrieSet() {
92         uprv_free(trieData);
93         delete restSet;
94     }
95 
contains(UChar32 c) const96     UBool contains(UChar32 c) const {
97         if((uint32_t)c<=0xff) {
98             return (UBool)latin1[c];
99         } else if((uint32_t)c<0xffff) {
100             return (UBool)UTRIE_GET8_FROM_LEAD(&trie, c);
101         } else {
102             return restSet->contains(c);
103         }
104     }
105 
106 private:
107     uint32_t *trieData;
108     const uint8_t *latin1;
109     UTrie trie;
110     UnicodeSet *restSet;
111 };
112