1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ******************************************************************************
5  *
6  *   Copyright (C) 2009-2015, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *
9  ******************************************************************************
10  */
11 
12 package com.ibm.icu.impl;
13 
14 import com.ibm.icu.text.UnicodeSet.SpanCondition;
15 import com.ibm.icu.util.OutputInt;
16 
17 /**
18  * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
19  *
20  * Latin-1: Look up bytes.
21  * 2-byte characters: Bits organized vertically.
22  * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
23  * Supplementary characters: Binary search over
24  * the supplementary part of the parent set's inversion list.
25  */
26 public final class BMPSet {
27     public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000);
28 
29     /**
30      * One boolean ('true' or 'false') per Latin-1 character.
31      */
32     private boolean[] latin1Contains;
33 
34     /**
35      * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
36      * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
37      * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
38      *
39      * Bits for 0..FF are unused (0).
40      */
41     private int[] table7FF;
42 
43     /**
44      * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
45      * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
46      * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
47      * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
48      * and set.contains(c) must be called.
49      *
50      * Bits for 0..7FF are unused (0).
51      */
52     private int[] bmpBlockBits;
53 
54     /**
55      * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
56      * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
57      * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
58      */
59     private int[] list4kStarts;
60 
61     /**
62      * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
63      * supplementary code points. The list is terminated with list[listLength-1]=0x110000.
64      */
65     private final int[] list;
66     private final int listLength; // length used; list may be longer to minimize reallocs
67 
BMPSet(final int[] parentList, int parentListLength)68     public BMPSet(final int[] parentList, int parentListLength) {
69         list = parentList;
70         listLength = parentListLength;
71         latin1Contains = new boolean[0x100];
72         table7FF = new int[64];
73         bmpBlockBits = new int[64];
74         list4kStarts = new int[18];
75 
76         /*
77          * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
78          * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
79          * indexes is for finding supplementary code points.
80          */
81         list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
82         int i;
83         for (i = 1; i <= 0x10; ++i) {
84             list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
85         }
86         list4kStarts[0x11] = listLength - 1;
87 
88         initBits();
89     }
90 
BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength)91     public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) {
92         list = newParentList;
93         listLength = newParentListLength;
94         latin1Contains = otherBMPSet.latin1Contains.clone();
95         table7FF = otherBMPSet.table7FF.clone();
96         bmpBlockBits = otherBMPSet.bmpBlockBits.clone();
97         list4kStarts = otherBMPSet.list4kStarts.clone();
98     }
99 
contains(int c)100     public boolean contains(int c) {
101         if (c <= 0xff) {
102             return (latin1Contains[c]);
103         } else if (c <= 0x7ff) {
104             return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
105         } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
106             int lead = c >> 12;
107             int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
108             if (twoBits <= 1) {
109                 // All 64 code points with the same bits 15..6
110                 // are either in the set or not.
111                 return (0 != twoBits);
112             } else {
113                 // Look up the code point in its 4k block of code points.
114                 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
115             }
116         } else if (c <= 0x10ffff) {
117             // surrogate or supplementary code point
118             return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
119         } else {
120             // Out-of-range code points get false, consistent with long-standing
121             // behavior of UnicodeSet.contains(c).
122             return false;
123         }
124     }
125 
126     /**
127      * Span the initial substring for which each character c has spanCondition==contains(c). It must be
128      * spanCondition==0 or 1.
129      *
130      * @param start The start index
131      * @param outCount If not null: Receives the number of code points in the span.
132      * @return the limit (exclusive end) of the span
133      *
134      * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
135      * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
136      * as usual in ICU.
137      */
span(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)138     public final int span(CharSequence s, int start, SpanCondition spanCondition,
139             OutputInt outCount) {
140         char c, c2;
141         int i = start;
142         int limit = s.length();
143         int numSupplementary = 0;
144         if (SpanCondition.NOT_CONTAINED != spanCondition) {
145             // span
146             while (i < limit) {
147                 c = s.charAt(i);
148                 if (c <= 0xff) {
149                     if (!latin1Contains[c]) {
150                         break;
151                     }
152                 } else if (c <= 0x7ff) {
153                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
154                         break;
155                     }
156                 } else if (c < 0xd800 ||
157                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
158                     int lead = c >> 12;
159                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
160                     if (twoBits <= 1) {
161                         // All 64 code points with the same bits 15..6
162                         // are either in the set or not.
163                         if (twoBits == 0) {
164                             break;
165                         }
166                     } else {
167                         // Look up the code point in its 4k block of code points.
168                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
169                             break;
170                         }
171                     }
172                 } else {
173                     // surrogate pair
174                     int supplementary = Character.toCodePoint(c, c2);
175                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
176                         break;
177                     }
178                     ++numSupplementary;
179                     ++i;
180                 }
181                 ++i;
182             }
183         } else {
184             // span not
185             while (i < limit) {
186                 c = s.charAt(i);
187                 if (c <= 0xff) {
188                     if (latin1Contains[c]) {
189                         break;
190                     }
191                 } else if (c <= 0x7ff) {
192                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
193                         break;
194                     }
195                 } else if (c < 0xd800 ||
196                            c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
197                     int lead = c >> 12;
198                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
199                     if (twoBits <= 1) {
200                         // All 64 code points with the same bits 15..6
201                         // are either in the set or not.
202                         if (twoBits != 0) {
203                             break;
204                         }
205                     } else {
206                         // Look up the code point in its 4k block of code points.
207                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
208                             break;
209                         }
210                     }
211                 } else {
212                     // surrogate pair
213                     int supplementary = Character.toCodePoint(c, c2);
214                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
215                         break;
216                     }
217                     ++numSupplementary;
218                     ++i;
219                 }
220                 ++i;
221             }
222         }
223         if (outCount != null) {
224             int spanLength = i - start;
225             outCount.value = spanLength - numSupplementary;  // number of code points
226         }
227         return i;
228     }
229 
230     /**
231      * Symmetrical with span().
232      * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
233      * limit and spanCondition==0 or 1.
234      *
235      * @return The string index which starts the span (i.e. inclusive).
236      */
spanBack(CharSequence s, int limit, SpanCondition spanCondition)237     public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
238         char c, c2;
239 
240         if (SpanCondition.NOT_CONTAINED != spanCondition) {
241             // span
242             for (;;) {
243                 c = s.charAt(--limit);
244                 if (c <= 0xff) {
245                     if (!latin1Contains[c]) {
246                         break;
247                     }
248                 } else if (c <= 0x7ff) {
249                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
250                         break;
251                     }
252                 } else if (c < 0xd800 ||
253                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
254                     int lead = c >> 12;
255                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
256                     if (twoBits <= 1) {
257                         // All 64 code points with the same bits 15..6
258                         // are either in the set or not.
259                         if (twoBits == 0) {
260                             break;
261                         }
262                     } else {
263                         // Look up the code point in its 4k block of code points.
264                         if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
265                             break;
266                         }
267                     }
268                 } else {
269                     // surrogate pair
270                     int supplementary = Character.toCodePoint(c2, c);
271                     if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
272                         break;
273                     }
274                     --limit;
275                 }
276                 if (0 == limit) {
277                     return 0;
278                 }
279             }
280         } else {
281             // span not
282             for (;;) {
283                 c = s.charAt(--limit);
284                 if (c <= 0xff) {
285                     if (latin1Contains[c]) {
286                         break;
287                     }
288                 } else if (c <= 0x7ff) {
289                     if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
290                         break;
291                     }
292                 } else if (c < 0xd800 ||
293                            c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
294                     int lead = c >> 12;
295                     int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
296                     if (twoBits <= 1) {
297                         // All 64 code points with the same bits 15..6
298                         // are either in the set or not.
299                         if (twoBits != 0) {
300                             break;
301                         }
302                     } else {
303                         // Look up the code point in its 4k block of code points.
304                         if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
305                             break;
306                         }
307                     }
308                 } else {
309                     // surrogate pair
310                     int supplementary = Character.toCodePoint(c2, c);
311                     if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
312                         break;
313                     }
314                     --limit;
315                 }
316                 if (0 == limit) {
317                     return 0;
318                 }
319             }
320         }
321         return limit + 1;
322     }
323 
324     /**
325      * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
326      */
set32x64Bits(int[] table, int start, int limit)327     private static void set32x64Bits(int[] table, int start, int limit) {
328         assert (64 == table.length);
329         int lead = start >> 6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
330         int trail = start & 0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
331 
332         // Set one bit indicating an all-one block.
333         int bits = 1 << lead;
334         if ((start + 1) == limit) { // Single-character shortcut.
335             table[trail] |= bits;
336             return;
337         }
338 
339         int limitLead = limit >> 6;
340         int limitTrail = limit & 0x3f;
341 
342         if (lead == limitLead) {
343             // Partial vertical bit column.
344             while (trail < limitTrail) {
345                 table[trail++] |= bits;
346             }
347         } else {
348             // Partial vertical bit column,
349             // followed by a bit rectangle,
350             // followed by another partial vertical bit column.
351             if (trail > 0) {
352                 do {
353                     table[trail++] |= bits;
354                 } while (trail < 64);
355                 ++lead;
356             }
357             if (lead < limitLead) {
358                 bits = ~((1 << lead) - 1);
359                 if (limitLead < 0x20) {
360                     bits &= (1 << limitLead) - 1;
361                 }
362                 for (trail = 0; trail < 64; ++trail) {
363                     table[trail] |= bits;
364                 }
365             }
366             // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
367             // In that case, bits=1<<limitLead == 1<<0 == 1
368             // (because Java << uses only the lower 5 bits of the shift operand)
369             // but the bits value is not used because trail<limitTrail is already false.
370             bits = 1 << limitLead;
371             for (trail = 0; trail < limitTrail; ++trail) {
372                 table[trail] |= bits;
373             }
374         }
375     }
376 
initBits()377     private void initBits() {
378         int start, limit;
379         int listIndex = 0;
380 
381         // Set latin1Contains[].
382         do {
383             start = list[listIndex++];
384             if (listIndex < listLength) {
385                 limit = list[listIndex++];
386             } else {
387                 limit = 0x110000;
388             }
389             if (start >= 0x100) {
390                 break;
391             }
392             do {
393                 latin1Contains[start++] = true;
394             } while (start < limit && start < 0x100);
395         } while (limit <= 0x100);
396 
397         // Set table7FF[].
398         while (start < 0x800) {
399             set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
400             if (limit > 0x800) {
401                 start = 0x800;
402                 break;
403             }
404 
405             start = list[listIndex++];
406             if (listIndex < listLength) {
407                 limit = list[listIndex++];
408             } else {
409                 limit = 0x110000;
410             }
411         }
412 
413         // Set bmpBlockBits[].
414         int minStart = 0x800;
415         while (start < 0x10000) {
416             if (limit > 0x10000) {
417                 limit = 0x10000;
418             }
419 
420             if (start < minStart) {
421                 start = minStart;
422             }
423             if (start < limit) { // Else: Another range entirely in a known mixed-value block.
424                 if (0 != (start & 0x3f)) {
425                     // Mixed-value block of 64 code points.
426                     start >>= 6;
427                     bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
428                     start = (start + 1) << 6; // Round up to the next block boundary.
429                     minStart = start; // Ignore further ranges in this block.
430                 }
431                 if (start < limit) {
432                     if (start < (limit & ~0x3f)) {
433                         // Multiple all-ones blocks of 64 code points each.
434                         set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
435                     }
436 
437                     if (0 != (limit & 0x3f)) {
438                         // Mixed-value block of 64 code points.
439                         limit >>= 6;
440                         bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
441                         limit = (limit + 1) << 6; // Round up to the next block boundary.
442                         minStart = limit; // Ignore further ranges in this block.
443                     }
444                 }
445             }
446 
447             if (limit == 0x10000) {
448                 break;
449             }
450 
451             start = list[listIndex++];
452             if (listIndex < listLength) {
453                 limit = list[listIndex++];
454             } else {
455                 limit = 0x110000;
456             }
457         }
458     }
459 
460 
461     /**
462      * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
463      * points in a certain range.
464      *
465      * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
466      * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
467      *
468      * @param c
469      *            a character in a subrange of MIN_VALUE..MAX_VALUE
470      * @param lo
471      *            The lowest index to be returned.
472      * @param hi
473      *            The highest index to be returned.
474      * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
475      */
findCodePoint(int c, int lo, int hi)476     private int findCodePoint(int c, int lo, int hi) {
477         /* Examples:
478                                            findCodePoint(c)
479            set              list[]         c=0 1 3 4 7 8
480            ===              ==============   ===========
481            []               [110000]         0 0 0 0 0 0
482            [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
483            [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
484            [:Any:]          [0, 110000]      1 1 1 1 1 1
485          */
486 
487         // Return the smallest i such that c < list[i]. Assume
488         // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
489         if (c < list[lo])
490             return lo;
491         // High runner test. c is often after the last range, so an
492         // initial check for this condition pays off.
493         if (lo >= hi || c >= list[hi - 1])
494             return hi;
495         // invariant: c >= list[lo]
496         // invariant: c < list[hi]
497         for (;;) {
498             int i = (lo + hi) >>> 1;
499             if (i == lo) {
500                 break; // Found!
501             } else if (c < list[i]) {
502                 hi = i;
503             } else {
504                 lo = i;
505             }
506         }
507         return hi;
508     }
509 
containsSlow(int c, int lo, int hi)510     private final boolean containsSlow(int c, int lo, int hi) {
511         return (0 != (findCodePoint(c, lo, hi) & 1));
512     }
513 }
514