1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2007-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  bmpset.cpp
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007jan29
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/utf8.h"
22 #include "unicode/utf16.h"
23 #include "cmemory.h"
24 #include "bmpset.h"
25 #include "uassert.h"
26 
27 U_NAMESPACE_BEGIN
28 
BMPSet(const int32_t * parentList,int32_t parentListLength)29 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30         list(parentList), listLength(parentListLength) {
31     uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
32     uprv_memset(table7FF, 0, sizeof(table7FF));
33     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34 
35     /*
36      * Set the list indexes for binary searches for
37      * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39      * looked up in the bit tables.
40      * The last pair of indexes is for finding supplementary code points.
41      */
42     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43     int32_t i;
44     for(i=1; i<=0x10; ++i) {
45         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46     }
47     list4kStarts[0x11]=listLength-1;
48 
49     initBits();
50     overrideIllegal();
51 }
52 
BMPSet(const BMPSet & otherBMPSet,const int32_t * newParentList,int32_t newParentListLength)53 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
54         list(newParentList), listLength(newParentListLength) {
55     uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
56     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
57     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
58     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
59 }
60 
~BMPSet()61 BMPSet::~BMPSet() {
62 }
63 
64 /*
65  * Set bits in a bit rectangle in "vertical" bit organization.
66  * start<limit<=0x800
67  */
set32x64Bits(uint32_t table[64],int32_t start,int32_t limit)68 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
69     U_ASSERT(start<limit);
70     U_ASSERT(limit<=0x800);
71 
72     int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
73     int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
74 
75     // Set one bit indicating an all-one block.
76     uint32_t bits=(uint32_t)1<<lead;
77     if((start+1)==limit) {  // Single-character shortcut.
78         table[trail]|=bits;
79         return;
80     }
81 
82     int32_t limitLead=limit>>6;
83     int32_t limitTrail=limit&0x3f;
84 
85     if(lead==limitLead) {
86         // Partial vertical bit column.
87         while(trail<limitTrail) {
88             table[trail++]|=bits;
89         }
90     } else {
91         // Partial vertical bit column,
92         // followed by a bit rectangle,
93         // followed by another partial vertical bit column.
94         if(trail>0) {
95             do {
96                 table[trail++]|=bits;
97             } while(trail<64);
98             ++lead;
99         }
100         if(lead<limitLead) {
101             bits=~((1<<lead)-1);
102             if(limitLead<0x20) {
103                 bits&=(1<<limitLead)-1;
104             }
105             for(trail=0; trail<64; ++trail) {
106                 table[trail]|=bits;
107             }
108         }
109         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
110         // In that case, bits=1<<limitLead is undefined but the bits value
111         // is not used because trail<limitTrail is already false.
112         bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
113         for(trail=0; trail<limitTrail; ++trail) {
114             table[trail]|=bits;
115         }
116     }
117 }
118 
initBits()119 void BMPSet::initBits() {
120     UChar32 start, limit;
121     int32_t listIndex=0;
122 
123     // Set asciiBytes[].
124     do {
125         start=list[listIndex++];
126         if(listIndex<listLength) {
127             limit=list[listIndex++];
128         } else {
129             limit=0x110000;
130         }
131         if(start>=0x80) {
132             break;
133         }
134         do {
135             asciiBytes[start++]=1;
136         } while(start<limit && start<0x80);
137     } while(limit<=0x80);
138 
139     // Set table7FF[].
140     while(start<0x800) {
141         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
142         if(limit>0x800) {
143             start=0x800;
144             break;
145         }
146 
147         start=list[listIndex++];
148         if(listIndex<listLength) {
149             limit=list[listIndex++];
150         } else {
151             limit=0x110000;
152         }
153     }
154 
155     // Set bmpBlockBits[].
156     int32_t minStart=0x800;
157     while(start<0x10000) {
158         if(limit>0x10000) {
159             limit=0x10000;
160         }
161 
162         if(start<minStart) {
163             start=minStart;
164         }
165         if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
166             if(start&0x3f) {
167                 // Mixed-value block of 64 code points.
168                 start>>=6;
169                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
170                 start=(start+1)<<6;  // Round up to the next block boundary.
171                 minStart=start;      // Ignore further ranges in this block.
172             }
173             if(start<limit) {
174                 if(start<(limit&~0x3f)) {
175                     // Multiple all-ones blocks of 64 code points each.
176                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);
177                 }
178 
179                 if(limit&0x3f) {
180                     // Mixed-value block of 64 code points.
181                     limit>>=6;
182                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
183                     limit=(limit+1)<<6;  // Round up to the next block boundary.
184                     minStart=limit;      // Ignore further ranges in this block.
185                 }
186             }
187         }
188 
189         if(limit==0x10000) {
190             break;
191         }
192 
193         start=list[listIndex++];
194         if(listIndex<listLength) {
195             limit=list[listIndex++];
196         } else {
197             limit=0x110000;
198         }
199     }
200 }
201 
202 /*
203  * Override some bits and bytes to the result of contains(FFFD)
204  * for faster validity checking at runtime.
205  * No need to set 0 values where they were reset to 0 in the constructor
206  * and not modified by initBits().
207  * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
208  * Need to set 0 values for surrogates D800..DFFF.
209  */
overrideIllegal()210 void BMPSet::overrideIllegal() {
211     uint32_t bits, mask;
212     int32_t i;
213 
214     if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
215         // contains(FFFD)==TRUE
216         for(i=0x80; i<0xc0; ++i) {
217             asciiBytes[i]=1;
218         }
219 
220         bits=3;                 // Lead bytes 0xC0 and 0xC1.
221         for(i=0; i<64; ++i) {
222             table7FF[i]|=bits;
223         }
224 
225         bits=1;                 // Lead byte 0xE0.
226         for(i=0; i<32; ++i) {   // First half of 4k block.
227             bmpBlockBits[i]|=bits;
228         }
229 
230         mask=~(0x10001<<0xd);   // Lead byte 0xED.
231         bits=1<<0xd;
232         for(i=32; i<64; ++i) {  // Second half of 4k block.
233             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
234         }
235     } else {
236         // contains(FFFD)==FALSE
237         mask=~(0x10001<<0xd);   // Lead byte 0xED.
238         for(i=32; i<64; ++i) {  // Second half of 4k block.
239             bmpBlockBits[i]&=mask;
240         }
241     }
242 }
243 
findCodePoint(UChar32 c,int32_t lo,int32_t hi) const244 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
245     /* Examples:
246                                        findCodePoint(c)
247        set              list[]         c=0 1 3 4 7 8
248        ===              ==============   ===========
249        []               [110000]         0 0 0 0 0 0
250        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
251        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
252        [:Any:]          [0, 110000]      1 1 1 1 1 1
253      */
254 
255     // Return the smallest i such that c < list[i].  Assume
256     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
257     if (c < list[lo])
258         return lo;
259     // High runner test.  c is often after the last range, so an
260     // initial check for this condition pays off.
261     if (lo >= hi || c >= list[hi-1])
262         return hi;
263     // invariant: c >= list[lo]
264     // invariant: c < list[hi]
265     for (;;) {
266         int32_t i = (lo + hi) >> 1;
267         if (i == lo) {
268             break; // Found!
269         } else if (c < list[i]) {
270             hi = i;
271         } else {
272             lo = i;
273         }
274     }
275     return hi;
276 }
277 
278 UBool
contains(UChar32 c) const279 BMPSet::contains(UChar32 c) const {
280     if((uint32_t)c<=0x7f) {
281         return (UBool)asciiBytes[c];
282     } else if((uint32_t)c<=0x7ff) {
283         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
284     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
285         int lead=c>>12;
286         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
287         if(twoBits<=1) {
288             // All 64 code points with the same bits 15..6
289             // are either in the set or not.
290             return (UBool)twoBits;
291         } else {
292             // Look up the code point in its 4k block of code points.
293             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
294         }
295     } else if((uint32_t)c<=0x10ffff) {
296         // surrogate or supplementary code point
297         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
298     } else {
299         // Out-of-range code points get FALSE, consistent with long-standing
300         // behavior of UnicodeSet::contains(c).
301         return FALSE;
302     }
303 }
304 
305 /*
306  * Check for sufficient length for trail unit for each surrogate pair.
307  * Handle single surrogates as surrogate code points as usual in ICU.
308  */
309 const UChar *
span(const UChar * s,const UChar * limit,USetSpanCondition spanCondition) const310 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
311     UChar c, c2;
312 
313     if(spanCondition) {
314         // span
315         do {
316             c=*s;
317             if(c<=0x7f) {
318                 if(!asciiBytes[c]) {
319                     break;
320                 }
321             } else if(c<=0x7ff) {
322                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
323                     break;
324                 }
325             } else if(c<0xd800 || c>=0xe000) {
326                 int lead=c>>12;
327                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
328                 if(twoBits<=1) {
329                     // All 64 code points with the same bits 15..6
330                     // are either in the set or not.
331                     if(twoBits==0) {
332                         break;
333                     }
334                 } else {
335                     // Look up the code point in its 4k block of code points.
336                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
337                         break;
338                     }
339                 }
340             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
341                 // surrogate code point
342                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
343                     break;
344                 }
345             } else {
346                 // surrogate pair
347                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
348                     break;
349                 }
350                 ++s;
351             }
352         } while(++s<limit);
353     } else {
354         // span not
355         do {
356             c=*s;
357             if(c<=0x7f) {
358                 if(asciiBytes[c]) {
359                     break;
360                 }
361             } else if(c<=0x7ff) {
362                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
363                     break;
364                 }
365             } else if(c<0xd800 || c>=0xe000) {
366                 int lead=c>>12;
367                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
368                 if(twoBits<=1) {
369                     // All 64 code points with the same bits 15..6
370                     // are either in the set or not.
371                     if(twoBits!=0) {
372                         break;
373                     }
374                 } else {
375                     // Look up the code point in its 4k block of code points.
376                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
377                         break;
378                     }
379                 }
380             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
381                 // surrogate code point
382                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
383                     break;
384                 }
385             } else {
386                 // surrogate pair
387                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
388                     break;
389                 }
390                 ++s;
391             }
392         } while(++s<limit);
393     }
394     return s;
395 }
396 
397 /* Symmetrical with span(). */
398 const UChar *
spanBack(const UChar * s,const UChar * limit,USetSpanCondition spanCondition) const399 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
400     UChar c, c2;
401 
402     if(spanCondition) {
403         // span
404         for(;;) {
405             c=*(--limit);
406             if(c<=0x7f) {
407                 if(!asciiBytes[c]) {
408                     break;
409                 }
410             } else if(c<=0x7ff) {
411                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
412                     break;
413                 }
414             } else if(c<0xd800 || c>=0xe000) {
415                 int lead=c>>12;
416                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
417                 if(twoBits<=1) {
418                     // All 64 code points with the same bits 15..6
419                     // are either in the set or not.
420                     if(twoBits==0) {
421                         break;
422                     }
423                 } else {
424                     // Look up the code point in its 4k block of code points.
425                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
426                         break;
427                     }
428                 }
429             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
430                 // surrogate code point
431                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
432                     break;
433                 }
434             } else {
435                 // surrogate pair
436                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
437                     break;
438                 }
439                 --limit;
440             }
441             if(s==limit) {
442                 return s;
443             }
444         }
445     } else {
446         // span not
447         for(;;) {
448             c=*(--limit);
449             if(c<=0x7f) {
450                 if(asciiBytes[c]) {
451                     break;
452                 }
453             } else if(c<=0x7ff) {
454                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
455                     break;
456                 }
457             } else if(c<0xd800 || c>=0xe000) {
458                 int lead=c>>12;
459                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
460                 if(twoBits<=1) {
461                     // All 64 code points with the same bits 15..6
462                     // are either in the set or not.
463                     if(twoBits!=0) {
464                         break;
465                     }
466                 } else {
467                     // Look up the code point in its 4k block of code points.
468                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
469                         break;
470                     }
471                 }
472             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
473                 // surrogate code point
474                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
475                     break;
476                 }
477             } else {
478                 // surrogate pair
479                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
480                     break;
481                 }
482                 --limit;
483             }
484             if(s==limit) {
485                 return s;
486             }
487         }
488     }
489     return limit+1;
490 }
491 
492 /*
493  * Precheck for sufficient trail bytes at end of string only once per span.
494  * Check validity.
495  */
496 const uint8_t *
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const497 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
498     const uint8_t *limit=s+length;
499     uint8_t b=*s;
500     if((int8_t)b>=0) {
501         // Initial all-ASCII span.
502         if(spanCondition) {
503             do {
504                 if(!asciiBytes[b] || ++s==limit) {
505                     return s;
506                 }
507                 b=*s;
508             } while((int8_t)b>=0);
509         } else {
510             do {
511                 if(asciiBytes[b] || ++s==limit) {
512                     return s;
513                 }
514                 b=*s;
515             } while((int8_t)b>=0);
516         }
517         length=(int32_t)(limit-s);
518     }
519 
520     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
521         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
522     }
523 
524     const uint8_t *limit0=limit;
525 
526     /*
527      * Make sure that the last 1/2/3/4-byte sequence before limit is complete
528      * or runs into a lead byte.
529      * In the span loop compare s with limit only once
530      * per multi-byte character.
531      *
532      * Give a trailing illegal sequence the same value as the result of contains(FFFD),
533      * including it if that is part of the span, otherwise set limit0 to before
534      * the truncated sequence.
535      */
536     b=*(limit-1);
537     if((int8_t)b<0) {
538         // b>=0x80: lead or trail byte
539         if(b<0xc0) {
540             // single trail byte, check for preceding 3- or 4-byte lead byte
541             if(length>=2 && (b=*(limit-2))>=0xe0) {
542                 limit-=2;
543                 if(asciiBytes[0x80]!=spanCondition) {
544                     limit0=limit;
545                 }
546             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
547                 // 4-byte lead byte with only two trail bytes
548                 limit-=3;
549                 if(asciiBytes[0x80]!=spanCondition) {
550                     limit0=limit;
551                 }
552             }
553         } else {
554             // lead byte with no trail bytes
555             --limit;
556             if(asciiBytes[0x80]!=spanCondition) {
557                 limit0=limit;
558             }
559         }
560     }
561 
562     uint8_t t1, t2, t3;
563 
564     while(s<limit) {
565         b=*s;
566         if(b<0xc0) {
567             // ASCII; or trail bytes with the result of contains(FFFD).
568             if(spanCondition) {
569                 do {
570                     if(!asciiBytes[b]) {
571                         return s;
572                     } else if(++s==limit) {
573                         return limit0;
574                     }
575                     b=*s;
576                 } while(b<0xc0);
577             } else {
578                 do {
579                     if(asciiBytes[b]) {
580                         return s;
581                     } else if(++s==limit) {
582                         return limit0;
583                     }
584                     b=*s;
585                 } while(b<0xc0);
586             }
587         }
588         ++s;  // Advance past the lead byte.
589         if(b>=0xe0) {
590             if(b<0xf0) {
591                 if( /* handle U+0000..U+FFFF inline */
592                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
593                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
594                 ) {
595                     b&=0xf;
596                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
597                     if(twoBits<=1) {
598                         // All 64 code points with this lead byte and middle trail byte
599                         // are either in the set or not.
600                         if(twoBits!=(uint32_t)spanCondition) {
601                             return s-1;
602                         }
603                     } else {
604                         // Look up the code point in its 4k block of code points.
605                         UChar32 c=(b<<12)|(t1<<6)|t2;
606                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
607                             return s-1;
608                         }
609                     }
610                     s+=2;
611                     continue;
612                 }
613             } else if( /* handle U+10000..U+10FFFF inline */
614                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
615                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
616                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
617             ) {
618                 // Give an illegal sequence the same value as the result of contains(FFFD).
619                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
620                 if( (   (0x10000<=c && c<=0x10ffff) ?
621                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
622                             asciiBytes[0x80]
623                     ) != spanCondition
624                 ) {
625                     return s-1;
626                 }
627                 s+=3;
628                 continue;
629             }
630         } else /* 0xc0<=b<0xe0 */ {
631             if( /* handle U+0000..U+07FF inline */
632                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
633             ) {
634                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
635                     return s-1;
636                 }
637                 ++s;
638                 continue;
639             }
640         }
641 
642         // Give an illegal sequence the same value as the result of contains(FFFD).
643         // Handle each byte of an illegal sequence separately to simplify the code;
644         // no need to optimize error handling.
645         if(asciiBytes[0x80]!=spanCondition) {
646             return s-1;
647         }
648     }
649 
650     return limit0;
651 }
652 
653 /*
654  * While going backwards through UTF-8 optimize only for ASCII.
655  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
656  * possible to tell from the last byte in a multi-byte sequence how many
657  * preceding bytes there should be. Therefore, going backwards through UTF-8
658  * is much harder than going forward.
659  */
660 int32_t
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const661 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
662     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
663         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
664     }
665 
666     uint8_t b;
667 
668     do {
669         b=s[--length];
670         if((int8_t)b>=0) {
671             // ASCII sub-span
672             if(spanCondition) {
673                 do {
674                     if(!asciiBytes[b]) {
675                         return length+1;
676                     } else if(length==0) {
677                         return 0;
678                     }
679                     b=s[--length];
680                 } while((int8_t)b>=0);
681             } else {
682                 do {
683                     if(asciiBytes[b]) {
684                         return length+1;
685                     } else if(length==0) {
686                         return 0;
687                     }
688                     b=s[--length];
689                 } while((int8_t)b>=0);
690             }
691         }
692 
693         int32_t prev=length;
694         UChar32 c;
695         // trail byte: collect a multi-byte character
696         // (or  lead byte in last-trail position)
697         c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
698         // c is a valid code point, not ASCII, not a surrogate
699         if(c<=0x7ff) {
700             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
701                 return prev+1;
702             }
703         } else if(c<=0xffff) {
704             int lead=c>>12;
705             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
706             if(twoBits<=1) {
707                 // All 64 code points with the same bits 15..6
708                 // are either in the set or not.
709                 if(twoBits!=(uint32_t)spanCondition) {
710                     return prev+1;
711                 }
712             } else {
713                 // Look up the code point in its 4k block of code points.
714                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
715                     return prev+1;
716                 }
717             }
718         } else {
719             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
720                 return prev+1;
721             }
722         }
723     } while(length>0);
724     return 0;
725 }
726 
727 U_NAMESPACE_END
728