1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2007-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  bmpset.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2007jan29
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/uniset.h"
19 #include "unicode/utf8.h"
20 #include "unicode/utf16.h"
21 #include "cmemory.h"
22 #include "bmpset.h"
23 #include "uassert.h"
24 
25 U_NAMESPACE_BEGIN
26 
BMPSet(const int32_t * parentList,int32_t parentListLength)27 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
28         list(parentList), listLength(parentListLength) {
29     uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
30     uprv_memset(table7FF, 0, sizeof(table7FF));
31     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
32 
33     /*
34      * Set the list indexes for binary searches for
35      * U+0800, U+1000, U+2000, .., U+F000, U+10000.
36      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
37      * looked up in the bit tables.
38      * The last pair of indexes is for finding supplementary code points.
39      */
40     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
41     int32_t i;
42     for(i=1; i<=0x10; ++i) {
43         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
44     }
45     list4kStarts[0x11]=listLength-1;
46 
47     initBits();
48     overrideIllegal();
49 }
50 
BMPSet(const BMPSet & otherBMPSet,const int32_t * newParentList,int32_t newParentListLength)51 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
52         list(newParentList), listLength(newParentListLength) {
53     uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
54     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
55     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
56     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
57 }
58 
~BMPSet()59 BMPSet::~BMPSet() {
60 }
61 
62 /*
63  * Set bits in a bit rectangle in "vertical" bit organization.
64  * start<limit<=0x800
65  */
set32x64Bits(uint32_t table[64],int32_t start,int32_t limit)66 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
67     U_ASSERT(start<limit);
68     U_ASSERT(limit<=0x800);
69 
70     int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
71     int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
72 
73     // Set one bit indicating an all-one block.
74     uint32_t bits=(uint32_t)1<<lead;
75     if((start+1)==limit) {  // Single-character shortcut.
76         table[trail]|=bits;
77         return;
78     }
79 
80     int32_t limitLead=limit>>6;
81     int32_t limitTrail=limit&0x3f;
82 
83     if(lead==limitLead) {
84         // Partial vertical bit column.
85         while(trail<limitTrail) {
86             table[trail++]|=bits;
87         }
88     } else {
89         // Partial vertical bit column,
90         // followed by a bit rectangle,
91         // followed by another partial vertical bit column.
92         if(trail>0) {
93             do {
94                 table[trail++]|=bits;
95             } while(trail<64);
96             ++lead;
97         }
98         if(lead<limitLead) {
99             bits=~((1<<lead)-1);
100             if(limitLead<0x20) {
101                 bits&=(1<<limitLead)-1;
102             }
103             for(trail=0; trail<64; ++trail) {
104                 table[trail]|=bits;
105             }
106         }
107         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
108         // In that case, bits=1<<limitLead is undefined but the bits value
109         // is not used because trail<limitTrail is already false.
110         bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
111         for(trail=0; trail<limitTrail; ++trail) {
112             table[trail]|=bits;
113         }
114     }
115 }
116 
initBits()117 void BMPSet::initBits() {
118     UChar32 start, limit;
119     int32_t listIndex=0;
120 
121     // Set asciiBytes[].
122     do {
123         start=list[listIndex++];
124         if(listIndex<listLength) {
125             limit=list[listIndex++];
126         } else {
127             limit=0x110000;
128         }
129         if(start>=0x80) {
130             break;
131         }
132         do {
133             asciiBytes[start++]=1;
134         } while(start<limit && start<0x80);
135     } while(limit<=0x80);
136 
137     // Set table7FF[].
138     while(start<0x800) {
139         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
140         if(limit>0x800) {
141             start=0x800;
142             break;
143         }
144 
145         start=list[listIndex++];
146         if(listIndex<listLength) {
147             limit=list[listIndex++];
148         } else {
149             limit=0x110000;
150         }
151     }
152 
153     // Set bmpBlockBits[].
154     int32_t minStart=0x800;
155     while(start<0x10000) {
156         if(limit>0x10000) {
157             limit=0x10000;
158         }
159 
160         if(start<minStart) {
161             start=minStart;
162         }
163         if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
164             if(start&0x3f) {
165                 // Mixed-value block of 64 code points.
166                 start>>=6;
167                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
168                 start=(start+1)<<6;  // Round up to the next block boundary.
169                 minStart=start;      // Ignore further ranges in this block.
170             }
171             if(start<limit) {
172                 if(start<(limit&~0x3f)) {
173                     // Multiple all-ones blocks of 64 code points each.
174                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);
175                 }
176 
177                 if(limit&0x3f) {
178                     // Mixed-value block of 64 code points.
179                     limit>>=6;
180                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
181                     limit=(limit+1)<<6;  // Round up to the next block boundary.
182                     minStart=limit;      // Ignore further ranges in this block.
183                 }
184             }
185         }
186 
187         if(limit==0x10000) {
188             break;
189         }
190 
191         start=list[listIndex++];
192         if(listIndex<listLength) {
193             limit=list[listIndex++];
194         } else {
195             limit=0x110000;
196         }
197     }
198 }
199 
200 /*
201  * Override some bits and bytes to the result of contains(FFFD)
202  * for faster validity checking at runtime.
203  * No need to set 0 values where they were reset to 0 in the constructor
204  * and not modified by initBits().
205  * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
206  * Need to set 0 values for surrogates D800..DFFF.
207  */
overrideIllegal()208 void BMPSet::overrideIllegal() {
209     uint32_t bits, mask;
210     int32_t i;
211 
212     if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
213         // contains(FFFD)==TRUE
214         for(i=0x80; i<0xc0; ++i) {
215             asciiBytes[i]=1;
216         }
217 
218         bits=3;                 // Lead bytes 0xC0 and 0xC1.
219         for(i=0; i<64; ++i) {
220             table7FF[i]|=bits;
221         }
222 
223         bits=1;                 // Lead byte 0xE0.
224         for(i=0; i<32; ++i) {   // First half of 4k block.
225             bmpBlockBits[i]|=bits;
226         }
227 
228         mask=~(0x10001<<0xd);   // Lead byte 0xED.
229         bits=1<<0xd;
230         for(i=32; i<64; ++i) {  // Second half of 4k block.
231             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
232         }
233     } else {
234         // contains(FFFD)==FALSE
235         mask=~(0x10001<<0xd);   // Lead byte 0xED.
236         for(i=32; i<64; ++i) {  // Second half of 4k block.
237             bmpBlockBits[i]&=mask;
238         }
239     }
240 }
241 
findCodePoint(UChar32 c,int32_t lo,int32_t hi) const242 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
243     /* Examples:
244                                        findCodePoint(c)
245        set              list[]         c=0 1 3 4 7 8
246        ===              ==============   ===========
247        []               [110000]         0 0 0 0 0 0
248        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
249        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
250        [:Any:]          [0, 110000]      1 1 1 1 1 1
251      */
252 
253     // Return the smallest i such that c < list[i].  Assume
254     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
255     if (c < list[lo])
256         return lo;
257     // High runner test.  c is often after the last range, so an
258     // initial check for this condition pays off.
259     if (lo >= hi || c >= list[hi-1])
260         return hi;
261     // invariant: c >= list[lo]
262     // invariant: c < list[hi]
263     for (;;) {
264         int32_t i = (lo + hi) >> 1;
265         if (i == lo) {
266             break; // Found!
267         } else if (c < list[i]) {
268             hi = i;
269         } else {
270             lo = i;
271         }
272     }
273     return hi;
274 }
275 
276 UBool
contains(UChar32 c) const277 BMPSet::contains(UChar32 c) const {
278     if((uint32_t)c<=0x7f) {
279         return (UBool)asciiBytes[c];
280     } else if((uint32_t)c<=0x7ff) {
281         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
282     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
283         int lead=c>>12;
284         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
285         if(twoBits<=1) {
286             // All 64 code points with the same bits 15..6
287             // are either in the set or not.
288             return (UBool)twoBits;
289         } else {
290             // Look up the code point in its 4k block of code points.
291             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
292         }
293     } else if((uint32_t)c<=0x10ffff) {
294         // surrogate or supplementary code point
295         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
296     } else {
297         // Out-of-range code points get FALSE, consistent with long-standing
298         // behavior of UnicodeSet::contains(c).
299         return FALSE;
300     }
301 }
302 
303 /*
304  * Check for sufficient length for trail unit for each surrogate pair.
305  * Handle single surrogates as surrogate code points as usual in ICU.
306  */
307 const UChar *
span(const UChar * s,const UChar * limit,USetSpanCondition spanCondition) const308 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
309     UChar c, c2;
310 
311     if(spanCondition) {
312         // span
313         do {
314             c=*s;
315             if(c<=0x7f) {
316                 if(!asciiBytes[c]) {
317                     break;
318                 }
319             } else if(c<=0x7ff) {
320                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
321                     break;
322                 }
323             } else if(c<0xd800 || c>=0xe000) {
324                 int lead=c>>12;
325                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
326                 if(twoBits<=1) {
327                     // All 64 code points with the same bits 15..6
328                     // are either in the set or not.
329                     if(twoBits==0) {
330                         break;
331                     }
332                 } else {
333                     // Look up the code point in its 4k block of code points.
334                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
335                         break;
336                     }
337                 }
338             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
339                 // surrogate code point
340                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
341                     break;
342                 }
343             } else {
344                 // surrogate pair
345                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
346                     break;
347                 }
348                 ++s;
349             }
350         } while(++s<limit);
351     } else {
352         // span not
353         do {
354             c=*s;
355             if(c<=0x7f) {
356                 if(asciiBytes[c]) {
357                     break;
358                 }
359             } else if(c<=0x7ff) {
360                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
361                     break;
362                 }
363             } else if(c<0xd800 || c>=0xe000) {
364                 int lead=c>>12;
365                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
366                 if(twoBits<=1) {
367                     // All 64 code points with the same bits 15..6
368                     // are either in the set or not.
369                     if(twoBits!=0) {
370                         break;
371                     }
372                 } else {
373                     // Look up the code point in its 4k block of code points.
374                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
375                         break;
376                     }
377                 }
378             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
379                 // surrogate code point
380                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
381                     break;
382                 }
383             } else {
384                 // surrogate pair
385                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
386                     break;
387                 }
388                 ++s;
389             }
390         } while(++s<limit);
391     }
392     return s;
393 }
394 
395 /* Symmetrical with span(). */
396 const UChar *
spanBack(const UChar * s,const UChar * limit,USetSpanCondition spanCondition) const397 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
398     UChar c, c2;
399 
400     if(spanCondition) {
401         // span
402         for(;;) {
403             c=*(--limit);
404             if(c<=0x7f) {
405                 if(!asciiBytes[c]) {
406                     break;
407                 }
408             } else if(c<=0x7ff) {
409                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
410                     break;
411                 }
412             } else if(c<0xd800 || c>=0xe000) {
413                 int lead=c>>12;
414                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
415                 if(twoBits<=1) {
416                     // All 64 code points with the same bits 15..6
417                     // are either in the set or not.
418                     if(twoBits==0) {
419                         break;
420                     }
421                 } else {
422                     // Look up the code point in its 4k block of code points.
423                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
424                         break;
425                     }
426                 }
427             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
428                 // surrogate code point
429                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
430                     break;
431                 }
432             } else {
433                 // surrogate pair
434                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
435                     break;
436                 }
437                 --limit;
438             }
439             if(s==limit) {
440                 return s;
441             }
442         }
443     } else {
444         // span not
445         for(;;) {
446             c=*(--limit);
447             if(c<=0x7f) {
448                 if(asciiBytes[c]) {
449                     break;
450                 }
451             } else if(c<=0x7ff) {
452                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
453                     break;
454                 }
455             } else if(c<0xd800 || c>=0xe000) {
456                 int lead=c>>12;
457                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
458                 if(twoBits<=1) {
459                     // All 64 code points with the same bits 15..6
460                     // are either in the set or not.
461                     if(twoBits!=0) {
462                         break;
463                     }
464                 } else {
465                     // Look up the code point in its 4k block of code points.
466                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
467                         break;
468                     }
469                 }
470             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
471                 // surrogate code point
472                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
473                     break;
474                 }
475             } else {
476                 // surrogate pair
477                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
478                     break;
479                 }
480                 --limit;
481             }
482             if(s==limit) {
483                 return s;
484             }
485         }
486     }
487     return limit+1;
488 }
489 
490 /*
491  * Precheck for sufficient trail bytes at end of string only once per span.
492  * Check validity.
493  */
494 const uint8_t *
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const495 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
496     const uint8_t *limit=s+length;
497     uint8_t b=*s;
498     if((int8_t)b>=0) {
499         // Initial all-ASCII span.
500         if(spanCondition) {
501             do {
502                 if(!asciiBytes[b] || ++s==limit) {
503                     return s;
504                 }
505                 b=*s;
506             } while((int8_t)b>=0);
507         } else {
508             do {
509                 if(asciiBytes[b] || ++s==limit) {
510                     return s;
511                 }
512                 b=*s;
513             } while((int8_t)b>=0);
514         }
515         length=(int32_t)(limit-s);
516     }
517 
518     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
519         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
520     }
521 
522     const uint8_t *limit0=limit;
523 
524     /*
525      * Make sure that the last 1/2/3/4-byte sequence before limit is complete
526      * or runs into a lead byte.
527      * In the span loop compare s with limit only once
528      * per multi-byte character.
529      *
530      * Give a trailing illegal sequence the same value as the result of contains(FFFD),
531      * including it if that is part of the span, otherwise set limit0 to before
532      * the truncated sequence.
533      */
534     b=*(limit-1);
535     if((int8_t)b<0) {
536         // b>=0x80: lead or trail byte
537         if(b<0xc0) {
538             // single trail byte, check for preceding 3- or 4-byte lead byte
539             if(length>=2 && (b=*(limit-2))>=0xe0) {
540                 limit-=2;
541                 if(asciiBytes[0x80]!=spanCondition) {
542                     limit0=limit;
543                 }
544             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
545                 // 4-byte lead byte with only two trail bytes
546                 limit-=3;
547                 if(asciiBytes[0x80]!=spanCondition) {
548                     limit0=limit;
549                 }
550             }
551         } else {
552             // lead byte with no trail bytes
553             --limit;
554             if(asciiBytes[0x80]!=spanCondition) {
555                 limit0=limit;
556             }
557         }
558     }
559 
560     uint8_t t1, t2, t3;
561 
562     while(s<limit) {
563         b=*s;
564         if(b<0xc0) {
565             // ASCII; or trail bytes with the result of contains(FFFD).
566             if(spanCondition) {
567                 do {
568                     if(!asciiBytes[b]) {
569                         return s;
570                     } else if(++s==limit) {
571                         return limit0;
572                     }
573                     b=*s;
574                 } while(b<0xc0);
575             } else {
576                 do {
577                     if(asciiBytes[b]) {
578                         return s;
579                     } else if(++s==limit) {
580                         return limit0;
581                     }
582                     b=*s;
583                 } while(b<0xc0);
584             }
585         }
586         ++s;  // Advance past the lead byte.
587         if(b>=0xe0) {
588             if(b<0xf0) {
589                 if( /* handle U+0000..U+FFFF inline */
590                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
591                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
592                 ) {
593                     b&=0xf;
594                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
595                     if(twoBits<=1) {
596                         // All 64 code points with this lead byte and middle trail byte
597                         // are either in the set or not.
598                         if(twoBits!=(uint32_t)spanCondition) {
599                             return s-1;
600                         }
601                     } else {
602                         // Look up the code point in its 4k block of code points.
603                         UChar32 c=(b<<12)|(t1<<6)|t2;
604                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
605                             return s-1;
606                         }
607                     }
608                     s+=2;
609                     continue;
610                 }
611             } else if( /* handle U+10000..U+10FFFF inline */
612                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
613                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
614                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
615             ) {
616                 // Give an illegal sequence the same value as the result of contains(FFFD).
617                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
618                 if( (   (0x10000<=c && c<=0x10ffff) ?
619                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
620                             asciiBytes[0x80]
621                     ) != spanCondition
622                 ) {
623                     return s-1;
624                 }
625                 s+=3;
626                 continue;
627             }
628         } else /* 0xc0<=b<0xe0 */ {
629             if( /* handle U+0000..U+07FF inline */
630                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
631             ) {
632                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
633                     return s-1;
634                 }
635                 ++s;
636                 continue;
637             }
638         }
639 
640         // Give an illegal sequence the same value as the result of contains(FFFD).
641         // Handle each byte of an illegal sequence separately to simplify the code;
642         // no need to optimize error handling.
643         if(asciiBytes[0x80]!=spanCondition) {
644             return s-1;
645         }
646     }
647 
648     return limit0;
649 }
650 
651 /*
652  * While going backwards through UTF-8 optimize only for ASCII.
653  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
654  * possible to tell from the last byte in a multi-byte sequence how many
655  * preceding bytes there should be. Therefore, going backwards through UTF-8
656  * is much harder than going forward.
657  */
658 int32_t
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const659 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
660     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
661         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
662     }
663 
664     uint8_t b;
665 
666     do {
667         b=s[--length];
668         if((int8_t)b>=0) {
669             // ASCII sub-span
670             if(spanCondition) {
671                 do {
672                     if(!asciiBytes[b]) {
673                         return length+1;
674                     } else if(length==0) {
675                         return 0;
676                     }
677                     b=s[--length];
678                 } while((int8_t)b>=0);
679             } else {
680                 do {
681                     if(asciiBytes[b]) {
682                         return length+1;
683                     } else if(length==0) {
684                         return 0;
685                     }
686                     b=s[--length];
687                 } while((int8_t)b>=0);
688             }
689         }
690 
691         int32_t prev=length;
692         UChar32 c;
693         // trail byte: collect a multi-byte character
694         // (or  lead byte in last-trail position)
695         c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
696         // c is a valid code point, not ASCII, not a surrogate
697         if(c<=0x7ff) {
698             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
699                 return prev+1;
700             }
701         } else if(c<=0xffff) {
702             int lead=c>>12;
703             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
704             if(twoBits<=1) {
705                 // All 64 code points with the same bits 15..6
706                 // are either in the set or not.
707                 if(twoBits!=(uint32_t)spanCondition) {
708                     return prev+1;
709                 }
710             } else {
711                 // Look up the code point in its 4k block of code points.
712                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
713                     return prev+1;
714                 }
715             }
716         } else {
717             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
718                 return prev+1;
719             }
720         }
721     } while(length>0);
722     return 0;
723 }
724 
725 U_NAMESPACE_END
726