1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html
4 /*
5  *******************************************************************************
6  * Copyright (C) 2003-2015, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 
11 package android.icu.text;
12 
13 import java.io.IOException;
14 import java.io.InputStream;
15 import java.lang.ref.WeakReference;
16 import java.nio.ByteBuffer;
17 
18 import android.icu.impl.CharTrie;
19 import android.icu.impl.ICUBinary;
20 import android.icu.impl.StringPrepDataReader;
21 import android.icu.impl.UBiDiProps;
22 import android.icu.lang.UCharacter;
23 import android.icu.lang.UCharacterDirection;
24 import android.icu.util.ICUUncheckedIOException;
25 import android.icu.util.VersionInfo;
26 
27 /**
28  * StringPrep API implements the StingPrep framework as described by
29  * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
30  * StringPrep prepares Unicode strings for use in network protocols.
31  * Profiles of StingPrep are set of rules and data according to which the
32  * Unicode Strings are prepared. Each profiles contains tables which describe
33  * how a code point should be treated. The tables are broadly classied into
34  * <ul>
35  *     <li> Unassigned Table: Contains code points that are unassigned
36  *          in the Unicode Version supported by StringPrep. Currently
37  *          RFC 3454 supports Unicode 3.2. </li>
38  *     <li> Prohibited Table: Contains code points that are prohibted from
39  *          the output of the StringPrep processing function. </li>
40  *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
41  * </ul>
42  *
43  * The procedure for preparing Unicode strings:
44  * <ol>
45  *      <li> Map: For each character in the input, check if it has a mapping
46  *           and, if so, replace it with its mapping. </li>
47  *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
48  *           normalization. </li>
49  *      <li> Prohibit: Check for any characters that are not allowed in the
50  *           output.  If any are found, return an error.</li>
51  *      <li> Check bidi: Possibly check for right-to-left characters, and if
52  *           any are found, make sure that the whole string satisfies the
53  *           requirements for bidirectional strings.  If the string does not
54  *           satisfy the requirements for bidirectional strings, return an
55  *           error.  </li>
56  * </ol>
57  * @author Ram Viswanadha
58  * @hide Only a subset of ICU is exposed in Android
59  */
60 @libcore.api.CorePlatformApi
61 public final class StringPrep {
62     /**
63      * Option to prohibit processing of unassigned code points in the input
64      *
65      * @see   #prepare
66      */
67     @libcore.api.CorePlatformApi
68     public static final int DEFAULT = 0x0000;
69 
70     /**
71      * Option to allow processing of unassigned code points in the input
72      *
73      * @see   #prepare
74      */
75     public static final int ALLOW_UNASSIGNED = 0x0001;
76 
77     /**
78      * Profile type: RFC3491 Nameprep
79      * @see #getInstance(int)
80      */
81     public static final int RFC3491_NAMEPREP = 0;
82 
83     /**
84      * Profile type: RFC3530 nfs4_cs_prep
85      * @see #getInstance(int)
86      */
87     public static final int RFC3530_NFS4_CS_PREP = 1;
88 
89     /**
90      * Profile type: RFC3530 nfs4_cs_prep with case insensitive option
91      * @see #getInstance(int)
92      */
93     public static final int RFC3530_NFS4_CS_PREP_CI = 2;
94 
95     /**
96      * Profile type: RFC3530 nfs4_cis_prep
97      * @see #getInstance(int)
98      */
99     public static final int RFC3530_NFS4_CIS_PREP = 3;
100 
101     /**
102      * Profile type: RFC3530 nfs4_mixed_prep for prefix
103      * @see #getInstance(int)
104      */
105     public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4;
106 
107     /**
108      * Profile type: RFC3530 nfs4_mixed_prep for suffix
109      * @see #getInstance(int)
110      */
111     public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5;
112 
113     /**
114      * Profile type: RFC3722 iSCSI
115      * @see #getInstance(int)
116      */
117     public static final int RFC3722_ISCSI = 6;
118 
119     /**
120      * Profile type: RFC3920 XMPP Nodeprep
121      * @see #getInstance(int)
122      */
123     public static final int RFC3920_NODEPREP = 7;
124 
125     /**
126      * Profile type: RFC3920 XMPP Resourceprep
127      * @see #getInstance(int)
128      */
129     @libcore.api.CorePlatformApi
130     public static final int RFC3920_RESOURCEPREP = 8;
131 
132     /**
133      * Profile type: RFC4011 Policy MIB Stringprep
134      * @see #getInstance(int)
135      */
136     public static final int RFC4011_MIB = 9;
137 
138     /**
139      * Profile type: RFC4013 SASLprep
140      * @see #getInstance(int)
141      */
142     public static final int RFC4013_SASLPREP = 10;
143 
144     /**
145      * Profile type: RFC4505 trace
146      * @see #getInstance(int)
147      */
148     public static final int RFC4505_TRACE = 11;
149 
150     /**
151      * Profile type: RFC4518 LDAP
152      * @see #getInstance(int)
153      */
154     public static final int RFC4518_LDAP = 12;
155 
156     /**
157      * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix
158      * matching rules
159      * @see #getInstance(int)
160      */
161     public static final int RFC4518_LDAP_CI = 13;
162 
163     // Last available profile
164     private static final int MAX_PROFILE = RFC4518_LDAP_CI;
165 
166     // Profile names must be aligned to profile type definitions
167     private static final String[] PROFILE_NAMES = {
168         "rfc3491",      /* RFC3491_NAMEPREP */
169         "rfc3530cs",    /* RFC3530_NFS4_CS_PREP */
170         "rfc3530csci",  /* RFC3530_NFS4_CS_PREP_CI */
171         "rfc3491",      /* RFC3530_NSF4_CIS_PREP */
172         "rfc3530mixp",  /* RFC3530_NSF4_MIXED_PREP_PREFIX */
173         "rfc3491",      /* RFC3530_NSF4_MIXED_PREP_SUFFIX */
174         "rfc3722",      /* RFC3722_ISCSI */
175         "rfc3920node",  /* RFC3920_NODEPREP */
176         "rfc3920res",   /* RFC3920_RESOURCEPREP */
177         "rfc4011",      /* RFC4011_MIB */
178         "rfc4013",      /* RFC4013_SASLPREP */
179         "rfc4505",      /* RFC4505_TRACE */
180         "rfc4518",      /* RFC4518_LDAP */
181         "rfc4518ci",    /* RFC4518_LDAP_CI */
182     };
183 
184     @SuppressWarnings({"unchecked", "rawtypes"})
185     private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1];
186 
187     private static final int UNASSIGNED        = 0x0000;
188     private static final int MAP               = 0x0001;
189     private static final int PROHIBITED        = 0x0002;
190     private static final int DELETE            = 0x0003;
191     private static final int TYPE_LIMIT        = 0x0004;
192 
193     private static final int NORMALIZATION_ON  = 0x0001;
194     private static final int CHECK_BIDI_ON     = 0x0002;
195 
196     private static final int TYPE_THRESHOLD       = 0xFFF0;
197     private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
198     //private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
199 
200     /* indexes[] value names */
201 //  private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
202     private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
203     private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
204     private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
205     private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
206     private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
207     private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
208     private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
209     private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
210 
211 
212     // CharTrie implmentation for reading the trie data
213     private CharTrie sprepTrie;
214     // Indexes read from the data file
215     private int[] indexes;
216     // mapping data read from the data file
217     private char[] mappingData;
218     // the version of Unicode supported by the data file
219     private VersionInfo sprepUniVer;
220     // the Unicode version of last entry in the
221     // NormalizationCorrections.txt file if normalization
222     // is turned on
223     private VersionInfo normCorrVer;
224     // Option to turn on Normalization
225     private boolean doNFKC;
226     // Option to turn on checking for BiDi rules
227     private boolean checkBiDi;
228     // bidi properties
229     private UBiDiProps bdp;
230 
getCodePointValue(int ch)231     private char getCodePointValue(int ch){
232         return sprepTrie.getCodePointValue(ch);
233     }
234 
getVersionInfo(int comp)235     private static VersionInfo getVersionInfo(int comp){
236         int micro = comp & 0xFF;
237         int milli =(comp >> 8)  & 0xFF;
238         int minor =(comp >> 16) & 0xFF;
239         int major =(comp >> 24) & 0xFF;
240         return VersionInfo.getInstance(major,minor,milli,micro);
241     }
242 
getVersionInfo(byte[] version)243     private static VersionInfo getVersionInfo(byte[] version){
244         if(version.length != 4){
245             return null;
246         }
247         return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
248     }
249 
250     /**
251      * Creates an StringPrep object after reading the input stream.
252      * The object does not hold a reference to the input steam, so the stream can be
253      * closed after the method returns.
254      *
255      * @param inputStream The stream for reading the StringPrep profile binarySun
256      * @throws IOException An exception occurs when I/O of the inputstream is invalid
257      */
StringPrep(InputStream inputStream)258     public StringPrep(InputStream inputStream) throws IOException{
259         // TODO: Add a public constructor that takes ByteBuffer directly.
260         this(ICUBinary.getByteBufferFromInputStreamAndCloseStream(inputStream));
261     }
262 
StringPrep(ByteBuffer bytes)263     private StringPrep(ByteBuffer bytes) throws IOException {
264         StringPrepDataReader reader = new StringPrepDataReader(bytes);
265 
266         // read the indexes
267         indexes = reader.readIndexes(INDEX_TOP);
268 
269         sprepTrie = new CharTrie(bytes, null);
270 
271         //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
272         // load the rest of the data data and initialize the data members
273         mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2);
274 
275         // get the options
276         doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
277         checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
278         sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
279         normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
280         VersionInfo normUniVer = UCharacter.getUnicodeVersion();
281         if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
282            normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
283            ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
284            ){
285             throw new IOException("Normalization Correction version not supported");
286         }
287 
288         if(checkBiDi) {
289             bdp=UBiDiProps.INSTANCE;
290         }
291     }
292 
293     /**
294      * Gets a StringPrep instance for the specified profile
295      *
296      * @param profile The profile passed to find the StringPrep instance.
297      */
298     @libcore.api.CorePlatformApi
getInstance(int profile)299     public static StringPrep getInstance(int profile) {
300         if (profile < 0 || profile > MAX_PROFILE) {
301             throw new IllegalArgumentException("Bad profile type");
302         }
303 
304         StringPrep instance = null;
305 
306         // A StringPrep instance is immutable.  We use a single instance
307         // per type and store it in the internal cache.
308         synchronized (CACHE) {
309             WeakReference<StringPrep> ref = CACHE[profile];
310             if (ref != null) {
311                 instance = ref.get();
312             }
313 
314             if (instance == null) {
315                 ByteBuffer bytes = ICUBinary.getRequiredData(PROFILE_NAMES[profile] + ".spp");
316                 if (bytes != null) {
317                     try {
318                         instance = new StringPrep(bytes);
319                     } catch (IOException e) {
320                         throw new ICUUncheckedIOException(e);
321                     }
322                 }
323                 if (instance != null) {
324                     CACHE[profile] = new WeakReference<StringPrep>(instance);
325                 }
326             }
327         }
328         return instance;
329     }
330 
331     private static final class Values{
332         boolean isIndex;
333         int value;
334         int type;
reset()335         public void reset(){
336             isIndex = false;
337             value = 0;
338             type = -1;
339         }
340     }
341 
getValues(char trieWord,Values values)342     private static final void getValues(char trieWord,Values values){
343         values.reset();
344         if(trieWord == 0){
345             /*
346              * Initial value stored in the mapping table
347              * just return TYPE_LIMIT .. so that
348              * the source codepoint is copied to the destination
349              */
350             values.type = TYPE_LIMIT;
351         }else if(trieWord >= TYPE_THRESHOLD){
352             values.type = (trieWord - TYPE_THRESHOLD);
353         }else{
354             /* get the type */
355             values.type = MAP;
356             /* ascertain if the value is index or delta */
357             if((trieWord & 0x02)>0){
358                 values.isIndex = true;
359                 values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
360 
361             }else{
362                 values.isIndex = false;
363                 values.value = (trieWord<<16)>>16;
364                 values.value =  (values.value >> 2);
365 
366             }
367 
368             if((trieWord>>2) == MAX_INDEX_VALUE){
369                 values.type = DELETE;
370                 values.isIndex = false;
371                 values.value = 0;
372             }
373         }
374     }
375 
376 
377 
map( UCharacterIterator iter, int options)378     private StringBuffer map( UCharacterIterator iter, int options)
379                             throws StringPrepParseException{
380 
381         Values val = new Values();
382         char result = 0;
383         int ch  = UCharacterIterator.DONE;
384         StringBuffer dest = new StringBuffer();
385         boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
386 
387         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
388 
389             result = getCodePointValue(ch);
390             getValues(result,val);
391 
392             // check if the source codepoint is unassigned
393             if(val.type == UNASSIGNED && allowUnassigned == false){
394                  throw new StringPrepParseException("An unassigned code point was found in the input",
395                                           StringPrepParseException.UNASSIGNED_ERROR,
396                                           iter.getText(),iter.getIndex());
397             }else if((val.type == MAP)){
398                 int index, length;
399 
400                 if(val.isIndex){
401                     index = val.value;
402                     if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
403                              index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
404                         length = 1;
405                     }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
406                              index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
407                         length = 2;
408                     }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
409                              index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
410                         length = 3;
411                     }else{
412                         length = mappingData[index++];
413                     }
414                     /* copy mapping to destination */
415                     dest.append(mappingData,index,length);
416                     continue;
417 
418                 }else{
419                     ch -= val.value;
420                 }
421             }else if(val.type == DELETE){
422                 // just consume the codepoint and contine
423                 continue;
424             }
425             //copy the source into destination
426             UTF16.append(dest,ch);
427         }
428 
429         return dest;
430     }
431 
432 
normalize(StringBuffer src)433     private StringBuffer normalize(StringBuffer src){
434         return new StringBuffer(
435             Normalizer.normalize(
436                 src.toString(),
437                 Normalizer.NFKC,
438                 Normalizer.UNICODE_3_2));
439     }
440     /*
441     boolean isLabelSeparator(int ch){
442         int result = getCodePointValue(ch);
443         if( (result & 0x07)  == LABEL_SEPARATOR){
444             return true;
445         }
446         return false;
447     }
448     */
449      /*
450        1) Map -- For each character in the input, check if it has a mapping
451           and, if so, replace it with its mapping.
452 
453        2) Normalize -- Possibly normalize the result of step 1 using Unicode
454           normalization.
455 
456        3) Prohibit -- Check for any characters that are not allowed in the
457           output.  If any are found, return an error.
458 
459        4) Check bidi -- Possibly check for right-to-left characters, and if
460           any are found, make sure that the whole string satisfies the
461           requirements for bidirectional strings.  If the string does not
462           satisfy the requirements for bidirectional strings, return an
463           error.
464           [Unicode3.2] defines several bidirectional categories; each character
465            has one bidirectional category assigned to it.  For the purposes of
466            the requirements below, an "RandALCat character" is a character that
467            has Unicode bidirectional categories "R" or "AL"; an "LCat character"
468            is a character that has Unicode bidirectional category "L".  Note
469 
470 
471            that there are many characters which fall in neither of the above
472            definitions; Latin digits (<U+0030> through <U+0039>) are examples of
473            this because they have bidirectional category "EN".
474 
475            In any profile that specifies bidirectional character handling, all
476            three of the following requirements MUST be met:
477 
478            1) The characters in section 5.8 MUST be prohibited.
479 
480            2) If a string contains any RandALCat character, the string MUST NOT
481               contain any LCat character.
482 
483            3) If a string contains any RandALCat character, a RandALCat
484               character MUST be the first character of the string, and a
485               RandALCat character MUST be the last character of the string.
486     */
487     /**
488      * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
489      * checks for prohibited and BiDi characters in the order defined by RFC 3454
490      * depending on the options specified in the profile.
491      *
492      * @param src           A UCharacterIterator object containing the source string
493      * @param options       A bit set of options:
494      *   <ul>
495      *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
496      *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
497      *          as normal Unicode code points.</li>
498      *   </ul>
499      * @return StringBuffer A StringBuffer containing the output
500      * @throws StringPrepParseException An exception occurs when parsing a string is invalid.
501      */
prepare(UCharacterIterator src, int options)502     public StringBuffer prepare(UCharacterIterator src, int options)
503                         throws StringPrepParseException{
504 
505         // map
506         StringBuffer mapOut = map(src,options);
507         StringBuffer normOut = mapOut;// initialize
508 
509         if(doNFKC){
510             // normalize
511             normOut = normalize(mapOut);
512         }
513 
514         int ch;
515         char result;
516         UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
517         Values val = new Values();
518         int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
519             firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
520         int rtlPos=-1, ltrPos=-1;
521         boolean rightToLeft=false, leftToRight=false;
522 
523         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
524             result = getCodePointValue(ch);
525             getValues(result,val);
526 
527             if(val.type == PROHIBITED ){
528                 throw new StringPrepParseException("A prohibited code point was found in the input",
529                                          StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value);
530             }
531 
532             if(checkBiDi) {
533                 direction = bdp.getClass(ch);
534                 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
535                     firstCharDir = direction;
536                 }
537                 if(direction == UCharacterDirection.LEFT_TO_RIGHT){
538                     leftToRight = true;
539                     ltrPos = iter.getIndex()-1;
540                 }
541                 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
542                     rightToLeft = true;
543                     rtlPos = iter.getIndex()-1;
544                 }
545             }
546         }
547         if(checkBiDi == true){
548             // satisfy 2
549             if( leftToRight == true && rightToLeft == true){
550                 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
551                                          StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
552                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
553              }
554 
555             //satisfy 3
556             if( rightToLeft == true &&
557                 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
558                 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
559               ){
560                 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.",
561                                          StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(),
562                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
563             }
564         }
565         return normOut;
566 
567       }
568 
569     /**
570      * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC),
571      * checks for prohibited and BiDi characters in the order defined by RFC 3454
572      * depending on the options specified in the profile.
573      *
574      * @param src           A string
575      * @param options       A bit set of options:
576      *   <ul>
577      *     <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li>
578      *     <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input
579      *          as normal Unicode code points.</li>
580      *   </ul>
581      * @return String A String containing the output
582      * @throws StringPrepParseException An exception when parsing or preparing a string is invalid.
583      */
584     @libcore.api.CorePlatformApi
prepare(String src, int options)585     public String prepare(String src, int options)
586         throws StringPrepParseException{
587         StringBuffer result = prepare(UCharacterIterator.getInstance(src), options);
588         return result.toString();
589     }
590 }
591