1 /* 2 ********************************************************************** 3 * Copyright (C) 2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * 7 * indentifier_info.h 8 * 9 * created on: 2013 Jan 7 10 * created by: Andy Heninger 11 */ 12 13 #ifndef __IDENTIFIER_INFO_H__ 14 #define __IDENTIFIER_INFO_H__ 15 16 #include "unicode/utypes.h" 17 18 #include "unicode/uniset.h" 19 #include "unicode/uspoof.h" 20 #include "uhash.h" 21 22 U_NAMESPACE_BEGIN 23 24 class ScriptSet; 25 26 // TODO(andy): review consistency of reference vs pointer arguments to the funcions. 27 28 /** 29 * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile 30 * then setIdentifier. Available methods include: 31 * <ol> 32 * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in 33 * each of these. 34 * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be 35 * either Katakana or Hiragana. 36 * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates. 37 * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in 38 * the identifier. 39 * <li>call getRestrictionLevel to see what the UTS36 restriction level is. 40 * </ol> 41 * 42 * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo 43 */ 44 class U_I18N_API IdentifierInfo : public UMemory { 45 46 public: 47 /** 48 * Create an identifier info object. Subsequently, call setIdentifier(), etc. 49 * @internal 50 */ 51 IdentifierInfo(UErrorCode &status); 52 53 /** 54 * Destructor 55 */ 56 virtual ~IdentifierInfo(); 57 58 private: 59 /* Disallow copying for now. Can be added if there's a need. */ 60 IdentifierInfo(const IdentifierInfo &other); 61 62 public: 63 64 /** 65 * Set the identifier profile: the characters that are to be allowed in the identifier. 66 * 67 * @param identifierProfile the characters that are to be allowed in the identifier 68 * @return this 69 * @internal 70 */ 71 IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile); 72 73 /** 74 * Get the identifier profile: the characters that are to be allowed in the identifier. 75 * 76 * @return The characters that are to be allowed in the identifier. 77 * @internal 78 */ 79 const UnicodeSet &getIdentifierProfile() const; 80 81 82 /** 83 * Set an identifier to analyze. Afterwards, call methods like getScripts() 84 * 85 * @param identifier the identifier to analyze 86 * @param status Errorcode, set if errors occur. 87 * @return this 88 * @internal 89 */ 90 IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status); 91 92 93 /** 94 * Get the identifier that was analyzed. The returned string is owned by the ICU library, 95 * and must not be deleted by the caller. 96 * 97 * @return the identifier that was analyzed. 98 * @internal 99 */ 100 const UnicodeString *getIdentifier() const; 101 102 103 /** 104 * Get the scripts found in the identifiers. 105 * 106 * @return the set of explicit scripts. 107 * @internal 108 */ 109 const ScriptSet *getScripts() const; 110 111 /** 112 * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then 113 * the set consisting of those scripts will be returned. 114 * 115 * @return a uhash, with each key being of type (ScriptSet *). 116 * This is a set, not a map, so the value stored in the uhash is not relevant. 117 * (It is, in fact, 1). 118 * Ownership of the uhash and its contents remains with the IndetifierInfo object, 119 * and remains valid until a new identifer is set or until the object is deleted. 120 * @internal 121 */ 122 const UHashtable *getAlternates() const; 123 124 /** 125 * Get the representative characters (zeros) for the numerics found in the identifier. 126 * 127 * @return the set of explicit scripts. 128 * @internal 129 */ 130 const UnicodeSet *getNumerics() const; 131 132 /** 133 * Find out which scripts are in common among the alternates. 134 * 135 * @return the set of scripts that are in common among the alternates. 136 * @internal 137 */ 138 const ScriptSet *getCommonAmongAlternates() const; 139 140 /** 141 * Get the number of scripts appearing in the identifier. 142 * Note: Common and Inherited scripts are omitted from the count. 143 * Note: Result may be high when the identifier contains characters 144 * with alternate scripts. The distinction between 145 * 0, 1 and > 1 will remain valid, however. 146 * @return the number of scripts. 147 */ 148 int32_t getScriptCount() const; 149 150 #if !UCONFIG_NO_NORMALIZATION 151 152 /** 153 * Find the "tightest" restriction level that the identifier satisfies. 154 * 155 * @return the restriction level. 156 * @internal 157 */ 158 URestrictionLevel getRestrictionLevel(UErrorCode &status) const; 159 160 #endif /*!UCONFIG_NO_NORMALIZATION */ 161 162 UnicodeString toString() const; 163 164 /** 165 * Produce a readable string of alternates. 166 * 167 * @param alternates a UHashtable of UScriptSets. 168 * Keys only, no meaningful values in the UHash. 169 * @return display form 170 * @internal 171 */ 172 static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status); 173 174 private: 175 176 IdentifierInfo & clear(); 177 UBool containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const; 178 179 UnicodeString *fIdentifier; 180 ScriptSet *fRequiredScripts; 181 UHashtable *fScriptSetSet; 182 ScriptSet *fCommonAmongAlternates; 183 UnicodeSet *fNumerics; 184 UnicodeSet *fIdentifierProfile; 185 }; 186 187 U_NAMESPACE_END 188 189 #endif // __IDENTIFIER_INFO_H__ 190 191