1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucsdet.h 9 * encoding: US-ASCII 10 * indentation:4 11 * 12 * created on: 2005Aug04 13 * created by: Andy Heninger 14 * 15 * ICU Character Set Detection, API for C 16 * 17 * Draft version 18 Oct 2005 18 * 19 */ 20 21 #ifndef __UCSDET_H 22 #define __UCSDET_H 23 24 #include "unicode/utypes.h" 25 26 #if !UCONFIG_NO_CONVERSION 27 28 #include "unicode/localpointer.h" 29 #include "unicode/uenum.h" 30 31 /** 32 * \file 33 * \brief C API: Charset Detection API 34 * 35 * This API provides a facility for detecting the 36 * charset or encoding of character data in an unknown text format. 37 * The input data can be from an array of bytes. 38 * <p> 39 * Character set detection is at best an imprecise operation. The detection 40 * process will attempt to identify the charset that best matches the characteristics 41 * of the byte data, but the process is partly statistical in nature, and 42 * the results can not be guaranteed to always be correct. 43 * <p> 44 * For best accuracy in charset detection, the input data should be primarily 45 * in a single language, and a minimum of a few hundred bytes worth of plain text 46 * in the language are needed. The detection process will attempt to 47 * ignore html or xml style markup that could otherwise obscure the content. 48 */ 49 50 51 struct UCharsetDetector; 52 /** 53 * Structure representing a charset detector 54 * @stable ICU 3.6 55 */ 56 typedef struct UCharsetDetector UCharsetDetector; 57 58 struct UCharsetMatch; 59 /** 60 * Opaque structure representing a match that was identified 61 * from a charset detection operation. 62 * @stable ICU 3.6 63 */ 64 typedef struct UCharsetMatch UCharsetMatch; 65 66 /** 67 * Open a charset detector. 68 * 69 * @param status Any error conditions occurring during the open 70 * operation are reported back in this variable. 71 * @return the newly opened charset detector. 72 * @stable ICU 3.6 73 */ 74 U_STABLE UCharsetDetector * U_EXPORT2 75 ucsdet_open(UErrorCode *status); 76 77 /** 78 * Close a charset detector. All storage and any other resources 79 * owned by this charset detector will be released. Failure to 80 * close a charset detector when finished with it can result in 81 * memory leaks in the application. 82 * 83 * @param ucsd The charset detector to be closed. 84 * @stable ICU 3.6 85 */ 86 U_STABLE void U_EXPORT2 87 ucsdet_close(UCharsetDetector *ucsd); 88 89 #if U_SHOW_CPLUSPLUS_API 90 91 U_NAMESPACE_BEGIN 92 93 /** 94 * \class LocalUCharsetDetectorPointer 95 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 96 * For most methods see the LocalPointerBase base class. 97 * 98 * @see LocalPointerBase 99 * @see LocalPointer 100 * @stable ICU 4.4 101 */ 102 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 103 104 U_NAMESPACE_END 105 106 #endif 107 108 /** 109 * Set the input byte data whose charset is to detected. 110 * 111 * Ownership of the input text byte array remains with the caller. 112 * The input string must not be altered or deleted until the charset 113 * detector is either closed or reset to refer to different input text. 114 * 115 * @param ucsd the charset detector to be used. 116 * @param textIn the input text of unknown encoding. . 117 * @param len the length of the input text, or -1 if the text 118 * is NUL terminated. 119 * @param status any error conditions are reported back in this variable. 120 * 121 * @stable ICU 3.6 122 */ 123 U_STABLE void U_EXPORT2 124 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 125 126 127 /** Set the declared encoding for charset detection. 128 * The declared encoding of an input text is an encoding obtained 129 * by the user from an http header or xml declaration or similar source that 130 * can be provided as an additional hint to the charset detector. 131 * 132 * How and whether the declared encoding will be used during the 133 * detection process is TBD. 134 * 135 * @param ucsd the charset detector to be used. 136 * @param encoding an encoding for the current data obtained from 137 * a header or declaration or other source outside 138 * of the byte data itself. 139 * @param length the length of the encoding name, or -1 if the name string 140 * is NUL terminated. 141 * @param status any error conditions are reported back in this variable. 142 * 143 * @stable ICU 3.6 144 */ 145 U_STABLE void U_EXPORT2 146 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 147 148 149 /** 150 * Return the charset that best matches the supplied input data. 151 * 152 * Note though, that because the detection 153 * only looks at the start of the input data, 154 * there is a possibility that the returned charset will fail to handle 155 * the full set of input data. 156 * <p> 157 * The returned UCharsetMatch object is owned by the UCharsetDetector. 158 * It will remain valid until the detector input is reset, or until 159 * the detector is closed. 160 * <p> 161 * The function will fail if 162 * <ul> 163 * <li>no charset appears to match the data.</li> 164 * <li>no input text has been provided</li> 165 * </ul> 166 * 167 * @param ucsd the charset detector to be used. 168 * @param status any error conditions are reported back in this variable. 169 * @return a UCharsetMatch representing the best matching charset, 170 * or NULL if no charset matches the byte data. 171 * 172 * @stable ICU 3.6 173 */ 174 U_STABLE const UCharsetMatch * U_EXPORT2 175 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 176 177 178 /** 179 * Find all charset matches that appear to be consistent with the input, 180 * returning an array of results. The results are ordered with the 181 * best quality match first. 182 * 183 * Because the detection only looks at a limited amount of the 184 * input byte data, some of the returned charsets may fail to handle 185 * the all of input data. 186 * <p> 187 * The returned UCharsetMatch objects are owned by the UCharsetDetector. 188 * They will remain valid until the detector is closed or modified 189 * 190 * <p> 191 * Return an error if 192 * <ul> 193 * <li>no charsets appear to match the input data.</li> 194 * <li>no input text has been provided</li> 195 * </ul> 196 * 197 * @param ucsd the charset detector to be used. 198 * @param matchesFound pointer to a variable that will be set to the 199 * number of charsets identified that are consistent with 200 * the input data. Output only. 201 * @param status any error conditions are reported back in this variable. 202 * @return A pointer to an array of pointers to UCharSetMatch objects. 203 * This array, and the UCharSetMatch instances to which it refers, 204 * are owned by the UCharsetDetector, and will remain valid until 205 * the detector is closed or modified. 206 * @stable ICU 3.6 207 */ 208 U_STABLE const UCharsetMatch ** U_EXPORT2 209 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 210 211 212 213 /** 214 * Get the name of the charset represented by a UCharsetMatch. 215 * 216 * The storage for the returned name string is owned by the 217 * UCharsetMatch, and will remain valid while the UCharsetMatch 218 * is valid. 219 * 220 * The name returned is suitable for use with the ICU conversion APIs. 221 * 222 * @param ucsm The charset match object. 223 * @param status Any error conditions are reported back in this variable. 224 * @return The name of the matching charset. 225 * 226 * @stable ICU 3.6 227 */ 228 U_STABLE const char * U_EXPORT2 229 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 230 231 /** 232 * Get a confidence number for the quality of the match of the byte 233 * data with the charset. Confidence numbers range from zero to 100, 234 * with 100 representing complete confidence and zero representing 235 * no confidence. 236 * 237 * The confidence values are somewhat arbitrary. They define an 238 * an ordering within the results for any single detection operation 239 * but are not generally comparable between the results for different input. 240 * 241 * A confidence value of ten does have a general meaning - it is used 242 * for charsets that can represent the input data, but for which there 243 * is no other indication that suggests that the charset is the correct one. 244 * Pure 7 bit ASCII data, for example, is compatible with a 245 * great many charsets, most of which will appear as possible matches 246 * with a confidence of 10. 247 * 248 * @param ucsm The charset match object. 249 * @param status Any error conditions are reported back in this variable. 250 * @return A confidence number for the charset match. 251 * 252 * @stable ICU 3.6 253 */ 254 U_STABLE int32_t U_EXPORT2 255 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 256 257 /** 258 * Get the RFC 3066 code for the language of the input data. 259 * 260 * The Charset Detection service is intended primarily for detecting 261 * charsets, not language. For some, but not all, charsets, a language is 262 * identified as a byproduct of the detection process, and that is what 263 * is returned by this function. 264 * 265 * CAUTION: 266 * 1. Language information is not available for input data encoded in 267 * all charsets. In particular, no language is identified 268 * for UTF-8 input data. 269 * 270 * 2. Closely related languages may sometimes be confused. 271 * 272 * If more accurate language detection is required, a linguistic 273 * analysis package should be used. 274 * 275 * The storage for the returned name string is owned by the 276 * UCharsetMatch, and will remain valid while the UCharsetMatch 277 * is valid. 278 * 279 * @param ucsm The charset match object. 280 * @param status Any error conditions are reported back in this variable. 281 * @return The RFC 3066 code for the language of the input data, or 282 * an empty string if the language could not be determined. 283 * 284 * @stable ICU 3.6 285 */ 286 U_STABLE const char * U_EXPORT2 287 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 288 289 290 /** 291 * Get the entire input text as a UChar string, placing it into 292 * a caller-supplied buffer. A terminating 293 * NUL character will be appended to the buffer if space is available. 294 * 295 * The number of UChars in the output string, not including the terminating 296 * NUL, is returned. 297 * 298 * If the supplied buffer is smaller than required to hold the output, 299 * the contents of the buffer are undefined. The full output string length 300 * (in UChars) is returned as always, and can be used to allocate a buffer 301 * of the correct size. 302 * 303 * 304 * @param ucsm The charset match object. 305 * @param buf A UChar buffer to be filled with the converted text data. 306 * @param cap The capacity of the buffer in UChars. 307 * @param status Any error conditions are reported back in this variable. 308 * @return The number of UChars in the output string. 309 * 310 * @stable ICU 3.6 311 */ 312 U_STABLE int32_t U_EXPORT2 313 ucsdet_getUChars(const UCharsetMatch *ucsm, 314 UChar *buf, int32_t cap, UErrorCode *status); 315 316 317 318 /** 319 * Get an iterator over the set of all detectable charsets - 320 * over the charsets that are known to the charset detection 321 * service. 322 * 323 * The returned UEnumeration provides access to the names of 324 * the charsets. 325 * 326 * <p> 327 * The state of the Charset detector that is passed in does not 328 * affect the result of this function, but requiring a valid, open 329 * charset detector as a parameter insures that the charset detection 330 * service has been safely initialized and that the required detection 331 * data is available. 332 * 333 * <p> 334 * <b>Note:</b> Multiple different charset encodings in a same family may use 335 * a single shared name in this implementation. For example, this method returns 336 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 337 * (Windows Latin 1). However, actual detection result could be "windows-1252" 338 * when the input data matches Latin 1 code points with any points only available 339 * in "windows-1252". 340 * 341 * @param ucsd a Charset detector. 342 * @param status Any error conditions are reported back in this variable. 343 * @return an iterator providing access to the detectable charset names. 344 * @stable ICU 3.6 345 */ 346 U_STABLE UEnumeration * U_EXPORT2 347 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 348 349 /** 350 * Test whether input filtering is enabled for this charset detector. 351 * Input filtering removes text that appears to be HTML or xml 352 * markup from the input before applying the code page detection 353 * heuristics. 354 * 355 * @param ucsd The charset detector to check. 356 * @return TRUE if filtering is enabled. 357 * @stable ICU 3.6 358 */ 359 360 U_STABLE UBool U_EXPORT2 361 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 362 363 364 /** 365 * Enable filtering of input text. If filtering is enabled, 366 * text within angle brackets ("<" and ">") will be removed 367 * before detection, which will remove most HTML or xml markup. 368 * 369 * @param ucsd the charset detector to be modified. 370 * @param filter <code>true</code> to enable input text filtering. 371 * @return The previous setting. 372 * 373 * @stable ICU 3.6 374 */ 375 U_STABLE UBool U_EXPORT2 376 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 377 378 #ifndef U_HIDE_INTERNAL_API 379 /** 380 * Get an iterator over the set of detectable charsets - 381 * over the charsets that are enabled by the specified charset detector. 382 * 383 * The returned UEnumeration provides access to the names of 384 * the charsets. 385 * 386 * @param ucsd a Charset detector. 387 * @param status Any error conditions are reported back in this variable. 388 * @return an iterator providing access to the detectable charset names by 389 * the specified charset detector. 390 * @internal 391 */ 392 U_INTERNAL UEnumeration * U_EXPORT2 393 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 394 395 /** 396 * Enable or disable individual charset encoding. 397 * A name of charset encoding must be included in the names returned by 398 * {@link #getAllDetectableCharsets()}. 399 * 400 * @param ucsd a Charset detector. 401 * @param encoding encoding the name of charset encoding. 402 * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the 403 * charset encoding. 404 * @param status receives the return status. When the name of charset encoding 405 * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 406 * @internal 407 */ 408 U_INTERNAL void U_EXPORT2 409 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 410 #endif /* U_HIDE_INTERNAL_API */ 411 412 #endif 413 #endif /* __UCSDET_H */ 414 415 416