1 /* Copyright 2013 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 /* Heuristics for deciding about the UTF8-ness of strings. */
8 
9 #include "./utf8_util.h"
10 
11 #include <brotli/types.h>
12 
13 #if defined(__cplusplus) || defined(c_plusplus)
14 extern "C" {
15 #endif
16 
BrotliParseAsUTF8(int * symbol,const uint8_t * input,size_t size)17 static size_t BrotliParseAsUTF8(
18     int* symbol, const uint8_t* input, size_t size) {
19   /* ASCII */
20   if ((input[0] & 0x80) == 0) {
21     *symbol = input[0];
22     if (*symbol > 0) {
23       return 1;
24     }
25   }
26   /* 2-byte UTF8 */
27   if (size > 1u &&
28       (input[0] & 0xE0) == 0xC0 &&
29       (input[1] & 0xC0) == 0x80) {
30     *symbol = (((input[0] & 0x1F) << 6) |
31                (input[1] & 0x3F));
32     if (*symbol > 0x7F) {
33       return 2;
34     }
35   }
36   /* 3-byte UFT8 */
37   if (size > 2u &&
38       (input[0] & 0xF0) == 0xE0 &&
39       (input[1] & 0xC0) == 0x80 &&
40       (input[2] & 0xC0) == 0x80) {
41     *symbol = (((input[0] & 0x0F) << 12) |
42                ((input[1] & 0x3F) << 6) |
43                (input[2] & 0x3F));
44     if (*symbol > 0x7FF) {
45       return 3;
46     }
47   }
48   /* 4-byte UFT8 */
49   if (size > 3u &&
50       (input[0] & 0xF8) == 0xF0 &&
51       (input[1] & 0xC0) == 0x80 &&
52       (input[2] & 0xC0) == 0x80 &&
53       (input[3] & 0xC0) == 0x80) {
54     *symbol = (((input[0] & 0x07) << 18) |
55                ((input[1] & 0x3F) << 12) |
56                ((input[2] & 0x3F) << 6) |
57                (input[3] & 0x3F));
58     if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
59       return 4;
60     }
61   }
62   /* Not UTF8, emit a special symbol above the UTF8-code space */
63   *symbol = 0x110000 | input[0];
64   return 1;
65 }
66 
67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
BrotliIsMostlyUTF8(const uint8_t * data,const size_t pos,const size_t mask,const size_t length,const double min_fraction)68 BROTLI_BOOL BrotliIsMostlyUTF8(
69     const uint8_t* data, const size_t pos, const size_t mask,
70     const size_t length, const double min_fraction) {
71   size_t size_utf8 = 0;
72   size_t i = 0;
73   while (i < length) {
74     int symbol;
75     size_t bytes_read =
76         BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
77     i += bytes_read;
78     if (symbol < 0x110000) size_utf8 += bytes_read;
79   }
80   return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length);
81 }
82 
83 #if defined(__cplusplus) || defined(c_plusplus)
84 }  /* extern "C" */
85 #endif
86