1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_CONVERSION
13 
14 #include "inputext.h"
15 
16 #include "cmemory.h"
17 #include "cstring.h"
18 
19 #include <string.h>
20 
21 U_NAMESPACE_BEGIN
22 
23 #define BUFFER_SIZE 8192
24 
25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
26 #define DELETE_ARRAY(array) uprv_free((void *) (array))
27 
InputText(UErrorCode & status)28 InputText::InputText(UErrorCode &status)
29     : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
30                                                  //   removed if appropriate.
31       fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
32                                                  //   Value is percent, not absolute.
33       fDeclaredEncoding(0),
34       fRawInput(0),
35       fRawLength(0)
36 {
37     if (fInputBytes == NULL || fByteStats == NULL) {
38         status = U_MEMORY_ALLOCATION_ERROR;
39     }
40 }
41 
~InputText()42 InputText::~InputText()
43 {
44     DELETE_ARRAY(fDeclaredEncoding);
45     DELETE_ARRAY(fByteStats);
46     DELETE_ARRAY(fInputBytes);
47 }
48 
setText(const char * in,int32_t len)49 void InputText::setText(const char *in, int32_t len)
50 {
51     fInputLen  = 0;
52     fC1Bytes   = FALSE;
53     fRawInput  = (const uint8_t *) in;
54     fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
55 }
56 
setDeclaredEncoding(const char * encoding,int32_t len)57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
58 {
59     if(encoding) {
60         if (len == -1) {
61             len = (int32_t)uprv_strlen(encoding);
62         }
63 
64         len += 1;     // to make place for the \0 at the end.
65         uprv_free(fDeclaredEncoding);
66         fDeclaredEncoding = NEW_ARRAY(char, len);
67         uprv_strncpy(fDeclaredEncoding, encoding, len);
68     }
69 }
70 
isSet() const71 UBool InputText::isSet() const
72 {
73     return fRawInput != NULL;
74 }
75 
76 /**
77 *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
78 *               it by removing what appears to be html markup.
79 *
80 * @internal
81 */
MungeInput(UBool fStripTags)82 void InputText::MungeInput(UBool fStripTags) {
83     int     srci = 0;
84     int     dsti = 0;
85     uint8_t b;
86     bool    inMarkup = FALSE;
87     int32_t openTags = 0;
88     int32_t badTags  = 0;
89 
90     //
91     //  html / xml markup stripping.
92     //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
93     //     discard everything within < brackets >
94     //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
95     //     guess as to whether the input was actually marked up at all.
96     // TODO: Think about how this interacts with EBCDIC charsets that are detected.
97     if (fStripTags) {
98         for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
99             b = fRawInput[srci];
100 
101             if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
102                 if (inMarkup) {
103                     badTags += 1;
104                 }
105 
106                 inMarkup = TRUE;
107                 openTags += 1;
108             }
109 
110             if (! inMarkup) {
111                 fInputBytes[dsti++] = b;
112             }
113 
114             if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
115                 inMarkup = FALSE;
116             }
117         }
118 
119         fInputLen = dsti;
120     }
121 
122     //
123     //  If it looks like this input wasn't marked up, or if it looks like it's
124     //    essentially nothing but markup abandon the markup stripping.
125     //    Detection will have to work on the unstripped input.
126     //
127     if (openTags<5 || openTags/5 < badTags ||
128         (fInputLen < 100 && fRawLength>600))
129     {
130         int32_t limit = fRawLength;
131 
132         if (limit > BUFFER_SIZE) {
133             limit = BUFFER_SIZE;
134         }
135 
136         for (srci=0; srci<limit; srci++) {
137             fInputBytes[srci] = fRawInput[srci];
138         }
139 
140         fInputLen = srci;
141     }
142 
143     //
144     // Tally up the byte occurence statistics.
145     // These are available for use by the various detectors.
146     //
147 
148     uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
149 
150     for (srci = 0; srci < fInputLen; srci += 1) {
151         fByteStats[fInputBytes[srci]] += 1;
152     }
153 
154     for (int32_t i = 0x80; i <= 0x9F; i += 1) {
155         if (fByteStats[i] != 0) {
156             fC1Bytes = TRUE;
157             break;
158         }
159     }
160 }
161 
162 U_NAMESPACE_END
163 #endif
164 
165