1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_CONVERSION 13 14 #include "inputext.h" 15 16 #include "cmemory.h" 17 #include "cstring.h" 18 19 #include <string.h> 20 21 U_NAMESPACE_BEGIN 22 23 #define BUFFER_SIZE 8192 24 25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 26 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 27 28 InputText::InputText(UErrorCode &status) 29 : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been 30 // removed if appropriate. 31 fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text. 32 // Value is percent, not absolute. 33 fDeclaredEncoding(0), 34 fRawInput(0), 35 fRawLength(0) 36 { 37 if (fInputBytes == NULL || fByteStats == NULL) { 38 status = U_MEMORY_ALLOCATION_ERROR; 39 } 40 } 41 42 InputText::~InputText() 43 { 44 DELETE_ARRAY(fDeclaredEncoding); 45 DELETE_ARRAY(fByteStats); 46 DELETE_ARRAY(fInputBytes); 47 } 48 49 void InputText::setText(const char *in, int32_t len) 50 { 51 fInputLen = 0; 52 fC1Bytes = FALSE; 53 fRawInput = (const uint8_t *) in; 54 fRawLength = len == -1? (int32_t)uprv_strlen(in) : len; 55 } 56 57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len) 58 { 59 if(encoding) { 60 if (len == -1) { 61 len = (int32_t)uprv_strlen(encoding); 62 } 63 64 len += 1; // to make place for the \0 at the end. 65 uprv_free(fDeclaredEncoding); 66 fDeclaredEncoding = NEW_ARRAY(char, len); 67 uprv_strncpy(fDeclaredEncoding, encoding, len); 68 } 69 } 70 71 UBool InputText::isSet() const 72 { 73 return fRawInput != NULL; 74 } 75 76 /** 77 * MungeInput - after getting a set of raw input data to be analyzed, preprocess 78 * it by removing what appears to be html markup. 79 * 80 * @internal 81 */ 82 void InputText::MungeInput(UBool fStripTags) { 83 int srci = 0; 84 int dsti = 0; 85 uint8_t b; 86 bool inMarkup = FALSE; 87 int32_t openTags = 0; 88 int32_t badTags = 0; 89 90 // 91 // html / xml markup stripping. 92 // quick and dirty, not 100% accurate, but hopefully good enough, statistically. 93 // discard everything within < brackets > 94 // Count how many total '<' and illegal (nested) '<' occur, so we can make some 95 // guess as to whether the input was actually marked up at all. 96 // TODO: Think about how this interacts with EBCDIC charsets that are detected. 97 if (fStripTags) { 98 for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) { 99 b = fRawInput[srci]; 100 101 if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */ 102 if (inMarkup) { 103 badTags += 1; 104 } 105 106 inMarkup = TRUE; 107 openTags += 1; 108 } 109 110 if (! inMarkup) { 111 fInputBytes[dsti++] = b; 112 } 113 114 if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */ 115 inMarkup = FALSE; 116 } 117 } 118 119 fInputLen = dsti; 120 } 121 122 // 123 // If it looks like this input wasn't marked up, or if it looks like it's 124 // essentially nothing but markup abandon the markup stripping. 125 // Detection will have to work on the unstripped input. 126 // 127 if (openTags<5 || openTags/5 < badTags || 128 (fInputLen < 100 && fRawLength>600)) 129 { 130 int32_t limit = fRawLength; 131 132 if (limit > BUFFER_SIZE) { 133 limit = BUFFER_SIZE; 134 } 135 136 for (srci=0; srci<limit; srci++) { 137 fInputBytes[srci] = fRawInput[srci]; 138 } 139 140 fInputLen = srci; 141 } 142 143 // 144 // Tally up the byte occurence statistics. 145 // These are available for use by the various detectors. 146 // 147 148 uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256); 149 150 for (srci = 0; srci < fInputLen; srci += 1) { 151 fByteStats[fInputBytes[srci]] += 1; 152 } 153 154 for (int32_t i = 0x80; i <= 0x9F; i += 1) { 155 if (fByteStats[i] != 0) { 156 fC1Bytes = TRUE; 157 break; 158 } 159 } 160 } 161 162 U_NAMESPACE_END 163 #endif 164 165