1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/old_utf_string_conversions.h"
6
7 #include <stdint.h>
8
9 #include "base/strings/string_piece.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversion_utils.h"
12 #include "build/build_config.h"
13
14 namespace base_old {
15
16 using base::IsStringASCII;
17 using base::ReadUnicodeCharacter;
18 using base::WriteUnicodeCharacter;
19
20 template<typename CHAR>
PrepareForUTF8Output(const CHAR * src,size_t src_len,std::string * output)21 void PrepareForUTF8Output(const CHAR* src,
22 size_t src_len,
23 std::string* output) {
24 output->clear();
25 if (src_len == 0)
26 return;
27 if (src[0] < 0x80) {
28 // Assume that the entire input will be ASCII.
29 output->reserve(src_len);
30 } else {
31 // Assume that the entire input is non-ASCII and will have 3 bytes per char.
32 output->reserve(src_len * 3);
33 }
34 }
35
36 template<typename STRING>
PrepareForUTF16Or32Output(const char * src,size_t src_len,STRING * output)37 void PrepareForUTF16Or32Output(const char* src,
38 size_t src_len,
39 STRING* output) {
40 output->clear();
41 if (src_len == 0)
42 return;
43 if (static_cast<unsigned char>(src[0]) < 0x80) {
44 // Assume the input is all ASCII, which means 1:1 correspondence.
45 output->reserve(src_len);
46 } else {
47 // Otherwise assume that the UTF-8 sequences will have 2 bytes for each
48 // character.
49 output->reserve(src_len / 2);
50 }
51 }
52
53 namespace {
54
55 // Generalized Unicode converter -----------------------------------------------
56
57 // Converts the given source Unicode character type to the given destination
58 // Unicode character type as a STL string. The given input buffer and size
59 // determine the source, and the given output STL string will be replaced by
60 // the result.
61 template <typename SRC_CHAR, typename DEST_STRING>
ConvertUnicode(const SRC_CHAR * src,size_t src_len,DEST_STRING * output)62 bool ConvertUnicode(const SRC_CHAR* src, size_t src_len, DEST_STRING* output) {
63 // ICU requires 32-bit numbers.
64 bool success = true;
65 int32_t src_len32 = static_cast<int32_t>(src_len);
66 for (int32_t i = 0; i < src_len32; i++) {
67 uint32_t code_point;
68 if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
69 WriteUnicodeCharacter(code_point, output);
70 } else {
71 WriteUnicodeCharacter(0xFFFD, output);
72 success = false;
73 }
74 }
75
76 return success;
77 }
78
79 } // namespace
80
81 // UTF-8 <-> Wide --------------------------------------------------------------
82
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)83 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
84 if (IsStringASCII(std::wstring(src, src_len))) {
85 output->assign(src, src + src_len);
86 return true;
87 } else {
88 PrepareForUTF8Output(src, src_len, output);
89 return ConvertUnicode(src, src_len, output);
90 }
91 }
92
WideToUTF8(const std::wstring & wide)93 std::string WideToUTF8(const std::wstring& wide) {
94 if (IsStringASCII(wide)) {
95 return std::string(wide.data(), wide.data() + wide.length());
96 }
97
98 std::string ret;
99 PrepareForUTF8Output(wide.data(), wide.length(), &ret);
100 ConvertUnicode(wide.data(), wide.length(), &ret);
101 return ret;
102 }
103
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)104 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
105 if (IsStringASCII(StringPiece(src, src_len))) {
106 output->assign(src, src + src_len);
107 return true;
108 } else {
109 PrepareForUTF16Or32Output(src, src_len, output);
110 return ConvertUnicode(src, src_len, output);
111 }
112 }
113
UTF8ToWide(StringPiece utf8)114 std::wstring UTF8ToWide(StringPiece utf8) {
115 if (IsStringASCII(utf8)) {
116 return std::wstring(utf8.begin(), utf8.end());
117 }
118
119 std::wstring ret;
120 PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
121 ConvertUnicode(utf8.data(), utf8.length(), &ret);
122 return ret;
123 }
124
125 // UTF-16 <-> Wide -------------------------------------------------------------
126
127 #if defined(WCHAR_T_IS_UTF16)
128
129 // When wide == UTF-16, then conversions are a NOP.
WideToUTF16(const wchar_t * src,size_t src_len,string16 * output)130 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
131 output->assign(src, src_len);
132 return true;
133 }
134
WideToUTF16(const std::wstring & wide)135 string16 WideToUTF16(const std::wstring& wide) {
136 return wide;
137 }
138
UTF16ToWide(const char16 * src,size_t src_len,std::wstring * output)139 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
140 output->assign(src, src_len);
141 return true;
142 }
143
UTF16ToWide(const string16 & utf16)144 std::wstring UTF16ToWide(const string16& utf16) {
145 return utf16;
146 }
147
148 #elif defined(WCHAR_T_IS_UTF32)
149
WideToUTF16(const wchar_t * src,size_t src_len,string16 * output)150 bool WideToUTF16(const wchar_t* src, size_t src_len, string16* output) {
151 output->clear();
152 // Assume that normally we won't have any non-BMP characters so the counts
153 // will be the same.
154 output->reserve(src_len);
155 return ConvertUnicode(src, src_len, output);
156 }
157
WideToUTF16(const std::wstring & wide)158 string16 WideToUTF16(const std::wstring& wide) {
159 string16 ret;
160 WideToUTF16(wide.data(), wide.length(), &ret);
161 return ret;
162 }
163
UTF16ToWide(const char16 * src,size_t src_len,std::wstring * output)164 bool UTF16ToWide(const char16* src, size_t src_len, std::wstring* output) {
165 output->clear();
166 // Assume that normally we won't have any non-BMP characters so the counts
167 // will be the same.
168 output->reserve(src_len);
169 return ConvertUnicode(src, src_len, output);
170 }
171
UTF16ToWide(const string16 & utf16)172 std::wstring UTF16ToWide(const string16& utf16) {
173 std::wstring ret;
174 UTF16ToWide(utf16.data(), utf16.length(), &ret);
175 return ret;
176 }
177
178 #endif // defined(WCHAR_T_IS_UTF32)
179
180 // UTF16 <-> UTF8 --------------------------------------------------------------
181
182 #if defined(WCHAR_T_IS_UTF32)
183
UTF8ToUTF16(const char * src,size_t src_len,string16 * output)184 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
185 if (IsStringASCII(StringPiece(src, src_len))) {
186 output->assign(src, src + src_len);
187 return true;
188 } else {
189 PrepareForUTF16Or32Output(src, src_len, output);
190 return ConvertUnicode(src, src_len, output);
191 }
192 }
193
UTF8ToUTF16(StringPiece utf8)194 string16 UTF8ToUTF16(StringPiece utf8) {
195 if (IsStringASCII(utf8)) {
196 return string16(utf8.begin(), utf8.end());
197 }
198
199 string16 ret;
200 PrepareForUTF16Or32Output(utf8.data(), utf8.length(), &ret);
201 // Ignore the success flag of this call, it will do the best it can for
202 // invalid input, which is what we want here.
203 ConvertUnicode(utf8.data(), utf8.length(), &ret);
204 return ret;
205 }
206
UTF16ToUTF8(const char16 * src,size_t src_len,std::string * output)207 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
208 if (IsStringASCII(StringPiece16(src, src_len))) {
209 output->assign(src, src + src_len);
210 return true;
211 } else {
212 PrepareForUTF8Output(src, src_len, output);
213 return ConvertUnicode(src, src_len, output);
214 }
215 }
216
UTF16ToUTF8(StringPiece16 utf16)217 std::string UTF16ToUTF8(StringPiece16 utf16) {
218 std::string ret;
219 // Ignore the success flag of this call, it will do the best it can for
220 // invalid input, which is what we want here.
221 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
222 return ret;
223 }
224
225 #elif defined(WCHAR_T_IS_UTF16)
226 // Easy case since we can use the "wide" versions we already wrote above.
227
UTF8ToUTF16(const char * src,size_t src_len,string16 * output)228 bool UTF8ToUTF16(const char* src, size_t src_len, string16* output) {
229 return UTF8ToWide(src, src_len, output);
230 }
231
UTF8ToUTF16(StringPiece utf8)232 string16 UTF8ToUTF16(StringPiece utf8) {
233 return UTF8ToWide(utf8);
234 }
235
UTF16ToUTF8(const char16 * src,size_t src_len,std::string * output)236 bool UTF16ToUTF8(const char16* src, size_t src_len, std::string* output) {
237 return WideToUTF8(src, src_len, output);
238 }
239
UTF16ToUTF8(StringPiece16 utf16)240 std::string UTF16ToUTF8(StringPiece16 utf16) {
241 if (IsStringASCII(utf16))
242 return std::string(utf16.data(), utf16.data() + utf16.length());
243
244 std::string ret;
245 PrepareForUTF8Output(utf16.data(), utf16.length(), &ret);
246 ConvertUnicode(utf16.data(), utf16.length(), &ret);
247 return ret;
248 }
249
250 #endif
251
ASCIIToUTF16(StringPiece ascii)252 string16 ASCIIToUTF16(StringPiece ascii) {
253 DCHECK(IsStringASCII(ascii)) << ascii;
254 return string16(ascii.begin(), ascii.end());
255 }
256
UTF16ToASCII(StringPiece16 utf16)257 std::string UTF16ToASCII(StringPiece16 utf16) {
258 DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
259 return std::string(utf16.begin(), utf16.end());
260 }
261
262 } // namespace base_old
263