1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "compile/Pseudolocalizer.h"
18 #include "util/Util.h"
19
20 namespace aapt {
21
22 // String basis to generate expansion
23 static const std::u16string k_expansion_string = u"one two three "
24 "four five six seven eight nine ten eleven twelve thirteen "
25 "fourteen fiveteen sixteen seventeen nineteen twenty";
26
27 // Special unicode characters to override directionality of the words
28 static const std::u16string k_rlm = u"\u200f";
29 static const std::u16string k_rlo = u"\u202e";
30 static const std::u16string k_pdf = u"\u202c";
31
32 // Placeholder marks
33 static const std::u16string k_placeholder_open = u"\u00bb";
34 static const std::u16string k_placeholder_close = u"\u00ab";
35
36 static const char16_t k_arg_start = u'{';
37 static const char16_t k_arg_end = u'}';
38
39 class PseudoMethodNone : public PseudoMethodImpl {
40 public:
text(const StringPiece16 & text)41 std::u16string text(const StringPiece16& text) override { return text.toString(); }
placeholder(const StringPiece16 & text)42 std::u16string placeholder(const StringPiece16& text) override { return text.toString(); }
43 };
44
45 class PseudoMethodBidi : public PseudoMethodImpl {
46 public:
47 std::u16string text(const StringPiece16& text) override;
48 std::u16string placeholder(const StringPiece16& text) override;
49 };
50
51 class PseudoMethodAccent : public PseudoMethodImpl {
52 public:
PseudoMethodAccent()53 PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {}
54 std::u16string start() override;
55 std::u16string end() override;
56 std::u16string text(const StringPiece16& text) override;
57 std::u16string placeholder(const StringPiece16& text) override;
58 private:
59 size_t mDepth;
60 size_t mWordCount;
61 size_t mLength;
62 };
63
Pseudolocalizer(Method method)64 Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) {
65 setMethod(method);
66 }
67
setMethod(Method method)68 void Pseudolocalizer::setMethod(Method method) {
69 switch (method) {
70 case Method::kNone:
71 mImpl = util::make_unique<PseudoMethodNone>();
72 break;
73 case Method::kAccent:
74 mImpl = util::make_unique<PseudoMethodAccent>();
75 break;
76 case Method::kBidi:
77 mImpl = util::make_unique<PseudoMethodBidi>();
78 break;
79 }
80 }
81
text(const StringPiece16 & text)82 std::u16string Pseudolocalizer::text(const StringPiece16& text) {
83 std::u16string out;
84 size_t depth = mLastDepth;
85 size_t lastpos, pos;
86 const size_t length = text.size();
87 const char16_t* str = text.data();
88 bool escaped = false;
89 for (lastpos = pos = 0; pos < length; pos++) {
90 char16_t c = str[pos];
91 if (escaped) {
92 escaped = false;
93 continue;
94 }
95 if (c == '\'') {
96 escaped = true;
97 continue;
98 }
99
100 if (c == k_arg_start) {
101 depth++;
102 } else if (c == k_arg_end && depth) {
103 depth--;
104 }
105
106 if (mLastDepth != depth || pos == length - 1) {
107 bool pseudo = ((mLastDepth % 2) == 0);
108 size_t nextpos = pos;
109 if (!pseudo || depth == mLastDepth) {
110 nextpos++;
111 }
112 size_t size = nextpos - lastpos;
113 if (size) {
114 std::u16string chunk = text.substr(lastpos, size).toString();
115 if (pseudo) {
116 chunk = mImpl->text(chunk);
117 } else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) {
118 chunk = mImpl->placeholder(chunk);
119 }
120 out.append(chunk);
121 }
122 if (pseudo && depth < mLastDepth) { // End of message
123 out.append(mImpl->end());
124 } else if (!pseudo && depth > mLastDepth) { // Start of message
125 out.append(mImpl->start());
126 }
127 lastpos = nextpos;
128 mLastDepth = depth;
129 }
130 }
131 return out;
132 }
133
pseudolocalizeChar(const char16_t c)134 static const char16_t* pseudolocalizeChar(const char16_t c) {
135 switch (c) {
136 case 'a': return u"\u00e5";
137 case 'b': return u"\u0253";
138 case 'c': return u"\u00e7";
139 case 'd': return u"\u00f0";
140 case 'e': return u"\u00e9";
141 case 'f': return u"\u0192";
142 case 'g': return u"\u011d";
143 case 'h': return u"\u0125";
144 case 'i': return u"\u00ee";
145 case 'j': return u"\u0135";
146 case 'k': return u"\u0137";
147 case 'l': return u"\u013c";
148 case 'm': return u"\u1e3f";
149 case 'n': return u"\u00f1";
150 case 'o': return u"\u00f6";
151 case 'p': return u"\u00fe";
152 case 'q': return u"\u0051";
153 case 'r': return u"\u0155";
154 case 's': return u"\u0161";
155 case 't': return u"\u0163";
156 case 'u': return u"\u00fb";
157 case 'v': return u"\u0056";
158 case 'w': return u"\u0175";
159 case 'x': return u"\u0445";
160 case 'y': return u"\u00fd";
161 case 'z': return u"\u017e";
162 case 'A': return u"\u00c5";
163 case 'B': return u"\u03b2";
164 case 'C': return u"\u00c7";
165 case 'D': return u"\u00d0";
166 case 'E': return u"\u00c9";
167 case 'G': return u"\u011c";
168 case 'H': return u"\u0124";
169 case 'I': return u"\u00ce";
170 case 'J': return u"\u0134";
171 case 'K': return u"\u0136";
172 case 'L': return u"\u013b";
173 case 'M': return u"\u1e3e";
174 case 'N': return u"\u00d1";
175 case 'O': return u"\u00d6";
176 case 'P': return u"\u00de";
177 case 'Q': return u"\u0071";
178 case 'R': return u"\u0154";
179 case 'S': return u"\u0160";
180 case 'T': return u"\u0162";
181 case 'U': return u"\u00db";
182 case 'V': return u"\u03bd";
183 case 'W': return u"\u0174";
184 case 'X': return u"\u00d7";
185 case 'Y': return u"\u00dd";
186 case 'Z': return u"\u017d";
187 case '!': return u"\u00a1";
188 case '?': return u"\u00bf";
189 case '$': return u"\u20ac";
190 default: return NULL;
191 }
192 }
193
isPossibleNormalPlaceholderEnd(const char16_t c)194 static bool isPossibleNormalPlaceholderEnd(const char16_t c) {
195 switch (c) {
196 case 's': return true;
197 case 'S': return true;
198 case 'c': return true;
199 case 'C': return true;
200 case 'd': return true;
201 case 'o': return true;
202 case 'x': return true;
203 case 'X': return true;
204 case 'f': return true;
205 case 'e': return true;
206 case 'E': return true;
207 case 'g': return true;
208 case 'G': return true;
209 case 'a': return true;
210 case 'A': return true;
211 case 'b': return true;
212 case 'B': return true;
213 case 'h': return true;
214 case 'H': return true;
215 case '%': return true;
216 case 'n': return true;
217 default: return false;
218 }
219 }
220
pseudoGenerateExpansion(const unsigned int length)221 static std::u16string pseudoGenerateExpansion(const unsigned int length) {
222 std::u16string result = k_expansion_string;
223 const char16_t* s = result.data();
224 if (result.size() < length) {
225 result += u" ";
226 result += pseudoGenerateExpansion(length - result.size());
227 } else {
228 int ext = 0;
229 // Should contain only whole words, so looking for a space
230 for (unsigned int i = length + 1; i < result.size(); ++i) {
231 ++ext;
232 if (s[i] == ' ') {
233 break;
234 }
235 }
236 result = result.substr(0, length + ext);
237 }
238 return result;
239 }
240
start()241 std::u16string PseudoMethodAccent::start() {
242 std::u16string result;
243 if (mDepth == 0) {
244 result = u"[";
245 }
246 mWordCount = mLength = 0;
247 mDepth++;
248 return result;
249 }
250
end()251 std::u16string PseudoMethodAccent::end() {
252 std::u16string result;
253 if (mLength) {
254 result += u" ";
255 result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2);
256 }
257 mWordCount = mLength = 0;
258 mDepth--;
259 if (mDepth == 0) {
260 result += u"]";
261 }
262 return result;
263 }
264
265 /**
266 * Converts characters so they look like they've been localized.
267 *
268 * Note: This leaves placeholder syntax untouched.
269 */
text(const StringPiece16 & source)270 std::u16string PseudoMethodAccent::text(const StringPiece16& source)
271 {
272 const char16_t* s = source.data();
273 std::u16string result;
274 const size_t I = source.size();
275 bool lastspace = true;
276 for (size_t i = 0; i < I; i++) {
277 char16_t c = s[i];
278 if (c == '%') {
279 // Placeholder syntax, no need to pseudolocalize
280 std::u16string chunk;
281 bool end = false;
282 chunk.append(&c, 1);
283 while (!end && i < I) {
284 ++i;
285 c = s[i];
286 chunk.append(&c, 1);
287 if (isPossibleNormalPlaceholderEnd(c)) {
288 end = true;
289 } else if (c == 't') {
290 ++i;
291 c = s[i];
292 chunk.append(&c, 1);
293 end = true;
294 }
295 }
296 // Treat chunk as a placeholder unless it ends with %.
297 result += ((c == '%') ? chunk : placeholder(chunk));
298 } else if (c == '<' || c == '&') {
299 // html syntax, no need to pseudolocalize
300 bool tag_closed = false;
301 while (!tag_closed && i < I) {
302 if (c == '&') {
303 std::u16string escapeText;
304 escapeText.append(&c, 1);
305 bool end = false;
306 size_t htmlCodePos = i;
307 while (!end && htmlCodePos < I) {
308 ++htmlCodePos;
309 c = s[htmlCodePos];
310 escapeText.append(&c, 1);
311 // Valid html code
312 if (c == ';') {
313 end = true;
314 i = htmlCodePos;
315 }
316 // Wrong html code
317 else if (!((c == '#' ||
318 (c >= 'a' && c <= 'z') ||
319 (c >= 'A' && c <= 'Z') ||
320 (c >= '0' && c <= '9')))) {
321 end = true;
322 }
323 }
324 result += escapeText;
325 if (escapeText != u"<") {
326 tag_closed = true;
327 }
328 continue;
329 }
330 if (c == '>') {
331 tag_closed = true;
332 result.append(&c, 1);
333 continue;
334 }
335 result.append(&c, 1);
336 i++;
337 c = s[i];
338 }
339 } else {
340 // This is a pure text that should be pseudolocalized
341 const char16_t* p = pseudolocalizeChar(c);
342 if (p != nullptr) {
343 result += p;
344 } else {
345 bool space = util::isspace16(c);
346 if (lastspace && !space) {
347 mWordCount++;
348 }
349 lastspace = space;
350 result.append(&c, 1);
351 }
352 // Count only pseudolocalizable chars and delimiters
353 mLength++;
354 }
355 }
356 return result;
357 }
358
placeholder(const StringPiece16 & source)359 std::u16string PseudoMethodAccent::placeholder(const StringPiece16& source) {
360 // Surround a placeholder with brackets
361 return k_placeholder_open + source.toString() + k_placeholder_close;
362 }
363
text(const StringPiece16 & source)364 std::u16string PseudoMethodBidi::text(const StringPiece16& source) {
365 const char16_t* s = source.data();
366 std::u16string result;
367 bool lastspace = true;
368 bool space = true;
369 for (size_t i = 0; i < source.size(); i++) {
370 char16_t c = s[i];
371 space = util::isspace16(c);
372 if (lastspace && !space) {
373 // Word start
374 result += k_rlm + k_rlo;
375 } else if (!lastspace && space) {
376 // Word end
377 result += k_pdf + k_rlm;
378 }
379 lastspace = space;
380 result.append(&c, 1);
381 }
382 if (!lastspace) {
383 // End of last word
384 result += k_pdf + k_rlm;
385 }
386 return result;
387 }
388
placeholder(const StringPiece16 & source)389 std::u16string PseudoMethodBidi::placeholder(const StringPiece16& source) {
390 // Surround a placeholder with directionality change sequence
391 return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm;
392 }
393
394 } // namespace aapt
395