1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "compile/Pseudolocalizer.h"
18 #include "util/Util.h"
19 
20 namespace aapt {
21 
22 // String basis to generate expansion
23 static const std::u16string k_expansion_string = u"one two three "
24         "four five six seven eight nine ten eleven twelve thirteen "
25         "fourteen fiveteen sixteen seventeen nineteen twenty";
26 
27 // Special unicode characters to override directionality of the words
28 static const std::u16string k_rlm = u"\u200f";
29 static const std::u16string k_rlo = u"\u202e";
30 static const std::u16string k_pdf = u"\u202c";
31 
32 // Placeholder marks
33 static const std::u16string k_placeholder_open = u"\u00bb";
34 static const std::u16string k_placeholder_close = u"\u00ab";
35 
36 static const char16_t k_arg_start = u'{';
37 static const char16_t k_arg_end = u'}';
38 
39 class PseudoMethodNone : public PseudoMethodImpl {
40 public:
text(const StringPiece16 & text)41     std::u16string text(const StringPiece16& text) override { return text.toString(); }
placeholder(const StringPiece16 & text)42     std::u16string placeholder(const StringPiece16& text) override { return text.toString(); }
43 };
44 
45 class PseudoMethodBidi : public PseudoMethodImpl {
46 public:
47     std::u16string text(const StringPiece16& text) override;
48     std::u16string placeholder(const StringPiece16& text) override;
49 };
50 
51 class PseudoMethodAccent : public PseudoMethodImpl {
52 public:
PseudoMethodAccent()53     PseudoMethodAccent() : mDepth(0), mWordCount(0), mLength(0) {}
54     std::u16string start() override;
55     std::u16string end() override;
56     std::u16string text(const StringPiece16& text) override;
57     std::u16string placeholder(const StringPiece16& text) override;
58 private:
59     size_t mDepth;
60     size_t mWordCount;
61     size_t mLength;
62 };
63 
Pseudolocalizer(Method method)64 Pseudolocalizer::Pseudolocalizer(Method method) : mLastDepth(0) {
65     setMethod(method);
66 }
67 
setMethod(Method method)68 void Pseudolocalizer::setMethod(Method method) {
69     switch (method) {
70     case Method::kNone:
71         mImpl = util::make_unique<PseudoMethodNone>();
72         break;
73     case Method::kAccent:
74         mImpl = util::make_unique<PseudoMethodAccent>();
75         break;
76     case Method::kBidi:
77         mImpl = util::make_unique<PseudoMethodBidi>();
78         break;
79     }
80 }
81 
text(const StringPiece16 & text)82 std::u16string Pseudolocalizer::text(const StringPiece16& text) {
83     std::u16string out;
84     size_t depth = mLastDepth;
85     size_t lastpos, pos;
86     const size_t length = text.size();
87     const char16_t* str = text.data();
88     bool escaped = false;
89     for (lastpos = pos = 0; pos < length; pos++) {
90         char16_t c = str[pos];
91         if (escaped) {
92             escaped = false;
93             continue;
94         }
95         if (c == '\'') {
96             escaped = true;
97             continue;
98         }
99 
100         if (c == k_arg_start) {
101             depth++;
102         } else if (c == k_arg_end && depth) {
103             depth--;
104         }
105 
106         if (mLastDepth != depth || pos == length - 1) {
107             bool pseudo = ((mLastDepth % 2) == 0);
108             size_t nextpos = pos;
109             if (!pseudo || depth == mLastDepth) {
110                 nextpos++;
111             }
112             size_t size = nextpos - lastpos;
113             if (size) {
114                 std::u16string chunk = text.substr(lastpos, size).toString();
115                 if (pseudo) {
116                     chunk = mImpl->text(chunk);
117                 } else if (str[lastpos] == k_arg_start && str[nextpos - 1] == k_arg_end) {
118                     chunk = mImpl->placeholder(chunk);
119                 }
120                 out.append(chunk);
121             }
122             if (pseudo && depth < mLastDepth) { // End of message
123                 out.append(mImpl->end());
124             } else if (!pseudo && depth > mLastDepth) { // Start of message
125                 out.append(mImpl->start());
126             }
127             lastpos = nextpos;
128             mLastDepth = depth;
129         }
130     }
131     return out;
132 }
133 
pseudolocalizeChar(const char16_t c)134 static const char16_t* pseudolocalizeChar(const char16_t c) {
135     switch (c) {
136         case 'a':   return u"\u00e5";
137         case 'b':   return u"\u0253";
138         case 'c':   return u"\u00e7";
139         case 'd':   return u"\u00f0";
140         case 'e':   return u"\u00e9";
141         case 'f':   return u"\u0192";
142         case 'g':   return u"\u011d";
143         case 'h':   return u"\u0125";
144         case 'i':   return u"\u00ee";
145         case 'j':   return u"\u0135";
146         case 'k':   return u"\u0137";
147         case 'l':   return u"\u013c";
148         case 'm':   return u"\u1e3f";
149         case 'n':   return u"\u00f1";
150         case 'o':   return u"\u00f6";
151         case 'p':   return u"\u00fe";
152         case 'q':   return u"\u0051";
153         case 'r':   return u"\u0155";
154         case 's':   return u"\u0161";
155         case 't':   return u"\u0163";
156         case 'u':   return u"\u00fb";
157         case 'v':   return u"\u0056";
158         case 'w':   return u"\u0175";
159         case 'x':   return u"\u0445";
160         case 'y':   return u"\u00fd";
161         case 'z':   return u"\u017e";
162         case 'A':   return u"\u00c5";
163         case 'B':   return u"\u03b2";
164         case 'C':   return u"\u00c7";
165         case 'D':   return u"\u00d0";
166         case 'E':   return u"\u00c9";
167         case 'G':   return u"\u011c";
168         case 'H':   return u"\u0124";
169         case 'I':   return u"\u00ce";
170         case 'J':   return u"\u0134";
171         case 'K':   return u"\u0136";
172         case 'L':   return u"\u013b";
173         case 'M':   return u"\u1e3e";
174         case 'N':   return u"\u00d1";
175         case 'O':   return u"\u00d6";
176         case 'P':   return u"\u00de";
177         case 'Q':   return u"\u0071";
178         case 'R':   return u"\u0154";
179         case 'S':   return u"\u0160";
180         case 'T':   return u"\u0162";
181         case 'U':   return u"\u00db";
182         case 'V':   return u"\u03bd";
183         case 'W':   return u"\u0174";
184         case 'X':   return u"\u00d7";
185         case 'Y':   return u"\u00dd";
186         case 'Z':   return u"\u017d";
187         case '!':   return u"\u00a1";
188         case '?':   return u"\u00bf";
189         case '$':   return u"\u20ac";
190         default:    return NULL;
191     }
192 }
193 
isPossibleNormalPlaceholderEnd(const char16_t c)194 static bool isPossibleNormalPlaceholderEnd(const char16_t c) {
195     switch (c) {
196         case 's': return true;
197         case 'S': return true;
198         case 'c': return true;
199         case 'C': return true;
200         case 'd': return true;
201         case 'o': return true;
202         case 'x': return true;
203         case 'X': return true;
204         case 'f': return true;
205         case 'e': return true;
206         case 'E': return true;
207         case 'g': return true;
208         case 'G': return true;
209         case 'a': return true;
210         case 'A': return true;
211         case 'b': return true;
212         case 'B': return true;
213         case 'h': return true;
214         case 'H': return true;
215         case '%': return true;
216         case 'n': return true;
217         default:  return false;
218     }
219 }
220 
pseudoGenerateExpansion(const unsigned int length)221 static std::u16string pseudoGenerateExpansion(const unsigned int length) {
222     std::u16string result = k_expansion_string;
223     const char16_t* s = result.data();
224     if (result.size() < length) {
225         result += u" ";
226         result += pseudoGenerateExpansion(length - result.size());
227     } else {
228         int ext = 0;
229         // Should contain only whole words, so looking for a space
230         for (unsigned int i = length + 1; i < result.size(); ++i) {
231             ++ext;
232             if (s[i] == ' ') {
233                 break;
234             }
235         }
236         result = result.substr(0, length + ext);
237     }
238     return result;
239 }
240 
start()241 std::u16string PseudoMethodAccent::start() {
242     std::u16string result;
243     if (mDepth == 0) {
244         result = u"[";
245     }
246     mWordCount = mLength = 0;
247     mDepth++;
248     return result;
249 }
250 
end()251 std::u16string PseudoMethodAccent::end() {
252     std::u16string result;
253     if (mLength) {
254         result += u" ";
255         result += pseudoGenerateExpansion(mWordCount > 3 ? mLength : mLength / 2);
256     }
257     mWordCount = mLength = 0;
258     mDepth--;
259     if (mDepth == 0) {
260         result += u"]";
261     }
262     return result;
263 }
264 
265 /**
266  * Converts characters so they look like they've been localized.
267  *
268  * Note: This leaves placeholder syntax untouched.
269  */
text(const StringPiece16 & source)270 std::u16string PseudoMethodAccent::text(const StringPiece16& source)
271 {
272     const char16_t* s = source.data();
273     std::u16string result;
274     const size_t I = source.size();
275     bool lastspace = true;
276     for (size_t i = 0; i < I; i++) {
277         char16_t c = s[i];
278         if (c == '%') {
279             // Placeholder syntax, no need to pseudolocalize
280             std::u16string chunk;
281             bool end = false;
282             chunk.append(&c, 1);
283             while (!end && i < I) {
284                 ++i;
285                 c = s[i];
286                 chunk.append(&c, 1);
287                 if (isPossibleNormalPlaceholderEnd(c)) {
288                     end = true;
289                 } else if (c == 't') {
290                     ++i;
291                     c = s[i];
292                     chunk.append(&c, 1);
293                     end = true;
294                 }
295             }
296             // Treat chunk as a placeholder unless it ends with %.
297             result += ((c == '%') ? chunk : placeholder(chunk));
298         } else if (c == '<' || c == '&') {
299             // html syntax, no need to pseudolocalize
300             bool tag_closed = false;
301             while (!tag_closed && i < I) {
302                 if (c == '&') {
303                     std::u16string escapeText;
304                     escapeText.append(&c, 1);
305                     bool end = false;
306                     size_t htmlCodePos = i;
307                     while (!end && htmlCodePos < I) {
308                         ++htmlCodePos;
309                         c = s[htmlCodePos];
310                         escapeText.append(&c, 1);
311                         // Valid html code
312                         if (c == ';') {
313                             end = true;
314                             i = htmlCodePos;
315                         }
316                         // Wrong html code
317                         else if (!((c == '#' ||
318                                  (c >= 'a' && c <= 'z') ||
319                                  (c >= 'A' && c <= 'Z') ||
320                                  (c >= '0' && c <= '9')))) {
321                             end = true;
322                         }
323                     }
324                     result += escapeText;
325                     if (escapeText != u"&lt;") {
326                         tag_closed = true;
327                     }
328                     continue;
329                 }
330                 if (c == '>') {
331                     tag_closed = true;
332                     result.append(&c, 1);
333                     continue;
334                 }
335                 result.append(&c, 1);
336                 i++;
337                 c = s[i];
338             }
339         } else {
340             // This is a pure text that should be pseudolocalized
341             const char16_t* p = pseudolocalizeChar(c);
342             if (p != nullptr) {
343                 result += p;
344             } else {
345                 bool space = util::isspace16(c);
346                 if (lastspace && !space) {
347                     mWordCount++;
348                 }
349                 lastspace = space;
350                 result.append(&c, 1);
351             }
352             // Count only pseudolocalizable chars and delimiters
353             mLength++;
354         }
355     }
356     return result;
357 }
358 
placeholder(const StringPiece16 & source)359 std::u16string PseudoMethodAccent::placeholder(const StringPiece16& source) {
360     // Surround a placeholder with brackets
361     return k_placeholder_open + source.toString() + k_placeholder_close;
362 }
363 
text(const StringPiece16 & source)364 std::u16string PseudoMethodBidi::text(const StringPiece16& source) {
365     const char16_t* s = source.data();
366     std::u16string result;
367     bool lastspace = true;
368     bool space = true;
369     for (size_t i = 0; i < source.size(); i++) {
370         char16_t c = s[i];
371         space = util::isspace16(c);
372         if (lastspace && !space) {
373             // Word start
374             result += k_rlm + k_rlo;
375         } else if (!lastspace && space) {
376             // Word end
377             result += k_pdf + k_rlm;
378         }
379         lastspace = space;
380         result.append(&c, 1);
381     }
382     if (!lastspace) {
383         // End of last word
384         result += k_pdf + k_rlm;
385     }
386     return result;
387 }
388 
placeholder(const StringPiece16 & source)389 std::u16string PseudoMethodBidi::placeholder(const StringPiece16& source) {
390     // Surround a placeholder with directionality change sequence
391     return k_rlm + k_rlo + source.toString() + k_pdf + k_rlm;
392 }
393 
394 } // namespace aapt
395