1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "compile/Pseudolocalizer.h"
18 
19 #include "util/Util.h"
20 
21 using android::StringPiece;
22 
23 namespace aapt {
24 
25 // String basis to generate expansion
26 static const std::string kExpansionString =
27     "one two three "
28     "four five six seven eight nine ten eleven twelve thirteen "
29     "fourteen fiveteen sixteen seventeen nineteen twenty";
30 
31 // Special unicode characters to override directionality of the words
32 static const std::string kRlm = "\u200f";
33 static const std::string kRlo = "\u202e";
34 static const std::string kPdf = "\u202c";
35 
36 // Placeholder marks
37 static const std::string kPlaceholderOpen = "\u00bb";
38 static const std::string kPlaceholderClose = "\u00ab";
39 
40 static const char kArgStart = '{';
41 static const char kArgEnd = '}';
42 
43 class PseudoMethodNone : public PseudoMethodImpl {
44  public:
Text(const StringPiece & text)45   std::string Text(const StringPiece& text) override { return text.to_string(); }
Placeholder(const StringPiece & text)46   std::string Placeholder(const StringPiece& text) override { return text.to_string(); }
47 };
48 
49 class PseudoMethodBidi : public PseudoMethodImpl {
50  public:
51   std::string Text(const StringPiece& text) override;
52   std::string Placeholder(const StringPiece& text) override;
53 };
54 
55 class PseudoMethodAccent : public PseudoMethodImpl {
56  public:
PseudoMethodAccent()57   PseudoMethodAccent() : depth_(0), word_count_(0), length_(0) {}
58   std::string Start() override;
59   std::string End() override;
60   std::string Text(const StringPiece& text) override;
61   std::string Placeholder(const StringPiece& text) override;
62 
63  private:
64   size_t depth_;
65   size_t word_count_;
66   size_t length_;
67 };
68 
Pseudolocalizer(Method method)69 Pseudolocalizer::Pseudolocalizer(Method method) : last_depth_(0) {
70   SetMethod(method);
71 }
72 
SetMethod(Method method)73 void Pseudolocalizer::SetMethod(Method method) {
74   switch (method) {
75     case Method::kNone:
76       impl_ = util::make_unique<PseudoMethodNone>();
77       break;
78     case Method::kAccent:
79       impl_ = util::make_unique<PseudoMethodAccent>();
80       break;
81     case Method::kBidi:
82       impl_ = util::make_unique<PseudoMethodBidi>();
83       break;
84   }
85 }
86 
Text(const StringPiece & text)87 std::string Pseudolocalizer::Text(const StringPiece& text) {
88   std::string out;
89   size_t depth = last_depth_;
90   size_t lastpos, pos;
91   const size_t length = text.size();
92   const char* str = text.data();
93   bool escaped = false;
94   for (lastpos = pos = 0; pos < length; pos++) {
95     char16_t c = str[pos];
96     if (escaped) {
97       escaped = false;
98       continue;
99     }
100     if (c == '\'') {
101       escaped = true;
102       continue;
103     }
104 
105     if (c == kArgStart) {
106       depth++;
107     } else if (c == kArgEnd && depth) {
108       depth--;
109     }
110 
111     if (last_depth_ != depth || pos == length - 1) {
112       bool pseudo = ((last_depth_ % 2) == 0);
113       size_t nextpos = pos;
114       if (!pseudo || depth == last_depth_) {
115         nextpos++;
116       }
117       size_t size = nextpos - lastpos;
118       if (size) {
119         std::string chunk = text.substr(lastpos, size).to_string();
120         if (pseudo) {
121           chunk = impl_->Text(chunk);
122         } else if (str[lastpos] == kArgStart && str[nextpos - 1] == kArgEnd) {
123           chunk = impl_->Placeholder(chunk);
124         }
125         out.append(chunk);
126       }
127       if (pseudo && depth < last_depth_) {  // End of message
128         out.append(impl_->End());
129       } else if (!pseudo && depth > last_depth_) {  // Start of message
130         out.append(impl_->Start());
131       }
132       lastpos = nextpos;
133       last_depth_ = depth;
134     }
135   }
136   return out;
137 }
138 
PseudolocalizeChar(const char c)139 static const char* PseudolocalizeChar(const char c) {
140   switch (c) {
141     case 'a':
142       return "\u00e5";
143     case 'b':
144       return "\u0253";
145     case 'c':
146       return "\u00e7";
147     case 'd':
148       return "\u00f0";
149     case 'e':
150       return "\u00e9";
151     case 'f':
152       return "\u0192";
153     case 'g':
154       return "\u011d";
155     case 'h':
156       return "\u0125";
157     case 'i':
158       return "\u00ee";
159     case 'j':
160       return "\u0135";
161     case 'k':
162       return "\u0137";
163     case 'l':
164       return "\u013c";
165     case 'm':
166       return "\u1e3f";
167     case 'n':
168       return "\u00f1";
169     case 'o':
170       return "\u00f6";
171     case 'p':
172       return "\u00fe";
173     case 'q':
174       return "\u0051";
175     case 'r':
176       return "\u0155";
177     case 's':
178       return "\u0161";
179     case 't':
180       return "\u0163";
181     case 'u':
182       return "\u00fb";
183     case 'v':
184       return "\u0056";
185     case 'w':
186       return "\u0175";
187     case 'x':
188       return "\u0445";
189     case 'y':
190       return "\u00fd";
191     case 'z':
192       return "\u017e";
193     case 'A':
194       return "\u00c5";
195     case 'B':
196       return "\u03b2";
197     case 'C':
198       return "\u00c7";
199     case 'D':
200       return "\u00d0";
201     case 'E':
202       return "\u00c9";
203     case 'G':
204       return "\u011c";
205     case 'H':
206       return "\u0124";
207     case 'I':
208       return "\u00ce";
209     case 'J':
210       return "\u0134";
211     case 'K':
212       return "\u0136";
213     case 'L':
214       return "\u013b";
215     case 'M':
216       return "\u1e3e";
217     case 'N':
218       return "\u00d1";
219     case 'O':
220       return "\u00d6";
221     case 'P':
222       return "\u00de";
223     case 'Q':
224       return "\u0071";
225     case 'R':
226       return "\u0154";
227     case 'S':
228       return "\u0160";
229     case 'T':
230       return "\u0162";
231     case 'U':
232       return "\u00db";
233     case 'V':
234       return "\u03bd";
235     case 'W':
236       return "\u0174";
237     case 'X':
238       return "\u00d7";
239     case 'Y':
240       return "\u00dd";
241     case 'Z':
242       return "\u017d";
243     case '!':
244       return "\u00a1";
245     case '?':
246       return "\u00bf";
247     case '$':
248       return "\u20ac";
249     default:
250       return nullptr;
251   }
252 }
253 
IsPossibleNormalPlaceholderEnd(const char c)254 static bool IsPossibleNormalPlaceholderEnd(const char c) {
255   switch (c) {
256     case 's':
257       return true;
258     case 'S':
259       return true;
260     case 'c':
261       return true;
262     case 'C':
263       return true;
264     case 'd':
265       return true;
266     case 'o':
267       return true;
268     case 'x':
269       return true;
270     case 'X':
271       return true;
272     case 'f':
273       return true;
274     case 'e':
275       return true;
276     case 'E':
277       return true;
278     case 'g':
279       return true;
280     case 'G':
281       return true;
282     case 'a':
283       return true;
284     case 'A':
285       return true;
286     case 'b':
287       return true;
288     case 'B':
289       return true;
290     case 'h':
291       return true;
292     case 'H':
293       return true;
294     case '%':
295       return true;
296     case 'n':
297       return true;
298     default:
299       return false;
300   }
301 }
302 
PseudoGenerateExpansion(const unsigned int length)303 static std::string PseudoGenerateExpansion(const unsigned int length) {
304   std::string result = kExpansionString;
305   const char* s = result.data();
306   if (result.size() < length) {
307     result += " ";
308     result += PseudoGenerateExpansion(length - result.size());
309   } else {
310     int ext = 0;
311     // Should contain only whole words, so looking for a space
312     for (unsigned int i = length + 1; i < result.size(); ++i) {
313       ++ext;
314       if (s[i] == ' ') {
315         break;
316       }
317     }
318     result = result.substr(0, length + ext);
319   }
320   return result;
321 }
322 
Start()323 std::string PseudoMethodAccent::Start() {
324   std::string result;
325   if (depth_ == 0) {
326     result = "[";
327   }
328   word_count_ = length_ = 0;
329   depth_++;
330   return result;
331 }
332 
End()333 std::string PseudoMethodAccent::End() {
334   std::string result;
335   if (length_) {
336     result += " ";
337     result += PseudoGenerateExpansion(word_count_ > 3 ? length_ : length_ / 2);
338   }
339   word_count_ = length_ = 0;
340   depth_--;
341   if (depth_ == 0) {
342     result += "]";
343   }
344   return result;
345 }
346 
347 /**
348  * Converts characters so they look like they've been localized.
349  *
350  * Note: This leaves placeholder syntax untouched.
351  */
Text(const StringPiece & source)352 std::string PseudoMethodAccent::Text(const StringPiece& source) {
353   const char* s = source.data();
354   std::string result;
355   const size_t I = source.size();
356   bool lastspace = true;
357   for (size_t i = 0; i < I; i++) {
358     char c = s[i];
359     if (c == '%') {
360       // Placeholder syntax, no need to pseudolocalize
361       std::string chunk;
362       bool end = false;
363       chunk.append(&c, 1);
364       while (!end && i + 1 < I) {
365         ++i;
366         c = s[i];
367         chunk.append(&c, 1);
368         if (IsPossibleNormalPlaceholderEnd(c)) {
369           end = true;
370         } else if (i + 1 < I && c == 't') {
371           ++i;
372           c = s[i];
373           chunk.append(&c, 1);
374           end = true;
375         }
376       }
377       // Treat chunk as a placeholder unless it ends with %.
378       result += ((c == '%') ? chunk : Placeholder(chunk));
379     } else if (c == '<' || c == '&') {
380       // html syntax, no need to pseudolocalize
381       bool tag_closed = false;
382       while (!tag_closed && i < I) {
383         if (c == '&') {
384           std::string escape_text;
385           escape_text.append(&c, 1);
386           bool end = false;
387           size_t html_code_pos = i;
388           while (!end && html_code_pos < I) {
389             ++html_code_pos;
390             c = s[html_code_pos];
391             escape_text.append(&c, 1);
392             // Valid html code
393             if (c == ';') {
394               end = true;
395               i = html_code_pos;
396             }
397             // Wrong html code
398             else if (!((c == '#' || (c >= 'a' && c <= 'z') ||
399                         (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')))) {
400               end = true;
401             }
402           }
403           result += escape_text;
404           if (escape_text != "&lt;") {
405             tag_closed = true;
406           }
407           continue;
408         }
409         if (c == '>') {
410           tag_closed = true;
411           result.append(&c, 1);
412           continue;
413         }
414         result.append(&c, 1);
415         i++;
416         c = s[i];
417       }
418     } else {
419       // This is a pure text that should be pseudolocalized
420       const char* p = PseudolocalizeChar(c);
421       if (p != nullptr) {
422         result += p;
423       } else {
424         bool space = isspace(c);
425         if (lastspace && !space) {
426           word_count_++;
427         }
428         lastspace = space;
429         result.append(&c, 1);
430       }
431       // Count only pseudolocalizable chars and delimiters
432       length_++;
433     }
434   }
435   return result;
436 }
437 
Placeholder(const StringPiece & source)438 std::string PseudoMethodAccent::Placeholder(const StringPiece& source) {
439   // Surround a placeholder with brackets
440   return kPlaceholderOpen + source.to_string() + kPlaceholderClose;
441 }
442 
Text(const StringPiece & source)443 std::string PseudoMethodBidi::Text(const StringPiece& source) {
444   const char* s = source.data();
445   std::string result;
446   bool lastspace = true;
447   bool space = true;
448   bool escape = false;
449   const char ESCAPE_CHAR = '\\';
450   for (size_t i = 0; i < source.size(); i++) {
451     char c = s[i];
452     if (!escape && c == ESCAPE_CHAR) {
453       escape = true;
454       continue;
455     }
456     space = (!escape && isspace(c)) || (escape && (c == 'n' || c == 't'));
457     if (lastspace && !space) {
458       // Word start
459       result += kRlm + kRlo;
460     } else if (!lastspace && space) {
461       // Word end
462       result += kPdf + kRlm;
463     }
464     lastspace = space;
465     if (escape) {
466       result.append(&ESCAPE_CHAR, 1);
467       escape=false;
468     }
469     result.append(&c, 1);
470   }
471   if (!lastspace) {
472     // End of last word
473     result += kPdf + kRlm;
474   }
475   return result;
476 }
477 
Placeholder(const StringPiece & source)478 std::string PseudoMethodBidi::Placeholder(const StringPiece& source) {
479   // Surround a placeholder with directionality change sequence
480   return kRlm + kRlo + source.to_string() + kPdf + kRlm;
481 }
482 
483 }  // namespace aapt
484