1 /*
2  * Copyright (C) 2013 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.base;
18 
19 import com.google.common.annotations.GwtCompatible;
20 import com.google.common.annotations.GwtIncompatible;
21 
22 import junit.framework.TestCase;
23 
24 import java.io.UnsupportedEncodingException;
25 import java.util.Arrays;
26 import java.util.HashMap;
27 import java.util.Random;
28 
29 /**
30  * Unit tests for {@link Utf8}.
31  *
32  * @author Jon Perlow
33  * @author Martin Buchholz
34  * @author Clément Roux
35  */
36 @GwtCompatible(emulated = true)
37 public class Utf8Test extends TestCase {
testEncodedLength_validStrings()38   public void testEncodedLength_validStrings() {
39     assertEquals(0, Utf8.encodedLength(""));
40     assertEquals(11, Utf8.encodedLength("Hello world"));
41     assertEquals(8, Utf8.encodedLength("Résumé"));
42     assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
43         + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
44         + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
45         + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
46         + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
47         + "哈都拕人翻譯做好多話。"));
48     // A surrogate pair
49     assertEquals(4, Utf8.encodedLength(
50         newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
51   }
52 
53   @GwtIncompatible("StringBuilder.appendCodePoint()")
testEncodedLength_validStrings2()54   public void testEncodedLength_validStrings2() {
55     HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
56     utf8Lengths.put(0x00, 1);
57     utf8Lengths.put(0x7f, 1);
58     utf8Lengths.put(0x80, 2);
59     utf8Lengths.put(0x7ff, 2);
60     utf8Lengths.put(0x800, 3);
61     utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
62     utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
63     utf8Lengths.put(Character.MAX_CODE_POINT, 4);
64 
65     Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
66     StringBuilder sb = new StringBuilder();
67     Random rnd = new Random();
68     for (int trial = 0; trial < 100; trial++) {
69       sb.setLength(0);
70       int utf8Length = 0;
71       for (int i = 0; i < 6; i++) {
72         Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
73         sb.appendCodePoint(randomCodePoint);
74         utf8Length += utf8Lengths.get(randomCodePoint);
75         if (utf8Length != Utf8.encodedLength(sb)) {
76           StringBuilder repro = new StringBuilder();
77           for (int j = 0; j < sb.length(); j++) {
78             repro.append(" " + (int) sb.charAt(j));  // GWT compatible
79           }
80           assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
81         }
82       }
83     }
84   }
85 
testEncodedLength_invalidStrings()86   public void testEncodedLength_invalidStrings() {
87     testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
88     testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
89     testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
90     testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
91     testEncodedLengthFails(
92         newString(
93             Character.MIN_HIGH_SURROGATE,
94             Character.MIN_HIGH_SURROGATE), 0);
95   }
96 
testEncodedLengthFails(String invalidString, int invalidCodePointIndex)97   private static void testEncodedLengthFails(String invalidString,
98       int invalidCodePointIndex) {
99     try {
100       Utf8.encodedLength(invalidString);
101       fail();
102     } catch (IllegalArgumentException expected) {
103       assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
104           expected.getMessage());
105     }
106   }
107 
108   // 128 - [chars 0x0000 to 0x007f]
109   private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
110       0x007f - 0x0000 + 1;
111 
112   // 128
113   private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
114       ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
115 
116   // 1920 [chars 0x0080 to 0x07FF]
117   private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
118       0x07FF - 0x0080 + 1;
119 
120   // 18,304
121   private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
122       // Both bytes are one byte characters
123       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
124       // The possible number of two byte characters
125       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
126 
127   // 2048
128   private static final long THREE_BYTE_SURROGATES = 2 * 1024;
129 
130   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
131   private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
132       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
133 
134   // 2,650,112
135   private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
136       // All one byte characters
137       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
138       // One two byte character and a one byte character
139       2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
140           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
141        // Three byte characters
142       THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
143 
144   // 1,048,576 [chars 0x10000L to 0x10FFFF]
145   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
146       0x10FFFF - 0x10000L + 1;
147 
148   // 289,571,839
149   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
150       // All one byte characters
151       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
152       // One and three byte characters
153       2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
154           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
155       // Two two byte characters
156       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
157       // Permutations of one and two byte characters
158       3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
159           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
160           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
161       // Four byte characters
162       FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
163 
164   /** Tests that round tripping of all two byte permutations work. */
165   @GwtIncompatible("java.nio.charset.Charset")
testIsWellFormed_1Byte()166   public void testIsWellFormed_1Byte() {
167     testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
168   }
169 
170   /** Tests that round tripping of all two byte permutations work. */
171   @GwtIncompatible("java.nio.charset.Charset")
testIsWellFormed_2Bytes()172   public void testIsWellFormed_2Bytes() {
173     testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
174   }
175 
176   /** Tests that round tripping of all three byte permutations work. */
177   @GwtIncompatible("java.nio.charset.Charset")
testIsWellFormed_3Bytes()178   public void testIsWellFormed_3Bytes() {
179     testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
180   }
181 
182   /**
183    * Tests that round tripping of a sample of four byte permutations work.
184    * All permutations are prohibitively expensive to test for automated runs.
185    * This method tests specific four-byte cases.
186    */
testIsWellFormed_4BytesSamples()187   public void testIsWellFormed_4BytesSamples() {
188     // Valid 4 byte.
189     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
190     // Bad trailing bytes
191     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
192     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
193     // Special cases for byte2
194     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
195     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
196   }
197 
198   /** Tests some hard-coded test cases. */
testSomeSequences()199   public void testSomeSequences() {
200     // Empty
201     assertWellFormed();
202     // One-byte characters, including control characters
203     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
204     // Two-byte characters
205     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
206     // Three-byte characters
207     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
208     // Four-byte characters
209     // "\u024B62\u024B62"
210     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
211     // Mixed string
212     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
213     assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
214         0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
215         0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
216     // Not a valid string
217     assertNotWellFormed(-1, 0, -1, 0);
218   }
219 
testShardsHaveExpectedRoundTrippables()220   public void testShardsHaveExpectedRoundTrippables() {
221     // A sanity check.
222     long actual = 0;
223     for (long expected : generateFourByteShardsExpectedRunnables()) {
224       actual += expected;
225     }
226     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
227   }
228 
newString(char... chars)229   private String newString(char... chars) {
230     return new String(chars);
231   }
232 
toByteArray(int... bytes)233   private byte[] toByteArray(int... bytes) {
234     byte[] realBytes = new byte[bytes.length];
235     for (int i = 0; i < bytes.length; i++) {
236       realBytes[i] = (byte) bytes[i];
237     }
238     return realBytes;
239   }
240 
assertWellFormed(int... bytes)241   private void assertWellFormed(int... bytes) {
242     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
243   }
244 
assertNotWellFormed(int... bytes)245   private void assertNotWellFormed(int... bytes) {
246     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
247   }
248 
generateFourByteShardsExpectedRunnables()249   private static long[] generateFourByteShardsExpectedRunnables() {
250     long[] expected = new long[128];
251     // 0-63 are all 5300224
252     for (int i = 0; i <= 63; i++) {
253       expected[i] = 5300224;
254     }
255     // 97-111 are all 2342912
256     for (int i = 97; i <= 111; i++) {
257      expected[i] = 2342912;
258     }
259     // 113-117 are all 1048576
260     for (int i = 113; i <= 117; i++) {
261       expected[i] = 1048576;
262     }
263     // One offs
264     expected[112] = 786432;
265     expected[118] = 786432;
266     expected[119] = 1048576;
267     expected[120] = 458752;
268     expected[121] = 524288;
269     expected[122] = 65536;
270     // Anything not assigned was the default 0.
271     return expected;
272   }
273 
274   /**
275    * Helper to run the loop to test all the permutations for the number of bytes
276    * specified.
277    *
278    * @param numBytes the number of bytes in the byte array
279    * @param expectedCount the expected number of roundtrippable permutations
280    */
281   @GwtIncompatible("java.nio.charset.Charset")
testBytes(int numBytes, long expectedCount)282   private static void testBytes(int numBytes, long expectedCount) {
283     testBytes(numBytes, expectedCount, 0, -1);
284   }
285 
286   /**
287    * Helper to run the loop to test all the permutations for the number of bytes
288    * specified. This overload is useful for debugging to get the loop to start
289    * at a certain character.
290    *
291    * @param numBytes the number of bytes in the byte array
292    * @param expectedCount the expected number of roundtrippable permutations
293    * @param start the starting bytes encoded as a long as big-endian
294    * @param lim the limit of bytes to process encoded as a long as big-endian,
295    *     or -1 to mean the max limit for numBytes
296    */
297   @GwtIncompatible("java.nio.charset.Charset")
testBytes(int numBytes, long expectedCount, long start, long lim)298   private static void testBytes(int numBytes, long expectedCount, long start,
299       long lim) {
300     byte[] bytes = new byte[numBytes];
301     if (lim == -1) {
302       lim = 1L << (numBytes * 8);
303     }
304     long countRoundTripped = 0;
305     for (long byteChar = start; byteChar < lim; byteChar++) {
306       long tmpByteChar = byteChar;
307       for (int i = 0; i < numBytes; i++) {
308         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
309         tmpByteChar = tmpByteChar >> 8;
310       }
311       boolean isRoundTrippable = Utf8.isWellFormed(bytes);
312       assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
313       boolean bytesEqual;
314       try {
315         String s = new String(bytes, Charsets.UTF_8.name());
316         byte[] bytesReencoded = s.getBytes(Charsets.UTF_8.name());
317         bytesEqual = Arrays.equals(bytes, bytesReencoded);
318       } catch (UnsupportedEncodingException e) {
319         throw new AssertionError(e);
320       }
321 
322       if (bytesEqual != isRoundTrippable) {
323         fail();
324       }
325       if (isRoundTrippable) {
326         countRoundTripped++;
327       }
328     }
329     assertEquals(expectedCount, countRoundTripped);
330   }
331 }
332