1 /*
2  * Copyright (C) 2013 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.base;
18 
19 import com.google.common.annotations.GwtCompatible;
20 import com.google.common.annotations.GwtIncompatible;
21 
22 import junit.framework.TestCase;
23 
24 import java.util.Arrays;
25 import java.util.HashMap;
26 import java.util.Random;
27 
28 /**
29  * Unit tests for {@link Utf8}.
30  *
31  * @author Jon Perlow
32  * @author Martin Buchholz
33  * @author Clément Roux
34  */
35 @GwtCompatible(emulated = true)
36 public class Utf8Test extends TestCase {
testEncodedLength_validStrings()37   public void testEncodedLength_validStrings() {
38     assertEquals(0, Utf8.encodedLength(""));
39     assertEquals(11, Utf8.encodedLength("Hello world"));
40     assertEquals(8, Utf8.encodedLength("Résumé"));
41     assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare,"
42         + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
43         + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
44         + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
45         + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
46         + "哈都拕人翻譯做好多話。"));
47     // A surrogate pair
48     assertEquals(4, Utf8.encodedLength(
49         newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE)));
50   }
51 
52   @GwtIncompatible("StringBuilder.appendCodePoint()")
testEncodedLength_validStrings2()53   public void testEncodedLength_validStrings2() {
54     HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>();
55     utf8Lengths.put(0x00, 1);
56     utf8Lengths.put(0x7f, 1);
57     utf8Lengths.put(0x80, 2);
58     utf8Lengths.put(0x7ff, 2);
59     utf8Lengths.put(0x800, 3);
60     utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
61     utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4);
62     utf8Lengths.put(Character.MAX_CODE_POINT, 4);
63 
64     Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{});
65     StringBuilder sb = new StringBuilder();
66     Random rnd = new Random();
67     for (int trial = 0; trial < 100; trial++) {
68       sb.setLength(0);
69       int utf8Length = 0;
70       for (int i = 0; i < 6; i++) {
71         Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
72         sb.appendCodePoint(randomCodePoint);
73         utf8Length += utf8Lengths.get(randomCodePoint);
74         if (utf8Length != Utf8.encodedLength(sb)) {
75           StringBuilder repro = new StringBuilder();
76           for (int j = 0; j < sb.length(); j++) {
77             repro.append(" " + (int) sb.charAt(j));  // GWT compatible
78           }
79           assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
80         }
81       }
82     }
83   }
84 
testEncodedLength_invalidStrings()85   public void testEncodedLength_invalidStrings() {
86     testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0);
87     testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6);
88     testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0);
89     testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6);
90     testEncodedLengthFails(
91         newString(
92             Character.MIN_HIGH_SURROGATE,
93             Character.MIN_HIGH_SURROGATE), 0);
94   }
95 
testEncodedLengthFails(String invalidString, int invalidCodePointIndex)96   private static void testEncodedLengthFails(String invalidString,
97       int invalidCodePointIndex) {
98     try {
99       Utf8.encodedLength(invalidString);
100       fail();
101     } catch (IllegalArgumentException expected) {
102       assertEquals("Unpaired surrogate at index " + invalidCodePointIndex,
103           expected.getMessage());
104     }
105   }
106 
107   // 128 - [chars 0x0000 to 0x007f]
108   private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
109       0x007f - 0x0000 + 1;
110 
111   // 128
112   private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
113       ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
114 
115   // 1920 [chars 0x0080 to 0x07FF]
116   private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS =
117       0x07FF - 0x0080 + 1;
118 
119   // 18,304
120   private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
121       // Both bytes are one byte characters
122       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
123       // The possible number of two byte characters
124       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
125 
126   // 2048
127   private static final long THREE_BYTE_SURROGATES = 2 * 1024;
128 
129   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
130   private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
131       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
132 
133   // 2,650,112
134   private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
135       // All one byte characters
136       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
137       // One two byte character and a one byte character
138       2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
139           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
140        // Three byte characters
141       THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
142 
143   // 1,048,576 [chars 0x10000L to 0x10FFFF]
144   private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS =
145       0x10FFFF - 0x10000L + 1;
146 
147   // 289,571,839
148   private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
149       // All one byte characters
150       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
151       // One and three byte characters
152       2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
153           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
154       // Two two byte characters
155       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
156       // Permutations of one and two byte characters
157       3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS *
158           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS *
159           ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
160       // Four byte characters
161       FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
162 
163   /** Tests that round tripping of all two byte permutations work. */
164   @GwtIncompatible("java.nio.charset.Charset")
testIsWellFormed_1Byte()165   public void testIsWellFormed_1Byte() {
166     testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
167   }
168 
169   /** Tests that round tripping of all two byte permutations work. */
170   @GwtIncompatible("java.nio.charset.Charset")
testIsWellFormed_2Bytes()171   public void testIsWellFormed_2Bytes() {
172     testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
173   }
174 
175   /** Tests that round tripping of all three byte permutations work. */
176   @GwtIncompatible("java.nio.charset.Charset")
testIsWellFormed_3Bytes()177   public void testIsWellFormed_3Bytes() {
178     testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
179   }
180 
181   /**
182    * Tests that round tripping of a sample of four byte permutations work.
183    * All permutations are prohibitively expensive to test for automated runs.
184    * This method tests specific four-byte cases.
185    */
testIsWellFormed_4BytesSamples()186   public void testIsWellFormed_4BytesSamples() {
187     // Valid 4 byte.
188     assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
189     // Bad trailing bytes
190     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
191     assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
192     // Special cases for byte2
193     assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
194     assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
195   }
196 
197   /** Tests some hard-coded test cases. */
testSomeSequences()198   public void testSomeSequences() {
199     // Empty
200     assertWellFormed();
201     // One-byte characters, including control characters
202     assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
203     // Two-byte characters
204     assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
205     // Three-byte characters
206     assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
207     // Four-byte characters
208     // "\u024B62\u024B62"
209     assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
210     // Mixed string
211     // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
212     assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30,
213         0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63,
214         0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
215     // Not a valid string
216     assertNotWellFormed(-1, 0, -1, 0);
217   }
218 
testShardsHaveExpectedRoundTrippables()219   public void testShardsHaveExpectedRoundTrippables() {
220     // A sanity check.
221     long actual = 0;
222     for (long expected : generateFourByteShardsExpectedRunnables()) {
223       actual += expected;
224     }
225     assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
226   }
227 
newString(char... chars)228   private String newString(char... chars) {
229     return new String(chars);
230   }
231 
toByteArray(int... bytes)232   private byte[] toByteArray(int... bytes) {
233     byte[] realBytes = new byte[bytes.length];
234     for (int i = 0; i < bytes.length; i++) {
235       realBytes[i] = (byte) bytes[i];
236     }
237     return realBytes;
238   }
239 
assertWellFormed(int... bytes)240   private void assertWellFormed(int... bytes) {
241     assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
242   }
243 
assertNotWellFormed(int... bytes)244   private void assertNotWellFormed(int... bytes) {
245     assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
246   }
247 
generateFourByteShardsExpectedRunnables()248   private static long[] generateFourByteShardsExpectedRunnables() {
249     long[] expected = new long[128];
250     // 0-63 are all 5300224
251     for (int i = 0; i <= 63; i++) {
252       expected[i] = 5300224;
253     }
254     // 97-111 are all 2342912
255     for (int i = 97; i <= 111; i++) {
256      expected[i] = 2342912;
257     }
258     // 113-117 are all 1048576
259     for (int i = 113; i <= 117; i++) {
260       expected[i] = 1048576;
261     }
262     // One offs
263     expected[112] = 786432;
264     expected[118] = 786432;
265     expected[119] = 1048576;
266     expected[120] = 458752;
267     expected[121] = 524288;
268     expected[122] = 65536;
269     // Anything not assigned was the default 0.
270     return expected;
271   }
272 
273   /**
274    * Helper to run the loop to test all the permutations for the number of bytes
275    * specified.
276    *
277    * @param numBytes the number of bytes in the byte array
278    * @param expectedCount the expected number of roundtrippable permutations
279    */
280   @GwtIncompatible("java.nio.charset.Charset")
testBytes(int numBytes, long expectedCount)281   private static void testBytes(int numBytes, long expectedCount) {
282     testBytes(numBytes, expectedCount, 0, -1);
283   }
284 
285   /**
286    * Helper to run the loop to test all the permutations for the number of bytes
287    * specified. This overload is useful for debugging to get the loop to start
288    * at a certain character.
289    *
290    * @param numBytes the number of bytes in the byte array
291    * @param expectedCount the expected number of roundtrippable permutations
292    * @param start the starting bytes encoded as a long as big-endian
293    * @param lim the limit of bytes to process encoded as a long as big-endian,
294    *     or -1 to mean the max limit for numBytes
295    */
296   @GwtIncompatible("java.nio.charset.Charset")
testBytes(int numBytes, long expectedCount, long start, long lim)297   private static void testBytes(int numBytes, long expectedCount, long start,
298       long lim) {
299     byte[] bytes = new byte[numBytes];
300     if (lim == -1) {
301       lim = 1L << (numBytes * 8);
302     }
303     long countRoundTripped = 0;
304     for (long byteChar = start; byteChar < lim; byteChar++) {
305       long tmpByteChar = byteChar;
306       for (int i = 0; i < numBytes; i++) {
307         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
308         tmpByteChar = tmpByteChar >> 8;
309       }
310       boolean isRoundTrippable = Utf8.isWellFormed(bytes);
311       assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
312       String s = new String(bytes, Charsets.UTF_8);
313       byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
314       boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
315 
316       if (bytesEqual != isRoundTrippable) {
317         fail();
318       }
319       if (isRoundTrippable) {
320         countRoundTripped++;
321       }
322     }
323     assertEquals(expectedCount, countRoundTripped);
324   }
325 }
326