1 /* 2 * Copyright (C) 2013 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.base; 18 19 import com.google.common.annotations.GwtCompatible; 20 import com.google.common.annotations.GwtIncompatible; 21 22 import junit.framework.TestCase; 23 24 import java.io.UnsupportedEncodingException; 25 import java.util.Arrays; 26 import java.util.HashMap; 27 import java.util.Random; 28 29 /** 30 * Unit tests for {@link Utf8}. 31 * 32 * @author Jon Perlow 33 * @author Martin Buchholz 34 * @author Clément Roux 35 */ 36 @GwtCompatible(emulated = true) 37 public class Utf8Test extends TestCase { testEncodedLength_validStrings()38 public void testEncodedLength_validStrings() { 39 assertEquals(0, Utf8.encodedLength("")); 40 assertEquals(11, Utf8.encodedLength("Hello world")); 41 assertEquals(8, Utf8.encodedLength("Résumé")); 42 assertEquals(461, Utf8.encodedLength("威廉·莎士比亞(William Shakespeare," 43 + "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人," 44 + "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、" 45 + "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、" 46 + "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響," 47 + "哈都拕人翻譯做好多話。")); 48 // A surrogate pair 49 assertEquals(4, Utf8.encodedLength( 50 newString(Character.MIN_HIGH_SURROGATE, Character.MIN_LOW_SURROGATE))); 51 } 52 53 @GwtIncompatible("StringBuilder.appendCodePoint()") testEncodedLength_validStrings2()54 public void testEncodedLength_validStrings2() { 55 HashMap<Integer, Integer> utf8Lengths = new HashMap<Integer, Integer>(); 56 utf8Lengths.put(0x00, 1); 57 utf8Lengths.put(0x7f, 1); 58 utf8Lengths.put(0x80, 2); 59 utf8Lengths.put(0x7ff, 2); 60 utf8Lengths.put(0x800, 3); 61 utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT - 1, 3); 62 utf8Lengths.put(Character.MIN_SUPPLEMENTARY_CODE_POINT, 4); 63 utf8Lengths.put(Character.MAX_CODE_POINT, 4); 64 65 Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[]{}); 66 StringBuilder sb = new StringBuilder(); 67 Random rnd = new Random(); 68 for (int trial = 0; trial < 100; trial++) { 69 sb.setLength(0); 70 int utf8Length = 0; 71 for (int i = 0; i < 6; i++) { 72 Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)]; 73 sb.appendCodePoint(randomCodePoint); 74 utf8Length += utf8Lengths.get(randomCodePoint); 75 if (utf8Length != Utf8.encodedLength(sb)) { 76 StringBuilder repro = new StringBuilder(); 77 for (int j = 0; j < sb.length(); j++) { 78 repro.append(" " + (int) sb.charAt(j)); // GWT compatible 79 } 80 assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb)); 81 } 82 } 83 } 84 } 85 testEncodedLength_invalidStrings()86 public void testEncodedLength_invalidStrings() { 87 testEncodedLengthFails(newString(Character.MIN_HIGH_SURROGATE), 0); 88 testEncodedLengthFails("foobar" + newString(Character.MIN_HIGH_SURROGATE), 6); 89 testEncodedLengthFails(newString(Character.MIN_LOW_SURROGATE), 0); 90 testEncodedLengthFails("foobar" + newString(Character.MIN_LOW_SURROGATE), 6); 91 testEncodedLengthFails( 92 newString( 93 Character.MIN_HIGH_SURROGATE, 94 Character.MIN_HIGH_SURROGATE), 0); 95 } 96 testEncodedLengthFails(String invalidString, int invalidCodePointIndex)97 private static void testEncodedLengthFails(String invalidString, 98 int invalidCodePointIndex) { 99 try { 100 Utf8.encodedLength(invalidString); 101 fail(); 102 } catch (IllegalArgumentException expected) { 103 assertEquals("Unpaired surrogate at index " + invalidCodePointIndex, 104 expected.getMessage()); 105 } 106 } 107 108 // 128 - [chars 0x0000 to 0x007f] 109 private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 110 0x007f - 0x0000 + 1; 111 112 // 128 113 private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = 114 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 115 116 // 1920 [chars 0x0080 to 0x07FF] 117 private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 118 0x07FF - 0x0080 + 1; 119 120 // 18,304 121 private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT = 122 // Both bytes are one byte characters 123 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) + 124 // The possible number of two byte characters 125 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS; 126 127 // 2048 128 private static final long THREE_BYTE_SURROGATES = 2 * 1024; 129 130 // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates] 131 private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 132 0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES; 133 134 // 2,650,112 135 private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT = 136 // All one byte characters 137 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) + 138 // One two byte character and a one byte character 139 2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * 140 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 141 // Three byte characters 142 THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS; 143 144 // 1,048,576 [chars 0x10000L to 0x10FFFF] 145 private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 146 0x10FFFF - 0x10000L + 1; 147 148 // 289,571,839 149 private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT = 150 // All one byte characters 151 (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) + 152 // One and three byte characters 153 2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * 154 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 155 // Two two byte characters 156 TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS + 157 // Permutations of one and two byte characters 158 3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * 159 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS * 160 ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS + 161 // Four byte characters 162 FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS; 163 164 /** Tests that round tripping of all two byte permutations work. */ 165 @GwtIncompatible("java.nio.charset.Charset") testIsWellFormed_1Byte()166 public void testIsWellFormed_1Byte() { 167 testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT); 168 } 169 170 /** Tests that round tripping of all two byte permutations work. */ 171 @GwtIncompatible("java.nio.charset.Charset") testIsWellFormed_2Bytes()172 public void testIsWellFormed_2Bytes() { 173 testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT); 174 } 175 176 /** Tests that round tripping of all three byte permutations work. */ 177 @GwtIncompatible("java.nio.charset.Charset") testIsWellFormed_3Bytes()178 public void testIsWellFormed_3Bytes() { 179 testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT); 180 } 181 182 /** 183 * Tests that round tripping of a sample of four byte permutations work. 184 * All permutations are prohibitively expensive to test for automated runs. 185 * This method tests specific four-byte cases. 186 */ testIsWellFormed_4BytesSamples()187 public void testIsWellFormed_4BytesSamples() { 188 // Valid 4 byte. 189 assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2); 190 // Bad trailing bytes 191 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F); 192 assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0); 193 // Special cases for byte2 194 assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2); 195 assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2); 196 } 197 198 /** Tests some hard-coded test cases. */ testSomeSequences()199 public void testSomeSequences() { 200 // Empty 201 assertWellFormed(); 202 // One-byte characters, including control characters 203 assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f" 204 // Two-byte characters 205 assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2" 206 // Three-byte characters 207 assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac" 208 // Four-byte characters 209 // "\u024B62\u024B62" 210 assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32); 211 // Mixed string 212 // "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62" 213 assertWellFormed(0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 214 0x32, 0x34, 0x42, 0x36, 0x32, 0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 215 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32); 216 // Not a valid string 217 assertNotWellFormed(-1, 0, -1, 0); 218 } 219 testShardsHaveExpectedRoundTrippables()220 public void testShardsHaveExpectedRoundTrippables() { 221 // A sanity check. 222 long actual = 0; 223 for (long expected : generateFourByteShardsExpectedRunnables()) { 224 actual += expected; 225 } 226 assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual); 227 } 228 newString(char... chars)229 private String newString(char... chars) { 230 return new String(chars); 231 } 232 toByteArray(int... bytes)233 private byte[] toByteArray(int... bytes) { 234 byte[] realBytes = new byte[bytes.length]; 235 for (int i = 0; i < bytes.length; i++) { 236 realBytes[i] = (byte) bytes[i]; 237 } 238 return realBytes; 239 } 240 assertWellFormed(int... bytes)241 private void assertWellFormed(int... bytes) { 242 assertTrue(Utf8.isWellFormed(toByteArray(bytes))); 243 } 244 assertNotWellFormed(int... bytes)245 private void assertNotWellFormed(int... bytes) { 246 assertFalse(Utf8.isWellFormed(toByteArray(bytes))); 247 } 248 generateFourByteShardsExpectedRunnables()249 private static long[] generateFourByteShardsExpectedRunnables() { 250 long[] expected = new long[128]; 251 // 0-63 are all 5300224 252 for (int i = 0; i <= 63; i++) { 253 expected[i] = 5300224; 254 } 255 // 97-111 are all 2342912 256 for (int i = 97; i <= 111; i++) { 257 expected[i] = 2342912; 258 } 259 // 113-117 are all 1048576 260 for (int i = 113; i <= 117; i++) { 261 expected[i] = 1048576; 262 } 263 // One offs 264 expected[112] = 786432; 265 expected[118] = 786432; 266 expected[119] = 1048576; 267 expected[120] = 458752; 268 expected[121] = 524288; 269 expected[122] = 65536; 270 // Anything not assigned was the default 0. 271 return expected; 272 } 273 274 /** 275 * Helper to run the loop to test all the permutations for the number of bytes 276 * specified. 277 * 278 * @param numBytes the number of bytes in the byte array 279 * @param expectedCount the expected number of roundtrippable permutations 280 */ 281 @GwtIncompatible("java.nio.charset.Charset") testBytes(int numBytes, long expectedCount)282 private static void testBytes(int numBytes, long expectedCount) { 283 testBytes(numBytes, expectedCount, 0, -1); 284 } 285 286 /** 287 * Helper to run the loop to test all the permutations for the number of bytes 288 * specified. This overload is useful for debugging to get the loop to start 289 * at a certain character. 290 * 291 * @param numBytes the number of bytes in the byte array 292 * @param expectedCount the expected number of roundtrippable permutations 293 * @param start the starting bytes encoded as a long as big-endian 294 * @param lim the limit of bytes to process encoded as a long as big-endian, 295 * or -1 to mean the max limit for numBytes 296 */ 297 @GwtIncompatible("java.nio.charset.Charset") testBytes(int numBytes, long expectedCount, long start, long lim)298 private static void testBytes(int numBytes, long expectedCount, long start, 299 long lim) { 300 byte[] bytes = new byte[numBytes]; 301 if (lim == -1) { 302 lim = 1L << (numBytes * 8); 303 } 304 long countRoundTripped = 0; 305 for (long byteChar = start; byteChar < lim; byteChar++) { 306 long tmpByteChar = byteChar; 307 for (int i = 0; i < numBytes; i++) { 308 bytes[bytes.length - i - 1] = (byte) tmpByteChar; 309 tmpByteChar = tmpByteChar >> 8; 310 } 311 boolean isRoundTrippable = Utf8.isWellFormed(bytes); 312 assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes)); 313 boolean bytesEqual; 314 try { 315 String s = new String(bytes, Charsets.UTF_8.name()); 316 byte[] bytesReencoded = s.getBytes(Charsets.UTF_8.name()); 317 bytesEqual = Arrays.equals(bytes, bytesReencoded); 318 } catch (UnsupportedEncodingException e) { 319 throw new AssertionError(e); 320 } 321 322 if (bytesEqual != isRoundTrippable) { 323 fail(); 324 } 325 if (isRoundTrippable) { 326 countRoundTripped++; 327 } 328 } 329 assertEquals(expectedCount, countRoundTripped); 330 } 331 } 332