1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static org.junit.Assert.assertEquals;
34 import static org.junit.Assert.assertFalse;
35 import static org.junit.Assert.assertSame;
36 import static org.junit.Assert.assertTrue;
37 import static org.junit.Assert.fail;
38 
39 import java.lang.ref.SoftReference;
40 import java.nio.ByteBuffer;
41 import java.nio.CharBuffer;
42 import java.nio.charset.CharsetDecoder;
43 import java.nio.charset.CharsetEncoder;
44 import java.nio.charset.CoderResult;
45 import java.nio.charset.CodingErrorAction;
46 import java.util.ArrayList;
47 import java.util.Arrays;
48 import java.util.List;
49 import java.util.Random;
50 import java.util.logging.Logger;
51 
52 /**
53  * Shared testing code for {@link IsValidUtf8Test} and
54  * {@link IsValidUtf8FourByteTest}.
55  *
56  * @author jonp@google.com (Jon Perlow)
57  * @author martinrb@google.com (Martin Buchholz)
58  */
59 final class IsValidUtf8TestUtil {
60   private static Logger logger = Logger.getLogger(IsValidUtf8TestUtil.class.getName());
61 
IsValidUtf8TestUtil()62   private IsValidUtf8TestUtil() {}
63 
64   static interface ByteStringFactory {
newByteString(byte[] bytes)65     ByteString newByteString(byte[] bytes);
66   }
67 
68   static final ByteStringFactory LITERAL_FACTORY = new ByteStringFactory() {
69     @Override
70     public ByteString newByteString(byte[] bytes) {
71       return ByteString.wrap(bytes);
72     }
73   };
74 
75   static final ByteStringFactory HEAP_NIO_FACTORY = new ByteStringFactory() {
76     @Override
77     public ByteString newByteString(byte[] bytes) {
78       return new NioByteString(ByteBuffer.wrap(bytes));
79     }
80   };
81 
82   private static ThreadLocal<SoftReference<ByteBuffer>> directBuffer =
83       new ThreadLocal<SoftReference<ByteBuffer>>();
84 
85   /**
86    * Factory for direct {@link ByteBuffer} instances. To reduce direct memory usage, this
87    * uses a thread local direct buffer. This means that each call will overwrite the buffer's
88    * contents from the previous call, so the calling code must be careful not to continue using
89    * a buffer returned from a previous invocation.
90    */
91   static final ByteStringFactory DIRECT_NIO_FACTORY = new ByteStringFactory() {
92     @Override
93     public ByteString newByteString(byte[] bytes) {
94       SoftReference<ByteBuffer> ref = directBuffer.get();
95       ByteBuffer buffer = ref == null ? null : ref.get();
96       if (buffer == null || buffer.capacity() < bytes.length) {
97         buffer = ByteBuffer.allocateDirect(bytes.length);
98         directBuffer.set(new SoftReference<ByteBuffer>(buffer));
99       }
100       buffer.clear();
101       buffer.put(bytes);
102       buffer.flip();
103       return new NioByteString(buffer);
104     }
105   };
106 
107   // 128 - [chars 0x0000 to 0x007f]
108   static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
109 
110   // 128
111   static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT = ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
112 
113   // 1920 [chars 0x0080 to 0x07FF]
114   static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
115 
116   // 18,304
117   static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
118       // Both bytes are one byte characters
119       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2) +
120       // The possible number of two byte characters
121       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
122 
123   // 2048
124   static final long THREE_BYTE_SURROGATES = 2 * 1024;
125 
126   // 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
127   static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
128       0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
129 
130   // 2,650,112
131   static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
132       // All one byte characters
133       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3) +
134       // One two byte character and a one byte character
135       2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
136       // Three byte characters
137       THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
138 
139   // 1,048,576 [chars 0x10000L to 0x10FFFF]
140   static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
141 
142   // 289,571,839
143   static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
144       // All one byte characters
145       (long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4) +
146       // One and three byte characters
147       2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS +
148       // Two two byte characters
149       TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS +
150       // Permutations of one and two byte characters
151       3 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
152       * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
153       +
154       // Four byte characters
155       FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
156 
157   static final class Shard {
158     final long index;
159     final long start;
160     final long lim;
161     final long expected;
162 
163 
Shard(long index, long start, long lim, long expected)164     public Shard(long index, long start, long lim, long expected) {
165       assertTrue(start < lim);
166       this.index = index;
167       this.start = start;
168       this.lim = lim;
169       this.expected = expected;
170     }
171   }
172 
173   static final long[] FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES =
174       generateFourByteShardsExpectedRunnables();
175 
176   private static long[] generateFourByteShardsExpectedRunnables() {
177     long[] expected = new long[128];
178 
179     // 0-63 are all 5300224
180     for (int i = 0; i <= 63; i++) {
181       expected[i] = 5300224;
182     }
183 
184     // 97-111 are all 2342912
185     for (int i = 97; i <= 111; i++) {
186       expected[i] = 2342912;
187     }
188 
189     // 113-117 are all 1048576
190     for (int i = 113; i <= 117; i++) {
191       expected[i] = 1048576;
192     }
193 
194     // One offs
195     expected[112] = 786432;
196     expected[118] = 786432;
197     expected[119] = 1048576;
198     expected[120] = 458752;
199     expected[121] = 524288;
200     expected[122] = 65536;
201 
202     // Anything not assigned was the default 0.
203     return expected;
204   }
205 
206   static final List<Shard> FOUR_BYTE_SHARDS =
207       generateFourByteShards(128, FOUR_BYTE_SHARDS_EXPECTED_ROUNTRIPPABLES);
208 
209 
210   private static List<Shard> generateFourByteShards(int numShards, long[] expected) {
211     assertEquals(numShards, expected.length);
212     List<Shard> shards = new ArrayList<Shard>(numShards);
213     long LIM = 1L << 32;
214     long increment = LIM / numShards;
215     assertTrue(LIM % numShards == 0);
216     for (int i = 0; i < numShards; i++) {
217       shards.add(new Shard(i, increment * i, increment * (i + 1), expected[i]));
218     }
219     return shards;
220   }
221 
222   /**
223    * Helper to run the loop to test all the permutations for the number of bytes
224    * specified.
225    *
226    * @param factory the factory for {@link ByteString} instances.
227    * @param numBytes the number of bytes in the byte array
228    * @param expectedCount the expected number of roundtrippable permutations
229    */
230   static void testBytes(ByteStringFactory factory, int numBytes, long expectedCount) {
231     testBytes(factory, numBytes, expectedCount, 0, -1);
232   }
233 
234   /**
235    * Helper to run the loop to test all the permutations for the number of bytes
236    * specified. This overload is useful for debugging to get the loop to start
237    * at a certain character.
238    *
239    * @param factory the factory for {@link ByteString} instances.
240    * @param numBytes the number of bytes in the byte array
241    * @param expectedCount the expected number of roundtrippable permutations
242    * @param start the starting bytes encoded as a long as big-endian
243    * @param lim the limit of bytes to process encoded as a long as big-endian,
244    *     or -1 to mean the max limit for numBytes
245    */
246   static void testBytes(
247       ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
248     Random rnd = new Random();
249     byte[] bytes = new byte[numBytes];
250 
251     if (lim == -1) {
252       lim = 1L << (numBytes * 8);
253     }
254     long count = 0;
255     long countRoundTripped = 0;
256     for (long byteChar = start; byteChar < lim; byteChar++) {
257       long tmpByteChar = byteChar;
258       for (int i = 0; i < numBytes; i++) {
259         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
260         tmpByteChar = tmpByteChar >> 8;
261       }
262       ByteString bs = factory.newByteString(bytes);
263       boolean isRoundTrippable = bs.isValidUtf8();
264       String s = new String(bytes, Internal.UTF_8);
265       byte[] bytesReencoded = s.getBytes(Internal.UTF_8);
266       boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
267 
268       if (bytesEqual != isRoundTrippable) {
269         outputFailure(byteChar, bytes, bytesReencoded);
270       }
271 
272       // Check agreement with static Utf8 methods.
273       assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes));
274       assertEquals(isRoundTrippable, Utf8.isValidUtf8(bytes, 0, numBytes));
275 
276       // Test partial sequences.
277       // Partition numBytes into three segments (not necessarily non-empty).
278       int i = rnd.nextInt(numBytes);
279       int j = rnd.nextInt(numBytes);
280       if (j < i) {
281         int tmp = i;
282         i = j;
283         j = tmp;
284       }
285       int state1 = Utf8.partialIsValidUtf8(Utf8.COMPLETE, bytes, 0, i);
286       int state2 = Utf8.partialIsValidUtf8(state1, bytes, i, j);
287       int state3 = Utf8.partialIsValidUtf8(state2, bytes, j, numBytes);
288       if (isRoundTrippable != (state3 == Utf8.COMPLETE)) {
289         System.out.printf("state=%04x %04x %04x i=%d j=%d%n", state1, state2, state3, i, j);
290         outputFailure(byteChar, bytes, bytesReencoded);
291       }
292       assertEquals(isRoundTrippable, (state3 == Utf8.COMPLETE));
293 
294       // Test ropes built out of small partial sequences
295       ByteString rope = RopeByteString.newInstanceForTest(
296           bs.substring(0, i),
297           RopeByteString.newInstanceForTest(bs.substring(i, j), bs.substring(j, numBytes)));
298       assertSame(RopeByteString.class, rope.getClass());
299 
300       ByteString[] byteStrings = {bs, bs.substring(0, numBytes), rope};
301       for (ByteString x : byteStrings) {
302         assertEquals(isRoundTrippable, x.isValidUtf8());
303         assertEquals(state3, x.partialIsValidUtf8(Utf8.COMPLETE, 0, numBytes));
304 
305         assertEquals(state1, x.partialIsValidUtf8(Utf8.COMPLETE, 0, i));
306         assertEquals(state1, x.substring(0, i).partialIsValidUtf8(Utf8.COMPLETE, 0, i));
307         assertEquals(state2, x.partialIsValidUtf8(state1, i, j - i));
308         assertEquals(state2, x.substring(i, j).partialIsValidUtf8(state1, 0, j - i));
309         assertEquals(state3, x.partialIsValidUtf8(state2, j, numBytes - j));
310         assertEquals(state3, x.substring(j, numBytes).partialIsValidUtf8(state2, 0, numBytes - j));
311       }
312 
313       // ByteString reduplication should not affect its UTF-8 validity.
314       ByteString ropeADope = RopeByteString.newInstanceForTest(bs, bs.substring(0, numBytes));
315       assertEquals(isRoundTrippable, ropeADope.isValidUtf8());
316 
317       if (isRoundTrippable) {
318         countRoundTripped++;
319       }
320       count++;
321       if (byteChar != 0 && byteChar % 1000000L == 0) {
322         logger.info("Processed " + (byteChar / 1000000L) + " million characters");
323       }
324     }
325     logger.info("Round tripped " + countRoundTripped + " of " + count);
326     assertEquals(expectedCount, countRoundTripped);
327   }
328 
329   /**
330    * Variation of {@link #testBytes} that does less allocation using the
331    * low-level encoders/decoders directly. Checked in because it's useful for
332    * debugging when trying to process bytes faster, but since it doesn't use the
333    * actual String class, it's possible for incompatibilities to develop
334    * (although unlikely).
335    *
336    * @param factory the factory for {@link ByteString} instances.
337    * @param numBytes the number of bytes in the byte array
338    * @param expectedCount the expected number of roundtrippable permutations
339    * @param start the starting bytes encoded as a long as big-endian
340    * @param lim the limit of bytes to process encoded as a long as big-endian,
341    *     or -1 to mean the max limit for numBytes
342    */
343   static void testBytesUsingByteBuffers(
344       ByteStringFactory factory, int numBytes, long expectedCount, long start, long lim) {
345     CharsetDecoder decoder =
346         Internal.UTF_8.newDecoder()
347             .onMalformedInput(CodingErrorAction.REPLACE)
348             .onUnmappableCharacter(CodingErrorAction.REPLACE);
349     CharsetEncoder encoder =
350         Internal.UTF_8.newEncoder()
351             .onMalformedInput(CodingErrorAction.REPLACE)
352             .onUnmappableCharacter(CodingErrorAction.REPLACE);
353     byte[] bytes = new byte[numBytes];
354     int maxChars = (int) (decoder.maxCharsPerByte() * numBytes) + 1;
355     char[] charsDecoded = new char[(int) (decoder.maxCharsPerByte() * numBytes) + 1];
356     int maxBytes = (int) (encoder.maxBytesPerChar() * maxChars) + 1;
357     byte[] bytesReencoded = new byte[maxBytes];
358 
359     ByteBuffer bb = ByteBuffer.wrap(bytes);
360     CharBuffer cb = CharBuffer.wrap(charsDecoded);
361     ByteBuffer bbReencoded = ByteBuffer.wrap(bytesReencoded);
362     if (lim == -1) {
363       lim = 1L << (numBytes * 8);
364     }
365     long count = 0;
366     long countRoundTripped = 0;
367     for (long byteChar = start; byteChar < lim; byteChar++) {
368       bb.rewind();
369       bb.limit(bytes.length);
370       cb.rewind();
371       cb.limit(charsDecoded.length);
372       bbReencoded.rewind();
373       bbReencoded.limit(bytesReencoded.length);
374       encoder.reset();
375       decoder.reset();
376       long tmpByteChar = byteChar;
377       for (int i = 0; i < bytes.length; i++) {
378         bytes[bytes.length - i - 1] = (byte) tmpByteChar;
379         tmpByteChar = tmpByteChar >> 8;
380       }
381       boolean isRoundTrippable = factory.newByteString(bytes).isValidUtf8();
382       CoderResult result = decoder.decode(bb, cb, true);
383       assertFalse(result.isError());
384       result = decoder.flush(cb);
385       assertFalse(result.isError());
386 
387       int charLen = cb.position();
388       cb.rewind();
389       cb.limit(charLen);
390       result = encoder.encode(cb, bbReencoded, true);
391       assertFalse(result.isError());
392       result = encoder.flush(bbReencoded);
393       assertFalse(result.isError());
394 
395       boolean bytesEqual = true;
396       int bytesLen = bbReencoded.position();
397       if (bytesLen != numBytes) {
398         bytesEqual = false;
399       } else {
400         for (int i = 0; i < numBytes; i++) {
401           if (bytes[i] != bytesReencoded[i]) {
402             bytesEqual = false;
403             break;
404           }
405         }
406       }
407       if (bytesEqual != isRoundTrippable) {
408         outputFailure(byteChar, bytes, bytesReencoded, bytesLen);
409       }
410 
411       count++;
412       if (isRoundTrippable) {
413         countRoundTripped++;
414       }
415       if (byteChar != 0 && byteChar % 1000000 == 0) {
416         logger.info("Processed " + (byteChar / 1000000) + " million characters");
417       }
418     }
419     logger.info("Round tripped " + countRoundTripped + " of " + count);
420     assertEquals(expectedCount, countRoundTripped);
421   }
422 
423   private static void outputFailure(long byteChar, byte[] bytes, byte[] after) {
424     outputFailure(byteChar, bytes, after, after.length);
425   }
426 
427   private static void outputFailure(long byteChar, byte[] bytes, byte[] after, int len) {
428     fail("Failure: (" + Long.toHexString(byteChar) + ") " + toHexString(bytes) + " => "
429         + toHexString(after, len));
430   }
431 
432   private static String toHexString(byte[] b) {
433     return toHexString(b, b.length);
434   }
435 
436   private static String toHexString(byte[] b, int len) {
437     StringBuilder s = new StringBuilder();
438     s.append("\"");
439     for (int i = 0; i < len; i++) {
440       if (i > 0) {
441         s.append(" ");
442       }
443       s.append(String.format("%02x", b[i] & 0xFF));
444     }
445     s.append("\"");
446     return s.toString();
447   }
448 }
449