1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 package com.google.protobuf;
32 
33 import static com.google.protobuf.UnsafeUtil.addressOffset;
34 import static com.google.protobuf.UnsafeUtil.getArrayBaseOffset;
35 import static com.google.protobuf.UnsafeUtil.hasUnsafeArrayOperations;
36 import static com.google.protobuf.UnsafeUtil.hasUnsafeByteBufferOperations;
37 import static java.lang.Character.MAX_SURROGATE;
38 import static java.lang.Character.MIN_SURROGATE;
39 import static java.lang.Character.isSurrogatePair;
40 import static java.lang.Character.toCodePoint;
41 
42 import java.nio.ByteBuffer;
43 
44 /**
45  * A set of low-level, high-performance static utility methods related
46  * to the UTF-8 character encoding.  This class has no dependencies
47  * outside of the core JDK libraries.
48  *
49  * <p>There are several variants of UTF-8.  The one implemented by
50  * this class is the restricted definition of UTF-8 introduced in
51  * Unicode 3.1, which mandates the rejection of "overlong" byte
52  * sequences as well as rejection of 3-byte surrogate codepoint byte
53  * sequences.  Note that the UTF-8 decoder included in Oracle's JDK
54  * has been modified to also reject "overlong" byte sequences, but (as
55  * of 2011) still accepts 3-byte surrogate codepoint byte sequences.
56  *
57  * <p>The byte sequences considered valid by this class are exactly
58  * those that can be roundtrip converted to Strings and back to bytes
59  * using the UTF-8 charset, without loss: <pre> {@code
60  * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8))
61  * }</pre>
62  *
63  * <p>See the Unicode Standard,</br>
64  * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br>
65  * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>.
66  *
67  * <p>This class supports decoding of partial byte sequences, so that the
68  * bytes in a complete UTF-8 byte sequences can be stored in multiple
69  * segments.  Methods typically return {@link #MALFORMED} if the partial
70  * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is
71  * well-formed in the absence of additional input, or if the byte sequence
72  * apparently terminated in the middle of a character, an opaque integer
73  * "state" value containing enough information to decode the character when
74  * passed to a subsequent invocation of a partial decoding method.
75  *
76  * @author martinrb@google.com (Martin Buchholz)
77  */
78 // TODO(nathanmittler): Copy changes in this class back to Guava
79 final class Utf8 {
80 
81   /**
82    * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations
83    * depending on what is available on the platform. The processor is the platform-optimized
84    * delegate for which all methods are delegated directly to.
85    */
86   private static final Processor processor =
87       UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor();
88 
89   /**
90    * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII
91    * characters (i.e. any byte >= 0x80).
92    */
93   private static final long ASCII_MASK_LONG = 0x8080808080808080L;
94 
95   /**
96    * Maximum number of bytes per Java UTF-16 char in UTF-8.
97    * @see java.nio.charset.CharsetEncoder#maxBytesPerChar()
98    */
99   static final int MAX_BYTES_PER_CHAR = 3;
100 
101   /**
102    * State value indicating that the byte sequence is well-formed and
103    * complete (no further bytes are needed to complete a character).
104    */
105   public static final int COMPLETE = 0;
106 
107   /**
108    * State value indicating that the byte sequence is definitely not
109    * well-formed.
110    */
111   public static final int MALFORMED = -1;
112 
113   /**
114    * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length
115    * above which to employ an optimized algorithm for counting ASCII characters. The reason for this
116    * threshold is that for small strings, the optimization may not be beneficial or may even
117    * negatively impact performance since it requires additional logic to avoid unaligned reads
118    * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial
119    * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()}
120    * which provides a performance improvement that entirely subsumes the cost of the additional
121    * logic.
122    */
123   private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16;
124 
125   // Other state values include the partial bytes of the incomplete
126   // character to be decoded in the simplest way: we pack the bytes
127   // into the state int in little-endian order.  For example:
128   //
129   // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16);
130   //
131   // Such a state is unpacked thus (note the ~ operation for byte2 to
132   // undo byte1's sign-extension bits):
133   //
134   // int byte1 = (byte) state;
135   // int byte2 = (byte) ~(state >> 8);
136   // int byte3 = (byte) (state >> 16);
137   //
138   // We cannot store a zero byte in the state because it would be
139   // indistinguishable from the absence of a byte.  But we don't need
140   // to, because partial bytes must always be negative.  When building
141   // a state, we ensure that byte1 is negative and subsequent bytes
142   // are valid trailing bytes.
143 
144   /**
145    * Returns {@code true} if the given byte array is a well-formed
146    * UTF-8 byte sequence.
147    *
148    * <p>This is a convenience method, equivalent to a call to {@code
149    * isValidUtf8(bytes, 0, bytes.length)}.
150    */
isValidUtf8(byte[] bytes)151   public static boolean isValidUtf8(byte[] bytes) {
152     return processor.isValidUtf8(bytes, 0, bytes.length);
153   }
154 
155   /**
156    * Returns {@code true} if the given byte array slice is a
157    * well-formed UTF-8 byte sequence.  The range of bytes to be
158    * checked extends from index {@code index}, inclusive, to {@code
159    * limit}, exclusive.
160    *
161    * <p>This is a convenience method, equivalent to {@code
162    * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
163    */
isValidUtf8(byte[] bytes, int index, int limit)164   public static boolean isValidUtf8(byte[] bytes, int index, int limit) {
165     return processor.isValidUtf8(bytes, index, limit);
166   }
167 
168   /**
169    * Tells whether the given byte array slice is a well-formed,
170    * malformed, or incomplete UTF-8 byte sequence.  The range of bytes
171    * to be checked extends from index {@code index}, inclusive, to
172    * {@code limit}, exclusive.
173    *
174    * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
175    * operation) or the value returned from a call to a partial decoding method
176    * for the previous bytes
177    *
178    * @return {@link #MALFORMED} if the partial byte sequence is
179    * definitely not well-formed, {@link #COMPLETE} if it is well-formed
180    * (no additional input needed), or if the byte sequence is
181    * "incomplete", i.e. apparently terminated in the middle of a character,
182    * an opaque integer "state" value containing enough information to
183    * decode the character when passed to a subsequent invocation of a
184    * partial decoding method.
185    */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)186   public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
187     return processor.partialIsValidUtf8(state, bytes, index, limit);
188   }
189 
incompleteStateFor(int byte1)190   private static int incompleteStateFor(int byte1) {
191     return (byte1 > (byte) 0xF4) ?
192         MALFORMED : byte1;
193   }
194 
incompleteStateFor(int byte1, int byte2)195   private static int incompleteStateFor(int byte1, int byte2) {
196     return (byte1 > (byte) 0xF4 ||
197             byte2 > (byte) 0xBF) ?
198         MALFORMED : byte1 ^ (byte2 << 8);
199   }
200 
incompleteStateFor(int byte1, int byte2, int byte3)201   private static int incompleteStateFor(int byte1, int byte2, int byte3) {
202     return (byte1 > (byte) 0xF4 ||
203             byte2 > (byte) 0xBF ||
204             byte3 > (byte) 0xBF) ?
205         MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16);
206   }
207 
incompleteStateFor(byte[] bytes, int index, int limit)208   private static int incompleteStateFor(byte[] bytes, int index, int limit) {
209     int byte1 = bytes[index - 1];
210     switch (limit - index) {
211       case 0: return incompleteStateFor(byte1);
212       case 1: return incompleteStateFor(byte1, bytes[index]);
213       case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]);
214       default: throw new AssertionError();
215     }
216   }
217 
incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)218   private static int incompleteStateFor(
219       final ByteBuffer buffer, final int byte1, final int index, final int remaining) {
220     switch (remaining) {
221       case 0:
222         return incompleteStateFor(byte1);
223       case 1:
224         return incompleteStateFor(byte1, buffer.get(index));
225       case 2:
226         return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1));
227       default:
228         throw new AssertionError();
229     }
230   }
231 
232   // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw
233   // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can
234   // fallback to more lenient behavior.
235 
236   static class UnpairedSurrogateException extends IllegalArgumentException {
UnpairedSurrogateException(int index, int length)237     UnpairedSurrogateException(int index, int length) {
238       super("Unpaired surrogate at index " + index + " of " + length);
239     }
240   }
241 
242   /**
243    * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string,
244    * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in
245    * both time and space.
246    *
247    * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired
248    *     surrogates)
249    */
encodedLength(CharSequence sequence)250   static int encodedLength(CharSequence sequence) {
251     // Warning to maintainers: this implementation is highly optimized.
252     int utf16Length = sequence.length();
253     int utf8Length = utf16Length;
254     int i = 0;
255 
256     // This loop optimizes for pure ASCII.
257     while (i < utf16Length && sequence.charAt(i) < 0x80) {
258       i++;
259     }
260 
261     // This loop optimizes for chars less than 0x800.
262     for (; i < utf16Length; i++) {
263       char c = sequence.charAt(i);
264       if (c < 0x800) {
265         utf8Length += ((0x7f - c) >>> 31);  // branch free!
266       } else {
267         utf8Length += encodedLengthGeneral(sequence, i);
268         break;
269       }
270     }
271 
272     if (utf8Length < utf16Length) {
273       // Necessary and sufficient condition for overflow because of maximum 3x expansion
274       throw new IllegalArgumentException("UTF-8 length does not fit in int: "
275               + (utf8Length + (1L << 32)));
276     }
277     return utf8Length;
278   }
279 
encodedLengthGeneral(CharSequence sequence, int start)280   private static int encodedLengthGeneral(CharSequence sequence, int start) {
281     int utf16Length = sequence.length();
282     int utf8Length = 0;
283     for (int i = start; i < utf16Length; i++) {
284       char c = sequence.charAt(i);
285       if (c < 0x800) {
286         utf8Length += (0x7f - c) >>> 31; // branch free!
287       } else {
288         utf8Length += 2;
289         // jdk7+: if (Character.isSurrogate(c)) {
290         if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) {
291           // Check that we have a well-formed surrogate pair.
292           int cp = Character.codePointAt(sequence, i);
293           if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
294             throw new UnpairedSurrogateException(i, utf16Length);
295           }
296           i++;
297         }
298       }
299     }
300     return utf8Length;
301   }
302 
encode(CharSequence in, byte[] out, int offset, int length)303   static int encode(CharSequence in, byte[] out, int offset, int length) {
304     return processor.encodeUtf8(in, out, offset, length);
305   }
306   // End Guava UTF-8 methods.
307 
308   /**
309    * Determines if the given {@link ByteBuffer} is a valid UTF-8 string.
310    *
311    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
312    * and the capabilities of the platform.
313    *
314    * @param buffer the buffer to check.
315    * @see Utf8#isValidUtf8(byte[], int, int)
316    */
isValidUtf8(ByteBuffer buffer)317   static boolean isValidUtf8(ByteBuffer buffer) {
318     return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining());
319   }
320 
321   /**
322    * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string.
323    *
324    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
325    * and the capabilities of the platform.
326    *
327    * @param buffer the buffer to check.
328    * @see Utf8#partialIsValidUtf8(int, byte[], int, int)
329    */
partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)330   static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) {
331     return processor.partialIsValidUtf8(state, buffer, index, limit);
332   }
333 
334   /**
335    * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding.
336    *
337    * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct)
338    * and the capabilities of the platform.
339    *
340    * @param in the source string to be encoded
341    * @param out the target buffer to receive the encoded string.
342    * @see Utf8#encode(CharSequence, byte[], int, int)
343    */
encodeUtf8(CharSequence in, ByteBuffer out)344   static void encodeUtf8(CharSequence in, ByteBuffer out) {
345     processor.encodeUtf8(in, out);
346   }
347 
348   /**
349    * Counts (approximately) the number of consecutive ASCII characters in the given buffer.
350    * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if
351    * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}).
352    *
353    * @param buffer the buffer to be scanned for ASCII chars
354    * @param index the starting index of the scan
355    * @param limit the limit within buffer for the scan
356    * @return the number of ASCII characters found. The stopping position will be at or
357    * before the first non-ASCII byte.
358    */
estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)359   private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) {
360     int i = index;
361     final int lim = limit - 7;
362     // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
363     // To speed things up further, we're reading longs instead of bytes so we use a mask to
364     // determine if any byte in the current long is non-ASCII.
365     for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {}
366     return i - index;
367   }
368 
369   /**
370    * A processor of UTF-8 strings, providing methods for checking validity and encoding.
371    */
372   // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android.
373   abstract static class Processor {
374     /**
375      * Returns {@code true} if the given byte array slice is a
376      * well-formed UTF-8 byte sequence.  The range of bytes to be
377      * checked extends from index {@code index}, inclusive, to {@code
378      * limit}, exclusive.
379      *
380      * <p>This is a convenience method, equivalent to {@code
381      * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
382      */
isValidUtf8(byte[] bytes, int index, int limit)383     final boolean isValidUtf8(byte[] bytes, int index, int limit) {
384       return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE;
385     }
386 
387     /**
388      * Tells whether the given byte array slice is a well-formed,
389      * malformed, or incomplete UTF-8 byte sequence.  The range of bytes
390      * to be checked extends from index {@code index}, inclusive, to
391      * {@code limit}, exclusive.
392      *
393      * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding
394      * operation) or the value returned from a call to a partial decoding method
395      * for the previous bytes
396      *
397      * @return {@link #MALFORMED} if the partial byte sequence is
398      * definitely not well-formed, {@link #COMPLETE} if it is well-formed
399      * (no additional input needed), or if the byte sequence is
400      * "incomplete", i.e. apparently terminated in the middle of a character,
401      * an opaque integer "state" value containing enough information to
402      * decode the character when passed to a subsequent invocation of a
403      * partial decoding method.
404      */
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)405     abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit);
406 
407     /**
408      * Returns {@code true} if the given portion of the {@link ByteBuffer} is a
409      * well-formed UTF-8 byte sequence.  The range of bytes to be
410      * checked extends from index {@code index}, inclusive, to {@code
411      * limit}, exclusive.
412      *
413      * <p>This is a convenience method, equivalent to {@code
414      * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}.
415      */
isValidUtf8(ByteBuffer buffer, int index, int limit)416     final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) {
417       return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE;
418     }
419 
420     /**
421      * Indicates whether or not the given buffer contains a valid UTF-8 string.
422      *
423      * @param buffer the buffer to check.
424      * @return {@code true} if the given buffer contains a valid UTF-8 string.
425      */
partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)426     final int partialIsValidUtf8(
427         final int state, final ByteBuffer buffer, int index, final int limit) {
428       if (buffer.hasArray()) {
429         final int offset = buffer.arrayOffset();
430         return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit);
431       } else if (buffer.isDirect()){
432         return partialIsValidUtf8Direct(state, buffer, index, limit);
433       }
434       return partialIsValidUtf8Default(state, buffer, index, limit);
435     }
436 
437     /**
438      * Performs validation for direct {@link ByteBuffer} instances.
439      */
partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)440     abstract int partialIsValidUtf8Direct(
441         final int state, final ByteBuffer buffer, int index, final int limit);
442 
443     /**
444      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
445      * than potentially faster approaches. This first completes validation for the current
446      * character (provided by {@code state}) and then finishes validation for the sequence.
447      */
partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)448     final int partialIsValidUtf8Default(
449         final int state, final ByteBuffer buffer, int index, final int limit) {
450       if (state != COMPLETE) {
451         // The previous decoding operation was incomplete (or malformed).
452         // We look for a well-formed sequence consisting of bytes from
453         // the previous decoding operation (stored in state) together
454         // with bytes from the array slice.
455         //
456         // We expect such "straddler characters" to be rare.
457 
458         if (index >= limit) { // No bytes? No progress.
459           return state;
460         }
461 
462         byte byte1 = (byte) state;
463         // byte1 is never ASCII.
464         if (byte1 < (byte) 0xE0) {
465           // two-byte form
466 
467           // Simultaneously checks for illegal trailing-byte in
468           // leading position and overlong 2-byte form.
469           if (byte1 < (byte) 0xC2
470               // byte2 trailing-byte test
471               || buffer.get(index++) > (byte) 0xBF) {
472             return MALFORMED;
473           }
474         } else if (byte1 < (byte) 0xF0) {
475           // three-byte form
476 
477           // Get byte2 from saved state or array
478           byte byte2 = (byte) ~(state >> 8);
479           if (byte2 == 0) {
480             byte2 = buffer.get(index++);
481             if (index >= limit) {
482               return incompleteStateFor(byte1, byte2);
483             }
484           }
485           if (byte2 > (byte) 0xBF
486               // overlong? 5 most significant bits must not all be zero
487               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
488               // illegal surrogate codepoint?
489               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
490               // byte3 trailing-byte test
491               || buffer.get(index++) > (byte) 0xBF) {
492             return MALFORMED;
493           }
494         } else {
495           // four-byte form
496 
497           // Get byte2 and byte3 from saved state or array
498           byte byte2 = (byte) ~(state >> 8);
499           byte byte3 = 0;
500           if (byte2 == 0) {
501             byte2 = buffer.get(index++);
502             if (index >= limit) {
503               return incompleteStateFor(byte1, byte2);
504             }
505           } else {
506             byte3 = (byte) (state >> 16);
507           }
508           if (byte3 == 0) {
509             byte3 = buffer.get(index++);
510             if (index >= limit) {
511               return incompleteStateFor(byte1, byte2, byte3);
512             }
513           }
514 
515           // If we were called with state == MALFORMED, then byte1 is 0xFF,
516           // which never occurs in well-formed UTF-8, and so we will return
517           // MALFORMED again below.
518 
519           if (byte2 > (byte) 0xBF
520               // Check that 1 <= plane <= 16.  Tricky optimized form of:
521               // if (byte1 > (byte) 0xF4 ||
522               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
523               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
524               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
525               // byte3 trailing-byte test
526               || byte3 > (byte) 0xBF
527               // byte4 trailing-byte test
528               || buffer.get(index++) > (byte) 0xBF) {
529             return MALFORMED;
530           }
531         }
532       }
533 
534       // Finish validation for the sequence.
535       return partialIsValidUtf8(buffer, index, limit);
536     }
537 
538     /**
539      * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather
540      * than potentially faster approaches.
541      */
partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)542     private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) {
543       index += estimateConsecutiveAscii(buffer, index, limit);
544 
545       for (;;) {
546         // Optimize for interior runs of ASCII bytes.
547         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
548         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
549         int byte1;
550         do {
551           if (index >= limit) {
552             return COMPLETE;
553           }
554         } while ((byte1 = buffer.get(index++)) >= 0);
555 
556         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
557         if (byte1 < (byte) 0xE0) {
558           // Two-byte form (110xxxxx 10xxxxxx)
559           if (index >= limit) {
560             // Incomplete sequence
561             return byte1;
562           }
563 
564           // Simultaneously checks for illegal trailing-byte in
565           // leading position and overlong 2-byte form.
566           if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) {
567             return MALFORMED;
568           }
569           index++;
570         } else if (byte1 < (byte) 0xF0) {
571           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
572           if (index >= limit - 1) {
573             // Incomplete sequence
574             return incompleteStateFor(buffer, byte1, index, limit - index);
575           }
576 
577           final byte byte2 = buffer.get(index++);
578           if (byte2 > (byte) 0xBF
579               // overlong? 5 most significant bits must not all be zero
580               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
581               // check for illegal surrogate codepoints
582               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
583               // byte3 trailing-byte test
584               || buffer.get(index) > (byte) 0xBF) {
585             return MALFORMED;
586           }
587           index++;
588         } else {
589           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
590           if (index >= limit - 2) {
591             // Incomplete sequence
592             return incompleteStateFor(buffer, byte1, index, limit - index);
593           }
594 
595           // TODO(nathanmittler): Consider using getInt() to improve performance.
596           final int byte2 = buffer.get(index++);
597           if (byte2 > (byte) 0xBF
598               // Check that 1 <= plane <= 16.  Tricky optimized form of:
599               // if (byte1 > (byte) 0xF4 ||
600               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
601               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
602               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
603               // byte3 trailing-byte test
604               || buffer.get(index++) > (byte) 0xBF
605               // byte4 trailing-byte test
606               || buffer.get(index++) > (byte) 0xBF) {
607             return MALFORMED;
608           }
609         }
610       }
611     }
612 
613     /**
614      * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}).
615      * For a string, this method is similar to
616      * <pre>{@code
617      * byte[] a = string.getBytes(UTF_8);
618      * System.arraycopy(a, 0, bytes, offset, a.length);
619      * return offset + a.length;
620      * }</pre>
621      *
622      * but is more efficient in both time and space. One key difference is that this method
623      * requires paired surrogates, and therefore does not support chunking.
624      * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default
625      * replacement character, this method throws {@link UnpairedSurrogateException}.
626      *
627      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
628      * compute the exact amount needed, or leave room for
629      * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number
630      * of bytes that any input can be encoded to.
631      *
632      * @param in the input character sequence to be encoded
633      * @param out the target array
634      * @param offset the starting offset in {@code bytes} to start writing at
635      * @param length the length of the {@code bytes}, starting from {@code offset}
636      * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired
637      *     surrogates)
638      * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than
639      *     {@code bytes.length - offset}
640      * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)}
641      */
encodeUtf8(CharSequence in, byte[] out, int offset, int length)642     abstract int encodeUtf8(CharSequence in, byte[] out, int offset, int length);
643 
644     /**
645      * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}).
646      * Upon returning from this method, the {@code out} position will point to the position after
647      * the last encoded byte. This method requires paired surrogates, and therefore does not
648      * support chunking.
649      *
650      * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to
651      * compute the exact amount needed, or leave room for
652      * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number
653      * of bytes that any input can be encoded to.
654      *
655      * @param in the source character sequence to be encoded
656      * @param out the target buffer
657      * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired
658      *     surrogates)
659      * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than
660      *     {@code out.remaining()}
661      */
encodeUtf8(CharSequence in, ByteBuffer out)662     final void encodeUtf8(CharSequence in, ByteBuffer out) {
663       if (out.hasArray()) {
664         final int offset = out.arrayOffset();
665         int endIndex =
666             Utf8.encode(in, out.array(), offset + out.position(), out.remaining());
667         out.position(endIndex - offset);
668       } else if (out.isDirect()) {
669         encodeUtf8Direct(in, out);
670       } else {
671         encodeUtf8Default(in, out);
672       }
673     }
674 
675     /**
676      * Encodes the input character sequence to a direct {@link ByteBuffer} instance.
677      */
encodeUtf8Direct(CharSequence in, ByteBuffer out)678     abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out);
679 
680     /**
681      * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link
682      * ByteBuffer} API, rather than potentially faster approaches.
683      */
encodeUtf8Default(CharSequence in, ByteBuffer out)684     final void encodeUtf8Default(CharSequence in, ByteBuffer out) {
685       final int inLength = in.length();
686       int outIx = out.position();
687       int inIx = 0;
688 
689       // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check
690       // access. Assume the buffer is big enough and let it handle the out of bounds exception
691       // if it occurs.
692       try {
693         // Designed to take advantage of
694         // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
695         for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) {
696           out.put(outIx + inIx, (byte) c);
697         }
698         if (inIx == inLength) {
699           // Successfully encoded the entire string.
700           out.position(outIx + inIx);
701           return;
702         }
703 
704         outIx += inIx;
705         for (char c; inIx < inLength; ++inIx, ++outIx) {
706           c = in.charAt(inIx);
707           if (c < 0x80) {
708             // One byte (0xxx xxxx)
709             out.put(outIx, (byte) c);
710           } else if (c < 0x800) {
711             // Two bytes (110x xxxx 10xx xxxx)
712 
713             // Benchmarks show put performs better than putShort here (for HotSpot).
714             out.put(outIx++, (byte) (0xC0 | (c >>> 6)));
715             out.put(outIx, (byte) (0x80 | (0x3F & c)));
716           } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) {
717             // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx)
718             // Maximum single-char code point is 0xFFFF, 16 bits.
719 
720             // Benchmarks show put performs better than putShort here (for HotSpot).
721             out.put(outIx++, (byte) (0xE0 | (c >>> 12)));
722             out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
723             out.put(outIx, (byte) (0x80 | (0x3F & c)));
724           } else {
725             // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx)
726 
727             // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
728             // bytes
729             final char low;
730             if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
731               throw new UnpairedSurrogateException(inIx, inLength);
732             }
733             // TODO(nathanmittler): Consider using putInt() to improve performance.
734             int codePoint = toCodePoint(c, low);
735             out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
736             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
737             out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
738             out.put(outIx, (byte) (0x80 | (0x3F & codePoint)));
739           }
740         }
741 
742         // Successfully encoded the entire string.
743         out.position(outIx);
744       } catch (IndexOutOfBoundsException e) {
745         // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead.
746 
747         // If we failed in the outer ASCII loop, outIx will not have been updated. In this case,
748         // use inIx to determine the bad write index.
749         int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1);
750         throw new ArrayIndexOutOfBoundsException(
751             "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex);
752       }
753     }
754   }
755 
756   /**
757    * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods.
758    */
759   static final class SafeProcessor extends Processor {
760     @Override
partialIsValidUtf8(int state, byte[] bytes, int index, int limit)761     int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) {
762       if (state != COMPLETE) {
763         // The previous decoding operation was incomplete (or malformed).
764         // We look for a well-formed sequence consisting of bytes from
765         // the previous decoding operation (stored in state) together
766         // with bytes from the array slice.
767         //
768         // We expect such "straddler characters" to be rare.
769 
770         if (index >= limit) {  // No bytes? No progress.
771           return state;
772         }
773         int byte1 = (byte) state;
774         // byte1 is never ASCII.
775         if (byte1 < (byte) 0xE0) {
776           // two-byte form
777 
778           // Simultaneously checks for illegal trailing-byte in
779           // leading position and overlong 2-byte form.
780           if (byte1 < (byte) 0xC2
781               // byte2 trailing-byte test
782               || bytes[index++] > (byte) 0xBF) {
783             return MALFORMED;
784           }
785         } else if (byte1 < (byte) 0xF0) {
786           // three-byte form
787 
788           // Get byte2 from saved state or array
789           int byte2 = (byte) ~(state >> 8);
790           if (byte2 == 0) {
791             byte2 = bytes[index++];
792             if (index >= limit) {
793               return incompleteStateFor(byte1, byte2);
794             }
795           }
796           if (byte2 > (byte) 0xBF
797               // overlong? 5 most significant bits must not all be zero
798               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
799               // illegal surrogate codepoint?
800               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
801               // byte3 trailing-byte test
802               || bytes[index++] > (byte) 0xBF) {
803             return MALFORMED;
804           }
805         } else {
806           // four-byte form
807 
808           // Get byte2 and byte3 from saved state or array
809           int byte2 = (byte) ~(state >> 8);
810           int byte3 = 0;
811           if (byte2 == 0) {
812             byte2 = bytes[index++];
813             if (index >= limit) {
814               return incompleteStateFor(byte1, byte2);
815             }
816           } else {
817             byte3 = (byte) (state >> 16);
818           }
819           if (byte3 == 0) {
820             byte3 = bytes[index++];
821             if (index >= limit) {
822               return incompleteStateFor(byte1, byte2, byte3);
823             }
824           }
825 
826           // If we were called with state == MALFORMED, then byte1 is 0xFF,
827           // which never occurs in well-formed UTF-8, and so we will return
828           // MALFORMED again below.
829 
830           if (byte2 > (byte) 0xBF
831               // Check that 1 <= plane <= 16.  Tricky optimized form of:
832               // if (byte1 > (byte) 0xF4 ||
833               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
834               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
835               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
836               // byte3 trailing-byte test
837               || byte3 > (byte) 0xBF
838               // byte4 trailing-byte test
839               || bytes[index++] > (byte) 0xBF) {
840             return MALFORMED;
841           }
842         }
843       }
844 
845       return partialIsValidUtf8(bytes, index, limit);
846     }
847 
848     @Override
partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)849     int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) {
850       // For safe processing, we have to use the ByteBuffer API.
851       return partialIsValidUtf8Default(state, buffer, index, limit);
852     }
853 
854     @Override
encodeUtf8(CharSequence in, byte[] out, int offset, int length)855     int encodeUtf8(CharSequence in, byte[] out, int offset, int length) {
856       int utf16Length = in.length();
857       int j = offset;
858       int i = 0;
859       int limit = offset + length;
860       // Designed to take advantage of
861       // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
862       for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) {
863         out[j + i] = (byte) c;
864       }
865       if (i == utf16Length) {
866         return j + utf16Length;
867       }
868       j += i;
869       for (char c; i < utf16Length; i++) {
870         c = in.charAt(i);
871         if (c < 0x80 && j < limit) {
872           out[j++] = (byte) c;
873         } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes
874           out[j++] = (byte) ((0xF << 6) | (c >>> 6));
875           out[j++] = (byte) (0x80 | (0x3F & c));
876         } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) {
877           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
878           out[j++] = (byte) ((0xF << 5) | (c >>> 12));
879           out[j++] = (byte) (0x80 | (0x3F & (c >>> 6)));
880           out[j++] = (byte) (0x80 | (0x3F & c));
881         } else if (j <= limit - 4) {
882           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits,
883           // four UTF-8 bytes
884           final char low;
885           if (i + 1 == in.length()
886                   || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) {
887             throw new UnpairedSurrogateException((i - 1), utf16Length);
888           }
889           int codePoint = Character.toCodePoint(c, low);
890           out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18));
891           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12)));
892           out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6)));
893           out[j++] = (byte) (0x80 | (0x3F & codePoint));
894         } else {
895           // If we are surrogates and we're not a surrogate pair, always throw an
896           // UnpairedSurrogateException instead of an ArrayOutOfBoundsException.
897           if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE)
898               && (i + 1 == in.length()
899                   || !Character.isSurrogatePair(c, in.charAt(i + 1)))) {
900             throw new UnpairedSurrogateException(i, utf16Length);
901           }
902           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j);
903         }
904       }
905       return j;
906     }
907 
908     @Override
encodeUtf8Direct(CharSequence in, ByteBuffer out)909     void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
910       // For safe processing, we have to use the ByteBuffer API.
911       encodeUtf8Default(in, out);
912     }
913 
partialIsValidUtf8(byte[] bytes, int index, int limit)914     private static int partialIsValidUtf8(byte[] bytes, int index, int limit) {
915       // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
916       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
917       while (index < limit && bytes[index] >= 0) {
918         index++;
919       }
920 
921       return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit);
922     }
923 
partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)924     private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) {
925       for (;;) {
926         int byte1, byte2;
927 
928         // Optimize for interior runs of ASCII bytes.
929         do {
930           if (index >= limit) {
931             return COMPLETE;
932           }
933         } while ((byte1 = bytes[index++]) >= 0);
934 
935         if (byte1 < (byte) 0xE0) {
936           // two-byte form
937 
938           if (index >= limit) {
939             // Incomplete sequence
940             return byte1;
941           }
942 
943           // Simultaneously checks for illegal trailing-byte in
944           // leading position and overlong 2-byte form.
945           if (byte1 < (byte) 0xC2
946               || bytes[index++] > (byte) 0xBF) {
947             return MALFORMED;
948           }
949         } else if (byte1 < (byte) 0xF0) {
950           // three-byte form
951 
952           if (index >= limit - 1) { // incomplete sequence
953             return incompleteStateFor(bytes, index, limit);
954           }
955           if ((byte2 = bytes[index++]) > (byte) 0xBF
956               // overlong? 5 most significant bits must not all be zero
957               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
958               // check for illegal surrogate codepoints
959               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
960               // byte3 trailing-byte test
961               || bytes[index++] > (byte) 0xBF) {
962             return MALFORMED;
963           }
964         } else {
965           // four-byte form
966 
967           if (index >= limit - 2) {  // incomplete sequence
968             return incompleteStateFor(bytes, index, limit);
969           }
970           if ((byte2 = bytes[index++]) > (byte) 0xBF
971               // Check that 1 <= plane <= 16.  Tricky optimized form of:
972               // if (byte1 > (byte) 0xF4 ||
973               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
974               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
975               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
976               // byte3 trailing-byte test
977               || bytes[index++] > (byte) 0xBF
978               // byte4 trailing-byte test
979               || bytes[index++] > (byte) 0xBF) {
980             return MALFORMED;
981           }
982         }
983       }
984     }
985   }
986 
987   /**
988    * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance.
989    */
990   static final class UnsafeProcessor extends Processor {
991     /**
992      * Indicates whether or not all required unsafe operations are supported on this platform.
993      */
isAvailable()994     static boolean isAvailable() {
995       return hasUnsafeArrayOperations() && hasUnsafeByteBufferOperations();
996     }
997 
998     @Override
partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)999     int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) {
1000       if ((index | limit | bytes.length - limit) < 0) {
1001         throw new ArrayIndexOutOfBoundsException(
1002             String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit));
1003       }
1004       long offset = getArrayBaseOffset() + index;
1005       final long offsetLimit = getArrayBaseOffset() + limit;
1006       if (state != COMPLETE) {
1007         // The previous decoding operation was incomplete (or malformed).
1008         // We look for a well-formed sequence consisting of bytes from
1009         // the previous decoding operation (stored in state) together
1010         // with bytes from the array slice.
1011         //
1012         // We expect such "straddler characters" to be rare.
1013 
1014         if (offset >= offsetLimit) {  // No bytes? No progress.
1015           return state;
1016         }
1017         int byte1 = (byte) state;
1018         // byte1 is never ASCII.
1019         if (byte1 < (byte) 0xE0) {
1020           // two-byte form
1021 
1022           // Simultaneously checks for illegal trailing-byte in
1023           // leading position and overlong 2-byte form.
1024           if (byte1 < (byte) 0xC2
1025               // byte2 trailing-byte test
1026               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1027             return MALFORMED;
1028           }
1029         } else if (byte1 < (byte) 0xF0) {
1030           // three-byte form
1031 
1032           // Get byte2 from saved state or array
1033           int byte2 = (byte) ~(state >> 8);
1034           if (byte2 == 0) {
1035             byte2 = UnsafeUtil.getByte(bytes, offset++);
1036             if (offset >= offsetLimit) {
1037               return incompleteStateFor(byte1, byte2);
1038             }
1039           }
1040           if (byte2 > (byte) 0xBF
1041               // overlong? 5 most significant bits must not all be zero
1042               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1043               // illegal surrogate codepoint?
1044               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1045               // byte3 trailing-byte test
1046               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1047             return MALFORMED;
1048           }
1049         } else {
1050           // four-byte form
1051 
1052           // Get byte2 and byte3 from saved state or array
1053           int byte2 = (byte) ~(state >> 8);
1054           int byte3 = 0;
1055           if (byte2 == 0) {
1056             byte2 = UnsafeUtil.getByte(bytes, offset++);
1057             if (offset >= offsetLimit) {
1058               return incompleteStateFor(byte1, byte2);
1059             }
1060           } else {
1061             byte3 = (byte) (state >> 16);
1062           }
1063           if (byte3 == 0) {
1064             byte3 = UnsafeUtil.getByte(bytes, offset++);
1065             if (offset >= offsetLimit) {
1066               return incompleteStateFor(byte1, byte2, byte3);
1067             }
1068           }
1069 
1070           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1071           // which never occurs in well-formed UTF-8, and so we will return
1072           // MALFORMED again below.
1073 
1074           if (byte2 > (byte) 0xBF
1075               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1076               // if (byte1 > (byte) 0xF4 ||
1077               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1078               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1079               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1080               // byte3 trailing-byte test
1081               || byte3 > (byte) 0xBF
1082               // byte4 trailing-byte test
1083               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1084             return MALFORMED;
1085           }
1086         }
1087       }
1088 
1089       return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset));
1090     }
1091 
1092     @Override
partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1093     int partialIsValidUtf8Direct(
1094         final int state, ByteBuffer buffer, final int index, final int limit) {
1095       if ((index | limit | buffer.limit() - limit) < 0) {
1096         throw new ArrayIndexOutOfBoundsException(
1097             String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit));
1098       }
1099       long address = addressOffset(buffer) + index;
1100       final long addressLimit = address + (limit - index);
1101       if (state != COMPLETE) {
1102         // The previous decoding operation was incomplete (or malformed).
1103         // We look for a well-formed sequence consisting of bytes from
1104         // the previous decoding operation (stored in state) together
1105         // with bytes from the array slice.
1106         //
1107         // We expect such "straddler characters" to be rare.
1108 
1109         if (address >= addressLimit) { // No bytes? No progress.
1110           return state;
1111         }
1112 
1113         final int byte1 = (byte) state;
1114         // byte1 is never ASCII.
1115         if (byte1 < (byte) 0xE0) {
1116           // two-byte form
1117 
1118           // Simultaneously checks for illegal trailing-byte in
1119           // leading position and overlong 2-byte form.
1120           if (byte1 < (byte) 0xC2
1121               // byte2 trailing-byte test
1122               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1123             return MALFORMED;
1124           }
1125         } else if (byte1 < (byte) 0xF0) {
1126           // three-byte form
1127 
1128           // Get byte2 from saved state or array
1129           int byte2 = (byte) ~(state >> 8);
1130           if (byte2 == 0) {
1131             byte2 = UnsafeUtil.getByte(address++);
1132             if (address >= addressLimit) {
1133               return incompleteStateFor(byte1, byte2);
1134             }
1135           }
1136           if (byte2 > (byte) 0xBF
1137               // overlong? 5 most significant bits must not all be zero
1138               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1139               // illegal surrogate codepoint?
1140               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1141               // byte3 trailing-byte test
1142               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1143             return MALFORMED;
1144           }
1145         } else {
1146           // four-byte form
1147 
1148           // Get byte2 and byte3 from saved state or array
1149           int byte2 = (byte) ~(state >> 8);
1150           int byte3 = 0;
1151           if (byte2 == 0) {
1152             byte2 = UnsafeUtil.getByte(address++);
1153             if (address >= addressLimit) {
1154               return incompleteStateFor(byte1, byte2);
1155             }
1156           } else {
1157             byte3 = (byte) (state >> 16);
1158           }
1159           if (byte3 == 0) {
1160             byte3 = UnsafeUtil.getByte(address++);
1161             if (address >= addressLimit) {
1162               return incompleteStateFor(byte1, byte2, byte3);
1163             }
1164           }
1165 
1166           // If we were called with state == MALFORMED, then byte1 is 0xFF,
1167           // which never occurs in well-formed UTF-8, and so we will return
1168           // MALFORMED again below.
1169 
1170           if (byte2 > (byte) 0xBF
1171               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1172               // if (byte1 > (byte) 0xF4 ||
1173               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1174               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1175               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1176               // byte3 trailing-byte test
1177               || byte3 > (byte) 0xBF
1178               // byte4 trailing-byte test
1179               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1180             return MALFORMED;
1181           }
1182         }
1183       }
1184 
1185       return partialIsValidUtf8(address, (int) (addressLimit - address));
1186     }
1187 
1188     @Override
encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length)1189     int encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length) {
1190       long outIx = getArrayBaseOffset() + offset;
1191       final long outLimit = outIx + length;
1192       final int inLimit = in.length();
1193       if (inLimit > length || out.length - length < offset) {
1194         // Not even enough room for an ASCII-encoded string.
1195         throw new ArrayIndexOutOfBoundsException(
1196             "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length));
1197       }
1198 
1199       // Designed to take advantage of
1200       // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1201       int inIx = 0;
1202       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1203         UnsafeUtil.putByte(out, outIx++, (byte) c);
1204       }
1205       if (inIx == inLimit) {
1206         // We're done, it was ASCII encoded.
1207         return (int) (outIx - getArrayBaseOffset());
1208       }
1209 
1210       for (char c; inIx < inLimit; ++inIx) {
1211         c = in.charAt(inIx);
1212         if (c < 0x80 && outIx < outLimit) {
1213           UnsafeUtil.putByte(out, outIx++, (byte) c);
1214         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1215           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1216           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1217         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1218           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1219           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1220           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1221           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & c)));
1222         } else if (outIx <= outLimit - 4L) {
1223           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1224           // bytes
1225           final char low;
1226           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1227             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1228           }
1229           int codePoint = toCodePoint(c, low);
1230           UnsafeUtil.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1231           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1232           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1233           UnsafeUtil.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint)));
1234         } else {
1235           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1236               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1237             // We are surrogates and we're not a surrogate pair.
1238             throw new UnpairedSurrogateException(inIx, inLimit);
1239           }
1240           // Not enough space in the output buffer.
1241           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1242         }
1243       }
1244 
1245       // All bytes have been encoded.
1246       return (int) (outIx - getArrayBaseOffset());
1247     }
1248 
1249     @Override
encodeUtf8Direct(CharSequence in, ByteBuffer out)1250     void encodeUtf8Direct(CharSequence in, ByteBuffer out) {
1251       final long address = addressOffset(out);
1252       long outIx = address + out.position();
1253       final long outLimit = address + out.limit();
1254       final int inLimit = in.length();
1255       if (inLimit > outLimit - outIx) {
1256         // Not even enough room for an ASCII-encoded string.
1257         throw new ArrayIndexOutOfBoundsException(
1258             "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit());
1259       }
1260 
1261       // Designed to take advantage of
1262       // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination
1263       int inIx = 0;
1264       for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) {
1265         UnsafeUtil.putByte(outIx++, (byte) c);
1266       }
1267       if (inIx == inLimit) {
1268         // We're done, it was ASCII encoded.
1269         out.position((int) (outIx - address));
1270         return;
1271       }
1272 
1273       for (char c; inIx < inLimit; ++inIx) {
1274         c = in.charAt(inIx);
1275         if (c < 0x80 && outIx < outLimit) {
1276           UnsafeUtil.putByte(outIx++, (byte) c);
1277         } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes
1278           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6)));
1279           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1280         } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) {
1281           // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes
1282           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12)));
1283           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6))));
1284           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & c)));
1285         } else if (outIx <= outLimit - 4L) {
1286           // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8
1287           // bytes
1288           final char low;
1289           if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) {
1290             throw new UnpairedSurrogateException((inIx - 1), inLimit);
1291           }
1292           int codePoint = toCodePoint(c, low);
1293           UnsafeUtil.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
1294           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
1295           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
1296           UnsafeUtil.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint)));
1297         } else {
1298           if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE)
1299               && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) {
1300             // We are surrogates and we're not a surrogate pair.
1301             throw new UnpairedSurrogateException(inIx, inLimit);
1302           }
1303           // Not enough space in the output buffer.
1304           throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx);
1305         }
1306       }
1307 
1308       // All bytes have been encoded.
1309       out.position((int) (outIx - address));
1310     }
1311 
1312     /**
1313      * Counts (approximately) the number of consecutive ASCII characters starting from the given
1314      * position, using the most efficient method available to the platform.
1315      *
1316      * @param bytes the array containing the character sequence
1317      * @param offset the offset position of the index (same as index + arrayBaseOffset)
1318      * @param maxChars the maximum number of characters to count
1319      * @return the number of ASCII characters found. The stopping position will be at or
1320      * before the first non-ASCII byte.
1321      */
unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1322     private static int unsafeEstimateConsecutiveAscii(
1323         byte[] bytes, long offset, final int maxChars) {
1324       int remaining = maxChars;
1325       if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1326         // Don't bother with small strings.
1327         return 0;
1328       }
1329 
1330       // Read bytes until 8-byte aligned so that we can read longs in the loop below.
1331       // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that
1332       // the index (relative to the start of the array) is also 8-byte aligned. We do this by
1333       // ANDing the index with 7 to determine the number of bytes that need to be read before
1334       // we're 8-byte aligned.
1335       final int unaligned = (int) offset & 7;
1336       for (int j = unaligned; j > 0; j--) {
1337         if (UnsafeUtil.getByte(bytes, offset++) < 0) {
1338           return unaligned - j;
1339         }
1340       }
1341 
1342       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1343       // To speed things up further, we're reading longs instead of bytes so we use a mask to
1344       // determine if any byte in the current long is non-ASCII.
1345       remaining -= unaligned;
1346       for (; remaining >= 8 && (UnsafeUtil.getLong(bytes, offset) & ASCII_MASK_LONG) == 0;
1347           offset += 8, remaining -= 8) {}
1348       return maxChars - remaining;
1349     }
1350 
1351     /**
1352      * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the
1353      * most efficient method available to the platform.
1354      */
unsafeEstimateConsecutiveAscii(long address, final int maxChars)1355     private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) {
1356       int remaining = maxChars;
1357       if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) {
1358         // Don't bother with small strings.
1359         return 0;
1360       }
1361 
1362       // Read bytes until 8-byte aligned so that we can read longs in the loop below.
1363       // We do this by ANDing the address with 7 to determine the number of bytes that need to
1364       // be read before we're 8-byte aligned.
1365       final int unaligned = (int) address & 7;
1366       for (int j = unaligned; j > 0; j--) {
1367         if (UnsafeUtil.getByte(address++) < 0) {
1368           return unaligned - j;
1369         }
1370       }
1371 
1372       // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
1373       // To speed things up further, we're reading longs instead of bytes so we use a mask to
1374       // determine if any byte in the current long is non-ASCII.
1375       remaining -= unaligned;
1376       for (; remaining >= 8 && (UnsafeUtil.getLong(address) & ASCII_MASK_LONG) == 0;
1377           address += 8, remaining -= 8) {}
1378       return maxChars - remaining;
1379     }
1380 
partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1381     private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) {
1382       // Skip past ASCII characters as quickly as possible.
1383       final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining);
1384       remaining -= skipped;
1385       offset += skipped;
1386 
1387       for (;;) {
1388         // Optimize for interior runs of ASCII bytes.
1389         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
1390         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1391         int byte1 = 0;
1392         for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(bytes, offset++)) >= 0; --remaining) {
1393         }
1394         if (remaining == 0) {
1395           return COMPLETE;
1396         }
1397         remaining--;
1398 
1399         // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms.
1400         if (byte1 < (byte) 0xE0) {
1401           // Two-byte form (110xxxxx 10xxxxxx)
1402           if (remaining == 0) {
1403             // Incomplete sequence
1404             return byte1;
1405           }
1406           remaining--;
1407 
1408           // Simultaneously checks for illegal trailing-byte in
1409           // leading position and overlong 2-byte form.
1410           if (byte1 < (byte) 0xC2
1411               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1412             return MALFORMED;
1413           }
1414         } else if (byte1 < (byte) 0xF0) {
1415           // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx)
1416           if (remaining < 2) {
1417             // Incomplete sequence
1418             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1419           }
1420           remaining -= 2;
1421 
1422           final int byte2;
1423           if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1424               // overlong? 5 most significant bits must not all be zero
1425               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1426               // check for illegal surrogate codepoints
1427               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1428               // byte3 trailing-byte test
1429               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1430             return MALFORMED;
1431           }
1432         } else {
1433           // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx)
1434           if (remaining < 3) {
1435             // Incomplete sequence
1436             return unsafeIncompleteStateFor(bytes, byte1, offset, remaining);
1437           }
1438           remaining -= 3;
1439 
1440           final int byte2;
1441           if ((byte2 = UnsafeUtil.getByte(bytes, offset++)) > (byte) 0xBF
1442               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1443               // if (byte1 > (byte) 0xF4 ||
1444               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1445               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1446               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1447               // byte3 trailing-byte test
1448               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF
1449               // byte4 trailing-byte test
1450               || UnsafeUtil.getByte(bytes, offset++) > (byte) 0xBF) {
1451             return MALFORMED;
1452           }
1453         }
1454       }
1455     }
1456 
partialIsValidUtf8(long address, int remaining)1457     private static int partialIsValidUtf8(long address, int remaining) {
1458       // Skip past ASCII characters as quickly as possible.
1459       final int skipped = unsafeEstimateConsecutiveAscii(address, remaining);
1460       address += skipped;
1461       remaining -= skipped;
1462 
1463       for (;;) {
1464         // Optimize for interior runs of ASCII bytes.
1465         // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold?
1466         // Maybe after seeing a few in a row that are ASCII, go back to fast mode?
1467         int byte1 = 0;
1468         for (; remaining > 0 && (byte1 = UnsafeUtil.getByte(address++)) >= 0; --remaining) {
1469         }
1470         if (remaining == 0) {
1471           return COMPLETE;
1472         }
1473         remaining--;
1474 
1475         if (byte1 < (byte) 0xE0) {
1476           // Two-byte form
1477 
1478           if (remaining == 0) {
1479             // Incomplete sequence
1480             return byte1;
1481           }
1482           remaining--;
1483 
1484           // Simultaneously checks for illegal trailing-byte in
1485           // leading position and overlong 2-byte form.
1486           if (byte1 < (byte) 0xC2 || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1487             return MALFORMED;
1488           }
1489         } else if (byte1 < (byte) 0xF0) {
1490           // Three-byte form
1491 
1492           if (remaining < 2) {
1493             // Incomplete sequence
1494             return unsafeIncompleteStateFor(address, byte1, remaining);
1495           }
1496           remaining -= 2;
1497 
1498           final byte byte2 = UnsafeUtil.getByte(address++);
1499           if (byte2 > (byte) 0xBF
1500               // overlong? 5 most significant bits must not all be zero
1501               || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0)
1502               // check for illegal surrogate codepoints
1503               || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0)
1504               // byte3 trailing-byte test
1505               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1506             return MALFORMED;
1507           }
1508         } else {
1509           // Four-byte form
1510 
1511           if (remaining < 3) {
1512             // Incomplete sequence
1513             return unsafeIncompleteStateFor(address, byte1, remaining);
1514           }
1515           remaining -= 3;
1516 
1517           final byte byte2 = UnsafeUtil.getByte(address++);
1518           if (byte2 > (byte) 0xBF
1519               // Check that 1 <= plane <= 16.  Tricky optimized form of:
1520               // if (byte1 > (byte) 0xF4 ||
1521               //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
1522               //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
1523               || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0
1524               // byte3 trailing-byte test
1525               || UnsafeUtil.getByte(address++) > (byte) 0xBF
1526               // byte4 trailing-byte test
1527               || UnsafeUtil.getByte(address++) > (byte) 0xBF) {
1528             return MALFORMED;
1529           }
1530         }
1531       }
1532     }
1533 
unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset, int remaining)1534     private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset,
1535         int remaining) {
1536       switch (remaining) {
1537         case 0: {
1538           return incompleteStateFor(byte1);
1539         }
1540         case 1: {
1541           return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset));
1542         }
1543         case 2: {
1544           return incompleteStateFor(byte1, UnsafeUtil.getByte(bytes, offset),
1545               UnsafeUtil.getByte(bytes, offset + 1));
1546         }
1547         default: {
1548           throw new AssertionError();
1549         }
1550       }
1551     }
1552 
unsafeIncompleteStateFor(long address, final int byte1, int remaining)1553     private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) {
1554       switch (remaining) {
1555         case 0: {
1556           return incompleteStateFor(byte1);
1557         }
1558         case 1: {
1559           return incompleteStateFor(byte1, UnsafeUtil.getByte(address));
1560         }
1561         case 2: {
1562           return incompleteStateFor(byte1, UnsafeUtil.getByte(address),
1563               UnsafeUtil.getByte(address + 1));
1564         }
1565         default: {
1566           throw new AssertionError();
1567         }
1568       }
1569     }
1570   }
1571 
Utf8()1572   private Utf8() {}
1573 }
1574