1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 package com.google.protobuf; 32 33 import static java.lang.Character.MAX_SURROGATE; 34 import static java.lang.Character.MIN_SURROGATE; 35 import static java.lang.Character.isSurrogatePair; 36 import static java.lang.Character.toCodePoint; 37 38 import java.lang.reflect.Field; 39 import java.nio.Buffer; 40 import java.nio.ByteBuffer; 41 import java.security.AccessController; 42 import java.security.PrivilegedExceptionAction; 43 import java.util.logging.Level; 44 import java.util.logging.Logger; 45 46 /** 47 * A set of low-level, high-performance static utility methods related 48 * to the UTF-8 character encoding. This class has no dependencies 49 * outside of the core JDK libraries. 50 * 51 * <p>There are several variants of UTF-8. The one implemented by 52 * this class is the restricted definition of UTF-8 introduced in 53 * Unicode 3.1, which mandates the rejection of "overlong" byte 54 * sequences as well as rejection of 3-byte surrogate codepoint byte 55 * sequences. Note that the UTF-8 decoder included in Oracle's JDK 56 * has been modified to also reject "overlong" byte sequences, but (as 57 * of 2011) still accepts 3-byte surrogate codepoint byte sequences. 58 * 59 * <p>The byte sequences considered valid by this class are exactly 60 * those that can be roundtrip converted to Strings and back to bytes 61 * using the UTF-8 charset, without loss: <pre> {@code 62 * Arrays.equals(bytes, new String(bytes, Internal.UTF_8).getBytes(Internal.UTF_8)) 63 * }</pre> 64 * 65 * <p>See the Unicode Standard,</br> 66 * Table 3-6. <em>UTF-8 Bit Distribution</em>,</br> 67 * Table 3-7. <em>Well Formed UTF-8 Byte Sequences</em>. 68 * 69 * <p>This class supports decoding of partial byte sequences, so that the 70 * bytes in a complete UTF-8 byte sequences can be stored in multiple 71 * segments. Methods typically return {@link #MALFORMED} if the partial 72 * byte sequence is definitely not well-formed, {@link #COMPLETE} if it is 73 * well-formed in the absence of additional input, or if the byte sequence 74 * apparently terminated in the middle of a character, an opaque integer 75 * "state" value containing enough information to decode the character when 76 * passed to a subsequent invocation of a partial decoding method. 77 * 78 * @author martinrb@google.com (Martin Buchholz) 79 */ 80 // TODO(nathanmittler): Copy changes in this class back to Guava 81 final class Utf8 { 82 private static final Logger logger = Logger.getLogger(Utf8.class.getName()); 83 84 /** 85 * UTF-8 is a runtime hot spot so we attempt to provide heavily optimized implementations 86 * depending on what is available on the platform. The processor is the platform-optimized 87 * delegate for which all methods are delegated directly to. 88 */ 89 private static final Processor processor = 90 UnsafeProcessor.isAvailable() ? new UnsafeProcessor() : new SafeProcessor(); 91 92 /** 93 * A mask used when performing unsafe reads to determine if a long value contains any non-ASCII 94 * characters (i.e. any byte >= 0x80). 95 */ 96 private static final long ASCII_MASK_LONG = 0x8080808080808080L; 97 98 /** 99 * Maximum number of bytes per Java UTF-16 char in UTF-8. 100 * @see java.nio.charset.CharsetEncoder#maxBytesPerChar() 101 */ 102 static final int MAX_BYTES_PER_CHAR = 3; 103 104 /** 105 * State value indicating that the byte sequence is well-formed and 106 * complete (no further bytes are needed to complete a character). 107 */ 108 public static final int COMPLETE = 0; 109 110 /** 111 * State value indicating that the byte sequence is definitely not 112 * well-formed. 113 */ 114 public static final int MALFORMED = -1; 115 116 /** 117 * Used by {@code Unsafe} UTF-8 string validation logic to determine the minimum string length 118 * above which to employ an optimized algorithm for counting ASCII characters. The reason for this 119 * threshold is that for small strings, the optimization may not be beneficial or may even 120 * negatively impact performance since it requires additional logic to avoid unaligned reads 121 * (when calling {@code Unsafe.getLong}). This threshold guarantees that even if the initial 122 * offset is unaligned, we're guaranteed to make at least one call to {@code Unsafe.getLong()} 123 * which provides a performance improvement that entirely subsumes the cost of the additional 124 * logic. 125 */ 126 private static final int UNSAFE_COUNT_ASCII_THRESHOLD = 16; 127 128 // Other state values include the partial bytes of the incomplete 129 // character to be decoded in the simplest way: we pack the bytes 130 // into the state int in little-endian order. For example: 131 // 132 // int state = byte1 ^ (byte2 << 8) ^ (byte3 << 16); 133 // 134 // Such a state is unpacked thus (note the ~ operation for byte2 to 135 // undo byte1's sign-extension bits): 136 // 137 // int byte1 = (byte) state; 138 // int byte2 = (byte) ~(state >> 8); 139 // int byte3 = (byte) (state >> 16); 140 // 141 // We cannot store a zero byte in the state because it would be 142 // indistinguishable from the absence of a byte. But we don't need 143 // to, because partial bytes must always be negative. When building 144 // a state, we ensure that byte1 is negative and subsequent bytes 145 // are valid trailing bytes. 146 147 /** 148 * Returns {@code true} if the given byte array is a well-formed 149 * UTF-8 byte sequence. 150 * 151 * <p>This is a convenience method, equivalent to a call to {@code 152 * isValidUtf8(bytes, 0, bytes.length)}. 153 */ isValidUtf8(byte[] bytes)154 public static boolean isValidUtf8(byte[] bytes) { 155 return processor.isValidUtf8(bytes, 0, bytes.length); 156 } 157 158 /** 159 * Returns {@code true} if the given byte array slice is a 160 * well-formed UTF-8 byte sequence. The range of bytes to be 161 * checked extends from index {@code index}, inclusive, to {@code 162 * limit}, exclusive. 163 * 164 * <p>This is a convenience method, equivalent to {@code 165 * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. 166 */ isValidUtf8(byte[] bytes, int index, int limit)167 public static boolean isValidUtf8(byte[] bytes, int index, int limit) { 168 return processor.isValidUtf8(bytes, index, limit); 169 } 170 171 /** 172 * Tells whether the given byte array slice is a well-formed, 173 * malformed, or incomplete UTF-8 byte sequence. The range of bytes 174 * to be checked extends from index {@code index}, inclusive, to 175 * {@code limit}, exclusive. 176 * 177 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding 178 * operation) or the value returned from a call to a partial decoding method 179 * for the previous bytes 180 * 181 * @return {@link #MALFORMED} if the partial byte sequence is 182 * definitely not well-formed, {@link #COMPLETE} if it is well-formed 183 * (no additional input needed), or if the byte sequence is 184 * "incomplete", i.e. apparently terminated in the middle of a character, 185 * an opaque integer "state" value containing enough information to 186 * decode the character when passed to a subsequent invocation of a 187 * partial decoding method. 188 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)189 public static int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 190 return processor.partialIsValidUtf8(state, bytes, index, limit); 191 } 192 incompleteStateFor(int byte1)193 private static int incompleteStateFor(int byte1) { 194 return (byte1 > (byte) 0xF4) ? 195 MALFORMED : byte1; 196 } 197 incompleteStateFor(int byte1, int byte2)198 private static int incompleteStateFor(int byte1, int byte2) { 199 return (byte1 > (byte) 0xF4 || 200 byte2 > (byte) 0xBF) ? 201 MALFORMED : byte1 ^ (byte2 << 8); 202 } 203 incompleteStateFor(int byte1, int byte2, int byte3)204 private static int incompleteStateFor(int byte1, int byte2, int byte3) { 205 return (byte1 > (byte) 0xF4 || 206 byte2 > (byte) 0xBF || 207 byte3 > (byte) 0xBF) ? 208 MALFORMED : byte1 ^ (byte2 << 8) ^ (byte3 << 16); 209 } 210 incompleteStateFor(byte[] bytes, int index, int limit)211 private static int incompleteStateFor(byte[] bytes, int index, int limit) { 212 int byte1 = bytes[index - 1]; 213 switch (limit - index) { 214 case 0: return incompleteStateFor(byte1); 215 case 1: return incompleteStateFor(byte1, bytes[index]); 216 case 2: return incompleteStateFor(byte1, bytes[index], bytes[index + 1]); 217 default: throw new AssertionError(); 218 } 219 } 220 incompleteStateFor( final ByteBuffer buffer, final int byte1, final int index, final int remaining)221 private static int incompleteStateFor( 222 final ByteBuffer buffer, final int byte1, final int index, final int remaining) { 223 switch (remaining) { 224 case 0: 225 return incompleteStateFor(byte1); 226 case 1: 227 return incompleteStateFor(byte1, buffer.get(index)); 228 case 2: 229 return incompleteStateFor(byte1, buffer.get(index), buffer.get(index + 1)); 230 default: 231 throw new AssertionError(); 232 } 233 } 234 235 // These UTF-8 handling methods are copied from Guava's Utf8 class with a modification to throw 236 // a protocol buffer local exception. This exception is then caught in CodedOutputStream so it can 237 // fallback to more lenient behavior. 238 239 static class UnpairedSurrogateException extends IllegalArgumentException { UnpairedSurrogateException(int index, int length)240 private UnpairedSurrogateException(int index, int length) { 241 super("Unpaired surrogate at index " + index + " of " + length); 242 } 243 } 244 245 /** 246 * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, 247 * this method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in 248 * both time and space. 249 * 250 * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired 251 * surrogates) 252 */ encodedLength(CharSequence sequence)253 static int encodedLength(CharSequence sequence) { 254 // Warning to maintainers: this implementation is highly optimized. 255 int utf16Length = sequence.length(); 256 int utf8Length = utf16Length; 257 int i = 0; 258 259 // This loop optimizes for pure ASCII. 260 while (i < utf16Length && sequence.charAt(i) < 0x80) { 261 i++; 262 } 263 264 // This loop optimizes for chars less than 0x800. 265 for (; i < utf16Length; i++) { 266 char c = sequence.charAt(i); 267 if (c < 0x800) { 268 utf8Length += ((0x7f - c) >>> 31); // branch free! 269 } else { 270 utf8Length += encodedLengthGeneral(sequence, i); 271 break; 272 } 273 } 274 275 if (utf8Length < utf16Length) { 276 // Necessary and sufficient condition for overflow because of maximum 3x expansion 277 throw new IllegalArgumentException("UTF-8 length does not fit in int: " 278 + (utf8Length + (1L << 32))); 279 } 280 return utf8Length; 281 } 282 encodedLengthGeneral(CharSequence sequence, int start)283 private static int encodedLengthGeneral(CharSequence sequence, int start) { 284 int utf16Length = sequence.length(); 285 int utf8Length = 0; 286 for (int i = start; i < utf16Length; i++) { 287 char c = sequence.charAt(i); 288 if (c < 0x800) { 289 utf8Length += (0x7f - c) >>> 31; // branch free! 290 } else { 291 utf8Length += 2; 292 // jdk7+: if (Character.isSurrogate(c)) { 293 if (Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) { 294 // Check that we have a well-formed surrogate pair. 295 int cp = Character.codePointAt(sequence, i); 296 if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 297 throw new UnpairedSurrogateException(i, utf16Length); 298 } 299 i++; 300 } 301 } 302 } 303 return utf8Length; 304 } 305 encode(CharSequence in, byte[] out, int offset, int length)306 static int encode(CharSequence in, byte[] out, int offset, int length) { 307 return processor.encodeUtf8(in, out, offset, length); 308 } 309 // End Guava UTF-8 methods. 310 311 /** 312 * Determines if the given {@link ByteBuffer} is a valid UTF-8 string. 313 * 314 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 315 * and the capabilities of the platform. 316 * 317 * @param buffer the buffer to check. 318 * @see Utf8#isValidUtf8(byte[], int, int) 319 */ isValidUtf8(ByteBuffer buffer)320 static boolean isValidUtf8(ByteBuffer buffer) { 321 return processor.isValidUtf8(buffer, buffer.position(), buffer.remaining()); 322 } 323 324 /** 325 * Determines if the given {@link ByteBuffer} is a partially valid UTF-8 string. 326 * 327 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 328 * and the capabilities of the platform. 329 * 330 * @param buffer the buffer to check. 331 * @see Utf8#partialIsValidUtf8(int, byte[], int, int) 332 */ partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit)333 static int partialIsValidUtf8(int state, ByteBuffer buffer, int index, int limit) { 334 return processor.partialIsValidUtf8(state, buffer, index, limit); 335 } 336 337 /** 338 * Encodes the given characters to the target {@link ByteBuffer} using UTF-8 encoding. 339 * 340 * <p>Selects an optimal algorithm based on the type of {@link ByteBuffer} (i.e. heap or direct) 341 * and the capabilities of the platform. 342 * 343 * @param in the source string to be encoded 344 * @param out the target buffer to receive the encoded string. 345 * @see Utf8#encode(CharSequence, byte[], int, int) 346 */ encodeUtf8(CharSequence in, ByteBuffer out)347 static void encodeUtf8(CharSequence in, ByteBuffer out) { 348 processor.encodeUtf8(in, out); 349 } 350 351 /** 352 * Counts (approximately) the number of consecutive ASCII characters in the given buffer. 353 * The byte order of the {@link ByteBuffer} does not matter, so performance can be improved if 354 * native byte order is used (i.e. no byte-swapping in {@link ByteBuffer#getLong(int)}). 355 * 356 * @param buffer the buffer to be scanned for ASCII chars 357 * @param index the starting index of the scan 358 * @param limit the limit within buffer for the scan 359 * @return the number of ASCII characters found. The stopping position will be at or 360 * before the first non-ASCII byte. 361 */ estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit)362 private static int estimateConsecutiveAscii(ByteBuffer buffer, int index, int limit) { 363 int i = index; 364 final int lim = limit - 7; 365 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 366 // To speed things up further, we're reading longs instead of bytes so we use a mask to 367 // determine if any byte in the current long is non-ASCII. 368 for (; i < lim && (buffer.getLong(i) & ASCII_MASK_LONG) == 0; i += 8) {} 369 return i - index; 370 } 371 372 /** 373 * A processor of UTF-8 strings, providing methods for checking validity and encoding. 374 */ 375 // TODO(nathanmittler): Add support for Memory/MemoryBlock on Android. 376 abstract static class Processor { 377 /** 378 * Returns {@code true} if the given byte array slice is a 379 * well-formed UTF-8 byte sequence. The range of bytes to be 380 * checked extends from index {@code index}, inclusive, to {@code 381 * limit}, exclusive. 382 * 383 * <p>This is a convenience method, equivalent to {@code 384 * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. 385 */ isValidUtf8(byte[] bytes, int index, int limit)386 final boolean isValidUtf8(byte[] bytes, int index, int limit) { 387 return partialIsValidUtf8(COMPLETE, bytes, index, limit) == COMPLETE; 388 } 389 390 /** 391 * Tells whether the given byte array slice is a well-formed, 392 * malformed, or incomplete UTF-8 byte sequence. The range of bytes 393 * to be checked extends from index {@code index}, inclusive, to 394 * {@code limit}, exclusive. 395 * 396 * @param state either {@link Utf8#COMPLETE} (if this is the initial decoding 397 * operation) or the value returned from a call to a partial decoding method 398 * for the previous bytes 399 * 400 * @return {@link #MALFORMED} if the partial byte sequence is 401 * definitely not well-formed, {@link #COMPLETE} if it is well-formed 402 * (no additional input needed), or if the byte sequence is 403 * "incomplete", i.e. apparently terminated in the middle of a character, 404 * an opaque integer "state" value containing enough information to 405 * decode the character when passed to a subsequent invocation of a 406 * partial decoding method. 407 */ partialIsValidUtf8(int state, byte[] bytes, int index, int limit)408 abstract int partialIsValidUtf8(int state, byte[] bytes, int index, int limit); 409 410 /** 411 * Returns {@code true} if the given portion of the {@link ByteBuffer} is a 412 * well-formed UTF-8 byte sequence. The range of bytes to be 413 * checked extends from index {@code index}, inclusive, to {@code 414 * limit}, exclusive. 415 * 416 * <p>This is a convenience method, equivalent to {@code 417 * partialIsValidUtf8(bytes, index, limit) == Utf8.COMPLETE}. 418 */ isValidUtf8(ByteBuffer buffer, int index, int limit)419 final boolean isValidUtf8(ByteBuffer buffer, int index, int limit) { 420 return partialIsValidUtf8(COMPLETE, buffer, index, limit) == COMPLETE; 421 } 422 423 /** 424 * Indicates whether or not the given buffer contains a valid UTF-8 string. 425 * 426 * @param buffer the buffer to check. 427 * @return {@code true} if the given buffer contains a valid UTF-8 string. 428 */ partialIsValidUtf8( final int state, final ByteBuffer buffer, int index, final int limit)429 final int partialIsValidUtf8( 430 final int state, final ByteBuffer buffer, int index, final int limit) { 431 if (buffer.hasArray()) { 432 final int offset = buffer.arrayOffset(); 433 return partialIsValidUtf8(state, buffer.array(), offset + index, offset + limit); 434 } else if (buffer.isDirect()){ 435 return partialIsValidUtf8Direct(state, buffer, index, limit); 436 } 437 return partialIsValidUtf8Default(state, buffer, index, limit); 438 } 439 440 /** 441 * Performs validation for direct {@link ByteBuffer} instances. 442 */ partialIsValidUtf8Direct( final int state, final ByteBuffer buffer, int index, final int limit)443 abstract int partialIsValidUtf8Direct( 444 final int state, final ByteBuffer buffer, int index, final int limit); 445 446 /** 447 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 448 * than potentially faster approaches. This first completes validation for the current 449 * character (provided by {@code state}) and then finishes validation for the sequence. 450 */ partialIsValidUtf8Default( final int state, final ByteBuffer buffer, int index, final int limit)451 final int partialIsValidUtf8Default( 452 final int state, final ByteBuffer buffer, int index, final int limit) { 453 if (state != COMPLETE) { 454 // The previous decoding operation was incomplete (or malformed). 455 // We look for a well-formed sequence consisting of bytes from 456 // the previous decoding operation (stored in state) together 457 // with bytes from the array slice. 458 // 459 // We expect such "straddler characters" to be rare. 460 461 if (index >= limit) { // No bytes? No progress. 462 return state; 463 } 464 465 byte byte1 = (byte) state; 466 // byte1 is never ASCII. 467 if (byte1 < (byte) 0xE0) { 468 // two-byte form 469 470 // Simultaneously checks for illegal trailing-byte in 471 // leading position and overlong 2-byte form. 472 if (byte1 < (byte) 0xC2 473 // byte2 trailing-byte test 474 || buffer.get(index++) > (byte) 0xBF) { 475 return MALFORMED; 476 } 477 } else if (byte1 < (byte) 0xF0) { 478 // three-byte form 479 480 // Get byte2 from saved state or array 481 byte byte2 = (byte) ~(state >> 8); 482 if (byte2 == 0) { 483 byte2 = buffer.get(index++); 484 if (index >= limit) { 485 return incompleteStateFor(byte1, byte2); 486 } 487 } 488 if (byte2 > (byte) 0xBF 489 // overlong? 5 most significant bits must not all be zero 490 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 491 // illegal surrogate codepoint? 492 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 493 // byte3 trailing-byte test 494 || buffer.get(index++) > (byte) 0xBF) { 495 return MALFORMED; 496 } 497 } else { 498 // four-byte form 499 500 // Get byte2 and byte3 from saved state or array 501 byte byte2 = (byte) ~(state >> 8); 502 byte byte3 = 0; 503 if (byte2 == 0) { 504 byte2 = buffer.get(index++); 505 if (index >= limit) { 506 return incompleteStateFor(byte1, byte2); 507 } 508 } else { 509 byte3 = (byte) (state >> 16); 510 } 511 if (byte3 == 0) { 512 byte3 = buffer.get(index++); 513 if (index >= limit) { 514 return incompleteStateFor(byte1, byte2, byte3); 515 } 516 } 517 518 // If we were called with state == MALFORMED, then byte1 is 0xFF, 519 // which never occurs in well-formed UTF-8, and so we will return 520 // MALFORMED again below. 521 522 if (byte2 > (byte) 0xBF 523 // Check that 1 <= plane <= 16. Tricky optimized form of: 524 // if (byte1 > (byte) 0xF4 || 525 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 526 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 527 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 528 // byte3 trailing-byte test 529 || byte3 > (byte) 0xBF 530 // byte4 trailing-byte test 531 || buffer.get(index++) > (byte) 0xBF) { 532 return MALFORMED; 533 } 534 } 535 } 536 537 // Finish validation for the sequence. 538 return partialIsValidUtf8(buffer, index, limit); 539 } 540 541 /** 542 * Performs validation for {@link ByteBuffer} instances using the {@link ByteBuffer} API rather 543 * than potentially faster approaches. 544 */ partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit)545 private static int partialIsValidUtf8(final ByteBuffer buffer, int index, final int limit) { 546 index += estimateConsecutiveAscii(buffer, index, limit); 547 548 for (;;) { 549 // Optimize for interior runs of ASCII bytes. 550 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 551 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 552 int byte1; 553 do { 554 if (index >= limit) { 555 return COMPLETE; 556 } 557 } while ((byte1 = buffer.get(index++)) >= 0); 558 559 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 560 if (byte1 < (byte) 0xE0) { 561 // Two-byte form (110xxxxx 10xxxxxx) 562 if (index >= limit) { 563 // Incomplete sequence 564 return byte1; 565 } 566 567 // Simultaneously checks for illegal trailing-byte in 568 // leading position and overlong 2-byte form. 569 if (byte1 < (byte) 0xC2 || buffer.get(index) > (byte) 0xBF) { 570 return MALFORMED; 571 } 572 index++; 573 } else if (byte1 < (byte) 0xF0) { 574 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 575 if (index >= limit - 1) { 576 // Incomplete sequence 577 return incompleteStateFor(buffer, byte1, index, limit - index); 578 } 579 580 final byte byte2 = buffer.get(index++); 581 if (byte2 > (byte) 0xBF 582 // overlong? 5 most significant bits must not all be zero 583 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 584 // check for illegal surrogate codepoints 585 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 586 // byte3 trailing-byte test 587 || buffer.get(index) > (byte) 0xBF) { 588 return MALFORMED; 589 } 590 index++; 591 } else { 592 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 593 if (index >= limit - 2) { 594 // Incomplete sequence 595 return incompleteStateFor(buffer, byte1, index, limit - index); 596 } 597 598 // TODO(nathanmittler): Consider using getInt() to improve performance. 599 final int byte2 = buffer.get(index++); 600 if (byte2 > (byte) 0xBF 601 // Check that 1 <= plane <= 16. Tricky optimized form of: 602 // if (byte1 > (byte) 0xF4 || 603 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 604 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 605 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 606 // byte3 trailing-byte test 607 || buffer.get(index++) > (byte) 0xBF 608 // byte4 trailing-byte test 609 || buffer.get(index++) > (byte) 0xBF) { 610 return MALFORMED; 611 } 612 } 613 } 614 } 615 616 /** 617 * Encodes an input character sequence ({@code in}) to UTF-8 in the target array ({@code out}). 618 * For a string, this method is similar to 619 * <pre>{@code 620 * byte[] a = string.getBytes(UTF_8); 621 * System.arraycopy(a, 0, bytes, offset, a.length); 622 * return offset + a.length; 623 * }</pre> 624 * 625 * but is more efficient in both time and space. One key difference is that this method 626 * requires paired surrogates, and therefore does not support chunking. 627 * While {@code String.getBytes(UTF_8)} replaces unpaired surrogates with the default 628 * replacement character, this method throws {@link UnpairedSurrogateException}. 629 * 630 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 631 * compute the exact amount needed, or leave room for 632 * {@code Utf8.MAX_BYTES_PER_CHAR * sequence.length()}, which is the largest possible number 633 * of bytes that any input can be encoded to. 634 * 635 * @param in the input character sequence to be encoded 636 * @param out the target array 637 * @param offset the starting offset in {@code bytes} to start writing at 638 * @param length the length of the {@code bytes}, starting from {@code offset} 639 * @throws UnpairedSurrogateException if {@code sequence} contains ill-formed UTF-16 (unpaired 640 * surrogates) 641 * @throws ArrayIndexOutOfBoundsException if {@code sequence} encoded in UTF-8 is longer than 642 * {@code bytes.length - offset} 643 * @return the new offset, equivalent to {@code offset + Utf8.encodedLength(sequence)} 644 */ encodeUtf8(CharSequence in, byte[] out, int offset, int length)645 abstract int encodeUtf8(CharSequence in, byte[] out, int offset, int length); 646 647 /** 648 * Encodes an input character sequence ({@code in}) to UTF-8 in the target buffer ({@code out}). 649 * Upon returning from this method, the {@code out} position will point to the position after 650 * the last encoded byte. This method requires paired surrogates, and therefore does not 651 * support chunking. 652 * 653 * <p>To ensure sufficient space in the output buffer, either call {@link #encodedLength} to 654 * compute the exact amount needed, or leave room for 655 * {@code Utf8.MAX_BYTES_PER_CHAR * in.length()}, which is the largest possible number 656 * of bytes that any input can be encoded to. 657 * 658 * @param in the source character sequence to be encoded 659 * @param out the target buffer 660 * @throws UnpairedSurrogateException if {@code in} contains ill-formed UTF-16 (unpaired 661 * surrogates) 662 * @throws ArrayIndexOutOfBoundsException if {@code in} encoded in UTF-8 is longer than 663 * {@code out.remaining()} 664 */ encodeUtf8(CharSequence in, ByteBuffer out)665 final void encodeUtf8(CharSequence in, ByteBuffer out) { 666 if (out.hasArray()) { 667 final int offset = out.arrayOffset(); 668 int endIndex = 669 Utf8.encode(in, out.array(), offset + out.position(), out.remaining()); 670 out.position(endIndex - offset); 671 } else if (out.isDirect()) { 672 encodeUtf8Direct(in, out); 673 } else { 674 encodeUtf8Default(in, out); 675 } 676 } 677 678 /** 679 * Encodes the input character sequence to a direct {@link ByteBuffer} instance. 680 */ encodeUtf8Direct(CharSequence in, ByteBuffer out)681 abstract void encodeUtf8Direct(CharSequence in, ByteBuffer out); 682 683 /** 684 * Encodes the input character sequence to a {@link ByteBuffer} instance using the {@link 685 * ByteBuffer} API, rather than potentially faster approaches. 686 */ encodeUtf8Default(CharSequence in, ByteBuffer out)687 final void encodeUtf8Default(CharSequence in, ByteBuffer out) { 688 final int inLength = in.length(); 689 int outIx = out.position(); 690 int inIx = 0; 691 692 // Since ByteBuffer.putXXX() already checks boundaries for us, no need to explicitly check 693 // access. Assume the buffer is big enough and let it handle the out of bounds exception 694 // if it occurs. 695 try { 696 // Designed to take advantage of 697 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 698 for (char c; inIx < inLength && (c = in.charAt(inIx)) < 0x80; ++inIx) { 699 out.put(outIx + inIx, (byte) c); 700 } 701 if (inIx == inLength) { 702 // Successfully encoded the entire string. 703 out.position(outIx + inIx); 704 return; 705 } 706 707 outIx += inIx; 708 for (char c; inIx < inLength; ++inIx, ++outIx) { 709 c = in.charAt(inIx); 710 if (c < 0x80) { 711 // One byte (0xxx xxxx) 712 out.put(outIx, (byte) c); 713 } else if (c < 0x800) { 714 // Two bytes (110x xxxx 10xx xxxx) 715 716 // Benchmarks show put performs better than putShort here (for HotSpot). 717 out.put(outIx++, (byte) (0xC0 | (c >>> 6))); 718 out.put(outIx, (byte) (0x80 | (0x3F & c))); 719 } else if (c < MIN_SURROGATE || MAX_SURROGATE < c) { 720 // Three bytes (1110 xxxx 10xx xxxx 10xx xxxx) 721 // Maximum single-char code point is 0xFFFF, 16 bits. 722 723 // Benchmarks show put performs better than putShort here (for HotSpot). 724 out.put(outIx++, (byte) (0xE0 | (c >>> 12))); 725 out.put(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 726 out.put(outIx, (byte) (0x80 | (0x3F & c))); 727 } else { 728 // Four bytes (1111 xxxx 10xx xxxx 10xx xxxx 10xx xxxx) 729 730 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 731 // bytes 732 final char low; 733 if (inIx + 1 == inLength || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 734 throw new UnpairedSurrogateException(inIx, inLength); 735 } 736 // TODO(nathanmittler): Consider using putInt() to improve performance. 737 int codePoint = toCodePoint(c, low); 738 out.put(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 739 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 740 out.put(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 741 out.put(outIx, (byte) (0x80 | (0x3F & codePoint))); 742 } 743 } 744 745 // Successfully encoded the entire string. 746 out.position(outIx); 747 } catch (IndexOutOfBoundsException e) { 748 // TODO(nathanmittler): Consider making the API throw IndexOutOfBoundsException instead. 749 750 // If we failed in the outer ASCII loop, outIx will not have been updated. In this case, 751 // use inIx to determine the bad write index. 752 int badWriteIndex = out.position() + Math.max(inIx, outIx - out.position() + 1); 753 throw new ArrayIndexOutOfBoundsException( 754 "Failed writing " + in.charAt(inIx) + " at index " + badWriteIndex); 755 } 756 } 757 } 758 759 /** 760 * {@link Processor} implementation that does not use any {@code sun.misc.Unsafe} methods. 761 */ 762 static final class SafeProcessor extends Processor { 763 @Override partialIsValidUtf8(int state, byte[] bytes, int index, int limit)764 int partialIsValidUtf8(int state, byte[] bytes, int index, int limit) { 765 if (state != COMPLETE) { 766 // The previous decoding operation was incomplete (or malformed). 767 // We look for a well-formed sequence consisting of bytes from 768 // the previous decoding operation (stored in state) together 769 // with bytes from the array slice. 770 // 771 // We expect such "straddler characters" to be rare. 772 773 if (index >= limit) { // No bytes? No progress. 774 return state; 775 } 776 int byte1 = (byte) state; 777 // byte1 is never ASCII. 778 if (byte1 < (byte) 0xE0) { 779 // two-byte form 780 781 // Simultaneously checks for illegal trailing-byte in 782 // leading position and overlong 2-byte form. 783 if (byte1 < (byte) 0xC2 784 // byte2 trailing-byte test 785 || bytes[index++] > (byte) 0xBF) { 786 return MALFORMED; 787 } 788 } else if (byte1 < (byte) 0xF0) { 789 // three-byte form 790 791 // Get byte2 from saved state or array 792 int byte2 = (byte) ~(state >> 8); 793 if (byte2 == 0) { 794 byte2 = bytes[index++]; 795 if (index >= limit) { 796 return incompleteStateFor(byte1, byte2); 797 } 798 } 799 if (byte2 > (byte) 0xBF 800 // overlong? 5 most significant bits must not all be zero 801 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 802 // illegal surrogate codepoint? 803 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 804 // byte3 trailing-byte test 805 || bytes[index++] > (byte) 0xBF) { 806 return MALFORMED; 807 } 808 } else { 809 // four-byte form 810 811 // Get byte2 and byte3 from saved state or array 812 int byte2 = (byte) ~(state >> 8); 813 int byte3 = 0; 814 if (byte2 == 0) { 815 byte2 = bytes[index++]; 816 if (index >= limit) { 817 return incompleteStateFor(byte1, byte2); 818 } 819 } else { 820 byte3 = (byte) (state >> 16); 821 } 822 if (byte3 == 0) { 823 byte3 = bytes[index++]; 824 if (index >= limit) { 825 return incompleteStateFor(byte1, byte2, byte3); 826 } 827 } 828 829 // If we were called with state == MALFORMED, then byte1 is 0xFF, 830 // which never occurs in well-formed UTF-8, and so we will return 831 // MALFORMED again below. 832 833 if (byte2 > (byte) 0xBF 834 // Check that 1 <= plane <= 16. Tricky optimized form of: 835 // if (byte1 > (byte) 0xF4 || 836 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 837 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 838 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 839 // byte3 trailing-byte test 840 || byte3 > (byte) 0xBF 841 // byte4 trailing-byte test 842 || bytes[index++] > (byte) 0xBF) { 843 return MALFORMED; 844 } 845 } 846 } 847 848 return partialIsValidUtf8(bytes, index, limit); 849 } 850 851 @Override partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit)852 int partialIsValidUtf8Direct(int state, ByteBuffer buffer, int index, int limit) { 853 // For safe processing, we have to use the ByteBuffer API. 854 return partialIsValidUtf8Default(state, buffer, index, limit); 855 } 856 857 @Override encodeUtf8(CharSequence in, byte[] out, int offset, int length)858 int encodeUtf8(CharSequence in, byte[] out, int offset, int length) { 859 int utf16Length = in.length(); 860 int j = offset; 861 int i = 0; 862 int limit = offset + length; 863 // Designed to take advantage of 864 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 865 for (char c; i < utf16Length && i + j < limit && (c = in.charAt(i)) < 0x80; i++) { 866 out[j + i] = (byte) c; 867 } 868 if (i == utf16Length) { 869 return j + utf16Length; 870 } 871 j += i; 872 for (char c; i < utf16Length; i++) { 873 c = in.charAt(i); 874 if (c < 0x80 && j < limit) { 875 out[j++] = (byte) c; 876 } else if (c < 0x800 && j <= limit - 2) { // 11 bits, two UTF-8 bytes 877 out[j++] = (byte) ((0xF << 6) | (c >>> 6)); 878 out[j++] = (byte) (0x80 | (0x3F & c)); 879 } else if ((c < Character.MIN_SURROGATE || Character.MAX_SURROGATE < c) && j <= limit - 3) { 880 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 881 out[j++] = (byte) ((0xF << 5) | (c >>> 12)); 882 out[j++] = (byte) (0x80 | (0x3F & (c >>> 6))); 883 out[j++] = (byte) (0x80 | (0x3F & c)); 884 } else if (j <= limit - 4) { 885 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, 886 // four UTF-8 bytes 887 final char low; 888 if (i + 1 == in.length() 889 || !Character.isSurrogatePair(c, (low = in.charAt(++i)))) { 890 throw new UnpairedSurrogateException((i - 1), utf16Length); 891 } 892 int codePoint = Character.toCodePoint(c, low); 893 out[j++] = (byte) ((0xF << 4) | (codePoint >>> 18)); 894 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 12))); 895 out[j++] = (byte) (0x80 | (0x3F & (codePoint >>> 6))); 896 out[j++] = (byte) (0x80 | (0x3F & codePoint)); 897 } else { 898 // If we are surrogates and we're not a surrogate pair, always throw an 899 // UnpairedSurrogateException instead of an ArrayOutOfBoundsException. 900 if ((Character.MIN_SURROGATE <= c && c <= Character.MAX_SURROGATE) 901 && (i + 1 == in.length() 902 || !Character.isSurrogatePair(c, in.charAt(i + 1)))) { 903 throw new UnpairedSurrogateException(i, utf16Length); 904 } 905 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + j); 906 } 907 } 908 return j; 909 } 910 911 @Override encodeUtf8Direct(CharSequence in, ByteBuffer out)912 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 913 // For safe processing, we have to use the ByteBuffer API. 914 encodeUtf8Default(in, out); 915 } 916 partialIsValidUtf8(byte[] bytes, int index, int limit)917 private static int partialIsValidUtf8(byte[] bytes, int index, int limit) { 918 // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). 919 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 920 while (index < limit && bytes[index] >= 0) { 921 index++; 922 } 923 924 return (index >= limit) ? COMPLETE : partialIsValidUtf8NonAscii(bytes, index, limit); 925 } 926 partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit)927 private static int partialIsValidUtf8NonAscii(byte[] bytes, int index, int limit) { 928 for (;;) { 929 int byte1, byte2; 930 931 // Optimize for interior runs of ASCII bytes. 932 do { 933 if (index >= limit) { 934 return COMPLETE; 935 } 936 } while ((byte1 = bytes[index++]) >= 0); 937 938 if (byte1 < (byte) 0xE0) { 939 // two-byte form 940 941 if (index >= limit) { 942 // Incomplete sequence 943 return byte1; 944 } 945 946 // Simultaneously checks for illegal trailing-byte in 947 // leading position and overlong 2-byte form. 948 if (byte1 < (byte) 0xC2 949 || bytes[index++] > (byte) 0xBF) { 950 return MALFORMED; 951 } 952 } else if (byte1 < (byte) 0xF0) { 953 // three-byte form 954 955 if (index >= limit - 1) { // incomplete sequence 956 return incompleteStateFor(bytes, index, limit); 957 } 958 if ((byte2 = bytes[index++]) > (byte) 0xBF 959 // overlong? 5 most significant bits must not all be zero 960 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 961 // check for illegal surrogate codepoints 962 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 963 // byte3 trailing-byte test 964 || bytes[index++] > (byte) 0xBF) { 965 return MALFORMED; 966 } 967 } else { 968 // four-byte form 969 970 if (index >= limit - 2) { // incomplete sequence 971 return incompleteStateFor(bytes, index, limit); 972 } 973 if ((byte2 = bytes[index++]) > (byte) 0xBF 974 // Check that 1 <= plane <= 16. Tricky optimized form of: 975 // if (byte1 > (byte) 0xF4 || 976 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 977 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 978 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 979 // byte3 trailing-byte test 980 || bytes[index++] > (byte) 0xBF 981 // byte4 trailing-byte test 982 || bytes[index++] > (byte) 0xBF) { 983 return MALFORMED; 984 } 985 } 986 } 987 } 988 } 989 990 /** 991 * {@link Processor} that uses {@code sun.misc.Unsafe} where possible to improve performance. 992 */ 993 static final class UnsafeProcessor extends Processor { 994 private static final sun.misc.Unsafe UNSAFE = getUnsafe(); 995 private static final long BUFFER_ADDRESS_OFFSET = 996 fieldOffset(field(Buffer.class, "address")); 997 private static final int ARRAY_BASE_OFFSET = byteArrayBaseOffset(); 998 999 /** 1000 * We only use Unsafe operations if we have access to direct {@link ByteBuffer}'s address 1001 * and the array base offset is a multiple of 8 (needed by Unsafe.getLong()). 1002 */ 1003 private static final boolean AVAILABLE = 1004 BUFFER_ADDRESS_OFFSET != -1 && ARRAY_BASE_OFFSET % 8 == 0; 1005 1006 /** 1007 * Indicates whether or not all required unsafe operations are supported on this platform. 1008 */ isAvailable()1009 static boolean isAvailable() { 1010 return AVAILABLE; 1011 } 1012 1013 @Override partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit)1014 int partialIsValidUtf8(int state, byte[] bytes, final int index, final int limit) { 1015 if ((index | limit | bytes.length - limit) < 0) { 1016 throw new ArrayIndexOutOfBoundsException( 1017 String.format("Array length=%d, index=%d, limit=%d", bytes.length, index, limit)); 1018 } 1019 long offset = ARRAY_BASE_OFFSET + index; 1020 final long offsetLimit = ARRAY_BASE_OFFSET + limit; 1021 if (state != COMPLETE) { 1022 // The previous decoding operation was incomplete (or malformed). 1023 // We look for a well-formed sequence consisting of bytes from 1024 // the previous decoding operation (stored in state) together 1025 // with bytes from the array slice. 1026 // 1027 // We expect such "straddler characters" to be rare. 1028 1029 if (offset >= offsetLimit) { // No bytes? No progress. 1030 return state; 1031 } 1032 int byte1 = (byte) state; 1033 // byte1 is never ASCII. 1034 if (byte1 < (byte) 0xE0) { 1035 // two-byte form 1036 1037 // Simultaneously checks for illegal trailing-byte in 1038 // leading position and overlong 2-byte form. 1039 if (byte1 < (byte) 0xC2 1040 // byte2 trailing-byte test 1041 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1042 return MALFORMED; 1043 } 1044 } else if (byte1 < (byte) 0xF0) { 1045 // three-byte form 1046 1047 // Get byte2 from saved state or array 1048 int byte2 = (byte) ~(state >> 8); 1049 if (byte2 == 0) { 1050 byte2 = UNSAFE.getByte(bytes, offset++); 1051 if (offset >= offsetLimit) { 1052 return incompleteStateFor(byte1, byte2); 1053 } 1054 } 1055 if (byte2 > (byte) 0xBF 1056 // overlong? 5 most significant bits must not all be zero 1057 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1058 // illegal surrogate codepoint? 1059 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1060 // byte3 trailing-byte test 1061 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1062 return MALFORMED; 1063 } 1064 } else { 1065 // four-byte form 1066 1067 // Get byte2 and byte3 from saved state or array 1068 int byte2 = (byte) ~(state >> 8); 1069 int byte3 = 0; 1070 if (byte2 == 0) { 1071 byte2 = UNSAFE.getByte(bytes, offset++); 1072 if (offset >= offsetLimit) { 1073 return incompleteStateFor(byte1, byte2); 1074 } 1075 } else { 1076 byte3 = (byte) (state >> 16); 1077 } 1078 if (byte3 == 0) { 1079 byte3 = UNSAFE.getByte(bytes, offset++); 1080 if (offset >= offsetLimit) { 1081 return incompleteStateFor(byte1, byte2, byte3); 1082 } 1083 } 1084 1085 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1086 // which never occurs in well-formed UTF-8, and so we will return 1087 // MALFORMED again below. 1088 1089 if (byte2 > (byte) 0xBF 1090 // Check that 1 <= plane <= 16. Tricky optimized form of: 1091 // if (byte1 > (byte) 0xF4 || 1092 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1093 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1094 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1095 // byte3 trailing-byte test 1096 || byte3 > (byte) 0xBF 1097 // byte4 trailing-byte test 1098 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1099 return MALFORMED; 1100 } 1101 } 1102 } 1103 1104 return partialIsValidUtf8(bytes, offset, (int) (offsetLimit - offset)); 1105 } 1106 1107 @Override partialIsValidUtf8Direct( final int state, ByteBuffer buffer, final int index, final int limit)1108 int partialIsValidUtf8Direct( 1109 final int state, ByteBuffer buffer, final int index, final int limit) { 1110 if ((index | limit | buffer.limit() - limit) < 0) { 1111 throw new ArrayIndexOutOfBoundsException( 1112 String.format("buffer limit=%d, index=%d, limit=%d", buffer.limit(), index, limit)); 1113 } 1114 long address = addressOffset(buffer) + index; 1115 final long addressLimit = address + (limit - index); 1116 if (state != COMPLETE) { 1117 // The previous decoding operation was incomplete (or malformed). 1118 // We look for a well-formed sequence consisting of bytes from 1119 // the previous decoding operation (stored in state) together 1120 // with bytes from the array slice. 1121 // 1122 // We expect such "straddler characters" to be rare. 1123 1124 if (address >= addressLimit) { // No bytes? No progress. 1125 return state; 1126 } 1127 1128 final int byte1 = (byte) state; 1129 // byte1 is never ASCII. 1130 if (byte1 < (byte) 0xE0) { 1131 // two-byte form 1132 1133 // Simultaneously checks for illegal trailing-byte in 1134 // leading position and overlong 2-byte form. 1135 if (byte1 < (byte) 0xC2 1136 // byte2 trailing-byte test 1137 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1138 return MALFORMED; 1139 } 1140 } else if (byte1 < (byte) 0xF0) { 1141 // three-byte form 1142 1143 // Get byte2 from saved state or array 1144 int byte2 = (byte) ~(state >> 8); 1145 if (byte2 == 0) { 1146 byte2 = UNSAFE.getByte(address++); 1147 if (address >= addressLimit) { 1148 return incompleteStateFor(byte1, byte2); 1149 } 1150 } 1151 if (byte2 > (byte) 0xBF 1152 // overlong? 5 most significant bits must not all be zero 1153 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1154 // illegal surrogate codepoint? 1155 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1156 // byte3 trailing-byte test 1157 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1158 return MALFORMED; 1159 } 1160 } else { 1161 // four-byte form 1162 1163 // Get byte2 and byte3 from saved state or array 1164 int byte2 = (byte) ~(state >> 8); 1165 int byte3 = 0; 1166 if (byte2 == 0) { 1167 byte2 = UNSAFE.getByte(address++); 1168 if (address >= addressLimit) { 1169 return incompleteStateFor(byte1, byte2); 1170 } 1171 } else { 1172 byte3 = (byte) (state >> 16); 1173 } 1174 if (byte3 == 0) { 1175 byte3 = UNSAFE.getByte(address++); 1176 if (address >= addressLimit) { 1177 return incompleteStateFor(byte1, byte2, byte3); 1178 } 1179 } 1180 1181 // If we were called with state == MALFORMED, then byte1 is 0xFF, 1182 // which never occurs in well-formed UTF-8, and so we will return 1183 // MALFORMED again below. 1184 1185 if (byte2 > (byte) 0xBF 1186 // Check that 1 <= plane <= 16. Tricky optimized form of: 1187 // if (byte1 > (byte) 0xF4 || 1188 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1189 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1190 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1191 // byte3 trailing-byte test 1192 || byte3 > (byte) 0xBF 1193 // byte4 trailing-byte test 1194 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1195 return MALFORMED; 1196 } 1197 } 1198 } 1199 1200 return partialIsValidUtf8(address, (int) (addressLimit - address)); 1201 } 1202 1203 @Override encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length)1204 int encodeUtf8(final CharSequence in, final byte[] out, final int offset, final int length) { 1205 long outIx = ARRAY_BASE_OFFSET + offset; 1206 final long outLimit = outIx + length; 1207 final int inLimit = in.length(); 1208 if (inLimit > length || out.length - length < offset) { 1209 // Not even enough room for an ASCII-encoded string. 1210 throw new ArrayIndexOutOfBoundsException( 1211 "Failed writing " + in.charAt(inLimit - 1) + " at index " + (offset + length)); 1212 } 1213 1214 // Designed to take advantage of 1215 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1216 int inIx = 0; 1217 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1218 UNSAFE.putByte(out, outIx++, (byte) c); 1219 } 1220 if (inIx == inLimit) { 1221 // We're done, it was ASCII encoded. 1222 return (int) (outIx - ARRAY_BASE_OFFSET); 1223 } 1224 1225 for (char c; inIx < inLimit; ++inIx) { 1226 c = in.charAt(inIx); 1227 if (c < 0x80 && outIx < outLimit) { 1228 UNSAFE.putByte(out, outIx++, (byte) c); 1229 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1230 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1231 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1232 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1233 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1234 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1235 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1236 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & c))); 1237 } else if (outIx <= outLimit - 4L) { 1238 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1239 // bytes 1240 final char low; 1241 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1242 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1243 } 1244 int codePoint = toCodePoint(c, low); 1245 UNSAFE.putByte(out, outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1246 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1247 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1248 UNSAFE.putByte(out, outIx++, (byte) (0x80 | (0x3F & codePoint))); 1249 } else { 1250 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1251 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1252 // We are surrogates and we're not a surrogate pair. 1253 throw new UnpairedSurrogateException(inIx, inLimit); 1254 } 1255 // Not enough space in the output buffer. 1256 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1257 } 1258 } 1259 1260 // All bytes have been encoded. 1261 return (int) (outIx - ARRAY_BASE_OFFSET); 1262 } 1263 1264 @Override encodeUtf8Direct(CharSequence in, ByteBuffer out)1265 void encodeUtf8Direct(CharSequence in, ByteBuffer out) { 1266 final long address = addressOffset(out); 1267 long outIx = address + out.position(); 1268 final long outLimit = address + out.limit(); 1269 final int inLimit = in.length(); 1270 if (inLimit > outLimit - outIx) { 1271 // Not even enough room for an ASCII-encoded string. 1272 throw new ArrayIndexOutOfBoundsException( 1273 "Failed writing " + in.charAt(inLimit - 1) + " at index " + out.limit()); 1274 } 1275 1276 // Designed to take advantage of 1277 // https://wikis.oracle.com/display/HotSpotInternals/RangeCheckElimination 1278 int inIx = 0; 1279 for (char c; inIx < inLimit && (c = in.charAt(inIx)) < 0x80; ++inIx) { 1280 UNSAFE.putByte(outIx++, (byte) c); 1281 } 1282 if (inIx == inLimit) { 1283 // We're done, it was ASCII encoded. 1284 out.position((int) (outIx - address)); 1285 return; 1286 } 1287 1288 for (char c; inIx < inLimit; ++inIx) { 1289 c = in.charAt(inIx); 1290 if (c < 0x80 && outIx < outLimit) { 1291 UNSAFE.putByte(outIx++, (byte) c); 1292 } else if (c < 0x800 && outIx <= outLimit - 2L) { // 11 bits, two UTF-8 bytes 1293 UNSAFE.putByte(outIx++, (byte) ((0xF << 6) | (c >>> 6))); 1294 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1295 } else if ((c < MIN_SURROGATE || MAX_SURROGATE < c) && outIx <= outLimit - 3L) { 1296 // Maximum single-char code point is 0xFFFF, 16 bits, three UTF-8 bytes 1297 UNSAFE.putByte(outIx++, (byte) ((0xF << 5) | (c >>> 12))); 1298 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (c >>> 6)))); 1299 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & c))); 1300 } else if (outIx <= outLimit - 4L) { 1301 // Minimum code point represented by a surrogate pair is 0x10000, 17 bits, four UTF-8 1302 // bytes 1303 final char low; 1304 if (inIx + 1 == inLimit || !isSurrogatePair(c, (low = in.charAt(++inIx)))) { 1305 throw new UnpairedSurrogateException((inIx - 1), inLimit); 1306 } 1307 int codePoint = toCodePoint(c, low); 1308 UNSAFE.putByte(outIx++, (byte) ((0xF << 4) | (codePoint >>> 18))); 1309 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); 1310 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); 1311 UNSAFE.putByte(outIx++, (byte) (0x80 | (0x3F & codePoint))); 1312 } else { 1313 if ((MIN_SURROGATE <= c && c <= MAX_SURROGATE) 1314 && (inIx + 1 == inLimit || !isSurrogatePair(c, in.charAt(inIx + 1)))) { 1315 // We are surrogates and we're not a surrogate pair. 1316 throw new UnpairedSurrogateException(inIx, inLimit); 1317 } 1318 // Not enough space in the output buffer. 1319 throw new ArrayIndexOutOfBoundsException("Failed writing " + c + " at index " + outIx); 1320 } 1321 } 1322 1323 // All bytes have been encoded. 1324 out.position((int) (outIx - address)); 1325 } 1326 1327 /** 1328 * Counts (approximately) the number of consecutive ASCII characters starting from the given 1329 * position, using the most efficient method available to the platform. 1330 * 1331 * @param bytes the array containing the character sequence 1332 * @param offset the offset position of the index (same as index + arrayBaseOffset) 1333 * @param maxChars the maximum number of characters to count 1334 * @return the number of ASCII characters found. The stopping position will be at or 1335 * before the first non-ASCII byte. 1336 */ unsafeEstimateConsecutiveAscii( byte[] bytes, long offset, final int maxChars)1337 private static int unsafeEstimateConsecutiveAscii( 1338 byte[] bytes, long offset, final int maxChars) { 1339 int remaining = maxChars; 1340 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1341 // Don't bother with small strings. 1342 return 0; 1343 } 1344 1345 // Read bytes until 8-byte aligned so that we can read longs in the loop below. 1346 // Byte arrays are already either 8 or 16-byte aligned, so we just need to make sure that 1347 // the index (relative to the start of the array) is also 8-byte aligned. We do this by 1348 // ANDing the index with 7 to determine the number of bytes that need to be read before 1349 // we're 8-byte aligned. 1350 final int unaligned = (int) offset & 7; 1351 for (int j = unaligned; j > 0; j--) { 1352 if (UNSAFE.getByte(bytes, offset++) < 0) { 1353 return unaligned - j; 1354 } 1355 } 1356 1357 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1358 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1359 // determine if any byte in the current long is non-ASCII. 1360 remaining -= unaligned; 1361 for (; remaining >= 8 && (UNSAFE.getLong(bytes, offset) & ASCII_MASK_LONG) == 0; 1362 offset += 8, remaining -= 8) {} 1363 return maxChars - remaining; 1364 } 1365 1366 /** 1367 * Same as {@link Utf8#estimateConsecutiveAscii(ByteBuffer, int, int)} except that it uses the 1368 * most efficient method available to the platform. 1369 */ unsafeEstimateConsecutiveAscii(long address, final int maxChars)1370 private static int unsafeEstimateConsecutiveAscii(long address, final int maxChars) { 1371 int remaining = maxChars; 1372 if (remaining < UNSAFE_COUNT_ASCII_THRESHOLD) { 1373 // Don't bother with small strings. 1374 return 0; 1375 } 1376 1377 // Read bytes until 8-byte aligned so that we can read longs in the loop below. 1378 // We do this by ANDing the address with 7 to determine the number of bytes that need to 1379 // be read before we're 8-byte aligned. 1380 final int unaligned = (int) address & 7; 1381 for (int j = unaligned; j > 0; j--) { 1382 if (UNSAFE.getByte(address++) < 0) { 1383 return unaligned - j; 1384 } 1385 } 1386 1387 // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). 1388 // To speed things up further, we're reading longs instead of bytes so we use a mask to 1389 // determine if any byte in the current long is non-ASCII. 1390 remaining -= unaligned; 1391 for (; remaining >= 8 && (UNSAFE.getLong(address) & ASCII_MASK_LONG) == 0; 1392 address += 8, remaining -= 8) {} 1393 return maxChars - remaining; 1394 } 1395 partialIsValidUtf8(final byte[] bytes, long offset, int remaining)1396 private static int partialIsValidUtf8(final byte[] bytes, long offset, int remaining) { 1397 // Skip past ASCII characters as quickly as possible. 1398 final int skipped = unsafeEstimateConsecutiveAscii(bytes, offset, remaining); 1399 remaining -= skipped; 1400 offset += skipped; 1401 1402 for (;;) { 1403 // Optimize for interior runs of ASCII bytes. 1404 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 1405 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1406 int byte1 = 0; 1407 for (; remaining > 0 && (byte1 = UNSAFE.getByte(bytes, offset++)) >= 0; --remaining) { 1408 } 1409 if (remaining == 0) { 1410 return COMPLETE; 1411 } 1412 remaining--; 1413 1414 // If we're here byte1 is not ASCII. Only need to handle 2-4 byte forms. 1415 if (byte1 < (byte) 0xE0) { 1416 // Two-byte form (110xxxxx 10xxxxxx) 1417 if (remaining == 0) { 1418 // Incomplete sequence 1419 return byte1; 1420 } 1421 remaining--; 1422 1423 // Simultaneously checks for illegal trailing-byte in 1424 // leading position and overlong 2-byte form. 1425 if (byte1 < (byte) 0xC2 1426 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1427 return MALFORMED; 1428 } 1429 } else if (byte1 < (byte) 0xF0) { 1430 // Three-byte form (1110xxxx 10xxxxxx 10xxxxxx) 1431 if (remaining < 2) { 1432 // Incomplete sequence 1433 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1434 } 1435 remaining -= 2; 1436 1437 final int byte2; 1438 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF 1439 // overlong? 5 most significant bits must not all be zero 1440 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1441 // check for illegal surrogate codepoints 1442 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1443 // byte3 trailing-byte test 1444 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1445 return MALFORMED; 1446 } 1447 } else { 1448 // Four-byte form (1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx) 1449 if (remaining < 3) { 1450 // Incomplete sequence 1451 return unsafeIncompleteStateFor(bytes, byte1, offset, remaining); 1452 } 1453 remaining -= 3; 1454 1455 final int byte2; 1456 if ((byte2 = UNSAFE.getByte(bytes, offset++)) > (byte) 0xBF 1457 // Check that 1 <= plane <= 16. Tricky optimized form of: 1458 // if (byte1 > (byte) 0xF4 || 1459 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1460 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1461 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1462 // byte3 trailing-byte test 1463 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF 1464 // byte4 trailing-byte test 1465 || UNSAFE.getByte(bytes, offset++) > (byte) 0xBF) { 1466 return MALFORMED; 1467 } 1468 } 1469 } 1470 } 1471 partialIsValidUtf8(long address, int remaining)1472 private static int partialIsValidUtf8(long address, int remaining) { 1473 // Skip past ASCII characters as quickly as possible. 1474 final int skipped = unsafeEstimateConsecutiveAscii(address, remaining); 1475 address += skipped; 1476 remaining -= skipped; 1477 1478 for (;;) { 1479 // Optimize for interior runs of ASCII bytes. 1480 // TODO(nathanmittler): Consider checking 8 bytes at a time after some threshold? 1481 // Maybe after seeing a few in a row that are ASCII, go back to fast mode? 1482 int byte1 = 0; 1483 for (; remaining > 0 && (byte1 = UNSAFE.getByte(address++)) >= 0; --remaining) { 1484 } 1485 if (remaining == 0) { 1486 return COMPLETE; 1487 } 1488 remaining--; 1489 1490 if (byte1 < (byte) 0xE0) { 1491 // Two-byte form 1492 1493 if (remaining == 0) { 1494 // Incomplete sequence 1495 return byte1; 1496 } 1497 remaining--; 1498 1499 // Simultaneously checks for illegal trailing-byte in 1500 // leading position and overlong 2-byte form. 1501 if (byte1 < (byte) 0xC2 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1502 return MALFORMED; 1503 } 1504 } else if (byte1 < (byte) 0xF0) { 1505 // Three-byte form 1506 1507 if (remaining < 2) { 1508 // Incomplete sequence 1509 return unsafeIncompleteStateFor(address, byte1, remaining); 1510 } 1511 remaining -= 2; 1512 1513 final byte byte2 = UNSAFE.getByte(address++); 1514 if (byte2 > (byte) 0xBF 1515 // overlong? 5 most significant bits must not all be zero 1516 || (byte1 == (byte) 0xE0 && byte2 < (byte) 0xA0) 1517 // check for illegal surrogate codepoints 1518 || (byte1 == (byte) 0xED && byte2 >= (byte) 0xA0) 1519 // byte3 trailing-byte test 1520 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1521 return MALFORMED; 1522 } 1523 } else { 1524 // Four-byte form 1525 1526 if (remaining < 3) { 1527 // Incomplete sequence 1528 return unsafeIncompleteStateFor(address, byte1, remaining); 1529 } 1530 remaining -= 3; 1531 1532 final byte byte2 = UNSAFE.getByte(address++); 1533 if (byte2 > (byte) 0xBF 1534 // Check that 1 <= plane <= 16. Tricky optimized form of: 1535 // if (byte1 > (byte) 0xF4 || 1536 // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || 1537 // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) 1538 || (((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0 1539 // byte3 trailing-byte test 1540 || UNSAFE.getByte(address++) > (byte) 0xBF 1541 // byte4 trailing-byte test 1542 || UNSAFE.getByte(address++) > (byte) 0xBF) { 1543 return MALFORMED; 1544 } 1545 } 1546 } 1547 } 1548 unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset, int remaining)1549 private static int unsafeIncompleteStateFor(byte[] bytes, int byte1, long offset, 1550 int remaining) { 1551 switch (remaining) { 1552 case 0: { 1553 return incompleteStateFor(byte1); 1554 } 1555 case 1: { 1556 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset)); 1557 } 1558 case 2: { 1559 return incompleteStateFor(byte1, UNSAFE.getByte(bytes, offset), 1560 UNSAFE.getByte(bytes, offset + 1)); 1561 } 1562 default: { 1563 throw new AssertionError(); 1564 } 1565 } 1566 } 1567 unsafeIncompleteStateFor(long address, final int byte1, int remaining)1568 private static int unsafeIncompleteStateFor(long address, final int byte1, int remaining) { 1569 switch (remaining) { 1570 case 0: { 1571 return incompleteStateFor(byte1); 1572 } 1573 case 1: { 1574 return incompleteStateFor(byte1, UNSAFE.getByte(address)); 1575 } 1576 case 2: { 1577 return incompleteStateFor(byte1, UNSAFE.getByte(address), UNSAFE.getByte(address + 1)); 1578 } 1579 default: { 1580 throw new AssertionError(); 1581 } 1582 } 1583 } 1584 1585 /** 1586 * Gets the field with the given name within the class, or {@code null} if not found. If 1587 * found, the field is made accessible. 1588 */ field(Class<?> clazz, String fieldName)1589 private static Field field(Class<?> clazz, String fieldName) { 1590 Field field; 1591 try { 1592 field = clazz.getDeclaredField(fieldName); 1593 field.setAccessible(true); 1594 } catch (Throwable t) { 1595 // Failed to access the fields. 1596 field = null; 1597 } 1598 logger.log(Level.FINEST, "{0}.{1}: {2}", 1599 new Object[] {clazz.getName(), fieldName, (field != null ? "available" : "unavailable")}); 1600 return field; 1601 } 1602 1603 /** 1604 * Returns the offset of the provided field, or {@code -1} if {@code sun.misc.Unsafe} is not 1605 * available. 1606 */ fieldOffset(Field field)1607 private static long fieldOffset(Field field) { 1608 return field == null || UNSAFE == null ? -1 : UNSAFE.objectFieldOffset(field); 1609 } 1610 1611 /** 1612 * Get the base offset for byte arrays, or {@code -1} if {@code sun.misc.Unsafe} is not 1613 * available. 1614 */ byteArrayBaseOffset()1615 private static <T> int byteArrayBaseOffset() { 1616 return UNSAFE == null ? -1 : UNSAFE.arrayBaseOffset(byte[].class); 1617 } 1618 1619 /** 1620 * Gets the offset of the {@code address} field of the given direct {@link ByteBuffer}. 1621 */ addressOffset(ByteBuffer buffer)1622 private static long addressOffset(ByteBuffer buffer) { 1623 return UNSAFE.getLong(buffer, BUFFER_ADDRESS_OFFSET); 1624 } 1625 1626 /** 1627 * Gets the {@code sun.misc.Unsafe} instance, or {@code null} if not available on this 1628 * platform. 1629 */ getUnsafe()1630 private static sun.misc.Unsafe getUnsafe() { 1631 sun.misc.Unsafe unsafe = null; 1632 try { 1633 unsafe = AccessController.doPrivileged(new PrivilegedExceptionAction<sun.misc.Unsafe>() { 1634 @Override 1635 public sun.misc.Unsafe run() throws Exception { 1636 Class<sun.misc.Unsafe> k = sun.misc.Unsafe.class; 1637 1638 // Check that this platform supports all of the required unsafe methods. 1639 checkRequiredMethods(k); 1640 1641 for (Field f : k.getDeclaredFields()) { 1642 f.setAccessible(true); 1643 Object x = f.get(null); 1644 if (k.isInstance(x)) { 1645 return k.cast(x); 1646 } 1647 } 1648 // The sun.misc.Unsafe field does not exist. 1649 return null; 1650 } 1651 }); 1652 } catch (Throwable e) { 1653 // Catching Throwable here due to the fact that Google AppEngine raises NoClassDefFoundError 1654 // for Unsafe. 1655 } 1656 1657 logger.log(Level.FINEST, "sun.misc.Unsafe: {}", 1658 unsafe != null ? "available" : "unavailable"); 1659 return unsafe; 1660 } 1661 1662 /** 1663 * Verifies that all required methods of {@code sun.misc.Unsafe} are available on this platform. 1664 */ checkRequiredMethods(Class<sun.misc.Unsafe> clazz)1665 private static void checkRequiredMethods(Class<sun.misc.Unsafe> clazz) 1666 throws NoSuchMethodException, SecurityException { 1667 // Needed for Unsafe byte[] access 1668 clazz.getMethod("arrayBaseOffset", Class.class); 1669 clazz.getMethod("getByte", Object.class, long.class); 1670 clazz.getMethod("putByte", Object.class, long.class, byte.class); 1671 clazz.getMethod("getLong", Object.class, long.class); 1672 1673 // Needed for Unsafe Direct ByteBuffer access 1674 clazz.getMethod("objectFieldOffset", Field.class); 1675 clazz.getMethod("getByte", long.class); 1676 clazz.getMethod("getLong", Object.class, long.class); 1677 clazz.getMethod("putByte", long.class, byte.class); 1678 clazz.getMethod("getLong", long.class); 1679 } 1680 } 1681 Utf8()1682 private Utf8() {} 1683 } 1684