1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4  *******************************************************************************
5  * Copyright (C) 2006-2013, International Business Machines Corporation and    *
6  * others. All Rights Reserved.                                                *
7  *******************************************************************************
8  *
9  *******************************************************************************
10  */
11 
12 package com.ibm.icu.charset;
13 
14 import java.nio.BufferOverflowException;
15 import java.nio.ByteBuffer;
16 import java.nio.CharBuffer;
17 import java.nio.IntBuffer;
18 import java.nio.charset.CharsetEncoder;
19 import java.nio.charset.CoderResult;
20 import java.nio.charset.CodingErrorAction;
21 
22 import com.ibm.icu.impl.Assert;
23 import com.ibm.icu.lang.UCharacter;
24 import com.ibm.icu.text.UTF16;
25 
26 /**
27  * An abstract class that provides framework methods of decoding operations for concrete
28  * subclasses.
29  * In the future this class will contain API that will implement converter semantics of ICU4C.
30  * @stable ICU 3.6
31  */
32 public abstract class CharsetEncoderICU extends CharsetEncoder {
33 
34     /* this is used in fromUnicode DBCS tables as an "unassigned" marker */
35     static final char MISSING_CHAR_MARKER = '\uFFFF';
36 
37     byte[] errorBuffer = new byte[30];
38 
39     int errorBufferLength = 0;
40 
41     /** these are for encodeLoopICU */
42     int fromUnicodeStatus;
43 
44     int fromUChar32;
45 
46     boolean useSubChar1;
47 
48     boolean useFallback;
49 
50     /* maximum number of indexed UChars */
51     static final int EXT_MAX_UCHARS = 19;
52 
53     /* store previous UChars/chars to continue partial matches */
54     int preFromUFirstCP; /* >=0: partial match */
55 
56     char[] preFromUArray = new char[EXT_MAX_UCHARS];
57 
58     int preFromUBegin;
59 
60     int preFromULength; /* negative: replay */
61 
62     char[] invalidUCharBuffer = new char[2];
63 
64     int invalidUCharLength;
65 
66     Object fromUContext;
67 
68     private CharsetCallback.Encoder onUnmappableInput = CharsetCallback.FROM_U_CALLBACK_STOP;
69 
70     private CharsetCallback.Encoder onMalformedInput = CharsetCallback.FROM_U_CALLBACK_STOP;
71 
72     CharsetCallback.Encoder fromCharErrorBehaviour = new CharsetCallback.Encoder() {
73         @Override
74         public CoderResult call(CharsetEncoderICU encoder, Object context,
75                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
76                 char[] buffer, int length, int cp, CoderResult cr) {
77             if (cr.isUnmappable()) {
78                 return onUnmappableInput.call(encoder, context, source, target,
79                         offsets, buffer, length, cp, cr);
80             } else /* if (cr.isMalformed()) */ {
81                 return onMalformedInput.call(encoder, context, source, target,
82                         offsets, buffer, length, cp, cr);
83             }
84             // return CharsetCallback.FROM_U_CALLBACK_STOP.call(encoder, context, source, target, offsets, buffer, length, cp, cr);
85 
86         }
87     };
88 
89     /*
90      * Construcs a new encoder for the given charset
91      *
92      * @param cs
93      *            for which the decoder is created
94      * @param replacement
95      *            the substitution bytes
96      */
CharsetEncoderICU(CharsetICU cs, byte[] replacement)97     CharsetEncoderICU(CharsetICU cs, byte[] replacement) {
98         super(cs, (cs.minBytesPerChar + cs.maxBytesPerChar) / 2,
99                 cs.maxBytesPerChar, replacement);
100     }
101 
102     /**
103      * Is this Encoder allowed to use fallbacks? A fallback mapping is a mapping
104      * that will convert a Unicode codepoint sequence to a byte sequence, but
105      * the encoded byte sequence will round trip convert to a different
106      * Unicode codepoint sequence.
107      * @return true if the converter uses fallback, false otherwise.
108      * @stable ICU 3.8
109      */
isFallbackUsed()110     public boolean isFallbackUsed() {
111         return useFallback;
112     }
113 
114     /**
115      * Sets whether this Encoder can use fallbacks?
116      * @param usesFallback true if the user wants the converter to take
117      *  advantage of the fallback mapping, false otherwise.
118      * @stable ICU 3.8
119      */
setFallbackUsed(boolean usesFallback)120     public void setFallbackUsed(boolean usesFallback) {
121         useFallback = usesFallback;
122     }
123 
124     /*
125      * Use fallbacks from Unicode to codepage when useFallback or for private-use code points
126      * @param c A codepoint
127      */
isFromUUseFallback(int c)128     final boolean isFromUUseFallback(int c) {
129         return (useFallback) || isUnicodePrivateUse(c);
130     }
131 
132     /**
133      * Use fallbacks from Unicode to codepage when useFallback or for private-use code points
134      */
isFromUUseFallback(boolean iUseFallback, int c)135     static final boolean isFromUUseFallback(boolean iUseFallback, int c) {
136         return (iUseFallback) || isUnicodePrivateUse(c);
137     }
138 
isUnicodePrivateUse(int c)139     private static final boolean isUnicodePrivateUse(int c) {
140         // First test for U+E000 to optimize for the most common characters.
141         return c >= 0xE000 && (c <= 0xF8FF ||
142                 c >= 0xF0000 && (c <= 0xFFFFD ||
143                 (c >= 0x100000 && c <= 0x10FFFD)));
144     }
145 
146     /**
147      * Sets the action to be taken if an illegal sequence is encountered
148      *
149      * @param newAction
150      *            action to be taken
151      * @exception IllegalArgumentException
152      * @stable ICU 3.6
153      */
154     @Override
implOnMalformedInput(CodingErrorAction newAction)155     protected void implOnMalformedInput(CodingErrorAction newAction) {
156         onMalformedInput = getCallback(newAction);
157     }
158 
159     /**
160      * Sets the action to be taken if an illegal sequence is encountered
161      *
162      * @param newAction
163      *            action to be taken
164      * @exception IllegalArgumentException
165      * @stable ICU 3.6
166      */
167     @Override
implOnUnmappableCharacter(CodingErrorAction newAction)168     protected void implOnUnmappableCharacter(CodingErrorAction newAction) {
169         onUnmappableInput = getCallback(newAction);
170     }
171 
172     /**
173      * Sets the callback encoder method and context to be used if an illegal sequence is encountered.
174      * You would normally call this twice to set both the malform and unmappable error. In this case,
175      * newContext should remain the same since using a different newContext each time will negate the last
176      * one used.
177      * @param err CoderResult
178      * @param newCallback CharsetCallback.Encoder
179      * @param newContext Object
180      * @stable ICU 4.0
181      */
setFromUCallback(CoderResult err, CharsetCallback.Encoder newCallback, Object newContext)182     public final void setFromUCallback(CoderResult err, CharsetCallback.Encoder newCallback, Object newContext) {
183         if (err.isMalformed()) {
184             onMalformedInput = newCallback;
185         } else if (err.isUnmappable()) {
186             onUnmappableInput = newCallback;
187         } else {
188             /* Error: Only malformed and unmappable are handled. */
189         }
190 
191         if (fromUContext == null || !fromUContext.equals(newContext)) {
192             setFromUContext(newContext);
193         }
194     }
195 
196     /**
197      * Sets fromUContext used in callbacks.
198      *
199      * @param newContext Object
200      * @exception IllegalArgumentException The object is an illegal argument for UContext.
201      * @stable ICU 4.0
202      */
setFromUContext(Object newContext)203     public final void setFromUContext(Object newContext) {
204         fromUContext = newContext;
205     }
206 
getCallback(CodingErrorAction action)207     private static CharsetCallback.Encoder getCallback(CodingErrorAction action) {
208         if (action == CodingErrorAction.REPLACE) {
209             return CharsetCallback.FROM_U_CALLBACK_SUBSTITUTE;
210         } else if (action == CodingErrorAction.IGNORE) {
211             return CharsetCallback.FROM_U_CALLBACK_SKIP;
212         } else /* if (action == CodingErrorAction.REPORT) */ {
213             return CharsetCallback.FROM_U_CALLBACK_STOP;
214         }
215     }
216 
217     private static final CharBuffer EMPTY = CharBuffer.allocate(0);
218 
219     /**
220      * Flushes any characters saved in the converter's internal buffer and
221      * resets the converter.
222      * @param out action to be taken
223      * @return result of flushing action and completes the decoding all input.
224      *         Returns CoderResult.UNDERFLOW if the action succeeds.
225      * @stable ICU 3.6
226      */
227     @Override
implFlush(ByteBuffer out)228     protected CoderResult implFlush(ByteBuffer out) {
229         return encode(EMPTY, out, null, true);
230     }
231 
232     /**
233      * Resets the from Unicode mode of converter
234      * @stable ICU 3.6
235      */
236     @Override
implReset()237     protected void implReset() {
238         errorBufferLength = 0;
239         fromUnicodeStatus = 0;
240         fromUChar32 = 0;
241         fromUnicodeReset();
242     }
243 
fromUnicodeReset()244     private void fromUnicodeReset() {
245         preFromUBegin = 0;
246         preFromUFirstCP = UConverterConstants.U_SENTINEL;
247         preFromULength = 0;
248     }
249 
250     /**
251      * Encodes one or more chars. The default behaviour of the
252      * converter is stop and report if an error in input stream is encountered.
253      * To set different behaviour use @see CharsetEncoder.onMalformedInput()
254      * @param in buffer to decode
255      * @param out buffer to populate with decoded result
256      * @return result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
257      *         action succeeds or more input is needed for completing the decoding action.
258      * @stable ICU 3.6
259      */
260     @Override
encodeLoop(CharBuffer in, ByteBuffer out)261     protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
262         if (!in.hasRemaining() && this.errorBufferLength == 0) { // make sure the errorBuffer is empty
263             // The Java framework should have already substituted what was left.
264             fromUChar32 = 0;
265             //fromUnicodeReset();
266             return CoderResult.UNDERFLOW;
267         }
268         in.position(in.position() + fromUCountPending());
269         /* do the conversion */
270         CoderResult ret = encode(in, out, null, false);
271         setSourcePosition(in);
272         /* No need to reset to keep the proper state of the encoder.
273          if (ret.isUnderflow() && in.hasRemaining()) {
274             // The Java framework is going to substitute what is left.
275             //fromUnicodeReset();
276         } */
277         return ret;
278     }
279 
280     /*
281      * Implements ICU semantics of buffer management
282      * @param source
283      * @param target
284      * @param offsets
285      * @return A CoderResult object that contains the error result when an error occurs.
286      */
encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)287     abstract CoderResult encodeLoop(CharBuffer source, ByteBuffer target,
288             IntBuffer offsets, boolean flush);
289 
290     /*
291      * Implements ICU semantics for encoding the buffer
292      * @param source The input character buffer
293      * @param target The output byte buffer
294      * @param offsets
295      * @param flush true if, and only if, the invoker can provide no
296      *  additional input bytes beyond those in the given buffer.
297      * @return A CoderResult object that contains the error result when an error occurs.
298      */
encode(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)299     final CoderResult encode(CharBuffer source, ByteBuffer target,
300             IntBuffer offsets, boolean flush) {
301 
302         /* check parameters */
303         if (target == null || source == null) {
304             throw new IllegalArgumentException();
305         }
306 
307         /*
308          * Make sure that the buffer sizes do not exceed the number range for
309          * int32_t because some functions use the size (in units or bytes)
310          * rather than comparing pointers, and because offsets are int32_t values.
311          *
312          * size_t is guaranteed to be unsigned and large enough for the job.
313          *
314          * Return with an error instead of adjusting the limits because we would
315          * not be able to maintain the semantics that either the source must be
316          * consumed or the target filled (unless an error occurs).
317          * An adjustment would be targetLimit=t+0x7fffffff; for example.
318          */
319 
320         /* flush the target overflow buffer */
321         if (errorBufferLength > 0) {
322             byte[] overflowArray;
323             int i, length;
324 
325             overflowArray = errorBuffer;
326             length = errorBufferLength;
327             i = 0;
328             do {
329                 if (target.remaining() == 0) {
330                     /* the overflow buffer contains too much, keep the rest */
331                     int j = 0;
332 
333                     do {
334                         overflowArray[j++] = overflowArray[i++];
335                     } while (i < length);
336 
337                     errorBufferLength = (byte) j;
338                     return CoderResult.OVERFLOW;
339                 }
340 
341                 /* copy the overflow contents to the target */
342                 target.put(overflowArray[i++]);
343                 if (offsets != null) {
344                     offsets.put(-1); /* no source index available for old output */
345                 }
346             } while (i < length);
347 
348             /* the overflow buffer is completely copied to the target */
349             errorBufferLength = 0;
350         }
351 
352         if (!flush && source.remaining() == 0 && preFromULength >= 0) {
353             /* the overflow buffer is emptied and there is no new input: we are done */
354             return CoderResult.UNDERFLOW;
355         }
356 
357         /*
358          * Do not simply return with a buffer overflow error if
359          * !flush && t==targetLimit
360          * because it is possible that the source will not generate any output.
361          * For example, the skip callback may be called;
362          * it does not output anything.
363          */
364 
365         return fromUnicodeWithCallback(source, target, offsets, flush);
366 
367     }
368 
369     /*
370      * Implementation note for m:n conversions
371      *
372      * While collecting source units to find the longest match for m:n conversion,
373      * some source units may need to be stored for a partial match.
374      * When a second buffer does not yield a match on all of the previously stored
375      * source units, then they must be "replayed", i.e., fed back into the converter.
376      *
377      * The code relies on the fact that replaying will not nest -
378      * converting a replay buffer will not result in a replay.
379      * This is because a replay is necessary only after the _continuation_ of a
380      * partial match failed, but a replay buffer is converted as a whole.
381      * It may result in some of its units being stored again for a partial match,
382      * but there will not be a continuation _during_ the replay which could fail.
383      *
384      * It is conceivable that a callback function could call the converter
385      * recursively in a way that causes another replay to be stored, but that
386      * would be an error in the callback function.
387      * Such violations will cause assertion failures in a debug build,
388      * and wrong output, but they will not cause a crash.
389      */
fromUnicodeWithCallback(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush)390     final CoderResult fromUnicodeWithCallback(CharBuffer source,
391             ByteBuffer target, IntBuffer offsets, boolean flush) {
392         int sBufferIndex;
393         int sourceIndex;
394         int errorInputLength;
395         boolean converterSawEndOfInput, calledCallback;
396 
397         /* variables for m:n conversion */
398         CharBuffer replayArray = CharBuffer.allocate(EXT_MAX_UCHARS);
399         int replayArrayIndex = 0;
400         CharBuffer realSource;
401         boolean realFlush;
402 
403         CoderResult cr = CoderResult.UNDERFLOW;
404 
405         /* get the converter implementation function */
406         sourceIndex = 0;
407 
408         if (preFromULength >= 0) {
409             /* normal mode */
410             realSource = null;
411             realFlush = false;
412         } else {
413             /*
414              * Previous m:n conversion stored source units from a partial match
415              * and failed to consume all of them.
416              * We need to "replay" them from a temporary buffer and convert them first.
417              */
418             realSource = source;
419             realFlush = flush;
420 
421             //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
422             replayArray.put(preFromUArray, 0, -preFromULength);
423             source = replayArray;
424             source.position(replayArrayIndex);
425             source.limit(replayArrayIndex - preFromULength); //preFromULength is negative, see declaration
426             flush = false;
427 
428             preFromULength = 0;
429         }
430 
431         /*
432          * loop for conversion and error handling
433          *
434          * loop {
435          *   convert
436          *   loop {
437          *     update offsets
438          *     handle end of input
439          *     handle errors/call callback
440          *   }
441          * }
442          */
443         for (;;) {
444             /* convert */
445             cr = encodeLoop(source, target, offsets, flush);
446             /*
447              * set a flag for whether the converter
448              * successfully processed the end of the input
449              *
450              * need not check cnv.preFromULength==0 because a replay (<0) will cause
451              * s<sourceLimit before converterSawEndOfInput is checked
452              */
453             converterSawEndOfInput = (cr.isUnderflow() && flush
454                     && source.remaining() == 0 && fromUChar32 == 0);
455 
456             /* no callback called yet for this iteration */
457             calledCallback = false;
458 
459             /* no sourceIndex adjustment for conversion, only for callback output */
460             errorInputLength = 0;
461 
462             /*
463              * loop for offsets and error handling
464              *
465              * iterates at most 3 times:
466              * 1. to clean up after the conversion function
467              * 2. after the callback
468              * 3. after the callback again if there was truncated input
469              */
470             for (;;) {
471                 /* update offsets if we write any */
472                 /* Currently offsets are not being used in ICU4J */
473                 /* if (offsets != null) {
474                     int length = target.remaining();
475                     if (length > 0) {
476 
477                         /*
478                          * if a converter handles offsets and updates the offsets
479                          * pointer at the end, then offset should not change
480                          * here;
481                          * however, some converters do not handle offsets at all
482                          * (sourceIndex<0) or may not update the offsets pointer
483                          */
484                  /*       offsets.position(offsets.position() + length);
485                     }
486 
487                     if (sourceIndex >= 0) {
488                         sourceIndex += (int) (source.position());
489                     }
490                 } */
491 
492                 if (preFromULength < 0) {
493                     /*
494                      * switch the source to new replay units (cannot occur while replaying)
495                      * after offset handling and before end-of-input and callback handling
496                      */
497                     if (realSource == null) {
498                         realSource = source;
499                         realFlush = flush;
500 
501                         //UConverterUtility.uprv_memcpy(replayArray, replayArrayIndex, preFromUArray, 0, -preFromULength*UMachine.U_SIZEOF_UCHAR);
502                         replayArray.put(preFromUArray, 0, -preFromULength);
503 
504                         source = replayArray;
505                         source.position(replayArrayIndex);
506                         source.limit(replayArrayIndex - preFromULength);
507                         flush = false;
508                         if ((sourceIndex += preFromULength) < 0) {
509                             sourceIndex = -1;
510                         }
511 
512                         preFromULength = 0;
513                     } else {
514                         /* see implementation note before _fromUnicodeWithCallback() */
515                         //agljport:todo U_ASSERT(realSource==NULL);
516                         Assert.assrt(realSource == null);
517                     }
518                 }
519 
520                 /* update pointers */
521                 sBufferIndex = source.position();
522                 if (cr.isUnderflow()) {
523                     if (sBufferIndex < source.limit()) {
524                         /*
525                          * continue with the conversion loop while there is still input left
526                          * (continue converting by breaking out of only the inner loop)
527                          */
528                         break;
529                     } else if (realSource != null) {
530                         /* switch back from replaying to the real source and continue */
531                         source = realSource;
532                         flush = realFlush;
533                         sourceIndex = source.position();
534                         realSource = null;
535                         break;
536                     } else if (flush && fromUChar32 != 0) {
537                         /*
538                          * the entire input stream is consumed
539                          * and there is a partial, truncated input sequence left
540                          */
541 
542                         /* inject an error and continue with callback handling */
543                         //err[0]=ErrorCode.U_TRUNCATED_CHAR_FOUND;
544                         cr = CoderResult.malformedForLength(1);
545                         calledCallback = false; /* new error condition */
546                     } else {
547                         /* input consumed */
548                         if (flush) {
549                             /*
550                              * return to the conversion loop once more if the flush
551                              * flag is set and the conversion function has not
552                              * successfully processed the end of the input yet
553                              *
554                              * (continue converting by breaking out of only the inner loop)
555                              */
556                             if (!converterSawEndOfInput) {
557                                 break;
558                             }
559 
560                             /* reset the converter without calling the callback function */
561                             implReset();
562                         }
563 
564                         /* done successfully */
565                         return cr;
566                     }
567                 }
568 
569                 /*U_FAILURE(*err) */
570                 {
571 
572                     if (calledCallback || cr.isOverflow()
573                             || (!cr.isMalformed() && !cr.isUnmappable())) {
574                         /*
575                          * the callback did not or cannot resolve the error:
576                          * set output pointers and return
577                          *
578                          * the check for buffer overflow is redundant but it is
579                          * a high-runner case and hopefully documents the intent
580                          * well
581                          *
582                          * if we were replaying, then the replay buffer must be
583                          * copied back into the UConverter
584                          * and the real arguments must be restored
585                          */
586                         if (realSource != null) {
587                             int length;
588 
589                             //agljport:todo U_ASSERT(cnv.preFromULength==0);
590 
591                             length = source.remaining();
592                             if (length > 0) {
593                                 //UConverterUtility.uprv_memcpy(preFromUArray, 0, sourceArray, pArgs.sourceBegin, length*UMachine.U_SIZEOF_UCHAR);
594                                 source.get(preFromUArray, 0, length);
595                                 preFromULength = (byte) -length;
596                             }
597                         }
598                         return cr;
599                     }
600                 }
601 
602                 /* callback handling */
603                 {
604                     int codePoint;
605 
606                     /* get and write the code point */
607                     codePoint = fromUChar32;
608                     errorInputLength = UTF16.append(invalidUCharBuffer, 0,
609                             fromUChar32);
610                     invalidUCharLength = errorInputLength;
611 
612                     /* set the converter state to deal with the next character */
613                     fromUChar32 = 0;
614 
615                     /* call the callback function */
616                     cr = fromCharErrorBehaviour.call(this, fromUContext,
617                             source, target, offsets, invalidUCharBuffer,
618                             invalidUCharLength, codePoint, cr);
619                 }
620 
621                 /*
622                  * loop back to the offset handling
623                  *
624                  * this flag will indicate after offset handling
625                  * that a callback was called;
626                  * if the callback did not resolve the error, then we return
627                  */
628                 calledCallback = true;
629             }
630         }
631     }
632 
633     /*
634      * Ascertains if a given Unicode code point (32bit value for handling surrogates)
635      * can be converted to the target encoding. If the caller wants to test if a
636      * surrogate pair can be converted to target encoding then the
637      * responsibility of assembling the int value lies with the caller.
638      * For assembling a code point the caller can use UTF16 class of ICU4J and do something like:
639      * <pre>
640      *  while(i<mySource.length){
641      *      if(UTF16.isLeadSurrogate(mySource[i])&& i+1< mySource.length){
642      *          if(UTF16.isTrailSurrogate(mySource[i+1])){
643      *              int temp = UTF16.charAt(mySource,i,i+1,0);
644      *              if(!((CharsetEncoderICU) myConv).canEncode(temp)){
645      *                  passed=false;
646      *              }
647      *              i++;
648      *              i++;
649      *          }
650      *      }
651      *  }
652      * </pre>
653      * or
654      * <pre>
655      *  String src = new String(mySource);
656      *  int i,codepoint;
657      *  boolean passed = false;
658      *  while(i<src.length()){
659      *      codepoint = UTF16.charAt(src,i);
660      *      i+= (codepoint>0xfff)? 2:1;
661      *      if(!(CharsetEncoderICU) myConv).canEncode(codepoint)){
662      *          passed = false;
663      *      }
664      *  }
665      * </pre>
666      *
667      * @param codepoint Unicode code point as int value
668      * @return true if a character can be converted
669      */
670     /* TODO This is different from Java's canEncode(char) API.
671      * ICU's API should implement getUnicodeSet,
672      * and override canEncode(char) which queries getUnicodeSet.
673      * The getUnicodeSet should return a frozen UnicodeSet or use a fillin parameter, like ICU4C.
674      */
675     /*public boolean canEncode(int codepoint) {
676         return true;
677     }*/
678     /**
679      * Overrides super class method
680      * @stable ICU 3.6
681      */
682     @Override
isLegalReplacement(byte[] repl)683     public boolean isLegalReplacement(byte[] repl) {
684         return true;
685     }
686 
687     /*
688      * Writes out the specified output bytes to the target byte buffer or to converter internal buffers.
689      * @param cnv
690      * @param bytesArray
691      * @param bytesBegin
692      * @param bytesLength
693      * @param out
694      * @param offsets
695      * @param sourceIndex
696      * @return A CoderResult object that contains the error result when an error occurs.
697      */
fromUWriteBytes(CharsetEncoderICU cnv, byte[] bytesArray, int bytesBegin, int bytesLength, ByteBuffer out, IntBuffer offsets, int sourceIndex)698     static final CoderResult fromUWriteBytes(CharsetEncoderICU cnv,
699             byte[] bytesArray, int bytesBegin, int bytesLength, ByteBuffer out,
700             IntBuffer offsets, int sourceIndex) {
701 
702         //write bytes
703         int obl = bytesLength;
704         CoderResult cr = CoderResult.UNDERFLOW;
705         int bytesLimit = bytesBegin + bytesLength;
706         try {
707             for (; bytesBegin < bytesLimit;) {
708                 out.put(bytesArray[bytesBegin]);
709                 bytesBegin++;
710             }
711             // success
712             bytesLength = 0;
713         } catch (BufferOverflowException ex) {
714             cr = CoderResult.OVERFLOW;
715         }
716 
717         if (offsets != null) {
718             while (obl > bytesLength) {
719                 offsets.put(sourceIndex);
720                 --obl;
721             }
722         }
723         //write overflow
724         cnv.errorBufferLength = bytesLimit - bytesBegin;
725         if (cnv.errorBufferLength > 0) {
726             int index = 0;
727             while (bytesBegin < bytesLimit) {
728                 cnv.errorBuffer[index++] = bytesArray[bytesBegin++];
729             }
730             cr = CoderResult.OVERFLOW;
731         }
732         return cr;
733     }
734 
735     /*
736      * Returns the number of chars held in the converter's internal state
737      * because more input is needed for completing the conversion. This function is
738      * useful for mapping semantics of ICU's converter interface to those of iconv,
739      * and this information is not needed for normal conversion.
740      * @return The number of chars in the state. -1 if an error is encountered.
741      */
fromUCountPending()742     /*public*/int fromUCountPending() {
743         if (preFromULength > 0) {
744             return UTF16.getCharCount(preFromUFirstCP) + preFromULength;
745         } else if (preFromULength < 0) {
746             return -preFromULength;
747         } else if (fromUChar32 > 0) {
748             return 1;
749         } else if (preFromUFirstCP > 0) {
750             return UTF16.getCharCount(preFromUFirstCP);
751         }
752         return 0;
753     }
754 
755     /**
756      *
757      * @param source
758      */
setSourcePosition(CharBuffer source)759     private final void setSourcePosition(CharBuffer source) {
760 
761         // ok was there input held in the previous invocation of encodeLoop
762         // that resulted in output in this invocation?
763         source.position(source.position() - fromUCountPending());
764     }
765 
766     /*
767      * Write the codepage substitution character.
768      * Subclasses to override this method.
769      * For stateful converters, it is typically necessary to handle this
770      * specificially for the converter in order to properly maintain the state.
771      * @param source The input character buffer
772      * @param target The output byte buffer
773      * @param offsets
774      * @return A CoderResult object that contains the error result when an error occurs.
775      */
cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, IntBuffer offsets)776     CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source,
777             ByteBuffer target, IntBuffer offsets) {
778         CharsetICU cs = (CharsetICU) encoder.charset();
779         byte[] sub = encoder.replacement();
780         if (cs.subChar1 != 0 && encoder.invalidUCharBuffer[0] <= 0xff) {
781             return CharsetEncoderICU.fromUWriteBytes(encoder,
782                     new byte[] { cs.subChar1 }, 0, 1, target, offsets, source
783                             .position());
784         } else {
785             return CharsetEncoderICU.fromUWriteBytes(encoder, sub, 0,
786                     sub.length, target, offsets, source.position());
787         }
788     }
789 
790     /*
791      * Write the characters to target.
792      * @param source The input character buffer
793      * @param target The output byte buffer
794      * @param offsets
795      * @return A CoderResult object that contains the error result when an error occurs.
796      */
cbFromUWriteUChars(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, IntBuffer offsets)797     CoderResult cbFromUWriteUChars(CharsetEncoderICU encoder,
798             CharBuffer source, ByteBuffer target, IntBuffer offsets) {
799         CoderResult cr = CoderResult.UNDERFLOW;
800 
801         /* This is a fun one.  Recursion can occur - we're basically going to
802          * just retry shoving data through the same converter. Note, if you got
803          * here through some kind of invalid sequence, you maybe should emit a
804          * reset sequence of some kind. Since this IS an actual conversion,
805          * take care that you've changed the callback or the data, or you'll
806          * get an infinite loop.
807          */
808 
809         int oldTargetPosition = target.position();
810         int offsetIndex = source.position();
811 
812         cr = encoder.encode(source, target, null, false); /* no offsets and no flush */
813 
814         if (offsets != null) {
815             while (target.position() != oldTargetPosition) {
816                 offsets.put(offsetIndex);
817                 oldTargetPosition++;
818             }
819         }
820 
821         /* Note, if you did something like used a stop subcallback, things would get interesting.
822          * In fact, here's where we want to return the partially consumed in-source!
823          */
824         if (cr.isOverflow()) {
825             /* Overflowed target. Now, we'll write into the charErrorBuffer.
826              * It's a fixed size. If we overflow it...Hm
827              */
828 
829             /* start the new target at the first free slot in the error buffer */
830             int errBuffLen = encoder.errorBufferLength;
831             ByteBuffer newTarget = ByteBuffer.wrap(encoder.errorBuffer);
832             newTarget.position(errBuffLen); /* set the position at the end of the error buffer */
833             encoder.errorBufferLength = 0;
834 
835             encoder.encode(source, newTarget, null, false);
836 
837             encoder.errorBuffer = newTarget.array();
838             encoder.errorBufferLength = newTarget.position();
839         }
840 
841         return cr;
842     }
843 
844     /**
845      * <p>
846      * Handles a common situation where a character has been read and it may be
847      * a lead surrogate followed by a trail surrogate. This method can change
848      * the source position and will modify fromUChar32.
849      * </p>
850      *
851      * <p>
852      * If <code>null</code> is returned, then there was success in reading a
853      * surrogate pair, the codepoint is stored in <code>fromUChar32</code> and
854      * <code>fromUChar32</code> should be reset (to 0) after being read.
855      * </p>
856      *
857      * @param source
858      *            The encoding source.
859      * @param lead
860      *            A character that may be the first in a surrogate pair.
861      * @return <code>CoderResult.malformedForLength(1)</code> or
862      *         <code>CoderResult.UNDERFLOW</code> if there is a problem, or
863      *         <code>null</code> if there isn't.
864      * @see #handleSurrogates(CharBuffer, char)
865      * @see #handleSurrogates(char[], int, int, char)
866      */
handleSurrogates(CharBuffer source, char lead)867     final CoderResult handleSurrogates(CharBuffer source, char lead) {
868         if (!UTF16.isLeadSurrogate(lead)) {
869             fromUChar32 = lead;
870             return CoderResult.malformedForLength(1);
871         }
872 
873         if (!source.hasRemaining()) {
874             fromUChar32 = lead;
875             return CoderResult.UNDERFLOW;
876         }
877 
878         char trail = source.get();
879 
880         if (!UTF16.isTrailSurrogate(trail)) {
881             fromUChar32 = lead;
882             source.position(source.position() - 1);
883             return CoderResult.malformedForLength(1);
884         }
885 
886         fromUChar32 = UCharacter.getCodePoint(lead, trail);
887         return null;
888     }
889 
890     /**
891      * <p>
892      * Same as <code>handleSurrogates(CharBuffer, char)</code>, but with arrays. As an added
893      * requirement, the calling method must also increment the index if this method returns
894      * <code>null</code>.
895      * </p>
896      *
897      *
898      * @param source
899      *            The encoding source.
900      * @param lead
901      *            A character that may be the first in a surrogate pair.
902      * @return <code>CoderResult.malformedForLength(1)</code> or
903      *         <code>CoderResult.UNDERFLOW</code> if there is a problem, or <code>null</code> if
904      *         there isn't.
905      * @see #handleSurrogates(CharBuffer, char)
906      * @see #handleSurrogates(char[], int, int, char)
907      */
handleSurrogates(char[] sourceArray, int sourceIndex, int sourceLimit, char lead)908     final CoderResult handleSurrogates(char[] sourceArray, int sourceIndex,
909             int sourceLimit, char lead) {
910         if (!UTF16.isLeadSurrogate(lead)) {
911             fromUChar32 = lead;
912             return CoderResult.malformedForLength(1);
913         }
914 
915         if (sourceIndex >= sourceLimit) {
916             fromUChar32 = lead;
917             return CoderResult.UNDERFLOW;
918         }
919 
920         char trail = sourceArray[sourceIndex];
921 
922         if (!UTF16.isTrailSurrogate(trail)) {
923             fromUChar32 = lead;
924             return CoderResult.malformedForLength(1);
925         }
926 
927         fromUChar32 = UCharacter.getCodePoint(lead, trail);
928         return null;
929     }
930 
931     /**
932      * Returns the maxCharsPerByte value for the Charset that created this encoder.
933      * @return maxCharsPerByte
934      * @stable ICU 4.8
935      */
maxCharsPerByte()936     public final float maxCharsPerByte() {
937         return ((CharsetICU)(this.charset())).maxCharsPerByte;
938     }
939 
940     /**
941      * Calculates the size of a buffer for conversion from Unicode to a charset.
942      * The calculated size is guaranteed to be sufficient for this conversion.
943      *
944      * It takes into account initial and final non-character bytes that are output
945      * by some converters.
946      * It does not take into account callbacks which output more than one charset
947      * character sequence per call, like escape callbacks.
948      * The default (substitution) callback only outputs one charset character sequence.
949      *
950      * @param length Number of chars to be converted.
951      * @param maxCharSize Return value from maxBytesPerChar for the converter
952      *                    that will be used.
953      * @return Size of a buffer that will be large enough to hold the output of bytes
954      *
955      * @stable ICU 49
956      */
getMaxBytesForString(int length, int maxCharSize)957     public static int getMaxBytesForString(int length, int maxCharSize) {
958         return ((length + 10) * maxCharSize);
959     }
960 
961 }
962