1 /*
2 ********************************************************************
3 *
4 *   Copyright (C) 1997-2011, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ********************************************************************
8 */
9 
10 #ifndef CHARITER_H
11 #define CHARITER_H
12 
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/unistr.h"
16 /**
17  * \file
18  * \brief C++ API: Character Iterator
19  */
20 
21 U_NAMESPACE_BEGIN
22 /**
23  * Abstract class that defines an API for forward-only iteration
24  * on text objects.
25  * This is a minimal interface for iteration without random access
26  * or backwards iteration. It is especially useful for wrapping
27  * streams with converters into an object for collation or
28  * normalization.
29  *
30  * <p>Characters can be accessed in two ways: as code units or as
31  * code points.
32  * Unicode code points are 21-bit integers and are the scalar values
33  * of Unicode characters. ICU uses the type UChar32 for them.
34  * Unicode code units are the storage units of a given
35  * Unicode/UCS Transformation Format (a character encoding scheme).
36  * With UTF-16, all code points can be represented with either one
37  * or two code units ("surrogates").
38  * String storage is typically based on code units, while properties
39  * of characters are typically determined using code point values.
40  * Some processes may be designed to work with sequences of code units,
41  * or it may be known that all characters that are important to an
42  * algorithm can be represented with single code units.
43  * Other processes will need to use the code point access functions.</p>
44  *
45  * <p>ForwardCharacterIterator provides nextPostInc() to access
46  * a code unit and advance an internal position into the text object,
47  * similar to a <code>return text[position++]</code>.<br>
48  * It provides next32PostInc() to access a code point and advance an internal
49  * position.</p>
50  *
51  * <p>next32PostInc() assumes that the current position is that of
52  * the beginning of a code point, i.e., of its first code unit.
53  * After next32PostInc(), this will be true again.
54  * In general, access to code units and code points in the same
55  * iteration loop should not be mixed. In UTF-16, if the current position
56  * is on a second code unit (Low Surrogate), then only that code unit
57  * is returned even by next32PostInc().</p>
58  *
59  * <p>For iteration with either function, there are two ways to
60  * check for the end of the iteration. When there are no more
61  * characters in the text object:
62  * <ul>
63  * <li>The hasNext() function returns FALSE.</li>
64  * <li>nextPostInc() and next32PostInc() return DONE
65  *     when one attempts to read beyond the end of the text object.</li>
66  * </ul>
67  *
68  * Example:
69  * \code
70  * void function1(ForwardCharacterIterator &it) {
71  *     UChar32 c;
72  *     while(it.hasNext()) {
73  *         c=it.next32PostInc();
74  *         // use c
75  *     }
76  * }
77  *
78  * void function1(ForwardCharacterIterator &it) {
79  *     UChar c;
80  *     while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
81  *         // use c
82  *      }
83  *  }
84  * \endcode
85  * </p>
86  *
87  * @stable ICU 2.0
88  */
89 class U_COMMON_API ForwardCharacterIterator : public UObject {
90 public:
91     /**
92      * Value returned by most of ForwardCharacterIterator's functions
93      * when the iterator has reached the limits of its iteration.
94      * @stable ICU 2.0
95      */
96     enum { DONE = 0xffff };
97 
98     /**
99      * Destructor.
100      * @stable ICU 2.0
101      */
102     virtual ~ForwardCharacterIterator();
103 
104     /**
105      * Returns true when both iterators refer to the same
106      * character in the same character-storage object.
107      * @param that The ForwardCharacterIterator to be compared for equality
108      * @return true when both iterators refer to the same
109      * character in the same character-storage object
110      * @stable ICU 2.0
111      */
112     virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
113 
114     /**
115      * Returns true when the iterators refer to different
116      * text-storage objects, or to different characters in the
117      * same text-storage object.
118      * @param that The ForwardCharacterIterator to be compared for inequality
119      * @return true when the iterators refer to different
120      * text-storage objects, or to different characters in the
121      * same text-storage object
122      * @stable ICU 2.0
123      */
124     inline UBool operator!=(const ForwardCharacterIterator& that) const;
125 
126     /**
127      * Generates a hash code for this iterator.
128      * @return the hash code.
129      * @stable ICU 2.0
130      */
131     virtual int32_t hashCode(void) const = 0;
132 
133     /**
134      * Returns a UClassID for this ForwardCharacterIterator ("poor man's
135      * RTTI").<P> Despite the fact that this function is public,
136      * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
137      * @return a UClassID for this ForwardCharacterIterator
138      * @stable ICU 2.0
139      */
140     virtual UClassID getDynamicClassID(void) const = 0;
141 
142     /**
143      * Gets the current code unit for returning and advances to the next code unit
144      * in the iteration range
145      * (toward endIndex()).  If there are
146      * no more code units to return, returns DONE.
147      * @return the current code unit.
148      * @stable ICU 2.0
149      */
150     virtual UChar         nextPostInc(void) = 0;
151 
152     /**
153      * Gets the current code point for returning and advances to the next code point
154      * in the iteration range
155      * (toward endIndex()).  If there are
156      * no more code points to return, returns DONE.
157      * @return the current code point.
158      * @stable ICU 2.0
159      */
160     virtual UChar32       next32PostInc(void) = 0;
161 
162     /**
163      * Returns FALSE if there are no more code units or code points
164      * at or after the current position in the iteration range.
165      * This is used with nextPostInc() or next32PostInc() in forward
166      * iteration.
167      * @returns FALSE if there are no more code units or code points
168      * at or after the current position in the iteration range.
169      * @stable ICU 2.0
170      */
171     virtual UBool        hasNext() = 0;
172 
173 protected:
174     /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
175     ForwardCharacterIterator();
176 
177     /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
178     ForwardCharacterIterator(const ForwardCharacterIterator &other);
179 
180     /**
181      * Assignment operator to be overridden in the implementing class.
182      * @stable ICU 2.0
183      */
184     ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
185 };
186 
187 /**
188  * Abstract class that defines an API for iteration
189  * on text objects.
190  * This is an interface for forward and backward iteration
191  * and random access into a text object.
192  *
193  * <p>The API provides backward compatibility to the Java and older ICU
194  * CharacterIterator classes but extends them significantly:
195  * <ol>
196  * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
197  * <li>While the old API functions provided forward iteration with
198  *     "pre-increment" semantics, the new one also provides functions
199  *     with "post-increment" semantics. They are more efficient and should
200  *     be the preferred iterator functions for new implementations.
201  *     The backward iteration always had "pre-decrement" semantics, which
202  *     are efficient.</li>
203  * <li>Just like ForwardCharacterIterator, it provides access to
204  *     both code units and code points. Code point access versions are available
205  *     for the old and the new iteration semantics.</li>
206  * <li>There are new functions for setting and moving the current position
207  *     without returning a character, for efficiency.</li>
208  * </ol>
209  *
210  * See ForwardCharacterIterator for examples for using the new forward iteration
211  * functions. For backward iteration, there is also a hasPrevious() function
212  * that can be used analogously to hasNext().
213  * The old functions work as before and are shown below.</p>
214  *
215  * <p>Examples for some of the new functions:</p>
216  *
217  * Forward iteration with hasNext():
218  * \code
219  * void forward1(CharacterIterator &it) {
220  *     UChar32 c;
221  *     for(it.setToStart(); it.hasNext();) {
222  *         c=it.next32PostInc();
223  *         // use c
224  *     }
225  *  }
226  * \endcode
227  * Forward iteration more similar to loops with the old forward iteration,
228  * showing a way to convert simple for() loops:
229  * \code
230  * void forward2(CharacterIterator &it) {
231  *     UChar c;
232  *     for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
233  *          // use c
234  *      }
235  * }
236  * \endcode
237  * Backward iteration with setToEnd() and hasPrevious():
238  * \code
239  *  void backward1(CharacterIterator &it) {
240  *      UChar32 c;
241  *      for(it.setToEnd(); it.hasPrevious();) {
242  *         c=it.previous32();
243  *          // use c
244  *      }
245  *  }
246  * \endcode
247  * Backward iteration with a more traditional for() loop:
248  * \code
249  * void backward2(CharacterIterator &it) {
250  *     UChar c;
251  *     for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
252  *         // use c
253  *      }
254  *  }
255  * \endcode
256  *
257  * Example for random access:
258  * \code
259  *  void random(CharacterIterator &it) {
260  *      // set to the third code point from the beginning
261  *      it.move32(3, CharacterIterator::kStart);
262  *      // get a code point from here without moving the position
263  *      UChar32 c=it.current32();
264  *      // get the position
265  *      int32_t pos=it.getIndex();
266  *      // get the previous code unit
267  *      UChar u=it.previous();
268  *      // move back one more code unit
269  *      it.move(-1, CharacterIterator::kCurrent);
270  *      // set the position back to where it was
271  *      // and read the same code point c and move beyond it
272  *      it.setIndex(pos);
273  *      if(c!=it.next32PostInc()) {
274  *          exit(1); // CharacterIterator inconsistent
275  *      }
276  *  }
277  * \endcode
278  *
279  * <p>Examples, especially for the old API:</p>
280  *
281  * Function processing characters, in this example simple output
282  * <pre>
283  * \code
284  *  void processChar( UChar c )
285  *  {
286  *      cout << " " << c;
287  *  }
288  * \endcode
289  * </pre>
290  * Traverse the text from start to finish
291  * <pre>
292  * \code
293  *  void traverseForward(CharacterIterator& iter)
294  *  {
295  *      for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
296  *          processChar(c);
297  *      }
298  *  }
299  * \endcode
300  * </pre>
301  * Traverse the text backwards, from end to start
302  * <pre>
303  * \code
304  *  void traverseBackward(CharacterIterator& iter)
305  *  {
306  *      for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
307  *          processChar(c);
308  *      }
309  *  }
310  * \endcode
311  * </pre>
312  * Traverse both forward and backward from a given position in the text.
313  * Calls to notBoundary() in this example represents some additional stopping criteria.
314  * <pre>
315  * \code
316  * void traverseOut(CharacterIterator& iter, int32_t pos)
317  * {
318  *      UChar c;
319  *      for (c = iter.setIndex(pos);
320  *      c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
321  *          c = iter.next()) {}
322  *      int32_t end = iter.getIndex();
323  *      for (c = iter.setIndex(pos);
324  *          c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
325  *          c = iter.previous()) {}
326  *      int32_t start = iter.getIndex() + 1;
327  *
328  *      cout << "start: " << start << " end: " << end << endl;
329  *      for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
330  *          processChar(c);
331  *     }
332  *  }
333  * \endcode
334  * </pre>
335  * Creating a StringCharacterIterator and calling the test functions
336  * <pre>
337  * \code
338  *  void CharacterIterator_Example( void )
339  *   {
340  *       cout << endl << "===== CharacterIterator_Example: =====" << endl;
341  *       UnicodeString text("Ein kleiner Satz.");
342  *       StringCharacterIterator iterator(text);
343  *       cout << "----- traverseForward: -----------" << endl;
344  *       traverseForward( iterator );
345  *       cout << endl << endl << "----- traverseBackward: ----------" << endl;
346  *       traverseBackward( iterator );
347  *       cout << endl << endl << "----- traverseOut: ---------------" << endl;
348  *       traverseOut( iterator, 7 );
349  *       cout << endl << endl << "-----" << endl;
350  *   }
351  * \endcode
352  * </pre>
353  *
354  * @stable ICU 2.0
355  */
356 class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
357 public:
358     /**
359      * Origin enumeration for the move() and move32() functions.
360      * @stable ICU 2.0
361      */
362     enum EOrigin { kStart, kCurrent, kEnd };
363 
364     /**
365      * Destructor.
366      * @stable ICU 2.0
367      */
368     virtual ~CharacterIterator();
369 
370     /**
371      * Returns a pointer to a new CharacterIterator of the same
372      * concrete class as this one, and referring to the same
373      * character in the same text-storage object as this one.  The
374      * caller is responsible for deleting the new clone.
375      * @return a pointer to a new CharacterIterator
376      * @stable ICU 2.0
377      */
378     virtual CharacterIterator* clone(void) const = 0;
379 
380     /**
381      * Sets the iterator to refer to the first code unit in its
382      * iteration range, and returns that code unit.
383      * This can be used to begin an iteration with next().
384      * @return the first code unit in its iteration range.
385      * @stable ICU 2.0
386      */
387     virtual UChar         first(void) = 0;
388 
389     /**
390      * Sets the iterator to refer to the first code unit in its
391      * iteration range, returns that code unit, and moves the position
392      * to the second code unit. This is an alternative to setToStart()
393      * for forward iteration with nextPostInc().
394      * @return the first code unit in its iteration range.
395      * @stable ICU 2.0
396      */
397     virtual UChar         firstPostInc(void);
398 
399     /**
400      * Sets the iterator to refer to the first code point in its
401      * iteration range, and returns that code unit,
402      * This can be used to begin an iteration with next32().
403      * Note that an iteration with next32PostInc(), beginning with,
404      * e.g., setToStart() or firstPostInc(), is more efficient.
405      * @return the first code point in its iteration range.
406      * @stable ICU 2.0
407      */
408     virtual UChar32       first32(void) = 0;
409 
410     /**
411      * Sets the iterator to refer to the first code point in its
412      * iteration range, returns that code point, and moves the position
413      * to the second code point. This is an alternative to setToStart()
414      * for forward iteration with next32PostInc().
415      * @return the first code point in its iteration range.
416      * @stable ICU 2.0
417      */
418     virtual UChar32       first32PostInc(void);
419 
420     /**
421      * Sets the iterator to refer to the first code unit or code point in its
422      * iteration range. This can be used to begin a forward
423      * iteration with nextPostInc() or next32PostInc().
424      * @return the start position of the iteration range
425      * @stable ICU 2.0
426      */
427     inline int32_t    setToStart();
428 
429     /**
430      * Sets the iterator to refer to the last code unit in its
431      * iteration range, and returns that code unit.
432      * This can be used to begin an iteration with previous().
433      * @return the last code unit.
434      * @stable ICU 2.0
435      */
436     virtual UChar         last(void) = 0;
437 
438     /**
439      * Sets the iterator to refer to the last code point in its
440      * iteration range, and returns that code unit.
441      * This can be used to begin an iteration with previous32().
442      * @return the last code point.
443      * @stable ICU 2.0
444      */
445     virtual UChar32       last32(void) = 0;
446 
447     /**
448      * Sets the iterator to the end of its iteration range, just behind
449      * the last code unit or code point. This can be used to begin a backward
450      * iteration with previous() or previous32().
451      * @return the end position of the iteration range
452      * @stable ICU 2.0
453      */
454     inline int32_t    setToEnd();
455 
456     /**
457      * Sets the iterator to refer to the "position"-th code unit
458      * in the text-storage object the iterator refers to, and
459      * returns that code unit.
460      * @param position the "position"-th code unit in the text-storage object
461      * @return the "position"-th code unit.
462      * @stable ICU 2.0
463      */
464     virtual UChar         setIndex(int32_t position) = 0;
465 
466     /**
467      * Sets the iterator to refer to the beginning of the code point
468      * that contains the "position"-th code unit
469      * in the text-storage object the iterator refers to, and
470      * returns that code point.
471      * The current position is adjusted to the beginning of the code point
472      * (its first code unit).
473      * @param position the "position"-th code unit in the text-storage object
474      * @return the "position"-th code point.
475      * @stable ICU 2.0
476      */
477     virtual UChar32       setIndex32(int32_t position) = 0;
478 
479     /**
480      * Returns the code unit the iterator currently refers to.
481      * @return the current code unit.
482      * @stable ICU 2.0
483      */
484     virtual UChar         current(void) const = 0;
485 
486     /**
487      * Returns the code point the iterator currently refers to.
488      * @return the current code point.
489      * @stable ICU 2.0
490      */
491     virtual UChar32       current32(void) const = 0;
492 
493     /**
494      * Advances to the next code unit in the iteration range
495      * (toward endIndex()), and returns that code unit.  If there are
496      * no more code units to return, returns DONE.
497      * @return the next code unit.
498      * @stable ICU 2.0
499      */
500     virtual UChar         next(void) = 0;
501 
502     /**
503      * Advances to the next code point in the iteration range
504      * (toward endIndex()), and returns that code point.  If there are
505      * no more code points to return, returns DONE.
506      * Note that iteration with "pre-increment" semantics is less
507      * efficient than iteration with "post-increment" semantics
508      * that is provided by next32PostInc().
509      * @return the next code point.
510      * @stable ICU 2.0
511      */
512     virtual UChar32       next32(void) = 0;
513 
514     /**
515      * Advances to the previous code unit in the iteration range
516      * (toward startIndex()), and returns that code unit.  If there are
517      * no more code units to return, returns DONE.
518      * @return the previous code unit.
519      * @stable ICU 2.0
520      */
521     virtual UChar         previous(void) = 0;
522 
523     /**
524      * Advances to the previous code point in the iteration range
525      * (toward startIndex()), and returns that code point.  If there are
526      * no more code points to return, returns DONE.
527      * @return the previous code point.
528      * @stable ICU 2.0
529      */
530     virtual UChar32       previous32(void) = 0;
531 
532     /**
533      * Returns FALSE if there are no more code units or code points
534      * before the current position in the iteration range.
535      * This is used with previous() or previous32() in backward
536      * iteration.
537      * @return FALSE if there are no more code units or code points
538      * before the current position in the iteration range, return TRUE otherwise.
539      * @stable ICU 2.0
540      */
541     virtual UBool        hasPrevious() = 0;
542 
543     /**
544      * Returns the numeric index in the underlying text-storage
545      * object of the character returned by first().  Since it's
546      * possible to create an iterator that iterates across only
547      * part of a text-storage object, this number isn't
548      * necessarily 0.
549      * @returns the numeric index in the underlying text-storage
550      * object of the character returned by first().
551      * @stable ICU 2.0
552      */
553     inline int32_t       startIndex(void) const;
554 
555     /**
556      * Returns the numeric index in the underlying text-storage
557      * object of the position immediately BEYOND the character
558      * returned by last().
559      * @return the numeric index in the underlying text-storage
560      * object of the position immediately BEYOND the character
561      * returned by last().
562      * @stable ICU 2.0
563      */
564     inline int32_t       endIndex(void) const;
565 
566     /**
567      * Returns the numeric index in the underlying text-storage
568      * object of the character the iterator currently refers to
569      * (i.e., the character returned by current()).
570      * @return the numberic index in the text-storage object of
571      * the character the iterator currently refers to
572      * @stable ICU 2.0
573      */
574     inline int32_t       getIndex(void) const;
575 
576     /**
577      * Returns the length of the entire text in the underlying
578      * text-storage object.
579      * @return the length of the entire text in the text-storage object
580      * @stable ICU 2.0
581      */
582     inline int32_t           getLength() const;
583 
584     /**
585      * Moves the current position relative to the start or end of the
586      * iteration range, or relative to the current position itself.
587      * The movement is expressed in numbers of code units forward
588      * or backward by specifying a positive or negative delta.
589      * @param delta the position relative to origin. A positive delta means forward;
590      * a negative delta means backward.
591      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
592      * @return the new position
593      * @stable ICU 2.0
594      */
595     virtual int32_t      move(int32_t delta, EOrigin origin) = 0;
596 
597     /**
598      * Moves the current position relative to the start or end of the
599      * iteration range, or relative to the current position itself.
600      * The movement is expressed in numbers of code points forward
601      * or backward by specifying a positive or negative delta.
602      * @param delta the position relative to origin. A positive delta means forward;
603      * a negative delta means backward.
604      * @param origin Origin enumeration {kStart, kCurrent, kEnd}
605      * @return the new position
606      * @stable ICU 2.0
607      */
608     virtual int32_t      move32(int32_t delta, EOrigin origin) = 0;
609 
610     /**
611      * Copies the text under iteration into the UnicodeString
612      * referred to by "result".
613      * @param result Receives a copy of the text under iteration.
614      * @stable ICU 2.0
615      */
616     virtual void            getText(UnicodeString&  result) = 0;
617 
618 protected:
619     /**
620      * Empty constructor.
621      * @stable ICU 2.0
622      */
623     CharacterIterator();
624 
625     /**
626      * Constructor, just setting the length field in this base class.
627      * @stable ICU 2.0
628      */
629     CharacterIterator(int32_t length);
630 
631     /**
632      * Constructor, just setting the length and position fields in this base class.
633      * @stable ICU 2.0
634      */
635     CharacterIterator(int32_t length, int32_t position);
636 
637     /**
638      * Constructor, just setting the length, start, end, and position fields in this base class.
639      * @stable ICU 2.0
640      */
641     CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
642 
643     /**
644      * Copy constructor.
645      *
646      * @param that The CharacterIterator to be copied
647      * @stable ICU 2.0
648      */
649     CharacterIterator(const CharacterIterator &that);
650 
651     /**
652      * Assignment operator.  Sets this CharacterIterator to have the same behavior,
653      * as the one passed in.
654      * @param that The CharacterIterator passed in.
655      * @return the newly set CharacterIterator.
656      * @stable ICU 2.0
657      */
658     CharacterIterator &operator=(const CharacterIterator &that);
659 
660     /**
661      * Base class text length field.
662      * Necessary this for correct getText() and hashCode().
663      * @stable ICU 2.0
664      */
665     int32_t textLength;
666 
667     /**
668      * Base class field for the current position.
669      * @stable ICU 2.0
670      */
671     int32_t  pos;
672 
673     /**
674      * Base class field for the start of the iteration range.
675      * @stable ICU 2.0
676      */
677     int32_t  begin;
678 
679     /**
680      * Base class field for the end of the iteration range.
681      * @stable ICU 2.0
682      */
683     int32_t  end;
684 };
685 
686 inline UBool
687 ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
688     return !operator==(that);
689 }
690 
691 inline int32_t
setToStart()692 CharacterIterator::setToStart() {
693     return move(0, kStart);
694 }
695 
696 inline int32_t
setToEnd()697 CharacterIterator::setToEnd() {
698     return move(0, kEnd);
699 }
700 
701 inline int32_t
startIndex(void)702 CharacterIterator::startIndex(void) const {
703     return begin;
704 }
705 
706 inline int32_t
endIndex(void)707 CharacterIterator::endIndex(void) const {
708     return end;
709 }
710 
711 inline int32_t
getIndex(void)712 CharacterIterator::getIndex(void) const {
713     return pos;
714 }
715 
716 inline int32_t
getLength(void)717 CharacterIterator::getLength(void) const {
718     return textLength;
719 }
720 
721 U_NAMESPACE_END
722 #endif
723