1 /*
2 ********************************************************************
3 *
4 * Copyright (C) 1997-2011, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ********************************************************************
8 */
9
10 #ifndef CHARITER_H
11 #define CHARITER_H
12
13 #include "unicode/utypes.h"
14 #include "unicode/uobject.h"
15 #include "unicode/unistr.h"
16 /**
17 * \file
18 * \brief C++ API: Character Iterator
19 */
20
21 U_NAMESPACE_BEGIN
22 /**
23 * Abstract class that defines an API for forward-only iteration
24 * on text objects.
25 * This is a minimal interface for iteration without random access
26 * or backwards iteration. It is especially useful for wrapping
27 * streams with converters into an object for collation or
28 * normalization.
29 *
30 * <p>Characters can be accessed in two ways: as code units or as
31 * code points.
32 * Unicode code points are 21-bit integers and are the scalar values
33 * of Unicode characters. ICU uses the type UChar32 for them.
34 * Unicode code units are the storage units of a given
35 * Unicode/UCS Transformation Format (a character encoding scheme).
36 * With UTF-16, all code points can be represented with either one
37 * or two code units ("surrogates").
38 * String storage is typically based on code units, while properties
39 * of characters are typically determined using code point values.
40 * Some processes may be designed to work with sequences of code units,
41 * or it may be known that all characters that are important to an
42 * algorithm can be represented with single code units.
43 * Other processes will need to use the code point access functions.</p>
44 *
45 * <p>ForwardCharacterIterator provides nextPostInc() to access
46 * a code unit and advance an internal position into the text object,
47 * similar to a <code>return text[position++]</code>.<br>
48 * It provides next32PostInc() to access a code point and advance an internal
49 * position.</p>
50 *
51 * <p>next32PostInc() assumes that the current position is that of
52 * the beginning of a code point, i.e., of its first code unit.
53 * After next32PostInc(), this will be true again.
54 * In general, access to code units and code points in the same
55 * iteration loop should not be mixed. In UTF-16, if the current position
56 * is on a second code unit (Low Surrogate), then only that code unit
57 * is returned even by next32PostInc().</p>
58 *
59 * <p>For iteration with either function, there are two ways to
60 * check for the end of the iteration. When there are no more
61 * characters in the text object:
62 * <ul>
63 * <li>The hasNext() function returns FALSE.</li>
64 * <li>nextPostInc() and next32PostInc() return DONE
65 * when one attempts to read beyond the end of the text object.</li>
66 * </ul>
67 *
68 * Example:
69 * \code
70 * void function1(ForwardCharacterIterator &it) {
71 * UChar32 c;
72 * while(it.hasNext()) {
73 * c=it.next32PostInc();
74 * // use c
75 * }
76 * }
77 *
78 * void function1(ForwardCharacterIterator &it) {
79 * UChar c;
80 * while((c=it.nextPostInc())!=ForwardCharacterIterator::DONE) {
81 * // use c
82 * }
83 * }
84 * \endcode
85 * </p>
86 *
87 * @stable ICU 2.0
88 */
89 class U_COMMON_API ForwardCharacterIterator : public UObject {
90 public:
91 /**
92 * Value returned by most of ForwardCharacterIterator's functions
93 * when the iterator has reached the limits of its iteration.
94 * @stable ICU 2.0
95 */
96 enum { DONE = 0xffff };
97
98 /**
99 * Destructor.
100 * @stable ICU 2.0
101 */
102 virtual ~ForwardCharacterIterator();
103
104 /**
105 * Returns true when both iterators refer to the same
106 * character in the same character-storage object.
107 * @param that The ForwardCharacterIterator to be compared for equality
108 * @return true when both iterators refer to the same
109 * character in the same character-storage object
110 * @stable ICU 2.0
111 */
112 virtual UBool operator==(const ForwardCharacterIterator& that) const = 0;
113
114 /**
115 * Returns true when the iterators refer to different
116 * text-storage objects, or to different characters in the
117 * same text-storage object.
118 * @param that The ForwardCharacterIterator to be compared for inequality
119 * @return true when the iterators refer to different
120 * text-storage objects, or to different characters in the
121 * same text-storage object
122 * @stable ICU 2.0
123 */
124 inline UBool operator!=(const ForwardCharacterIterator& that) const;
125
126 /**
127 * Generates a hash code for this iterator.
128 * @return the hash code.
129 * @stable ICU 2.0
130 */
131 virtual int32_t hashCode(void) const = 0;
132
133 /**
134 * Returns a UClassID for this ForwardCharacterIterator ("poor man's
135 * RTTI").<P> Despite the fact that this function is public,
136 * DO NOT CONSIDER IT PART OF CHARACTERITERATOR'S API!
137 * @return a UClassID for this ForwardCharacterIterator
138 * @stable ICU 2.0
139 */
140 virtual UClassID getDynamicClassID(void) const = 0;
141
142 /**
143 * Gets the current code unit for returning and advances to the next code unit
144 * in the iteration range
145 * (toward endIndex()). If there are
146 * no more code units to return, returns DONE.
147 * @return the current code unit.
148 * @stable ICU 2.0
149 */
150 virtual UChar nextPostInc(void) = 0;
151
152 /**
153 * Gets the current code point for returning and advances to the next code point
154 * in the iteration range
155 * (toward endIndex()). If there are
156 * no more code points to return, returns DONE.
157 * @return the current code point.
158 * @stable ICU 2.0
159 */
160 virtual UChar32 next32PostInc(void) = 0;
161
162 /**
163 * Returns FALSE if there are no more code units or code points
164 * at or after the current position in the iteration range.
165 * This is used with nextPostInc() or next32PostInc() in forward
166 * iteration.
167 * @returns FALSE if there are no more code units or code points
168 * at or after the current position in the iteration range.
169 * @stable ICU 2.0
170 */
171 virtual UBool hasNext() = 0;
172
173 protected:
174 /** Default constructor to be overridden in the implementing class. @stable ICU 2.0*/
175 ForwardCharacterIterator();
176
177 /** Copy constructor to be overridden in the implementing class. @stable ICU 2.0*/
178 ForwardCharacterIterator(const ForwardCharacterIterator &other);
179
180 /**
181 * Assignment operator to be overridden in the implementing class.
182 * @stable ICU 2.0
183 */
184 ForwardCharacterIterator &operator=(const ForwardCharacterIterator&) { return *this; }
185 };
186
187 /**
188 * Abstract class that defines an API for iteration
189 * on text objects.
190 * This is an interface for forward and backward iteration
191 * and random access into a text object.
192 *
193 * <p>The API provides backward compatibility to the Java and older ICU
194 * CharacterIterator classes but extends them significantly:
195 * <ol>
196 * <li>CharacterIterator is now a subclass of ForwardCharacterIterator.</li>
197 * <li>While the old API functions provided forward iteration with
198 * "pre-increment" semantics, the new one also provides functions
199 * with "post-increment" semantics. They are more efficient and should
200 * be the preferred iterator functions for new implementations.
201 * The backward iteration always had "pre-decrement" semantics, which
202 * are efficient.</li>
203 * <li>Just like ForwardCharacterIterator, it provides access to
204 * both code units and code points. Code point access versions are available
205 * for the old and the new iteration semantics.</li>
206 * <li>There are new functions for setting and moving the current position
207 * without returning a character, for efficiency.</li>
208 * </ol>
209 *
210 * See ForwardCharacterIterator for examples for using the new forward iteration
211 * functions. For backward iteration, there is also a hasPrevious() function
212 * that can be used analogously to hasNext().
213 * The old functions work as before and are shown below.</p>
214 *
215 * <p>Examples for some of the new functions:</p>
216 *
217 * Forward iteration with hasNext():
218 * \code
219 * void forward1(CharacterIterator &it) {
220 * UChar32 c;
221 * for(it.setToStart(); it.hasNext();) {
222 * c=it.next32PostInc();
223 * // use c
224 * }
225 * }
226 * \endcode
227 * Forward iteration more similar to loops with the old forward iteration,
228 * showing a way to convert simple for() loops:
229 * \code
230 * void forward2(CharacterIterator &it) {
231 * UChar c;
232 * for(c=it.firstPostInc(); c!=CharacterIterator::DONE; c=it.nextPostInc()) {
233 * // use c
234 * }
235 * }
236 * \endcode
237 * Backward iteration with setToEnd() and hasPrevious():
238 * \code
239 * void backward1(CharacterIterator &it) {
240 * UChar32 c;
241 * for(it.setToEnd(); it.hasPrevious();) {
242 * c=it.previous32();
243 * // use c
244 * }
245 * }
246 * \endcode
247 * Backward iteration with a more traditional for() loop:
248 * \code
249 * void backward2(CharacterIterator &it) {
250 * UChar c;
251 * for(c=it.last(); c!=CharacterIterator::DONE; c=it.previous()) {
252 * // use c
253 * }
254 * }
255 * \endcode
256 *
257 * Example for random access:
258 * \code
259 * void random(CharacterIterator &it) {
260 * // set to the third code point from the beginning
261 * it.move32(3, CharacterIterator::kStart);
262 * // get a code point from here without moving the position
263 * UChar32 c=it.current32();
264 * // get the position
265 * int32_t pos=it.getIndex();
266 * // get the previous code unit
267 * UChar u=it.previous();
268 * // move back one more code unit
269 * it.move(-1, CharacterIterator::kCurrent);
270 * // set the position back to where it was
271 * // and read the same code point c and move beyond it
272 * it.setIndex(pos);
273 * if(c!=it.next32PostInc()) {
274 * exit(1); // CharacterIterator inconsistent
275 * }
276 * }
277 * \endcode
278 *
279 * <p>Examples, especially for the old API:</p>
280 *
281 * Function processing characters, in this example simple output
282 * <pre>
283 * \code
284 * void processChar( UChar c )
285 * {
286 * cout << " " << c;
287 * }
288 * \endcode
289 * </pre>
290 * Traverse the text from start to finish
291 * <pre>
292 * \code
293 * void traverseForward(CharacterIterator& iter)
294 * {
295 * for(UChar c = iter.first(); c != CharacterIterator.DONE; c = iter.next()) {
296 * processChar(c);
297 * }
298 * }
299 * \endcode
300 * </pre>
301 * Traverse the text backwards, from end to start
302 * <pre>
303 * \code
304 * void traverseBackward(CharacterIterator& iter)
305 * {
306 * for(UChar c = iter.last(); c != CharacterIterator.DONE; c = iter.previous()) {
307 * processChar(c);
308 * }
309 * }
310 * \endcode
311 * </pre>
312 * Traverse both forward and backward from a given position in the text.
313 * Calls to notBoundary() in this example represents some additional stopping criteria.
314 * <pre>
315 * \code
316 * void traverseOut(CharacterIterator& iter, int32_t pos)
317 * {
318 * UChar c;
319 * for (c = iter.setIndex(pos);
320 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
321 * c = iter.next()) {}
322 * int32_t end = iter.getIndex();
323 * for (c = iter.setIndex(pos);
324 * c != CharacterIterator.DONE && (Unicode::isLetter(c) || Unicode::isDigit(c));
325 * c = iter.previous()) {}
326 * int32_t start = iter.getIndex() + 1;
327 *
328 * cout << "start: " << start << " end: " << end << endl;
329 * for (c = iter.setIndex(start); iter.getIndex() < end; c = iter.next() ) {
330 * processChar(c);
331 * }
332 * }
333 * \endcode
334 * </pre>
335 * Creating a StringCharacterIterator and calling the test functions
336 * <pre>
337 * \code
338 * void CharacterIterator_Example( void )
339 * {
340 * cout << endl << "===== CharacterIterator_Example: =====" << endl;
341 * UnicodeString text("Ein kleiner Satz.");
342 * StringCharacterIterator iterator(text);
343 * cout << "----- traverseForward: -----------" << endl;
344 * traverseForward( iterator );
345 * cout << endl << endl << "----- traverseBackward: ----------" << endl;
346 * traverseBackward( iterator );
347 * cout << endl << endl << "----- traverseOut: ---------------" << endl;
348 * traverseOut( iterator, 7 );
349 * cout << endl << endl << "-----" << endl;
350 * }
351 * \endcode
352 * </pre>
353 *
354 * @stable ICU 2.0
355 */
356 class U_COMMON_API CharacterIterator : public ForwardCharacterIterator {
357 public:
358 /**
359 * Origin enumeration for the move() and move32() functions.
360 * @stable ICU 2.0
361 */
362 enum EOrigin { kStart, kCurrent, kEnd };
363
364 /**
365 * Destructor.
366 * @stable ICU 2.0
367 */
368 virtual ~CharacterIterator();
369
370 /**
371 * Returns a pointer to a new CharacterIterator of the same
372 * concrete class as this one, and referring to the same
373 * character in the same text-storage object as this one. The
374 * caller is responsible for deleting the new clone.
375 * @return a pointer to a new CharacterIterator
376 * @stable ICU 2.0
377 */
378 virtual CharacterIterator* clone(void) const = 0;
379
380 /**
381 * Sets the iterator to refer to the first code unit in its
382 * iteration range, and returns that code unit.
383 * This can be used to begin an iteration with next().
384 * @return the first code unit in its iteration range.
385 * @stable ICU 2.0
386 */
387 virtual UChar first(void) = 0;
388
389 /**
390 * Sets the iterator to refer to the first code unit in its
391 * iteration range, returns that code unit, and moves the position
392 * to the second code unit. This is an alternative to setToStart()
393 * for forward iteration with nextPostInc().
394 * @return the first code unit in its iteration range.
395 * @stable ICU 2.0
396 */
397 virtual UChar firstPostInc(void);
398
399 /**
400 * Sets the iterator to refer to the first code point in its
401 * iteration range, and returns that code unit,
402 * This can be used to begin an iteration with next32().
403 * Note that an iteration with next32PostInc(), beginning with,
404 * e.g., setToStart() or firstPostInc(), is more efficient.
405 * @return the first code point in its iteration range.
406 * @stable ICU 2.0
407 */
408 virtual UChar32 first32(void) = 0;
409
410 /**
411 * Sets the iterator to refer to the first code point in its
412 * iteration range, returns that code point, and moves the position
413 * to the second code point. This is an alternative to setToStart()
414 * for forward iteration with next32PostInc().
415 * @return the first code point in its iteration range.
416 * @stable ICU 2.0
417 */
418 virtual UChar32 first32PostInc(void);
419
420 /**
421 * Sets the iterator to refer to the first code unit or code point in its
422 * iteration range. This can be used to begin a forward
423 * iteration with nextPostInc() or next32PostInc().
424 * @return the start position of the iteration range
425 * @stable ICU 2.0
426 */
427 inline int32_t setToStart();
428
429 /**
430 * Sets the iterator to refer to the last code unit in its
431 * iteration range, and returns that code unit.
432 * This can be used to begin an iteration with previous().
433 * @return the last code unit.
434 * @stable ICU 2.0
435 */
436 virtual UChar last(void) = 0;
437
438 /**
439 * Sets the iterator to refer to the last code point in its
440 * iteration range, and returns that code unit.
441 * This can be used to begin an iteration with previous32().
442 * @return the last code point.
443 * @stable ICU 2.0
444 */
445 virtual UChar32 last32(void) = 0;
446
447 /**
448 * Sets the iterator to the end of its iteration range, just behind
449 * the last code unit or code point. This can be used to begin a backward
450 * iteration with previous() or previous32().
451 * @return the end position of the iteration range
452 * @stable ICU 2.0
453 */
454 inline int32_t setToEnd();
455
456 /**
457 * Sets the iterator to refer to the "position"-th code unit
458 * in the text-storage object the iterator refers to, and
459 * returns that code unit.
460 * @param position the "position"-th code unit in the text-storage object
461 * @return the "position"-th code unit.
462 * @stable ICU 2.0
463 */
464 virtual UChar setIndex(int32_t position) = 0;
465
466 /**
467 * Sets the iterator to refer to the beginning of the code point
468 * that contains the "position"-th code unit
469 * in the text-storage object the iterator refers to, and
470 * returns that code point.
471 * The current position is adjusted to the beginning of the code point
472 * (its first code unit).
473 * @param position the "position"-th code unit in the text-storage object
474 * @return the "position"-th code point.
475 * @stable ICU 2.0
476 */
477 virtual UChar32 setIndex32(int32_t position) = 0;
478
479 /**
480 * Returns the code unit the iterator currently refers to.
481 * @return the current code unit.
482 * @stable ICU 2.0
483 */
484 virtual UChar current(void) const = 0;
485
486 /**
487 * Returns the code point the iterator currently refers to.
488 * @return the current code point.
489 * @stable ICU 2.0
490 */
491 virtual UChar32 current32(void) const = 0;
492
493 /**
494 * Advances to the next code unit in the iteration range
495 * (toward endIndex()), and returns that code unit. If there are
496 * no more code units to return, returns DONE.
497 * @return the next code unit.
498 * @stable ICU 2.0
499 */
500 virtual UChar next(void) = 0;
501
502 /**
503 * Advances to the next code point in the iteration range
504 * (toward endIndex()), and returns that code point. If there are
505 * no more code points to return, returns DONE.
506 * Note that iteration with "pre-increment" semantics is less
507 * efficient than iteration with "post-increment" semantics
508 * that is provided by next32PostInc().
509 * @return the next code point.
510 * @stable ICU 2.0
511 */
512 virtual UChar32 next32(void) = 0;
513
514 /**
515 * Advances to the previous code unit in the iteration range
516 * (toward startIndex()), and returns that code unit. If there are
517 * no more code units to return, returns DONE.
518 * @return the previous code unit.
519 * @stable ICU 2.0
520 */
521 virtual UChar previous(void) = 0;
522
523 /**
524 * Advances to the previous code point in the iteration range
525 * (toward startIndex()), and returns that code point. If there are
526 * no more code points to return, returns DONE.
527 * @return the previous code point.
528 * @stable ICU 2.0
529 */
530 virtual UChar32 previous32(void) = 0;
531
532 /**
533 * Returns FALSE if there are no more code units or code points
534 * before the current position in the iteration range.
535 * This is used with previous() or previous32() in backward
536 * iteration.
537 * @return FALSE if there are no more code units or code points
538 * before the current position in the iteration range, return TRUE otherwise.
539 * @stable ICU 2.0
540 */
541 virtual UBool hasPrevious() = 0;
542
543 /**
544 * Returns the numeric index in the underlying text-storage
545 * object of the character returned by first(). Since it's
546 * possible to create an iterator that iterates across only
547 * part of a text-storage object, this number isn't
548 * necessarily 0.
549 * @returns the numeric index in the underlying text-storage
550 * object of the character returned by first().
551 * @stable ICU 2.0
552 */
553 inline int32_t startIndex(void) const;
554
555 /**
556 * Returns the numeric index in the underlying text-storage
557 * object of the position immediately BEYOND the character
558 * returned by last().
559 * @return the numeric index in the underlying text-storage
560 * object of the position immediately BEYOND the character
561 * returned by last().
562 * @stable ICU 2.0
563 */
564 inline int32_t endIndex(void) const;
565
566 /**
567 * Returns the numeric index in the underlying text-storage
568 * object of the character the iterator currently refers to
569 * (i.e., the character returned by current()).
570 * @return the numberic index in the text-storage object of
571 * the character the iterator currently refers to
572 * @stable ICU 2.0
573 */
574 inline int32_t getIndex(void) const;
575
576 /**
577 * Returns the length of the entire text in the underlying
578 * text-storage object.
579 * @return the length of the entire text in the text-storage object
580 * @stable ICU 2.0
581 */
582 inline int32_t getLength() const;
583
584 /**
585 * Moves the current position relative to the start or end of the
586 * iteration range, or relative to the current position itself.
587 * The movement is expressed in numbers of code units forward
588 * or backward by specifying a positive or negative delta.
589 * @param delta the position relative to origin. A positive delta means forward;
590 * a negative delta means backward.
591 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
592 * @return the new position
593 * @stable ICU 2.0
594 */
595 virtual int32_t move(int32_t delta, EOrigin origin) = 0;
596
597 /**
598 * Moves the current position relative to the start or end of the
599 * iteration range, or relative to the current position itself.
600 * The movement is expressed in numbers of code points forward
601 * or backward by specifying a positive or negative delta.
602 * @param delta the position relative to origin. A positive delta means forward;
603 * a negative delta means backward.
604 * @param origin Origin enumeration {kStart, kCurrent, kEnd}
605 * @return the new position
606 * @stable ICU 2.0
607 */
608 virtual int32_t move32(int32_t delta, EOrigin origin) = 0;
609
610 /**
611 * Copies the text under iteration into the UnicodeString
612 * referred to by "result".
613 * @param result Receives a copy of the text under iteration.
614 * @stable ICU 2.0
615 */
616 virtual void getText(UnicodeString& result) = 0;
617
618 protected:
619 /**
620 * Empty constructor.
621 * @stable ICU 2.0
622 */
623 CharacterIterator();
624
625 /**
626 * Constructor, just setting the length field in this base class.
627 * @stable ICU 2.0
628 */
629 CharacterIterator(int32_t length);
630
631 /**
632 * Constructor, just setting the length and position fields in this base class.
633 * @stable ICU 2.0
634 */
635 CharacterIterator(int32_t length, int32_t position);
636
637 /**
638 * Constructor, just setting the length, start, end, and position fields in this base class.
639 * @stable ICU 2.0
640 */
641 CharacterIterator(int32_t length, int32_t textBegin, int32_t textEnd, int32_t position);
642
643 /**
644 * Copy constructor.
645 *
646 * @param that The CharacterIterator to be copied
647 * @stable ICU 2.0
648 */
649 CharacterIterator(const CharacterIterator &that);
650
651 /**
652 * Assignment operator. Sets this CharacterIterator to have the same behavior,
653 * as the one passed in.
654 * @param that The CharacterIterator passed in.
655 * @return the newly set CharacterIterator.
656 * @stable ICU 2.0
657 */
658 CharacterIterator &operator=(const CharacterIterator &that);
659
660 /**
661 * Base class text length field.
662 * Necessary this for correct getText() and hashCode().
663 * @stable ICU 2.0
664 */
665 int32_t textLength;
666
667 /**
668 * Base class field for the current position.
669 * @stable ICU 2.0
670 */
671 int32_t pos;
672
673 /**
674 * Base class field for the start of the iteration range.
675 * @stable ICU 2.0
676 */
677 int32_t begin;
678
679 /**
680 * Base class field for the end of the iteration range.
681 * @stable ICU 2.0
682 */
683 int32_t end;
684 };
685
686 inline UBool
687 ForwardCharacterIterator::operator!=(const ForwardCharacterIterator& that) const {
688 return !operator==(that);
689 }
690
691 inline int32_t
setToStart()692 CharacterIterator::setToStart() {
693 return move(0, kStart);
694 }
695
696 inline int32_t
setToEnd()697 CharacterIterator::setToEnd() {
698 return move(0, kEnd);
699 }
700
701 inline int32_t
startIndex(void)702 CharacterIterator::startIndex(void) const {
703 return begin;
704 }
705
706 inline int32_t
endIndex(void)707 CharacterIterator::endIndex(void) const {
708 return end;
709 }
710
711 inline int32_t
getIndex(void)712 CharacterIterator::getIndex(void) const {
713 return pos;
714 }
715
716 inline int32_t
getLength(void)717 CharacterIterator::getLength(void) const {
718 return textLength;
719 }
720
721 U_NAMESPACE_END
722 #endif
723