1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
4 **********************************************************************
5 *   Date        Name        Description
6 *  03/22/2000   helena      Creation.
7 **********************************************************************
8 */
9 
10 #ifndef SEARCH_H
11 #define SEARCH_H
12 
13 #include "unicode/utypes.h"
14 
15 /**
16  * \file
17  * \brief C++ API: SearchIterator object.
18  */
19 
20 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
21 
22 #include "unicode/uobject.h"
23 #include "unicode/unistr.h"
24 #include "unicode/chariter.h"
25 #include "unicode/brkiter.h"
26 #include "unicode/usearch.h"
27 
28 /**
29 * @stable ICU 2.0
30 */
31 struct USearch;
32 /**
33 * @stable ICU 2.0
34 */
35 typedef struct USearch USearch;
36 
37 U_NAMESPACE_BEGIN
38 
39 /**
40  *
41  * <tt>SearchIterator</tt> is an abstract base class that provides
42  * methods to search for a pattern within a text string. Instances of
43  * <tt>SearchIterator</tt> maintain a current position and scans over the
44  * target text, returning the indices the pattern is matched and the length
45  * of each match.
46  * <p>
47  * <tt>SearchIterator</tt> defines a protocol for text searching.
48  * Subclasses provide concrete implementations of various search algorithms.
49  * For example, <tt>StringSearch</tt> implements language-sensitive pattern
50  * matching based on the comparison rules defined in a
51  * <tt>RuleBasedCollator</tt> object.
52  * <p>
53  * Other options for searching includes using a BreakIterator to restrict
54  * the points at which matches are detected.
55  * <p>
56  * <tt>SearchIterator</tt> provides an API that is similar to that of
57  * other text iteration classes such as <tt>BreakIterator</tt>. Using
58  * this class, it is easy to scan through text looking for all occurances of
59  * a given pattern. The following example uses a <tt>StringSearch</tt>
60  * object to find all instances of "fox" in the target string. Any other
61  * subclass of <tt>SearchIterator</tt> can be used in an identical
62  * manner.
63  * <pre><code>
64  * UnicodeString target("The quick brown fox jumped over the lazy fox");
65  * UnicodeString pattern("fox");
66  *
67  * SearchIterator *iter  = new StringSearch(pattern, target);
68  * UErrorCode      error = U_ZERO_ERROR;
69  * for (int pos = iter->first(error); pos != USEARCH_DONE;
70  *                               pos = iter->next(error)) {
71  *     printf("Found match at %d pos, length is %d\n", pos,
72  *                                             iter.getMatchLength());
73  * }
74  * </code></pre>
75  *
76  * @see StringSearch
77  * @see RuleBasedCollator
78  */
79 class U_I18N_API SearchIterator : public UObject {
80 
81 public:
82 
83     // public constructors and destructors -------------------------------
84 
85     /**
86     * Copy constructor that creates a SearchIterator instance with the same
87     * behavior, and iterating over the same text.
88     * @param other the SearchIterator instance to be copied.
89     * @stable ICU 2.0
90     */
91     SearchIterator(const SearchIterator &other);
92 
93     /**
94      * Destructor. Cleans up the search iterator data struct.
95      * @stable ICU 2.0
96      */
97     virtual ~SearchIterator();
98 
99     // public get and set methods ----------------------------------------
100 
101     /**
102      * Sets the index to point to the given position, and clears any state
103      * that's affected.
104      * <p>
105      * This method takes the argument index and sets the position in the text
106      * string accordingly without checking if the index is pointing to a
107      * valid starting point to begin searching.
108      * @param position within the text to be set. If position is less
109      *             than or greater than the text range for searching,
110      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
111      * @param status for errors if it occurs
112      * @stable ICU 2.0
113      */
114     virtual void setOffset(int32_t position, UErrorCode &status) = 0;
115 
116     /**
117      * Return the current index in the text being searched.
118      * If the iteration has gone past the end of the text
119      * (or past the beginning for a backwards search), USEARCH_DONE
120      * is returned.
121      * @return current index in the text being searched.
122      * @stable ICU 2.0
123      */
124     virtual int32_t getOffset(void) const = 0;
125 
126     /**
127     * Sets the text searching attributes located in the enum
128     * USearchAttribute with values from the enum USearchAttributeValue.
129     * USEARCH_DEFAULT can be used for all attributes for resetting.
130     * @param attribute text attribute (enum USearchAttribute) to be set
131     * @param value text attribute value
132     * @param status for errors if it occurs
133     * @stable ICU 2.0
134     */
135     void setAttribute(USearchAttribute       attribute,
136                       USearchAttributeValue  value,
137                       UErrorCode            &status);
138 
139     /**
140     * Gets the text searching attributes
141     * @param attribute text attribute (enum USearchAttribute) to be retrieve
142     * @return text attribute value
143     * @stable ICU 2.0
144     */
145     USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
146 
147     /**
148     * Returns the index to the match in the text string that was searched.
149     * This call returns a valid result only after a successful call to
150     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
151     * Just after construction, or after a searching method returns
152     * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
153     * <p>
154     * Use getMatchedLength to get the matched string length.
155     * @return index of a substring within the text string that is being
156     *         searched.
157     * @see #first
158     * @see #next
159     * @see #previous
160     * @see #last
161     * @stable ICU 2.0
162     */
163     int32_t getMatchedStart(void) const;
164 
165     /**
166      * Returns the length of text in the string which matches the search
167      * pattern. This call returns a valid result only after a successful call
168      * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
169      * Just after construction, or after a searching method returns
170      * <tt>USEARCH_DONE</tt>, this method will return 0.
171      * @return The length of the match in the target text, or 0 if there
172      *         is no match currently.
173      * @see #first
174      * @see #next
175      * @see #previous
176      * @see #last
177      * @stable ICU 2.0
178      */
179     int32_t getMatchedLength(void) const;
180 
181     /**
182      * Returns the text that was matched by the most recent call to
183      * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
184      * If the iterator is not pointing at a valid match (e.g. just after
185      * construction or after <tt>USEARCH_DONE</tt> has been returned,
186      * returns an empty string.
187      * @param result stores the matched string or an empty string if a match
188      *        is not found.
189      * @see #first
190      * @see #next
191      * @see #previous
192      * @see #last
193      * @stable ICU 2.0
194      */
195     void getMatchedText(UnicodeString &result) const;
196 
197     /**
198      * Set the BreakIterator that will be used to restrict the points
199      * at which matches are detected. The user is responsible for deleting
200      * the breakiterator.
201      * @param breakiter A BreakIterator that will be used to restrict the
202      *                points at which matches are detected. If a match is
203      *                found, but the match's start or end index is not a
204      *                boundary as determined by the <tt>BreakIterator</tt>,
205      *                the match will be rejected and another will be searched
206      *                for. If this parameter is <tt>NULL</tt>, no break
207      *                detection is attempted.
208      * @param status for errors if it occurs
209      * @see BreakIterator
210      * @stable ICU 2.0
211      */
212     void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
213 
214     /**
215      * Returns the BreakIterator that is used to restrict the points at
216      * which matches are detected.  This will be the same object that was
217      * passed to the constructor or to <tt>setBreakIterator</tt>.
218      * Note that <tt>NULL</tt> is a legal value; it means that break
219      * detection should not be attempted.
220      * @return BreakIterator used to restrict matchings.
221      * @see #setBreakIterator
222      * @stable ICU 2.0
223      */
224     const BreakIterator * getBreakIterator(void) const;
225 
226     /**
227      * Set the string text to be searched. Text iteration will hence begin at
228      * the start of the text string. This method is useful if you want to
229      * re-use an iterator to search for the same pattern within a different
230      * body of text. The user is responsible for deleting the text.
231      * @param text string to be searched.
232      * @param status for errors. If the text length is 0,
233      *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
234      * @stable ICU 2.0
235      */
236     virtual void setText(const UnicodeString &text, UErrorCode &status);
237 
238     /**
239      * Set the string text to be searched. Text iteration will hence begin at
240      * the start of the text string. This method is useful if you want to
241      * re-use an iterator to search for the same pattern within a different
242      * body of text.
243      * <p>
244      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
245      * will be done during searching for this version. The block of text
246      * in <tt>CharacterIterator</tt> will be used as it is.
247      * The user is responsible for deleting the text.
248      * @param text string iterator to be searched.
249      * @param status for errors if any. If the text length is 0 then an
250      *        U_ILLEGAL_ARGUMENT_ERROR is returned.
251      * @stable ICU 2.0
252      */
253     virtual void setText(CharacterIterator &text, UErrorCode &status);
254 
255     /**
256      * Return the string text to be searched.
257      * @return text string to be searched.
258      * @stable ICU 2.0
259      */
260     const UnicodeString & getText(void) const;
261 
262     // operator overloading ----------------------------------------------
263 
264     /**
265      * Equality operator.
266      * @param that SearchIterator instance to be compared.
267      * @return TRUE if both BreakIterators are of the same class, have the
268      *         same behavior, terates over the same text and have the same
269      *         attributes. FALSE otherwise.
270      * @stable ICU 2.0
271      */
272     virtual UBool operator==(const SearchIterator &that) const;
273 
274     /**
275      * Not-equal operator.
276      * @param that SearchIterator instance to be compared.
277      * @return FALSE if operator== returns TRUE, and vice versa.
278      * @stable ICU 2.0
279      */
280     UBool operator!=(const SearchIterator &that) const;
281 
282     // public methods ----------------------------------------------------
283 
284     /**
285      * Returns a copy of SearchIterator with the same behavior, and
286      * iterating over the same text, as this one. Note that all data will be
287      * replicated, except for the text string to be searched.
288      * @return cloned object
289      * @stable ICU 2.0
290      */
291     virtual SearchIterator* safeClone(void) const = 0;
292 
293     /**
294      * Returns the first index at which the string text matches the search
295      * pattern. The iterator is adjusted so that its current index (as
296      * returned by <tt>getOffset</tt>) is the match position if one
297      * was found.
298      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
299      * the iterator will be adjusted to the index USEARCH_DONE
300      * @param  status for errors if it occurs
301      * @return The character index of the first match, or
302      *         <tt>USEARCH_DONE</tt> if there are no matches.
303      * @see #getOffset
304      * @stable ICU 2.0
305      */
306     int32_t first(UErrorCode &status);
307 
308     /**
309      * Returns the first index equal or greater than <tt>position</tt> at which the
310      * string text matches the search pattern. The iterator is adjusted so
311      * that its current index (as returned by <tt>getOffset</tt>) is the
312      * match position if one was found.
313      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
314      * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
315      * @param  position where search if to start from. If position is less
316      *             than or greater than the text range for searching,
317      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
318      * @param  status for errors if it occurs
319      * @return The character index of the first match following
320      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
321      *         matches.
322      * @see #getOffset
323      * @stable ICU 2.0
324      */
325     int32_t following(int32_t position, UErrorCode &status);
326 
327     /**
328      * Returns the last index in the target text at which it matches the
329      * search pattern. The iterator is adjusted so that its current index
330      * (as returned by <tt>getOffset</tt>) is the match position if one was
331      * found.
332      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
333      * the iterator will be adjusted to the index USEARCH_DONE.
334      * @param  status for errors if it occurs
335      * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
336      *         there are no matches.
337      * @see #getOffset
338      * @stable ICU 2.0
339      */
340     int32_t last(UErrorCode &status);
341 
342     /**
343      * Returns the first index less than <tt>position</tt> at which the string
344      * text matches the search pattern. The iterator is adjusted so that its
345      * current index (as returned by <tt>getOffset</tt>) is the match
346      * position if one was found. If a match is not found,
347      * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
348      * adjusted to the index USEARCH_DONE
349      * <p>
350      * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
351      * result match is always less than <tt>position</tt>.
352      * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
353      * <tt>position</tt>.
354      *
355      * @param  position where search is to start from. If position is less
356      *             than or greater than the text range for searching,
357      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
358      * @param  status for errors if it occurs
359      * @return The character index of the first match preceding
360      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
361      *         no matches.
362      * @see #getOffset
363      * @stable ICU 2.0
364      */
365     int32_t preceding(int32_t position, UErrorCode &status);
366 
367     /**
368      * Returns the index of the next point at which the text matches the
369      * search pattern, starting from the current position
370      * The iterator is adjusted so that its current index (as returned by
371      * <tt>getOffset</tt>) is the match position if one was found.
372      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
373      * the iterator will be adjusted to a position after the end of the text
374      * string.
375      * @param  status for errors if it occurs
376      * @return The index of the next match after the current position,
377      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
378      * @see #getOffset
379      * @stable ICU 2.0
380      */
381      int32_t next(UErrorCode &status);
382 
383     /**
384      * Returns the index of the previous point at which the string text
385      * matches the search pattern, starting at the current position.
386      * The iterator is adjusted so that its current index (as returned by
387      * <tt>getOffset</tt>) is the match position if one was found.
388      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
389      * the iterator will be adjusted to the index USEARCH_DONE
390      * @param  status for errors if it occurs
391      * @return The index of the previous match before the current position,
392      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
393      * @see #getOffset
394      * @stable ICU 2.0
395      */
396     int32_t previous(UErrorCode &status);
397 
398     /**
399     * Resets the iteration.
400     * Search will begin at the start of the text string if a forward
401     * iteration is initiated before a backwards iteration. Otherwise if a
402     * backwards iteration is initiated before a forwards iteration, the
403     * search will begin at the end of the text string.
404     * @stable ICU 2.0
405     */
406     virtual void reset();
407 
408 protected:
409     // protected data members ---------------------------------------------
410 
411     /**
412     * C search data struct
413     * @stable ICU 2.0
414     */
415     USearch *m_search_;
416 
417     /**
418     * Break iterator.
419     * Currently the C++ breakiterator does not have getRules etc to reproduce
420     * another in C. Hence we keep the original around and do the verification
421     * at the end of the match. The user is responsible for deleting this
422     * break iterator.
423     * @stable ICU 2.0
424     */
425     BreakIterator *m_breakiterator_;
426 
427     /**
428     * Unicode string version of the search text
429     * @stable ICU 2.0
430     */
431     UnicodeString  m_text_;
432 
433     // protected constructors and destructors -----------------------------
434 
435     /**
436     * Default constructor.
437     * Initializes data to the default values.
438     * @stable ICU 2.0
439     */
440     SearchIterator();
441 
442     /**
443      * Constructor for use by subclasses.
444      * @param text The target text to be searched.
445      * @param breakiter A {@link BreakIterator} that is used to restrict the
446      *                points at which matches are detected. If
447      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
448      *                match, but the match's start or end index is not a
449      *                boundary as determined by the <tt>BreakIterator</tt>,
450      *                the match is rejected and <tt>handleNext</tt> or
451      *                <tt>handlePrev</tt> is called again. If this parameter
452      *                is <tt>NULL</tt>, no break detection is attempted.
453      * @see #handleNext
454      * @see #handlePrev
455      * @stable ICU 2.0
456      */
457     SearchIterator(const UnicodeString &text,
458                          BreakIterator *breakiter = NULL);
459 
460     /**
461      * Constructor for use by subclasses.
462      * <p>
463      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
464      * will be done during searching for this version. The block of text
465      * in <tt>CharacterIterator</tt> will be used as it is.
466      * @param text The target text to be searched.
467      * @param breakiter A {@link BreakIterator} that is used to restrict the
468      *                points at which matches are detected. If
469      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
470      *                match, but the match's start or end index is not a
471      *                boundary as determined by the <tt>BreakIterator</tt>,
472      *                the match is rejected and <tt>handleNext</tt> or
473      *                <tt>handlePrev</tt> is called again. If this parameter
474      *                is <tt>NULL</tt>, no break detection is attempted.
475      * @see #handleNext
476      * @see #handlePrev
477      * @stable ICU 2.0
478      */
479     SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
480 
481     // protected methods --------------------------------------------------
482 
483     /**
484      * Assignment operator. Sets this iterator to have the same behavior,
485      * and iterate over the same text, as the one passed in.
486      * @param that instance to be copied.
487      * @stable ICU 2.0
488      */
489     SearchIterator & operator=(const SearchIterator &that);
490 
491     /**
492      * Abstract method which subclasses override to provide the mechanism
493      * for finding the next match in the target text. This allows different
494      * subclasses to provide different search algorithms.
495      * <p>
496      * If a match is found, the implementation should return the index at
497      * which the match starts and should call
498      * <tt>setMatchLength</tt> with the number of characters
499      * in the target text that make up the match. If no match is found, the
500      * method should return USEARCH_DONE.
501      * <p>
502      * @param position The index in the target text at which the search
503      *                 should start.
504      * @param status for error codes if it occurs.
505      * @return index at which the match starts, else if match is not found
506      *         USEARCH_DONE is returned
507      * @see #setMatchLength
508      * @stable ICU 2.0
509      */
510     virtual int32_t handleNext(int32_t position, UErrorCode &status)
511                                                                          = 0;
512 
513     /**
514      * Abstract method which subclasses override to provide the mechanism for
515      * finding the previous match in the target text. This allows different
516      * subclasses to provide different search algorithms.
517      * <p>
518      * If a match is found, the implementation should return the index at
519      * which the match starts and should call
520      * <tt>setMatchLength</tt> with the number of characters
521      * in the target text that make up the match. If no match is found, the
522      * method should return USEARCH_DONE.
523      * <p>
524      * @param position The index in the target text at which the search
525      *                 should start.
526      * @param status for error codes if it occurs.
527      * @return index at which the match starts, else if match is not found
528      *         USEARCH_DONE is returned
529      * @see #setMatchLength
530      * @stable ICU 2.0
531      */
532      virtual int32_t handlePrev(int32_t position, UErrorCode &status)
533                                                                          = 0;
534 
535     /**
536      * Sets the length of the currently matched string in the text string to
537      * be searched.
538      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
539      * methods should call this when they find a match in the target text.
540      * @param length length of the matched text.
541      * @see #handleNext
542      * @see #handlePrev
543      * @stable ICU 2.0
544      */
545     virtual void setMatchLength(int32_t length);
546 
547     /**
548      * Sets the offset of the currently matched string in the text string to
549      * be searched.
550      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
551      * methods should call this when they find a match in the target text.
552      * @param position start offset of the matched text.
553      * @see #handleNext
554      * @see #handlePrev
555      * @stable ICU 2.0
556      */
557     virtual void setMatchStart(int32_t position);
558 
559     /**
560     * sets match not found
561     * @stable ICU 2.0
562     */
563     void setMatchNotFound();
564 };
565 
566 inline UBool SearchIterator::operator!=(const SearchIterator &that) const
567 {
568    return !operator==(that);
569 }
570 U_NAMESPACE_END
571 
572 #endif /* #if !UCONFIG_NO_COLLATION */
573 
574 #endif
575 
576