1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
6 **********************************************************************
7 *   Date        Name        Description
8 *  03/22/2000   helena      Creation.
9 **********************************************************************
10 */
11 
12 #ifndef SEARCH_H
13 #define SEARCH_H
14 
15 #include "unicode/utypes.h"
16 
17 #if U_SHOW_CPLUSPLUS_API
18 
19 /**
20  * \file
21  * \brief C++ API: SearchIterator object.
22  */
23 
24 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
25 
26 #include "unicode/uobject.h"
27 #include "unicode/unistr.h"
28 #include "unicode/chariter.h"
29 #include "unicode/brkiter.h"
30 #include "unicode/usearch.h"
31 
32 /**
33 * @stable ICU 2.0
34 */
35 struct USearch;
36 /**
37 * @stable ICU 2.0
38 */
39 typedef struct USearch USearch;
40 
41 U_NAMESPACE_BEGIN
42 
43 /**
44  *
45  * <tt>SearchIterator</tt> is an abstract base class that provides
46  * methods to search for a pattern within a text string. Instances of
47  * <tt>SearchIterator</tt> maintain a current position and scans over the
48  * target text, returning the indices the pattern is matched and the length
49  * of each match.
50  * <p>
51  * <tt>SearchIterator</tt> defines a protocol for text searching.
52  * Subclasses provide concrete implementations of various search algorithms.
53  * For example, <tt>StringSearch</tt> implements language-sensitive pattern
54  * matching based on the comparison rules defined in a
55  * <tt>RuleBasedCollator</tt> object.
56  * <p>
57  * Other options for searching includes using a BreakIterator to restrict
58  * the points at which matches are detected.
59  * <p>
60  * <tt>SearchIterator</tt> provides an API that is similar to that of
61  * other text iteration classes such as <tt>BreakIterator</tt>. Using
62  * this class, it is easy to scan through text looking for all occurances of
63  * a given pattern. The following example uses a <tt>StringSearch</tt>
64  * object to find all instances of "fox" in the target string. Any other
65  * subclass of <tt>SearchIterator</tt> can be used in an identical
66  * manner.
67  * <pre><code>
68  * UnicodeString target("The quick brown fox jumped over the lazy fox");
69  * UnicodeString pattern("fox");
70  *
71  * SearchIterator *iter  = new StringSearch(pattern, target);
72  * UErrorCode      error = U_ZERO_ERROR;
73  * for (int pos = iter->first(error); pos != USEARCH_DONE;
74  *                               pos = iter->next(error)) {
75  *     printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength());
76  * }
77  * </code></pre>
78  *
79  * @see StringSearch
80  * @see RuleBasedCollator
81  */
82 class U_I18N_API SearchIterator : public UObject {
83 
84 public:
85 
86     // public constructors and destructors -------------------------------
87 
88     /**
89     * Copy constructor that creates a SearchIterator instance with the same
90     * behavior, and iterating over the same text.
91     * @param other the SearchIterator instance to be copied.
92     * @stable ICU 2.0
93     */
94     SearchIterator(const SearchIterator &other);
95 
96     /**
97      * Destructor. Cleans up the search iterator data struct.
98      * @stable ICU 2.0
99      */
100     virtual ~SearchIterator();
101 
102     // public get and set methods ----------------------------------------
103 
104     /**
105      * Sets the index to point to the given position, and clears any state
106      * that's affected.
107      * <p>
108      * This method takes the argument index and sets the position in the text
109      * string accordingly without checking if the index is pointing to a
110      * valid starting point to begin searching.
111      * @param position within the text to be set. If position is less
112      *             than or greater than the text range for searching,
113      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
114      * @param status for errors if it occurs
115      * @stable ICU 2.0
116      */
117     virtual void setOffset(int32_t position, UErrorCode &status) = 0;
118 
119     /**
120      * Return the current index in the text being searched.
121      * If the iteration has gone past the end of the text
122      * (or past the beginning for a backwards search), USEARCH_DONE
123      * is returned.
124      * @return current index in the text being searched.
125      * @stable ICU 2.0
126      */
127     virtual int32_t getOffset(void) const = 0;
128 
129     /**
130     * Sets the text searching attributes located in the enum
131     * USearchAttribute with values from the enum USearchAttributeValue.
132     * USEARCH_DEFAULT can be used for all attributes for resetting.
133     * @param attribute text attribute (enum USearchAttribute) to be set
134     * @param value text attribute value
135     * @param status for errors if it occurs
136     * @stable ICU 2.0
137     */
138     void setAttribute(USearchAttribute       attribute,
139                       USearchAttributeValue  value,
140                       UErrorCode            &status);
141 
142     /**
143     * Gets the text searching attributes
144     * @param attribute text attribute (enum USearchAttribute) to be retrieve
145     * @return text attribute value
146     * @stable ICU 2.0
147     */
148     USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
149 
150     /**
151     * Returns the index to the match in the text string that was searched.
152     * This call returns a valid result only after a successful call to
153     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
154     * Just after construction, or after a searching method returns
155     * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
156     * <p>
157     * Use getMatchedLength to get the matched string length.
158     * @return index of a substring within the text string that is being
159     *         searched.
160     * @see #first
161     * @see #next
162     * @see #previous
163     * @see #last
164     * @stable ICU 2.0
165     */
166     int32_t getMatchedStart(void) const;
167 
168     /**
169      * Returns the length of text in the string which matches the search
170      * pattern. This call returns a valid result only after a successful call
171      * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
172      * Just after construction, or after a searching method returns
173      * <tt>USEARCH_DONE</tt>, this method will return 0.
174      * @return The length of the match in the target text, or 0 if there
175      *         is no match currently.
176      * @see #first
177      * @see #next
178      * @see #previous
179      * @see #last
180      * @stable ICU 2.0
181      */
182     int32_t getMatchedLength(void) const;
183 
184     /**
185      * Returns the text that was matched by the most recent call to
186      * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
187      * If the iterator is not pointing at a valid match (e.g. just after
188      * construction or after <tt>USEARCH_DONE</tt> has been returned,
189      * returns an empty string.
190      * @param result stores the matched string or an empty string if a match
191      *        is not found.
192      * @see #first
193      * @see #next
194      * @see #previous
195      * @see #last
196      * @stable ICU 2.0
197      */
198     void getMatchedText(UnicodeString &result) const;
199 
200     /**
201      * Set the BreakIterator that will be used to restrict the points
202      * at which matches are detected. The user is responsible for deleting
203      * the breakiterator.
204      * @param breakiter A BreakIterator that will be used to restrict the
205      *                points at which matches are detected. If a match is
206      *                found, but the match's start or end index is not a
207      *                boundary as determined by the <tt>BreakIterator</tt>,
208      *                the match will be rejected and another will be searched
209      *                for. If this parameter is <tt>NULL</tt>, no break
210      *                detection is attempted.
211      * @param status for errors if it occurs
212      * @see BreakIterator
213      * @stable ICU 2.0
214      */
215     void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
216 
217     /**
218      * Returns the BreakIterator that is used to restrict the points at
219      * which matches are detected.  This will be the same object that was
220      * passed to the constructor or to <tt>setBreakIterator</tt>.
221      * Note that <tt>NULL</tt> is a legal value; it means that break
222      * detection should not be attempted.
223      * @return BreakIterator used to restrict matchings.
224      * @see #setBreakIterator
225      * @stable ICU 2.0
226      */
227     const BreakIterator * getBreakIterator(void) const;
228 
229     /**
230      * Set the string text to be searched. Text iteration will hence begin at
231      * the start of the text string. This method is useful if you want to
232      * re-use an iterator to search for the same pattern within a different
233      * body of text. The user is responsible for deleting the text.
234      * @param text string to be searched.
235      * @param status for errors. If the text length is 0,
236      *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
237      * @stable ICU 2.0
238      */
239     virtual void setText(const UnicodeString &text, UErrorCode &status);
240 
241     /**
242      * Set the string text to be searched. Text iteration will hence begin at
243      * the start of the text string. This method is useful if you want to
244      * re-use an iterator to search for the same pattern within a different
245      * body of text.
246      * <p>
247      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
248      * will be done during searching for this version. The block of text
249      * in <tt>CharacterIterator</tt> will be used as it is.
250      * The user is responsible for deleting the text.
251      * @param text string iterator to be searched.
252      * @param status for errors if any. If the text length is 0 then an
253      *        U_ILLEGAL_ARGUMENT_ERROR is returned.
254      * @stable ICU 2.0
255      */
256     virtual void setText(CharacterIterator &text, UErrorCode &status);
257 
258     /**
259      * Return the string text to be searched.
260      * @return text string to be searched.
261      * @stable ICU 2.0
262      */
263     const UnicodeString & getText(void) const;
264 
265     // operator overloading ----------------------------------------------
266 
267     /**
268      * Equality operator.
269      * @param that SearchIterator instance to be compared.
270      * @return true if both BreakIterators are of the same class, have the
271      *         same behavior, terates over the same text and have the same
272      *         attributes. false otherwise.
273      * @stable ICU 2.0
274      */
275     virtual UBool operator==(const SearchIterator &that) const;
276 
277     /**
278      * Not-equal operator.
279      * @param that SearchIterator instance to be compared.
280      * @return false if operator== returns true, and vice versa.
281      * @stable ICU 2.0
282      */
283     UBool operator!=(const SearchIterator &that) const;
284 
285     // public methods ----------------------------------------------------
286 
287     /**
288      * Returns a copy of SearchIterator with the same behavior, and
289      * iterating over the same text, as this one. Note that all data will be
290      * replicated, except for the text string to be searched.
291      * @return cloned object
292      * @stable ICU 2.0
293      */
294     virtual SearchIterator* safeClone(void) const = 0;
295 
296     /**
297      * Returns the first index at which the string text matches the search
298      * pattern. The iterator is adjusted so that its current index (as
299      * returned by <tt>getOffset</tt>) is the match position if one
300      * was found.
301      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
302      * the iterator will be adjusted to the index USEARCH_DONE
303      * @param  status for errors if it occurs
304      * @return The character index of the first match, or
305      *         <tt>USEARCH_DONE</tt> if there are no matches.
306      * @see #getOffset
307      * @stable ICU 2.0
308      */
309     int32_t first(UErrorCode &status);
310 
311     /**
312      * Returns the first index equal or greater than <tt>position</tt> at which the
313      * string text matches the search pattern. The iterator is adjusted so
314      * that its current index (as returned by <tt>getOffset</tt>) is the
315      * match position if one was found.
316      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
317      * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
318      * @param  position where search if to start from. If position is less
319      *             than or greater than the text range for searching,
320      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
321      * @param  status for errors if it occurs
322      * @return The character index of the first match following
323      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
324      *         matches.
325      * @see #getOffset
326      * @stable ICU 2.0
327      */
328     int32_t following(int32_t position, UErrorCode &status);
329 
330     /**
331      * Returns the last index in the target text at which it matches the
332      * search pattern. The iterator is adjusted so that its current index
333      * (as returned by <tt>getOffset</tt>) is the match position if one was
334      * found.
335      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
336      * the iterator will be adjusted to the index USEARCH_DONE.
337      * @param  status for errors if it occurs
338      * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
339      *         there are no matches.
340      * @see #getOffset
341      * @stable ICU 2.0
342      */
343     int32_t last(UErrorCode &status);
344 
345     /**
346      * Returns the first index less than <tt>position</tt> at which the string
347      * text matches the search pattern. The iterator is adjusted so that its
348      * current index (as returned by <tt>getOffset</tt>) is the match
349      * position if one was found. If a match is not found,
350      * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
351      * adjusted to the index USEARCH_DONE
352      * <p>
353      * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
354      * result match is always less than <tt>position</tt>.
355      * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
356      * <tt>position</tt>.
357      *
358      * @param  position where search is to start from. If position is less
359      *             than or greater than the text range for searching,
360      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
361      * @param  status for errors if it occurs
362      * @return The character index of the first match preceding
363      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
364      *         no matches.
365      * @see #getOffset
366      * @stable ICU 2.0
367      */
368     int32_t preceding(int32_t position, UErrorCode &status);
369 
370     /**
371      * Returns the index of the next point at which the text matches the
372      * search pattern, starting from the current position
373      * The iterator is adjusted so that its current index (as returned by
374      * <tt>getOffset</tt>) is the match position if one was found.
375      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
376      * the iterator will be adjusted to a position after the end of the text
377      * string.
378      * @param  status for errors if it occurs
379      * @return The index of the next match after the current position,
380      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
381      * @see #getOffset
382      * @stable ICU 2.0
383      */
384      int32_t next(UErrorCode &status);
385 
386     /**
387      * Returns the index of the previous point at which the string text
388      * matches the search pattern, starting at the current position.
389      * The iterator is adjusted so that its current index (as returned by
390      * <tt>getOffset</tt>) is the match position if one was found.
391      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
392      * the iterator will be adjusted to the index USEARCH_DONE
393      * @param  status for errors if it occurs
394      * @return The index of the previous match before the current position,
395      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
396      * @see #getOffset
397      * @stable ICU 2.0
398      */
399     int32_t previous(UErrorCode &status);
400 
401     /**
402     * Resets the iteration.
403     * Search will begin at the start of the text string if a forward
404     * iteration is initiated before a backwards iteration. Otherwise if a
405     * backwards iteration is initiated before a forwards iteration, the
406     * search will begin at the end of the text string.
407     * @stable ICU 2.0
408     */
409     virtual void reset();
410 
411 protected:
412     // protected data members ---------------------------------------------
413 
414     /**
415     * C search data struct
416     * @stable ICU 2.0
417     */
418     USearch *m_search_;
419 
420     /**
421     * Break iterator.
422     * Currently the C++ breakiterator does not have getRules etc to reproduce
423     * another in C. Hence we keep the original around and do the verification
424     * at the end of the match. The user is responsible for deleting this
425     * break iterator.
426     * @stable ICU 2.0
427     */
428     BreakIterator *m_breakiterator_;
429 
430     /**
431     * Unicode string version of the search text
432     * @stable ICU 2.0
433     */
434     UnicodeString  m_text_;
435 
436     // protected constructors and destructors -----------------------------
437 
438     /**
439     * Default constructor.
440     * Initializes data to the default values.
441     * @stable ICU 2.0
442     */
443     SearchIterator();
444 
445     /**
446      * Constructor for use by subclasses.
447      * @param text The target text to be searched.
448      * @param breakiter A {@link BreakIterator} that is used to restrict the
449      *                points at which matches are detected. If
450      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
451      *                match, but the match's start or end index is not a
452      *                boundary as determined by the <tt>BreakIterator</tt>,
453      *                the match is rejected and <tt>handleNext</tt> or
454      *                <tt>handlePrev</tt> is called again. If this parameter
455      *                is <tt>NULL</tt>, no break detection is attempted.
456      * @see #handleNext
457      * @see #handlePrev
458      * @stable ICU 2.0
459      */
460     SearchIterator(const UnicodeString &text,
461                          BreakIterator *breakiter = NULL);
462 
463     /**
464      * Constructor for use by subclasses.
465      * <p>
466      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
467      * will be done during searching for this version. The block of text
468      * in <tt>CharacterIterator</tt> will be used as it is.
469      * @param text The target text to be searched.
470      * @param breakiter A {@link BreakIterator} that is used to restrict the
471      *                points at which matches are detected. If
472      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
473      *                match, but the match's start or end index is not a
474      *                boundary as determined by the <tt>BreakIterator</tt>,
475      *                the match is rejected and <tt>handleNext</tt> or
476      *                <tt>handlePrev</tt> is called again. If this parameter
477      *                is <tt>NULL</tt>, no break detection is attempted.
478      * @see #handleNext
479      * @see #handlePrev
480      * @stable ICU 2.0
481      */
482     SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
483 
484     // protected methods --------------------------------------------------
485 
486     /**
487      * Assignment operator. Sets this iterator to have the same behavior,
488      * and iterate over the same text, as the one passed in.
489      * @param that instance to be copied.
490      * @stable ICU 2.0
491      */
492     SearchIterator & operator=(const SearchIterator &that);
493 
494     /**
495      * Abstract method which subclasses override to provide the mechanism
496      * for finding the next match in the target text. This allows different
497      * subclasses to provide different search algorithms.
498      * <p>
499      * If a match is found, the implementation should return the index at
500      * which the match starts and should call
501      * <tt>setMatchLength</tt> with the number of characters
502      * in the target text that make up the match. If no match is found, the
503      * method should return USEARCH_DONE.
504      * <p>
505      * @param position The index in the target text at which the search
506      *                 should start.
507      * @param status for error codes if it occurs.
508      * @return index at which the match starts, else if match is not found
509      *         USEARCH_DONE is returned
510      * @see #setMatchLength
511      * @stable ICU 2.0
512      */
513     virtual int32_t handleNext(int32_t position, UErrorCode &status)
514                                                                          = 0;
515 
516     /**
517      * Abstract method which subclasses override to provide the mechanism for
518      * finding the previous match in the target text. This allows different
519      * subclasses to provide different search algorithms.
520      * <p>
521      * If a match is found, the implementation should return the index at
522      * which the match starts and should call
523      * <tt>setMatchLength</tt> with the number of characters
524      * in the target text that make up the match. If no match is found, the
525      * method should return USEARCH_DONE.
526      * <p>
527      * @param position The index in the target text at which the search
528      *                 should start.
529      * @param status for error codes if it occurs.
530      * @return index at which the match starts, else if match is not found
531      *         USEARCH_DONE is returned
532      * @see #setMatchLength
533      * @stable ICU 2.0
534      */
535      virtual int32_t handlePrev(int32_t position, UErrorCode &status)
536                                                                          = 0;
537 
538     /**
539      * Sets the length of the currently matched string in the text string to
540      * be searched.
541      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
542      * methods should call this when they find a match in the target text.
543      * @param length length of the matched text.
544      * @see #handleNext
545      * @see #handlePrev
546      * @stable ICU 2.0
547      */
548     virtual void setMatchLength(int32_t length);
549 
550     /**
551      * Sets the offset of the currently matched string in the text string to
552      * be searched.
553      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
554      * methods should call this when they find a match in the target text.
555      * @param position start offset of the matched text.
556      * @see #handleNext
557      * @see #handlePrev
558      * @stable ICU 2.0
559      */
560     virtual void setMatchStart(int32_t position);
561 
562     /**
563     * sets match not found
564     * @stable ICU 2.0
565     */
566     void setMatchNotFound();
567 };
568 
569 inline UBool SearchIterator::operator!=(const SearchIterator &that) const
570 {
571    return !operator==(that);
572 }
573 U_NAMESPACE_END
574 
575 #endif /* #if !UCONFIG_NO_COLLATION */
576 
577 #endif /* U_SHOW_CPLUSPLUS_API */
578 
579 #endif
580 
581