1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2001-2011 IBM and others. All rights reserved.
6 **********************************************************************
7 *   Date        Name        Description
8 *  03/22/2000   helena      Creation.
9 **********************************************************************
10 */
11 
12 #ifndef SEARCH_H
13 #define SEARCH_H
14 
15 #include "unicode/utypes.h"
16 
17 /**
18  * \file
19  * \brief C++ API: SearchIterator object.
20  */
21 
22 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
23 
24 #include "unicode/uobject.h"
25 #include "unicode/unistr.h"
26 #include "unicode/chariter.h"
27 #include "unicode/brkiter.h"
28 #include "unicode/usearch.h"
29 
30 /**
31 * @stable ICU 2.0
32 */
33 struct USearch;
34 /**
35 * @stable ICU 2.0
36 */
37 typedef struct USearch USearch;
38 
39 U_NAMESPACE_BEGIN
40 
41 /**
42  *
43  * <tt>SearchIterator</tt> is an abstract base class that provides
44  * methods to search for a pattern within a text string. Instances of
45  * <tt>SearchIterator</tt> maintain a current position and scans over the
46  * target text, returning the indices the pattern is matched and the length
47  * of each match.
48  * <p>
49  * <tt>SearchIterator</tt> defines a protocol for text searching.
50  * Subclasses provide concrete implementations of various search algorithms.
51  * For example, <tt>StringSearch</tt> implements language-sensitive pattern
52  * matching based on the comparison rules defined in a
53  * <tt>RuleBasedCollator</tt> object.
54  * <p>
55  * Other options for searching includes using a BreakIterator to restrict
56  * the points at which matches are detected.
57  * <p>
58  * <tt>SearchIterator</tt> provides an API that is similar to that of
59  * other text iteration classes such as <tt>BreakIterator</tt>. Using
60  * this class, it is easy to scan through text looking for all occurances of
61  * a given pattern. The following example uses a <tt>StringSearch</tt>
62  * object to find all instances of "fox" in the target string. Any other
63  * subclass of <tt>SearchIterator</tt> can be used in an identical
64  * manner.
65  * <pre><code>
66  * UnicodeString target("The quick brown fox jumped over the lazy fox");
67  * UnicodeString pattern("fox");
68  *
69  * SearchIterator *iter  = new StringSearch(pattern, target);
70  * UErrorCode      error = U_ZERO_ERROR;
71  * for (int pos = iter->first(error); pos != USEARCH_DONE;
72  *                               pos = iter->next(error)) {
73  *     printf("Found match at %d pos, length is %d\n", pos,
74  *                                             iter.getMatchLength());
75  * }
76  * </code></pre>
77  *
78  * @see StringSearch
79  * @see RuleBasedCollator
80  */
81 class U_I18N_API SearchIterator : public UObject {
82 
83 public:
84 
85     // public constructors and destructors -------------------------------
86 
87     /**
88     * Copy constructor that creates a SearchIterator instance with the same
89     * behavior, and iterating over the same text.
90     * @param other the SearchIterator instance to be copied.
91     * @stable ICU 2.0
92     */
93     SearchIterator(const SearchIterator &other);
94 
95     /**
96      * Destructor. Cleans up the search iterator data struct.
97      * @stable ICU 2.0
98      */
99     virtual ~SearchIterator();
100 
101     // public get and set methods ----------------------------------------
102 
103     /**
104      * Sets the index to point to the given position, and clears any state
105      * that's affected.
106      * <p>
107      * This method takes the argument index and sets the position in the text
108      * string accordingly without checking if the index is pointing to a
109      * valid starting point to begin searching.
110      * @param position within the text to be set. If position is less
111      *             than or greater than the text range for searching,
112      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
113      * @param status for errors if it occurs
114      * @stable ICU 2.0
115      */
116     virtual void setOffset(int32_t position, UErrorCode &status) = 0;
117 
118     /**
119      * Return the current index in the text being searched.
120      * If the iteration has gone past the end of the text
121      * (or past the beginning for a backwards search), USEARCH_DONE
122      * is returned.
123      * @return current index in the text being searched.
124      * @stable ICU 2.0
125      */
126     virtual int32_t getOffset(void) const = 0;
127 
128     /**
129     * Sets the text searching attributes located in the enum
130     * USearchAttribute with values from the enum USearchAttributeValue.
131     * USEARCH_DEFAULT can be used for all attributes for resetting.
132     * @param attribute text attribute (enum USearchAttribute) to be set
133     * @param value text attribute value
134     * @param status for errors if it occurs
135     * @stable ICU 2.0
136     */
137     void setAttribute(USearchAttribute       attribute,
138                       USearchAttributeValue  value,
139                       UErrorCode            &status);
140 
141     /**
142     * Gets the text searching attributes
143     * @param attribute text attribute (enum USearchAttribute) to be retrieve
144     * @return text attribute value
145     * @stable ICU 2.0
146     */
147     USearchAttributeValue getAttribute(USearchAttribute  attribute) const;
148 
149     /**
150     * Returns the index to the match in the text string that was searched.
151     * This call returns a valid result only after a successful call to
152     * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
153     * Just after construction, or after a searching method returns
154     * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>.
155     * <p>
156     * Use getMatchedLength to get the matched string length.
157     * @return index of a substring within the text string that is being
158     *         searched.
159     * @see #first
160     * @see #next
161     * @see #previous
162     * @see #last
163     * @stable ICU 2.0
164     */
165     int32_t getMatchedStart(void) const;
166 
167     /**
168      * Returns the length of text in the string which matches the search
169      * pattern. This call returns a valid result only after a successful call
170      * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
171      * Just after construction, or after a searching method returns
172      * <tt>USEARCH_DONE</tt>, this method will return 0.
173      * @return The length of the match in the target text, or 0 if there
174      *         is no match currently.
175      * @see #first
176      * @see #next
177      * @see #previous
178      * @see #last
179      * @stable ICU 2.0
180      */
181     int32_t getMatchedLength(void) const;
182 
183     /**
184      * Returns the text that was matched by the most recent call to
185      * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>.
186      * If the iterator is not pointing at a valid match (e.g. just after
187      * construction or after <tt>USEARCH_DONE</tt> has been returned,
188      * returns an empty string.
189      * @param result stores the matched string or an empty string if a match
190      *        is not found.
191      * @see #first
192      * @see #next
193      * @see #previous
194      * @see #last
195      * @stable ICU 2.0
196      */
197     void getMatchedText(UnicodeString &result) const;
198 
199     /**
200      * Set the BreakIterator that will be used to restrict the points
201      * at which matches are detected. The user is responsible for deleting
202      * the breakiterator.
203      * @param breakiter A BreakIterator that will be used to restrict the
204      *                points at which matches are detected. If a match is
205      *                found, but the match's start or end index is not a
206      *                boundary as determined by the <tt>BreakIterator</tt>,
207      *                the match will be rejected and another will be searched
208      *                for. If this parameter is <tt>NULL</tt>, no break
209      *                detection is attempted.
210      * @param status for errors if it occurs
211      * @see BreakIterator
212      * @stable ICU 2.0
213      */
214     void setBreakIterator(BreakIterator *breakiter, UErrorCode &status);
215 
216     /**
217      * Returns the BreakIterator that is used to restrict the points at
218      * which matches are detected.  This will be the same object that was
219      * passed to the constructor or to <tt>setBreakIterator</tt>.
220      * Note that <tt>NULL</tt> is a legal value; it means that break
221      * detection should not be attempted.
222      * @return BreakIterator used to restrict matchings.
223      * @see #setBreakIterator
224      * @stable ICU 2.0
225      */
226     const BreakIterator * getBreakIterator(void) const;
227 
228     /**
229      * Set the string text to be searched. Text iteration will hence begin at
230      * the start of the text string. This method is useful if you want to
231      * re-use an iterator to search for the same pattern within a different
232      * body of text. The user is responsible for deleting the text.
233      * @param text string to be searched.
234      * @param status for errors. If the text length is 0,
235      *        an U_ILLEGAL_ARGUMENT_ERROR is returned.
236      * @stable ICU 2.0
237      */
238     virtual void setText(const UnicodeString &text, UErrorCode &status);
239 
240     /**
241      * Set the string text to be searched. Text iteration will hence begin at
242      * the start of the text string. This method is useful if you want to
243      * re-use an iterator to search for the same pattern within a different
244      * body of text.
245      * <p>
246      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
247      * will be done during searching for this version. The block of text
248      * in <tt>CharacterIterator</tt> will be used as it is.
249      * The user is responsible for deleting the text.
250      * @param text string iterator to be searched.
251      * @param status for errors if any. If the text length is 0 then an
252      *        U_ILLEGAL_ARGUMENT_ERROR is returned.
253      * @stable ICU 2.0
254      */
255     virtual void setText(CharacterIterator &text, UErrorCode &status);
256 
257     /**
258      * Return the string text to be searched.
259      * @return text string to be searched.
260      * @stable ICU 2.0
261      */
262     const UnicodeString & getText(void) const;
263 
264     // operator overloading ----------------------------------------------
265 
266     /**
267      * Equality operator.
268      * @param that SearchIterator instance to be compared.
269      * @return TRUE if both BreakIterators are of the same class, have the
270      *         same behavior, terates over the same text and have the same
271      *         attributes. FALSE otherwise.
272      * @stable ICU 2.0
273      */
274     virtual UBool operator==(const SearchIterator &that) const;
275 
276     /**
277      * Not-equal operator.
278      * @param that SearchIterator instance to be compared.
279      * @return FALSE if operator== returns TRUE, and vice versa.
280      * @stable ICU 2.0
281      */
282     UBool operator!=(const SearchIterator &that) const;
283 
284     // public methods ----------------------------------------------------
285 
286     /**
287      * Returns a copy of SearchIterator with the same behavior, and
288      * iterating over the same text, as this one. Note that all data will be
289      * replicated, except for the text string to be searched.
290      * @return cloned object
291      * @stable ICU 2.0
292      */
293     virtual SearchIterator* safeClone(void) const = 0;
294 
295     /**
296      * Returns the first index at which the string text matches the search
297      * pattern. The iterator is adjusted so that its current index (as
298      * returned by <tt>getOffset</tt>) is the match position if one
299      * was found.
300      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
301      * the iterator will be adjusted to the index USEARCH_DONE
302      * @param  status for errors if it occurs
303      * @return The character index of the first match, or
304      *         <tt>USEARCH_DONE</tt> if there are no matches.
305      * @see #getOffset
306      * @stable ICU 2.0
307      */
308     int32_t first(UErrorCode &status);
309 
310     /**
311      * Returns the first index equal or greater than <tt>position</tt> at which the
312      * string text matches the search pattern. The iterator is adjusted so
313      * that its current index (as returned by <tt>getOffset</tt>) is the
314      * match position if one was found.
315      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the
316      * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>.
317      * @param  position where search if to start from. If position is less
318      *             than or greater than the text range for searching,
319      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
320      * @param  status for errors if it occurs
321      * @return The character index of the first match following
322      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no
323      *         matches.
324      * @see #getOffset
325      * @stable ICU 2.0
326      */
327     int32_t following(int32_t position, UErrorCode &status);
328 
329     /**
330      * Returns the last index in the target text at which it matches the
331      * search pattern. The iterator is adjusted so that its current index
332      * (as returned by <tt>getOffset</tt>) is the match position if one was
333      * found.
334      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
335      * the iterator will be adjusted to the index USEARCH_DONE.
336      * @param  status for errors if it occurs
337      * @return The index of the first match, or <tt>USEARCH_DONE</tt> if
338      *         there are no matches.
339      * @see #getOffset
340      * @stable ICU 2.0
341      */
342     int32_t last(UErrorCode &status);
343 
344     /**
345      * Returns the first index less than <tt>position</tt> at which the string
346      * text matches the search pattern. The iterator is adjusted so that its
347      * current index (as returned by <tt>getOffset</tt>) is the match
348      * position if one was found. If a match is not found,
349      * <tt>USEARCH_DONE</tt> will be returned and the iterator will be
350      * adjusted to the index USEARCH_DONE
351      * <p>
352      * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the
353      * result match is always less than <tt>position</tt>.
354      * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across
355      * <tt>position</tt>.
356      *
357      * @param  position where search is to start from. If position is less
358      *             than or greater than the text range for searching,
359      *          an U_INDEX_OUTOFBOUNDS_ERROR will be returned
360      * @param  status for errors if it occurs
361      * @return The character index of the first match preceding
362      *         <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are
363      *         no matches.
364      * @see #getOffset
365      * @stable ICU 2.0
366      */
367     int32_t preceding(int32_t position, UErrorCode &status);
368 
369     /**
370      * Returns the index of the next point at which the text matches the
371      * search pattern, starting from the current position
372      * The iterator is adjusted so that its current index (as returned by
373      * <tt>getOffset</tt>) is the match position if one was found.
374      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
375      * the iterator will be adjusted to a position after the end of the text
376      * string.
377      * @param  status for errors if it occurs
378      * @return The index of the next match after the current position,
379      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
380      * @see #getOffset
381      * @stable ICU 2.0
382      */
383      int32_t next(UErrorCode &status);
384 
385     /**
386      * Returns the index of the previous point at which the string text
387      * matches the search pattern, starting at the current position.
388      * The iterator is adjusted so that its current index (as returned by
389      * <tt>getOffset</tt>) is the match position if one was found.
390      * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and
391      * the iterator will be adjusted to the index USEARCH_DONE
392      * @param  status for errors if it occurs
393      * @return The index of the previous match before the current position,
394      *          or <tt>USEARCH_DONE</tt> if there are no more matches.
395      * @see #getOffset
396      * @stable ICU 2.0
397      */
398     int32_t previous(UErrorCode &status);
399 
400     /**
401     * Resets the iteration.
402     * Search will begin at the start of the text string if a forward
403     * iteration is initiated before a backwards iteration. Otherwise if a
404     * backwards iteration is initiated before a forwards iteration, the
405     * search will begin at the end of the text string.
406     * @stable ICU 2.0
407     */
408     virtual void reset();
409 
410 protected:
411     // protected data members ---------------------------------------------
412 
413     /**
414     * C search data struct
415     * @stable ICU 2.0
416     */
417     USearch *m_search_;
418 
419     /**
420     * Break iterator.
421     * Currently the C++ breakiterator does not have getRules etc to reproduce
422     * another in C. Hence we keep the original around and do the verification
423     * at the end of the match. The user is responsible for deleting this
424     * break iterator.
425     * @stable ICU 2.0
426     */
427     BreakIterator *m_breakiterator_;
428 
429     /**
430     * Unicode string version of the search text
431     * @stable ICU 2.0
432     */
433     UnicodeString  m_text_;
434 
435     // protected constructors and destructors -----------------------------
436 
437     /**
438     * Default constructor.
439     * Initializes data to the default values.
440     * @stable ICU 2.0
441     */
442     SearchIterator();
443 
444     /**
445      * Constructor for use by subclasses.
446      * @param text The target text to be searched.
447      * @param breakiter A {@link BreakIterator} that is used to restrict the
448      *                points at which matches are detected. If
449      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
450      *                match, but the match's start or end index is not a
451      *                boundary as determined by the <tt>BreakIterator</tt>,
452      *                the match is rejected and <tt>handleNext</tt> or
453      *                <tt>handlePrev</tt> is called again. If this parameter
454      *                is <tt>NULL</tt>, no break detection is attempted.
455      * @see #handleNext
456      * @see #handlePrev
457      * @stable ICU 2.0
458      */
459     SearchIterator(const UnicodeString &text,
460                          BreakIterator *breakiter = NULL);
461 
462     /**
463      * Constructor for use by subclasses.
464      * <p>
465      * Note: No parsing of the text within the <tt>CharacterIterator</tt>
466      * will be done during searching for this version. The block of text
467      * in <tt>CharacterIterator</tt> will be used as it is.
468      * @param text The target text to be searched.
469      * @param breakiter A {@link BreakIterator} that is used to restrict the
470      *                points at which matches are detected. If
471      *                <tt>handleNext</tt> or <tt>handlePrev</tt> finds a
472      *                match, but the match's start or end index is not a
473      *                boundary as determined by the <tt>BreakIterator</tt>,
474      *                the match is rejected and <tt>handleNext</tt> or
475      *                <tt>handlePrev</tt> is called again. If this parameter
476      *                is <tt>NULL</tt>, no break detection is attempted.
477      * @see #handleNext
478      * @see #handlePrev
479      * @stable ICU 2.0
480      */
481     SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL);
482 
483     // protected methods --------------------------------------------------
484 
485     /**
486      * Assignment operator. Sets this iterator to have the same behavior,
487      * and iterate over the same text, as the one passed in.
488      * @param that instance to be copied.
489      * @stable ICU 2.0
490      */
491     SearchIterator & operator=(const SearchIterator &that);
492 
493     /**
494      * Abstract method which subclasses override to provide the mechanism
495      * for finding the next match in the target text. This allows different
496      * subclasses to provide different search algorithms.
497      * <p>
498      * If a match is found, the implementation should return the index at
499      * which the match starts and should call
500      * <tt>setMatchLength</tt> with the number of characters
501      * in the target text that make up the match. If no match is found, the
502      * method should return USEARCH_DONE.
503      * <p>
504      * @param position The index in the target text at which the search
505      *                 should start.
506      * @param status for error codes if it occurs.
507      * @return index at which the match starts, else if match is not found
508      *         USEARCH_DONE is returned
509      * @see #setMatchLength
510      * @stable ICU 2.0
511      */
512     virtual int32_t handleNext(int32_t position, UErrorCode &status)
513                                                                          = 0;
514 
515     /**
516      * Abstract method which subclasses override to provide the mechanism for
517      * finding the previous match in the target text. This allows different
518      * subclasses to provide different search algorithms.
519      * <p>
520      * If a match is found, the implementation should return the index at
521      * which the match starts and should call
522      * <tt>setMatchLength</tt> with the number of characters
523      * in the target text that make up the match. If no match is found, the
524      * method should return USEARCH_DONE.
525      * <p>
526      * @param position The index in the target text at which the search
527      *                 should start.
528      * @param status for error codes if it occurs.
529      * @return index at which the match starts, else if match is not found
530      *         USEARCH_DONE is returned
531      * @see #setMatchLength
532      * @stable ICU 2.0
533      */
534      virtual int32_t handlePrev(int32_t position, UErrorCode &status)
535                                                                          = 0;
536 
537     /**
538      * Sets the length of the currently matched string in the text string to
539      * be searched.
540      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
541      * methods should call this when they find a match in the target text.
542      * @param length length of the matched text.
543      * @see #handleNext
544      * @see #handlePrev
545      * @stable ICU 2.0
546      */
547     virtual void setMatchLength(int32_t length);
548 
549     /**
550      * Sets the offset of the currently matched string in the text string to
551      * be searched.
552      * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt>
553      * methods should call this when they find a match in the target text.
554      * @param position start offset of the matched text.
555      * @see #handleNext
556      * @see #handlePrev
557      * @stable ICU 2.0
558      */
559     virtual void setMatchStart(int32_t position);
560 
561     /**
562     * sets match not found
563     * @stable ICU 2.0
564     */
565     void setMatchNotFound();
566 };
567 
568 inline UBool SearchIterator::operator!=(const SearchIterator &that) const
569 {
570    return !operator==(that);
571 }
572 U_NAMESPACE_END
573 
574 #endif /* #if !UCONFIG_NO_COLLATION */
575 
576 #endif
577 
578