1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   file name:  regex.h
9 *   encoding:   US-ASCII
10 *   indentation:4
11 *
12 *   created on: 2002oct22
13 *   created by: Andy Heninger
14 *
15 *   ICU Regular Expressions, API for C++
16 */
17 
18 #ifndef REGEX_H
19 #define REGEX_H
20 
21 //#define REGEX_DEBUG
22 
23 /**
24  * \file
25  * \brief  C++ API:  Regular Expressions
26  *
27  * <h2>Regular Expression API</h2>
28  *
29  * <p>The ICU API for processing regular expressions consists of two classes,
30  *  <code>RegexPattern</code> and <code>RegexMatcher</code>.
31  *  <code>RegexPattern</code> objects represent a pre-processed, or compiled
32  *  regular expression.  They are created from a regular expression pattern string,
33  *  and can be used to create <code>RegexMatcher</code> objects for the pattern.</p>
34  *
35  * <p>Class <code>RegexMatcher</code> bundles together a regular expression
36  *  pattern and a target string to which the search pattern will be applied.
37  *  <code>RegexMatcher</code> includes API for doing plain find or search
38  *  operations, for search and replace operations, and for obtaining detailed
39  *  information about bounds of a match. </p>
40  *
41  * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
42  * expression pattern strings application code can be simplified and the explicit
43  * need for <code>RegexPattern</code> objects can usually be eliminated.
44  * </p>
45  */
46 
47 #include "unicode/utypes.h"
48 
49 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
50 
51 #include "unicode/uobject.h"
52 #include "unicode/unistr.h"
53 #include "unicode/utext.h"
54 #include "unicode/parseerr.h"
55 
56 #include "unicode/uregex.h"
57 
58 // Forward Declarations
59 
60 struct UHashtable;
61 
62 U_NAMESPACE_BEGIN
63 
64 struct Regex8BitSet;
65 class  RegexCImpl;
66 class  RegexMatcher;
67 class  RegexPattern;
68 struct REStackFrame;
69 class  RuleBasedBreakIterator;
70 class  UnicodeSet;
71 class  UVector;
72 class  UVector32;
73 class  UVector64;
74 
75 
76 /**
77   * Class <code>RegexPattern</code> represents a compiled regular expression.  It includes
78   * factory methods for creating a RegexPattern object from the source (string) form
79   * of a regular expression, methods for creating RegexMatchers that allow the pattern
80   * to be applied to input text, and a few convenience methods for simple common
81   * uses of regular expressions.
82   *
83   * <p>Class RegexPattern is not intended to be subclassed.</p>
84   *
85   * @stable ICU 2.4
86   */
87 class U_I18N_API RegexPattern U_FINAL : public UObject {
88 public:
89 
90     /**
91      * default constructor.  Create a RegexPattern object that refers to no actual
92      *   pattern.  Not normally needed; RegexPattern objects are usually
93      *   created using the factory method <code>compile()</code>.
94      *
95      * @stable ICU 2.4
96      */
97     RegexPattern();
98 
99     /**
100      * Copy Constructor.  Create a new RegexPattern object that is equivalent
101      *                    to the source object.
102      * @param source the pattern object to be copied.
103      * @stable ICU 2.4
104      */
105     RegexPattern(const RegexPattern &source);
106 
107     /**
108      * Destructor.  Note that a RegexPattern object must persist so long as any
109      *  RegexMatcher objects that were created from the RegexPattern are active.
110      * @stable ICU 2.4
111      */
112     virtual ~RegexPattern();
113 
114     /**
115      * Comparison operator.  Two RegexPattern objects are considered equal if they
116      * were constructed from identical source patterns using the same match flag
117      * settings.
118      * @param that a RegexPattern object to compare with "this".
119      * @return TRUE if the objects are equivalent.
120      * @stable ICU 2.4
121      */
122     UBool           operator==(const RegexPattern& that) const;
123 
124     /**
125      * Comparison operator.  Two RegexPattern objects are considered equal if they
126      * were constructed from identical source patterns using the same match flag
127      * settings.
128      * @param that a RegexPattern object to compare with "this".
129      * @return TRUE if the objects are different.
130      * @stable ICU 2.4
131      */
132     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);}
133 
134     /**
135      * Assignment operator.  After assignment, this RegexPattern will behave identically
136      *     to the source object.
137      * @stable ICU 2.4
138      */
139     RegexPattern  &operator =(const RegexPattern &source);
140 
141     /**
142      * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
143      * intended to be subclassed, <code>clone()</code> and the copy construction are
144      * equivalent operations.
145      * @return the copy of this RegexPattern
146      * @stable ICU 2.4
147      */
148     virtual RegexPattern  *clone() const;
149 
150 
151    /**
152     * Compiles the regular expression in string form into a RegexPattern
153     * object.  These compile methods, rather than the constructors, are the usual
154     * way that RegexPattern objects are created.
155     *
156     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
157     * objects created from the pattern are active.  RegexMatchers keep a pointer
158     * back to their pattern, so premature deletion of the pattern is a
159     * catastrophic error.</p>
160     *
161     * <p>All pattern match mode flags are set to their default values.</p>
162     *
163     * <p>Note that it is often more convenient to construct a RegexMatcher directly
164     *    from a pattern string rather than separately compiling the pattern and
165     *    then creating a RegexMatcher object from the pattern.</p>
166     *
167     * @param regex The regular expression to be compiled.
168     * @param pe    Receives the position (line and column nubers) of any error
169     *              within the regular expression.)
170     * @param status A reference to a UErrorCode to receive any errors.
171     * @return      A regexPattern object for the compiled pattern.
172     *
173     * @stable ICU 2.4
174     */
175     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
176         UParseError          &pe,
177         UErrorCode           &status);
178 
179    /**
180     * Compiles the regular expression in string form into a RegexPattern
181     * object.  These compile methods, rather than the constructors, are the usual
182     * way that RegexPattern objects are created.
183     *
184     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
185     * objects created from the pattern are active.  RegexMatchers keep a pointer
186     * back to their pattern, so premature deletion of the pattern is a
187     * catastrophic error.</p>
188     *
189     * <p>All pattern match mode flags are set to their default values.</p>
190     *
191     * <p>Note that it is often more convenient to construct a RegexMatcher directly
192     *    from a pattern string rather than separately compiling the pattern and
193     *    then creating a RegexMatcher object from the pattern.</p>
194     *
195     * @param regex The regular expression to be compiled. Note, the text referred
196     *              to by this UText must not be deleted during the lifetime of the
197     *              RegexPattern object or any RegexMatcher object created from it.
198     * @param pe    Receives the position (line and column nubers) of any error
199     *              within the regular expression.)
200     * @param status A reference to a UErrorCode to receive any errors.
201     * @return      A regexPattern object for the compiled pattern.
202     *
203     * @stable ICU 4.6
204     */
205     static RegexPattern * U_EXPORT2 compile( UText *regex,
206         UParseError          &pe,
207         UErrorCode           &status);
208 
209    /**
210     * Compiles the regular expression in string form into a RegexPattern
211     * object using the specified match mode flags.  These compile methods,
212     * rather than the constructors, are the usual way that RegexPattern objects
213     * are created.
214     *
215     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
216     * objects created from the pattern are active.  RegexMatchers keep a pointer
217     * back to their pattern, so premature deletion of the pattern is a
218     * catastrophic error.</p>
219     *
220     * <p>Note that it is often more convenient to construct a RegexMatcher directly
221     *    from a pattern string instead of than separately compiling the pattern and
222     *    then creating a RegexMatcher object from the pattern.</p>
223     *
224     * @param regex The regular expression to be compiled.
225     * @param flags The match mode flags to be used.
226     * @param pe    Receives the position (line and column numbers) of any error
227     *              within the regular expression.)
228     * @param status   A reference to a UErrorCode to receive any errors.
229     * @return      A regexPattern object for the compiled pattern.
230     *
231     * @stable ICU 2.4
232     */
233     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
234         uint32_t             flags,
235         UParseError          &pe,
236         UErrorCode           &status);
237 
238    /**
239     * Compiles the regular expression in string form into a RegexPattern
240     * object using the specified match mode flags.  These compile methods,
241     * rather than the constructors, are the usual way that RegexPattern objects
242     * are created.
243     *
244     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
245     * objects created from the pattern are active.  RegexMatchers keep a pointer
246     * back to their pattern, so premature deletion of the pattern is a
247     * catastrophic error.</p>
248     *
249     * <p>Note that it is often more convenient to construct a RegexMatcher directly
250     *    from a pattern string instead of than separately compiling the pattern and
251     *    then creating a RegexMatcher object from the pattern.</p>
252     *
253     * @param regex The regular expression to be compiled. Note, the text referred
254     *              to by this UText must not be deleted during the lifetime of the
255     *              RegexPattern object or any RegexMatcher object created from it.
256     * @param flags The match mode flags to be used.
257     * @param pe    Receives the position (line and column numbers) of any error
258     *              within the regular expression.)
259     * @param status   A reference to a UErrorCode to receive any errors.
260     * @return      A regexPattern object for the compiled pattern.
261     *
262     * @stable ICU 4.6
263     */
264     static RegexPattern * U_EXPORT2 compile( UText *regex,
265         uint32_t             flags,
266         UParseError          &pe,
267         UErrorCode           &status);
268 
269    /**
270     * Compiles the regular expression in string form into a RegexPattern
271     * object using the specified match mode flags.  These compile methods,
272     * rather than the constructors, are the usual way that RegexPattern objects
273     * are created.
274     *
275     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
276     * objects created from the pattern are active.  RegexMatchers keep a pointer
277     * back to their pattern, so premature deletion of the pattern is a
278     * catastrophic error.</p>
279     *
280     * <p>Note that it is often more convenient to construct a RegexMatcher directly
281     *    from a pattern string instead of than separately compiling the pattern and
282     *    then creating a RegexMatcher object from the pattern.</p>
283     *
284     * @param regex The regular expression to be compiled.
285     * @param flags The match mode flags to be used.
286     * @param status   A reference to a UErrorCode to receive any errors.
287     * @return      A regexPattern object for the compiled pattern.
288     *
289     * @stable ICU 2.6
290     */
291     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
292         uint32_t             flags,
293         UErrorCode           &status);
294 
295    /**
296     * Compiles the regular expression in string form into a RegexPattern
297     * object using the specified match mode flags.  These compile methods,
298     * rather than the constructors, are the usual way that RegexPattern objects
299     * are created.
300     *
301     * <p>Note that RegexPattern objects must not be deleted while RegexMatcher
302     * objects created from the pattern are active.  RegexMatchers keep a pointer
303     * back to their pattern, so premature deletion of the pattern is a
304     * catastrophic error.</p>
305     *
306     * <p>Note that it is often more convenient to construct a RegexMatcher directly
307     *    from a pattern string instead of than separately compiling the pattern and
308     *    then creating a RegexMatcher object from the pattern.</p>
309     *
310     * @param regex The regular expression to be compiled. Note, the text referred
311     *              to by this UText must not be deleted during the lifetime of the
312     *              RegexPattern object or any RegexMatcher object created from it.
313     * @param flags The match mode flags to be used.
314     * @param status   A reference to a UErrorCode to receive any errors.
315     * @return      A regexPattern object for the compiled pattern.
316     *
317     * @stable ICU 4.6
318     */
319     static RegexPattern * U_EXPORT2 compile( UText *regex,
320         uint32_t             flags,
321         UErrorCode           &status);
322 
323    /**
324     * Get the match mode flags that were used when compiling this pattern.
325     * @return  the match mode flags
326     * @stable ICU 2.4
327     */
328     virtual uint32_t flags() const;
329 
330    /**
331     * Creates a RegexMatcher that will match the given input against this pattern.  The
332     * RegexMatcher can then be used to perform match, find or replace operations
333     * on the input.  Note that a RegexPattern object must not be deleted while
334     * RegexMatchers created from it still exist and might possibly be used again.
335     * <p>
336     * The matcher will retain a reference to the supplied input string, and all regexp
337     * pattern matching operations happen directly on this original string.  It is
338     * critical that the string not be altered or deleted before use by the regular
339     * expression operations is complete.
340     *
341     * @param input    The input string to which the regular expression will be applied.
342     * @param status   A reference to a UErrorCode to receive any errors.
343     * @return         A RegexMatcher object for this pattern and input.
344     *
345     * @stable ICU 2.4
346     */
347     virtual RegexMatcher *matcher(const UnicodeString &input,
348         UErrorCode          &status) const;
349 
350 private:
351     /**
352      * Cause a compilation error if an application accidentally attempts to
353      *   create a matcher with a (UChar *) string as input rather than
354      *   a UnicodeString.  Avoids a dangling reference to a temporary string.
355      * <p>
356      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
357      * using one of the aliasing constructors, such as
358      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
359      * or in a UText, using
360      * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
361      *
362      */
363     RegexMatcher *matcher(const UChar *input,
364         UErrorCode          &status) const;
365 public:
366 
367 
368    /**
369     * Creates a RegexMatcher that will match against this pattern.  The
370     * RegexMatcher can be used to perform match, find or replace operations.
371     * Note that a RegexPattern object must not be deleted while
372     * RegexMatchers created from it still exist and might possibly be used again.
373     *
374     * @param status   A reference to a UErrorCode to receive any errors.
375     * @return      A RegexMatcher object for this pattern and input.
376     *
377     * @stable ICU 2.6
378     */
379     virtual RegexMatcher *matcher(UErrorCode  &status) const;
380 
381 
382    /**
383     * Test whether a string matches a regular expression.  This convenience function
384     * both compiles the regular expression and applies it in a single operation.
385     * Note that if the same pattern needs to be applied repeatedly, this method will be
386     * less efficient than creating and reusing a RegexMatcher object.
387     *
388     * @param regex The regular expression
389     * @param input The string data to be matched
390     * @param pe Receives the position of any syntax errors within the regular expression
391     * @param status A reference to a UErrorCode to receive any errors.
392     * @return True if the regular expression exactly matches the full input string.
393     *
394     * @stable ICU 2.4
395     */
396     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
397         const UnicodeString   &input,
398               UParseError     &pe,
399               UErrorCode      &status);
400 
401    /**
402     * Test whether a string matches a regular expression.  This convenience function
403     * both compiles the regular expression and applies it in a single operation.
404     * Note that if the same pattern needs to be applied repeatedly, this method will be
405     * less efficient than creating and reusing a RegexMatcher object.
406     *
407     * @param regex The regular expression
408     * @param input The string data to be matched
409     * @param pe Receives the position of any syntax errors within the regular expression
410     * @param status A reference to a UErrorCode to receive any errors.
411     * @return True if the regular expression exactly matches the full input string.
412     *
413     * @stable ICU 4.6
414     */
415     static UBool U_EXPORT2 matches(UText *regex,
416         UText           *input,
417         UParseError     &pe,
418         UErrorCode      &status);
419 
420    /**
421     * Returns the regular expression from which this pattern was compiled. This method will work
422     * even if the pattern was compiled from a UText.
423     *
424     * Note: If the pattern was originally compiled from a UText, and that UText was modified,
425     * the returned string may no longer reflect the RegexPattern object.
426     * @stable ICU 2.4
427     */
428     virtual UnicodeString pattern() const;
429 
430 
431    /**
432     * Returns the regular expression from which this pattern was compiled. This method will work
433     * even if the pattern was compiled from a UnicodeString.
434     *
435     * Note: This is the original input, not a clone. If the pattern was originally compiled from a
436     * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
437     * object.
438     *
439     * @stable ICU 4.6
440     */
441     virtual UText *patternText(UErrorCode      &status) const;
442 
443 
444     /**
445      * Get the group number corresponding to a named capture group.
446      * The returned number can be used with any function that access
447      * capture groups by number.
448      *
449      * The function returns an error status if the specified name does not
450      * appear in the pattern.
451      *
452      * @param  groupName   The capture group name.
453      * @param  status      A UErrorCode to receive any errors.
454      *
455      * @stable ICU 55
456      */
457     virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
458 
459 
460     /**
461      * Get the group number corresponding to a named capture group.
462      * The returned number can be used with any function that access
463      * capture groups by number.
464      *
465      * The function returns an error status if the specified name does not
466      * appear in the pattern.
467      *
468      * @param  groupName   The capture group name,
469      *                     platform invariant characters only.
470      * @param  nameLength  The length of the name, or -1 if the name is
471      *                     nul-terminated.
472      * @param  status      A UErrorCode to receive any errors.
473      *
474      * @stable ICU 55
475      */
476     virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
477 
478 
479     /**
480      * Split a string into fields.  Somewhat like split() from Perl or Java.
481      * Pattern matches identify delimiters that separate the input
482      * into fields.  The input data between the delimiters becomes the
483      * fields themselves.
484      *
485      * If the delimiter pattern includes capture groups, the captured text will
486      * also appear in the destination array of output strings, interspersed
487      * with the fields.  This is similar to Perl, but differs from Java,
488      * which ignores the presence of capture groups in the pattern.
489      *
490      * Trailing empty fields will always be returned, assuming sufficient
491      * destination capacity.  This differs from the default behavior for Java
492      * and Perl where trailing empty fields are not returned.
493      *
494      * The number of strings produced by the split operation is returned.
495      * This count includes the strings from capture groups in the delimiter pattern.
496      * This behavior differs from Java, which ignores capture groups.
497      *
498      * For the best performance on split() operations,
499      * <code>RegexMatcher::split</code> is preferable to this function
500      *
501      * @param input   The string to be split into fields.  The field delimiters
502      *                match the pattern (in the "this" object)
503      * @param dest    An array of UnicodeStrings to receive the results of the split.
504      *                This is an array of actual UnicodeString objects, not an
505      *                array of pointers to strings.  Local (stack based) arrays can
506      *                work well here.
507      * @param destCapacity  The number of elements in the destination array.
508      *                If the number of fields found is less than destCapacity, the
509      *                extra strings in the destination array are not altered.
510      *                If the number of destination strings is less than the number
511      *                of fields, the trailing part of the input string, including any
512      *                field delimiters, is placed in the last destination string.
513      * @param status  A reference to a UErrorCode to receive any errors.
514      * @return        The number of fields into which the input string was split.
515      * @stable ICU 2.4
516      */
517     virtual int32_t  split(const UnicodeString &input,
518         UnicodeString    dest[],
519         int32_t          destCapacity,
520         UErrorCode       &status) const;
521 
522 
523     /**
524      * Split a string into fields.  Somewhat like split() from Perl or Java.
525      * Pattern matches identify delimiters that separate the input
526      * into fields.  The input data between the delimiters becomes the
527      * fields themselves.
528      *
529      * If the delimiter pattern includes capture groups, the captured text will
530      * also appear in the destination array of output strings, interspersed
531      * with the fields.  This is similar to Perl, but differs from Java,
532      * which ignores the presence of capture groups in the pattern.
533      *
534      * Trailing empty fields will always be returned, assuming sufficient
535      * destination capacity.  This differs from the default behavior for Java
536      * and Perl where trailing empty fields are not returned.
537      *
538      * The number of strings produced by the split operation is returned.
539      * This count includes the strings from capture groups in the delimiter pattern.
540      * This behavior differs from Java, which ignores capture groups.
541      *
542      *  For the best performance on split() operations,
543      *  <code>RegexMatcher::split</code> is preferable to this function
544      *
545      * @param input   The string to be split into fields.  The field delimiters
546      *                match the pattern (in the "this" object)
547      * @param dest    An array of mutable UText structs to receive the results of the split.
548      *                If a field is NULL, a new UText is allocated to contain the results for
549      *                that field. This new UText is not guaranteed to be mutable.
550      * @param destCapacity  The number of elements in the destination array.
551      *                If the number of fields found is less than destCapacity, the
552      *                extra strings in the destination array are not altered.
553      *                If the number of destination strings is less than the number
554      *                of fields, the trailing part of the input string, including any
555      *                field delimiters, is placed in the last destination string.
556      * @param status  A reference to a UErrorCode to receive any errors.
557      * @return        The number of destination strings used.
558      *
559      * @stable ICU 4.6
560      */
561     virtual int32_t  split(UText *input,
562         UText            *dest[],
563         int32_t          destCapacity,
564         UErrorCode       &status) const;
565 
566 
567     /**
568      * ICU "poor man's RTTI", returns a UClassID for the actual class.
569      *
570      * @stable ICU 2.4
571      */
572     virtual UClassID getDynamicClassID() const;
573 
574     /**
575      * ICU "poor man's RTTI", returns a UClassID for this class.
576      *
577      * @stable ICU 2.4
578      */
579     static UClassID U_EXPORT2 getStaticClassID();
580 
581 private:
582     //
583     //  Implementation Data
584     //
585     UText          *fPattern;      // The original pattern string.
586     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
587     uint32_t        fFlags;        // The flags used when compiling the pattern.
588                                    //
589     UVector64       *fCompiledPat; // The compiled pattern p-code.
590     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
591                                    //   after un-escaping, for use during the match.
592 
593     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
594     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
595 
596 
597     UErrorCode      fDeferredStatus; // status if some prior error has left this
598                                    //  RegexPattern in an unusable state.
599 
600     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
601                                    //   >= this value.  For some patterns, this calculated
602                                    //   value may be less than the true shortest
603                                    //   possible match.
604 
605     int32_t         fFrameSize;    // Size of a state stack frame in the
606                                    //   execution engine.
607 
608     int32_t         fDataSize;     // The size of the data needed by the pattern that
609                                    //   does not go on the state stack, but has just
610                                    //   a single copy per matcher.
611 
612     UVector32       *fGroupMap;    // Map from capture group number to position of
613                                    //   the group's variables in the matcher stack frame.
614 
615     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
616                                    //   regex character classes, e.g. Word.
617 
618     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
619                                    //  sets for predefined regex classes.
620 
621     int32_t         fStartType;    // Info on how a match must start.
622     int32_t         fInitialStringIdx;     //
623     int32_t         fInitialStringLen;
624     UnicodeSet     *fInitialChars;
625     UChar32         fInitialChar;
626     Regex8BitSet   *fInitialChars8;
627     UBool           fNeedsAltInput;
628 
629     UHashtable     *fNamedCaptureMap;  // Map from capture group names to numbers.
630 
631     friend class RegexCompile;
632     friend class RegexMatcher;
633     friend class RegexCImpl;
634 
635     //
636     //  Implementation Methods
637     //
638     void        init();            // Common initialization, for use by constructors.
639     void        zap();             // Common cleanup
640 
641     void        dumpOp(int32_t index) const;
642 
643   public:
644 #ifndef U_HIDE_INTERNAL_API
645     /**
646       * Dump a compiled pattern. Internal debug function.
647       * @internal
648       */
649     void        dumpPattern() const;
650 #endif  /* U_HIDE_INTERNAL_API */
651 };
652 
653 
654 
655 /**
656  *  class RegexMatcher bundles together a regular expression pattern and
657  *  input text to which the expression can be applied.  It includes methods
658  *  for testing for matches, and for find and replace operations.
659  *
660  * <p>Class RegexMatcher is not intended to be subclassed.</p>
661  *
662  * @stable ICU 2.4
663  */
664 class U_I18N_API RegexMatcher U_FINAL : public UObject {
665 public:
666 
667     /**
668       * Construct a RegexMatcher for a regular expression.
669       * This is a convenience method that avoids the need to explicitly create
670       * a RegexPattern object.  Note that if several RegexMatchers need to be
671       * created for the same expression, it will be more efficient to
672       * separately create and cache a RegexPattern object, and use
673       * its matcher() method to create the RegexMatcher objects.
674       *
675       *  @param regexp The Regular Expression to be compiled.
676       *  @param flags  Regular expression options, such as case insensitive matching.
677       *                @see UREGEX_CASE_INSENSITIVE
678       *  @param status Any errors are reported by setting this UErrorCode variable.
679       *  @stable ICU 2.6
680       */
681     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
682 
683     /**
684       * Construct a RegexMatcher for a regular expression.
685       * This is a convenience method that avoids the need to explicitly create
686       * a RegexPattern object.  Note that if several RegexMatchers need to be
687       * created for the same expression, it will be more efficient to
688       * separately create and cache a RegexPattern object, and use
689       * its matcher() method to create the RegexMatcher objects.
690       *
691       *  @param regexp The regular expression to be compiled.
692       *  @param flags  Regular expression options, such as case insensitive matching.
693       *                @see UREGEX_CASE_INSENSITIVE
694       *  @param status Any errors are reported by setting this UErrorCode variable.
695       *
696       *  @stable ICU 4.6
697       */
698     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
699 
700     /**
701       * Construct a RegexMatcher for a regular expression.
702       * This is a convenience method that avoids the need to explicitly create
703       * a RegexPattern object.  Note that if several RegexMatchers need to be
704       * created for the same expression, it will be more efficient to
705       * separately create and cache a RegexPattern object, and use
706       * its matcher() method to create the RegexMatcher objects.
707       * <p>
708       * The matcher will retain a reference to the supplied input string, and all regexp
709       * pattern matching operations happen directly on the original string.  It is
710       * critical that the string not be altered or deleted before use by the regular
711       * expression operations is complete.
712       *
713       *  @param regexp The Regular Expression to be compiled.
714       *  @param input  The string to match.  The matcher retains a reference to the
715       *                caller's string; mo copy is made.
716       *  @param flags  Regular expression options, such as case insensitive matching.
717       *                @see UREGEX_CASE_INSENSITIVE
718       *  @param status Any errors are reported by setting this UErrorCode variable.
719       *  @stable ICU 2.6
720       */
721     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
722         uint32_t flags, UErrorCode &status);
723 
724     /**
725       * Construct a RegexMatcher for a regular expression.
726       * This is a convenience method that avoids the need to explicitly create
727       * a RegexPattern object.  Note that if several RegexMatchers need to be
728       * created for the same expression, it will be more efficient to
729       * separately create and cache a RegexPattern object, and use
730       * its matcher() method to create the RegexMatcher objects.
731       * <p>
732       * The matcher will make a shallow clone of the supplied input text, and all regexp
733       * pattern matching operations happen on this clone.  While read-only operations on
734       * the supplied text are permitted, it is critical that the underlying string not be
735       * altered or deleted before use by the regular expression operations is complete.
736       *
737       *  @param regexp The Regular Expression to be compiled.
738       *  @param input  The string to match.  The matcher retains a shallow clone of the text.
739       *  @param flags  Regular expression options, such as case insensitive matching.
740       *                @see UREGEX_CASE_INSENSITIVE
741       *  @param status Any errors are reported by setting this UErrorCode variable.
742       *
743       *  @stable ICU 4.6
744       */
745     RegexMatcher(UText *regexp, UText *input,
746         uint32_t flags, UErrorCode &status);
747 
748 private:
749     /**
750      * Cause a compilation error if an application accidentally attempts to
751      *   create a matcher with a (UChar *) string as input rather than
752      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
753      * <p>
754      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
755      * using one of the aliasing constructors, such as
756      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
757      * or in a UText, using
758      * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
759      *
760      */
761     RegexMatcher(const UnicodeString &regexp, const UChar *input,
762         uint32_t flags, UErrorCode &status);
763 public:
764 
765 
766    /**
767     *   Destructor.
768     *
769     *  @stable ICU 2.4
770     */
771     virtual ~RegexMatcher();
772 
773 
774    /**
775     *   Attempts to match the entire input region against the pattern.
776     *    @param   status     A reference to a UErrorCode to receive any errors.
777     *    @return TRUE if there is a match
778     *    @stable ICU 2.4
779     */
780     virtual UBool matches(UErrorCode &status);
781 
782 
783    /**
784     *   Resets the matcher, then attempts to match the input beginning
785     *   at the specified startIndex, and extending to the end of the input.
786     *   The input region is reset to include the entire input string.
787     *   A successful match must extend to the end of the input.
788     *    @param   startIndex The input string (native) index at which to begin matching.
789     *    @param   status     A reference to a UErrorCode to receive any errors.
790     *    @return TRUE if there is a match
791     *    @stable ICU 2.8
792     */
793     virtual UBool matches(int64_t startIndex, UErrorCode &status);
794 
795 
796    /**
797     *   Attempts to match the input string, starting from the beginning of the region,
798     *   against the pattern.  Like the matches() method, this function
799     *   always starts at the beginning of the input region;
800     *   unlike that function, it does not require that the entire region be matched.
801     *
802     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
803     *     <code>end()</code>, and <code>group()</code> functions.</p>
804     *
805     *    @param   status     A reference to a UErrorCode to receive any errors.
806     *    @return  TRUE if there is a match at the start of the input string.
807     *    @stable ICU 2.4
808     */
809     virtual UBool lookingAt(UErrorCode &status);
810 
811 
812   /**
813     *   Attempts to match the input string, starting from the specified index, against the pattern.
814     *   The match may be of any length, and is not required to extend to the end
815     *   of the input string.  Contrast with match().
816     *
817     *   <p>If the match succeeds then more information can be obtained via the <code>start()</code>,
818     *     <code>end()</code>, and <code>group()</code> functions.</p>
819     *
820     *    @param   startIndex The input string (native) index at which to begin matching.
821     *    @param   status     A reference to a UErrorCode to receive any errors.
822     *    @return  TRUE if there is a match.
823     *    @stable ICU 2.8
824     */
825     virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
826 
827 
828    /**
829     *  Find the next pattern match in the input string.
830     *  The find begins searching the input at the location following the end of
831     *  the previous match, or at the start of the string if there is no previous match.
832     *  If a match is found, <code>start(), end()</code> and <code>group()</code>
833     *  will provide more information regarding the match.
834     *  <p>Note that if the input string is changed by the application,
835     *     use find(startPos, status) instead of find(), because the saved starting
836     *     position may not be valid with the altered input string.</p>
837     *  @return  TRUE if a match is found.
838     *  @stable ICU 2.4
839     */
840     virtual UBool find();
841 
842 
843    /**
844     *  Find the next pattern match in the input string.
845     *  The find begins searching the input at the location following the end of
846     *  the previous match, or at the start of the string if there is no previous match.
847     *  If a match is found, <code>start(), end()</code> and <code>group()</code>
848     *  will provide more information regarding the match.
849     *  <p>Note that if the input string is changed by the application,
850     *     use find(startPos, status) instead of find(), because the saved starting
851     *     position may not be valid with the altered input string.</p>
852     *  @param   status  A reference to a UErrorCode to receive any errors.
853     *  @return  TRUE if a match is found.
854     * @stable ICU 55
855     */
856     virtual UBool find(UErrorCode &status);
857 
858    /**
859     *   Resets this RegexMatcher and then attempts to find the next substring of the
860     *   input string that matches the pattern, starting at the specified index.
861     *
862     *   @param   start     The (native) index in the input string to begin the search.
863     *   @param   status    A reference to a UErrorCode to receive any errors.
864     *   @return  TRUE if a match is found.
865     *   @stable ICU 2.4
866     */
867     virtual UBool find(int64_t start, UErrorCode &status);
868 
869 
870    /**
871     *   Returns a string containing the text matched by the previous match.
872     *   If the pattern can match an empty string, an empty string may be returned.
873     *   @param   status      A reference to a UErrorCode to receive any errors.
874     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
875     *                        has been attempted or the last match failed.
876     *   @return  a string containing the matched input text.
877     *   @stable ICU 2.4
878     */
879     virtual UnicodeString group(UErrorCode &status) const;
880 
881 
882    /**
883     *    Returns a string containing the text captured by the given group
884     *    during the previous match operation.  Group(0) is the entire match.
885     *
886     *    A zero length string is returned both for capture groups that did not
887     *    participate in the match and for actual zero length matches.
888     *    To distinguish between these two cases use the function start(),
889     *    which returns -1 for non-participating groups.
890     *
891     *    @param groupNum the capture group number
892     *    @param   status     A reference to a UErrorCode to receive any errors.
893     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
894     *                        has been attempted or the last match failed and
895     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
896     *    @return the captured text
897     *    @stable ICU 2.4
898     */
899     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
900 
901    /**
902     *   Returns the number of capturing groups in this matcher's pattern.
903     *   @return the number of capture groups
904     *   @stable ICU 2.4
905     */
906     virtual int32_t groupCount() const;
907 
908 
909    /**
910     *   Returns a shallow clone of the entire live input string with the UText current native index
911     *   set to the beginning of the requested group.
912     *
913     *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText
914     *   @param   group_len   A reference to receive the length of the desired capture group
915     *   @param   status      A reference to a UErrorCode to receive any errors.
916     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
917     *                        has been attempted or the last match failed and
918     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
919     *   @return dest if non-NULL, a shallow copy of the input text otherwise
920     *
921     *   @stable ICU 4.6
922     */
923     virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
924 
925    /**
926     *   Returns a shallow clone of the entire live input string with the UText current native index
927     *   set to the beginning of the requested group.
928     *
929     *   A group length of zero is returned both for capture groups that did not
930     *   participate in the match and for actual zero length matches.
931     *   To distinguish between these two cases use the function start(),
932     *   which returns -1 for non-participating groups.
933     *
934     *   @param   groupNum   The capture group number.
935     *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText.
936     *   @param   group_len   A reference to receive the length of the desired capture group
937     *   @param   status      A reference to a UErrorCode to receive any errors.
938     *                        Possible errors are  U_REGEX_INVALID_STATE if no match
939     *                        has been attempted or the last match failed and
940     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
941     *   @return dest if non-NULL, a shallow copy of the input text otherwise
942     *
943     *   @stable ICU 4.6
944     */
945     virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
946 
947    /**
948     *   Returns the index in the input string of the start of the text matched
949     *   during the previous match operation.
950     *    @param   status      a reference to a UErrorCode to receive any errors.
951     *    @return              The (native) position in the input string of the start of the last match.
952     *    @stable ICU 2.4
953     */
954     virtual int32_t start(UErrorCode &status) const;
955 
956    /**
957     *   Returns the index in the input string of the start of the text matched
958     *   during the previous match operation.
959     *    @param   status      a reference to a UErrorCode to receive any errors.
960     *    @return              The (native) position in the input string of the start of the last match.
961     *   @stable ICU 4.6
962     */
963     virtual int64_t start64(UErrorCode &status) const;
964 
965 
966    /**
967     *   Returns the index in the input string of the start of the text matched by the
968     *    specified capture group during the previous match operation.  Return -1 if
969     *    the capture group exists in the pattern, but was not part of the last match.
970     *
971     *    @param  group       the capture group number
972     *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
973     *                        errors are  U_REGEX_INVALID_STATE if no match has been
974     *                        attempted or the last match failed, and
975     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
976     *    @return the (native) start position of substring matched by the specified group.
977     *    @stable ICU 2.4
978     */
979     virtual int32_t start(int32_t group, UErrorCode &status) const;
980 
981    /**
982     *   Returns the index in the input string of the start of the text matched by the
983     *    specified capture group during the previous match operation.  Return -1 if
984     *    the capture group exists in the pattern, but was not part of the last match.
985     *
986     *    @param  group       the capture group number.
987     *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
988     *                        errors are  U_REGEX_INVALID_STATE if no match has been
989     *                        attempted or the last match failed, and
990     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
991     *    @return the (native) start position of substring matched by the specified group.
992     *    @stable ICU 4.6
993     */
994     virtual int64_t start64(int32_t group, UErrorCode &status) const;
995 
996    /**
997     *    Returns the index in the input string of the first character following the
998     *    text matched during the previous match operation.
999     *
1000     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1001     *                        errors are  U_REGEX_INVALID_STATE if no match has been
1002     *                        attempted or the last match failed.
1003     *    @return the index of the last character matched, plus one.
1004     *                        The index value returned is a native index, corresponding to
1005     *                        code units for the underlying encoding type, for example,
1006     *                        a byte index for UTF-8.
1007     *   @stable ICU 2.4
1008     */
1009     virtual int32_t end(UErrorCode &status) const;
1010 
1011    /**
1012     *    Returns the index in the input string of the first character following the
1013     *    text matched during the previous match operation.
1014     *
1015     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1016     *                        errors are  U_REGEX_INVALID_STATE if no match has been
1017     *                        attempted or the last match failed.
1018     *    @return the index of the last character matched, plus one.
1019     *                        The index value returned is a native index, corresponding to
1020     *                        code units for the underlying encoding type, for example,
1021     *                        a byte index for UTF-8.
1022     *   @stable ICU 4.6
1023     */
1024     virtual int64_t end64(UErrorCode &status) const;
1025 
1026 
1027    /**
1028     *    Returns the index in the input string of the character following the
1029     *    text matched by the specified capture group during the previous match operation.
1030     *
1031     *    @param group  the capture group number
1032     *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
1033     *                        errors are  U_REGEX_INVALID_STATE if no match has been
1034     *                        attempted or the last match failed and
1035     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
1036     *    @return  the index of the first character following the text
1037     *              captured by the specified group during the previous match operation.
1038     *              Return -1 if the capture group exists in the pattern but was not part of the match.
1039     *              The index value returned is a native index, corresponding to
1040     *              code units for the underlying encoding type, for example,
1041     *              a byte index for UTF8.
1042     *    @stable ICU 2.4
1043     */
1044     virtual int32_t end(int32_t group, UErrorCode &status) const;
1045 
1046    /**
1047     *    Returns the index in the input string of the character following the
1048     *    text matched by the specified capture group during the previous match operation.
1049     *
1050     *    @param group  the capture group number
1051     *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
1052     *                        errors are  U_REGEX_INVALID_STATE if no match has been
1053     *                        attempted or the last match failed and
1054     *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
1055     *    @return  the index of the first character following the text
1056     *              captured by the specified group during the previous match operation.
1057     *              Return -1 if the capture group exists in the pattern but was not part of the match.
1058     *              The index value returned is a native index, corresponding to
1059     *              code units for the underlying encoding type, for example,
1060     *              a byte index for UTF8.
1061     *   @stable ICU 4.6
1062     */
1063     virtual int64_t end64(int32_t group, UErrorCode &status) const;
1064 
1065    /**
1066     *   Resets this matcher.  The effect is to remove any memory of previous matches,
1067     *       and to cause subsequent find() operations to begin at the beginning of
1068     *       the input string.
1069     *
1070     *   @return this RegexMatcher.
1071     *   @stable ICU 2.4
1072     */
1073     virtual RegexMatcher &reset();
1074 
1075 
1076    /**
1077     *   Resets this matcher, and set the current input position.
1078     *   The effect is to remove any memory of previous matches,
1079     *       and to cause subsequent find() operations to begin at
1080     *       the specified (native) position in the input string.
1081     * <p>
1082     *   The matcher's region is reset to its default, which is the entire
1083     *   input string.
1084     * <p>
1085     *   An alternative to this function is to set a match region
1086     *   beginning at the desired index.
1087     *
1088     *   @return this RegexMatcher.
1089     *   @stable ICU 2.8
1090     */
1091     virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1092 
1093 
1094    /**
1095     *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
1096     *     to be reused, which is more efficient than creating a new RegexMatcher for
1097     *     each input string to be processed.
1098     *   @param input The new string on which subsequent pattern matches will operate.
1099     *                The matcher retains a reference to the callers string, and operates
1100     *                directly on that.  Ownership of the string remains with the caller.
1101     *                Because no copy of the string is made, it is essential that the
1102     *                caller not delete the string until after regexp operations on it
1103     *                are done.
1104     *                Note that while a reset on the matcher with an input string that is then
1105     *                modified across/during matcher operations may be supported currently for UnicodeString,
1106     *                this was not originally intended behavior, and support for this is not guaranteed
1107     *                in upcoming versions of ICU.
1108     *   @return this RegexMatcher.
1109     *   @stable ICU 2.4
1110     */
1111     virtual RegexMatcher &reset(const UnicodeString &input);
1112 
1113 
1114    /**
1115     *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
1116     *     to be reused, which is more efficient than creating a new RegexMatcher for
1117     *     each input string to be processed.
1118     *   @param input The new string on which subsequent pattern matches will operate.
1119     *                The matcher makes a shallow clone of the given text; ownership of the
1120     *                original string remains with the caller. Because no deep copy of the
1121     *                text is made, it is essential that the caller not modify the string
1122     *                until after regexp operations on it are done.
1123     *   @return this RegexMatcher.
1124     *
1125     *   @stable ICU 4.6
1126     */
1127     virtual RegexMatcher &reset(UText *input);
1128 
1129 
1130   /**
1131     *  Set the subject text string upon which the regular expression is looking for matches
1132     *  without changing any other aspect of the matching state.
1133     *  The new and previous text strings must have the same content.
1134     *
1135     *  This function is intended for use in environments where ICU is operating on
1136     *  strings that may move around in memory.  It provides a mechanism for notifying
1137     *  ICU that the string has been relocated, and providing a new UText to access the
1138     *  string in its new position.
1139     *
1140     *  Note that the regular expression implementation never copies the underlying text
1141     *  of a string being matched, but always operates directly on the original text
1142     *  provided by the user. Refreshing simply drops the references to the old text
1143     *  and replaces them with references to the new.
1144     *
1145     *  Caution:  this function is normally used only by very specialized,
1146     *  system-level code.  One example use case is with garbage collection that moves
1147     *  the text in memory.
1148     *
1149     * @param input      The new (moved) text string.
1150     * @param status     Receives errors detected by this function.
1151     *
1152     * @stable ICU 4.8
1153     */
1154     virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1155 
1156 private:
1157     /**
1158      * Cause a compilation error if an application accidentally attempts to
1159      *   reset a matcher with a (UChar *) string as input rather than
1160      *   a UnicodeString.    Avoids a dangling reference to a temporary string.
1161      * <p>
1162      * To efficiently work with UChar *strings, wrap the data in a UnicodeString
1163      * using one of the aliasing constructors, such as
1164      * <code>UnicodeString(UBool isTerminated, const UChar *text, int32_t textLength);</code>
1165      * or in a UText, using
1166      * <code>utext_openUChars(UText *ut, const UChar *text, int64_t textLength, UErrorCode *status);</code>
1167      *
1168      */
1169     RegexMatcher &reset(const UChar *input);
1170 public:
1171 
1172    /**
1173     *   Returns the input string being matched.  Ownership of the string belongs to
1174     *   the matcher; it should not be altered or deleted. This method will work even if the input
1175     *   was originally supplied as a UText.
1176     *   @return the input string
1177     *   @stable ICU 2.4
1178     */
1179     virtual const UnicodeString &input() const;
1180 
1181    /**
1182     *   Returns the input string being matched.  This is the live input text; it should not be
1183     *   altered or deleted. This method will work even if the input was originally supplied as
1184     *   a UnicodeString.
1185     *   @return the input text
1186     *
1187     *   @stable ICU 4.6
1188     */
1189     virtual UText *inputText() const;
1190 
1191    /**
1192     *   Returns the input string being matched, either by copying it into the provided
1193     *   UText parameter or by returning a shallow clone of the live input. Note that copying
1194     *   the entire input may cause significant performance and memory issues.
1195     *   @param dest The UText into which the input should be copied, or NULL to create a new UText
1196     *   @param status error code
1197     *   @return dest if non-NULL, a shallow copy of the input text otherwise
1198     *
1199     *   @stable ICU 4.6
1200     */
1201     virtual UText *getInput(UText *dest, UErrorCode &status) const;
1202 
1203 
1204    /** Sets the limits of this matcher's region.
1205      * The region is the part of the input string that will be searched to find a match.
1206      * Invoking this method resets the matcher, and then sets the region to start
1207      * at the index specified by the start parameter and end at the index specified
1208      * by the end parameter.
1209      *
1210      * Depending on the transparency and anchoring being used (see useTransparentBounds
1211      * and useAnchoringBounds), certain constructs such as anchors may behave differently
1212      * at or around the boundaries of the region
1213      *
1214      * The function will fail if start is greater than limit, or if either index
1215      *  is less than zero or greater than the length of the string being matched.
1216      *
1217      * @param start  The (native) index to begin searches at.
1218      * @param limit  The index to end searches at (exclusive).
1219      * @param status A reference to a UErrorCode to receive any errors.
1220      * @stable ICU 4.0
1221      */
1222      virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1223 
1224    /**
1225      * Identical to region(start, limit, status) but also allows a start position without
1226      *  resetting the region state.
1227      * @param regionStart The region start
1228      * @param regionLimit the limit of the region
1229      * @param startIndex  The (native) index within the region bounds at which to begin searches.
1230      * @param status A reference to a UErrorCode to receive any errors.
1231      *                If startIndex is not within the specified region bounds,
1232      *                U_INDEX_OUTOFBOUNDS_ERROR is returned.
1233      * @stable ICU 4.6
1234      */
1235      virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1236 
1237    /**
1238      * Reports the start index of this matcher's region. The searches this matcher
1239      * conducts are limited to finding matches within regionStart (inclusive) and
1240      * regionEnd (exclusive).
1241      *
1242      * @return The starting (native) index of this matcher's region.
1243      * @stable ICU 4.0
1244      */
1245      virtual int32_t regionStart() const;
1246 
1247    /**
1248      * Reports the start index of this matcher's region. The searches this matcher
1249      * conducts are limited to finding matches within regionStart (inclusive) and
1250      * regionEnd (exclusive).
1251      *
1252      * @return The starting (native) index of this matcher's region.
1253      * @stable ICU 4.6
1254      */
1255      virtual int64_t regionStart64() const;
1256 
1257 
1258     /**
1259       * Reports the end (limit) index (exclusive) of this matcher's region. The searches
1260       * this matcher conducts are limited to finding matches within regionStart
1261       * (inclusive) and regionEnd (exclusive).
1262       *
1263       * @return The ending point (native) of this matcher's region.
1264       * @stable ICU 4.0
1265       */
1266       virtual int32_t regionEnd() const;
1267 
1268    /**
1269      * Reports the end (limit) index (exclusive) of this matcher's region. The searches
1270      * this matcher conducts are limited to finding matches within regionStart
1271      * (inclusive) and regionEnd (exclusive).
1272      *
1273      * @return The ending point (native) of this matcher's region.
1274      * @stable ICU 4.6
1275      */
1276       virtual int64_t regionEnd64() const;
1277 
1278     /**
1279       * Queries the transparency of region bounds for this matcher.
1280       * See useTransparentBounds for a description of transparent and opaque bounds.
1281       * By default, a matcher uses opaque region boundaries.
1282       *
1283       * @return TRUE if this matcher is using opaque bounds, false if it is not.
1284       * @stable ICU 4.0
1285       */
1286       virtual UBool hasTransparentBounds() const;
1287 
1288     /**
1289       * Sets the transparency of region bounds for this matcher.
1290       * Invoking this function with an argument of true will set this matcher to use transparent bounds.
1291       * If the boolean argument is false, then opaque bounds will be used.
1292       *
1293       * Using transparent bounds, the boundaries of this matcher's region are transparent
1294       * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
1295       * see text beyond the boundaries of the region while checking for a match.
1296       *
1297       * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
1298       * lookbehind, and boundary matching constructs.
1299       *
1300       * By default, a matcher uses opaque bounds.
1301       *
1302       * @param   b TRUE for transparent bounds; FALSE for opaque bounds
1303       * @return  This Matcher;
1304       * @stable ICU 4.0
1305       **/
1306       virtual RegexMatcher &useTransparentBounds(UBool b);
1307 
1308 
1309     /**
1310       * Return true if this matcher is using anchoring bounds.
1311       * By default, matchers use anchoring region bounds.
1312       *
1313       * @return TRUE if this matcher is using anchoring bounds.
1314       * @stable ICU 4.0
1315       */
1316       virtual UBool hasAnchoringBounds() const;
1317 
1318 
1319     /**
1320       * Set whether this matcher is using Anchoring Bounds for its region.
1321       * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
1322       * and end of the region.  Without Anchoring Bounds, anchors will only match at
1323       * the positions they would in the complete text.
1324       *
1325       * Anchoring Bounds are the default for regions.
1326       *
1327       * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
1328       * @return  This Matcher
1329       * @stable ICU 4.0
1330       */
1331       virtual RegexMatcher &useAnchoringBounds(UBool b);
1332 
1333 
1334     /**
1335       * Return TRUE if the most recent matching operation attempted to access
1336       *  additional input beyond the available input text.
1337       *  In this case, additional input text could change the results of the match.
1338       *
1339       *  hitEnd() is defined for both successful and unsuccessful matches.
1340       *  In either case hitEnd() will return TRUE if if the end of the text was
1341       *  reached at any point during the matching process.
1342       *
1343       *  @return  TRUE if the most recent match hit the end of input
1344       *  @stable ICU 4.0
1345       */
1346       virtual UBool hitEnd() const;
1347 
1348     /**
1349       * Return TRUE the most recent match succeeded and additional input could cause
1350       * it to fail. If this method returns false and a match was found, then more input
1351       * might change the match but the match won't be lost. If a match was not found,
1352       * then requireEnd has no meaning.
1353       *
1354       * @return TRUE if more input could cause the most recent match to no longer match.
1355       * @stable ICU 4.0
1356       */
1357       virtual UBool requireEnd() const;
1358 
1359 
1360    /**
1361     *    Returns the pattern that is interpreted by this matcher.
1362     *    @return  the RegexPattern for this RegexMatcher
1363     *    @stable ICU 2.4
1364     */
1365     virtual const RegexPattern &pattern() const;
1366 
1367 
1368    /**
1369     *    Replaces every substring of the input that matches the pattern
1370     *    with the given replacement string.  This is a convenience function that
1371     *    provides a complete find-and-replace-all operation.
1372     *
1373     *    This method first resets this matcher. It then scans the input string
1374     *    looking for matches of the pattern. Input that is not part of any
1375     *    match is left unchanged; each match is replaced in the result by the
1376     *    replacement string. The replacement string may contain references to
1377     *    capture groups.
1378     *
1379     *    @param   replacement a string containing the replacement text.
1380     *    @param   status      a reference to a UErrorCode to receive any errors.
1381     *    @return              a string containing the results of the find and replace.
1382     *    @stable ICU 2.4
1383     */
1384     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1385 
1386 
1387    /**
1388     *    Replaces every substring of the input that matches the pattern
1389     *    with the given replacement string.  This is a convenience function that
1390     *    provides a complete find-and-replace-all operation.
1391     *
1392     *    This method first resets this matcher. It then scans the input string
1393     *    looking for matches of the pattern. Input that is not part of any
1394     *    match is left unchanged; each match is replaced in the result by the
1395     *    replacement string. The replacement string may contain references to
1396     *    capture groups.
1397     *
1398     *    @param   replacement a string containing the replacement text.
1399     *    @param   dest        a mutable UText in which the results are placed.
1400     *                          If NULL, a new UText will be created (which may not be mutable).
1401     *    @param   status      a reference to a UErrorCode to receive any errors.
1402     *    @return              a string containing the results of the find and replace.
1403     *                          If a pre-allocated UText was provided, it will always be used and returned.
1404     *
1405     *    @stable ICU 4.6
1406     */
1407     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1408 
1409 
1410    /**
1411     * Replaces the first substring of the input that matches
1412     * the pattern with the replacement string.   This is a convenience
1413     * function that provides a complete find-and-replace operation.
1414     *
1415     * <p>This function first resets this RegexMatcher. It then scans the input string
1416     * looking for a match of the pattern. Input that is not part
1417     * of the match is appended directly to the result string; the match is replaced
1418     * in the result by the replacement string. The replacement string may contain
1419     * references to captured groups.</p>
1420     *
1421     * <p>The state of the matcher (the position at which a subsequent find()
1422     *    would begin) after completing a replaceFirst() is not specified.  The
1423     *    RegexMatcher should be reset before doing additional find() operations.</p>
1424     *
1425     *    @param   replacement a string containing the replacement text.
1426     *    @param   status      a reference to a UErrorCode to receive any errors.
1427     *    @return              a string containing the results of the find and replace.
1428     *    @stable ICU 2.4
1429     */
1430     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1431 
1432 
1433    /**
1434     * Replaces the first substring of the input that matches
1435     * the pattern with the replacement string.   This is a convenience
1436     * function that provides a complete find-and-replace operation.
1437     *
1438     * <p>This function first resets this RegexMatcher. It then scans the input string
1439     * looking for a match of the pattern. Input that is not part
1440     * of the match is appended directly to the result string; the match is replaced
1441     * in the result by the replacement string. The replacement string may contain
1442     * references to captured groups.</p>
1443     *
1444     * <p>The state of the matcher (the position at which a subsequent find()
1445     *    would begin) after completing a replaceFirst() is not specified.  The
1446     *    RegexMatcher should be reset before doing additional find() operations.</p>
1447     *
1448     *    @param   replacement a string containing the replacement text.
1449     *    @param   dest        a mutable UText in which the results are placed.
1450     *                          If NULL, a new UText will be created (which may not be mutable).
1451     *    @param   status      a reference to a UErrorCode to receive any errors.
1452     *    @return              a string containing the results of the find and replace.
1453     *                          If a pre-allocated UText was provided, it will always be used and returned.
1454     *
1455     *    @stable ICU 4.6
1456     */
1457     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1458 
1459 
1460    /**
1461     *   Implements a replace operation intended to be used as part of an
1462     *   incremental find-and-replace.
1463     *
1464     *   <p>The input string, starting from the end of the previous replacement and ending at
1465     *   the start of the current match, is appended to the destination string.  Then the
1466     *   replacement string is appended to the output string,
1467     *   including handling any substitutions of captured text.</p>
1468     *
1469     *   <p>For simple, prepackaged, non-incremental find-and-replace
1470     *   operations, see replaceFirst() or replaceAll().</p>
1471     *
1472     *   @param   dest        A UnicodeString to which the results of the find-and-replace are appended.
1473     *   @param   replacement A UnicodeString that provides the text to be substituted for
1474     *                        the input text that matched the regexp pattern.  The replacement
1475     *                        text may contain references to captured text from the
1476     *                        input.
1477     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1478     *                        errors are  U_REGEX_INVALID_STATE if no match has been
1479     *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
1480     *                        if the replacement text specifies a capture group that
1481     *                        does not exist in the pattern.
1482     *
1483     *   @return  this  RegexMatcher
1484     *   @stable ICU 2.4
1485     *
1486     */
1487     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1488         const UnicodeString &replacement, UErrorCode &status);
1489 
1490 
1491    /**
1492     *   Implements a replace operation intended to be used as part of an
1493     *   incremental find-and-replace.
1494     *
1495     *   <p>The input string, starting from the end of the previous replacement and ending at
1496     *   the start of the current match, is appended to the destination string.  Then the
1497     *   replacement string is appended to the output string,
1498     *   including handling any substitutions of captured text.</p>
1499     *
1500     *   <p>For simple, prepackaged, non-incremental find-and-replace
1501     *   operations, see replaceFirst() or replaceAll().</p>
1502     *
1503     *   @param   dest        A mutable UText to which the results of the find-and-replace are appended.
1504     *                         Must not be NULL.
1505     *   @param   replacement A UText that provides the text to be substituted for
1506     *                        the input text that matched the regexp pattern.  The replacement
1507     *                        text may contain references to captured text from the input.
1508     *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1509     *                        errors are  U_REGEX_INVALID_STATE if no match has been
1510     *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
1511     *                        if the replacement text specifies a capture group that
1512     *                        does not exist in the pattern.
1513     *
1514     *   @return  this  RegexMatcher
1515     *
1516     *   @stable ICU 4.6
1517     */
1518     virtual RegexMatcher &appendReplacement(UText *dest,
1519         UText *replacement, UErrorCode &status);
1520 
1521 
1522    /**
1523     * As the final step in a find-and-replace operation, append the remainder
1524     * of the input string, starting at the position following the last appendReplacement(),
1525     * to the destination string. <code>appendTail()</code> is intended to be invoked after one
1526     * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
1527     *
1528     *  @param dest A UnicodeString to which the results of the find-and-replace are appended.
1529     *  @return  the destination string.
1530     *  @stable ICU 2.4
1531     */
1532     virtual UnicodeString &appendTail(UnicodeString &dest);
1533 
1534 
1535    /**
1536     * As the final step in a find-and-replace operation, append the remainder
1537     * of the input string, starting at the position following the last appendReplacement(),
1538     * to the destination string. <code>appendTail()</code> is intended to be invoked after one
1539     * or more invocations of the <code>RegexMatcher::appendReplacement()</code>.
1540     *
1541     *  @param dest A mutable UText to which the results of the find-and-replace are appended.
1542     *               Must not be NULL.
1543     *  @param status error cod
1544     *  @return  the destination string.
1545     *
1546     *  @stable ICU 4.6
1547     */
1548     virtual UText *appendTail(UText *dest, UErrorCode &status);
1549 
1550 
1551     /**
1552      * Split a string into fields.  Somewhat like split() from Perl.
1553      * The pattern matches identify delimiters that separate the input
1554      *  into fields.  The input data between the matches becomes the
1555      *  fields themselves.
1556      *
1557      * @param input   The string to be split into fields.  The field delimiters
1558      *                match the pattern (in the "this" object).  This matcher
1559      *                will be reset to this input string.
1560      * @param dest    An array of UnicodeStrings to receive the results of the split.
1561      *                This is an array of actual UnicodeString objects, not an
1562      *                array of pointers to strings.  Local (stack based) arrays can
1563      *                work well here.
1564      * @param destCapacity  The number of elements in the destination array.
1565      *                If the number of fields found is less than destCapacity, the
1566      *                extra strings in the destination array are not altered.
1567      *                If the number of destination strings is less than the number
1568      *                of fields, the trailing part of the input string, including any
1569      *                field delimiters, is placed in the last destination string.
1570      * @param status  A reference to a UErrorCode to receive any errors.
1571      * @return        The number of fields into which the input string was split.
1572      * @stable ICU 2.6
1573      */
1574     virtual int32_t  split(const UnicodeString &input,
1575         UnicodeString    dest[],
1576         int32_t          destCapacity,
1577         UErrorCode       &status);
1578 
1579 
1580     /**
1581      * Split a string into fields.  Somewhat like split() from Perl.
1582      * The pattern matches identify delimiters that separate the input
1583      *  into fields.  The input data between the matches becomes the
1584      *  fields themselves.
1585      *
1586      * @param input   The string to be split into fields.  The field delimiters
1587      *                match the pattern (in the "this" object).  This matcher
1588      *                will be reset to this input string.
1589      * @param dest    An array of mutable UText structs to receive the results of the split.
1590      *                If a field is NULL, a new UText is allocated to contain the results for
1591      *                that field. This new UText is not guaranteed to be mutable.
1592      * @param destCapacity  The number of elements in the destination array.
1593      *                If the number of fields found is less than destCapacity, the
1594      *                extra strings in the destination array are not altered.
1595      *                If the number of destination strings is less than the number
1596      *                of fields, the trailing part of the input string, including any
1597      *                field delimiters, is placed in the last destination string.
1598      * @param status  A reference to a UErrorCode to receive any errors.
1599      * @return        The number of fields into which the input string was split.
1600      *
1601      * @stable ICU 4.6
1602      */
1603     virtual int32_t  split(UText *input,
1604         UText           *dest[],
1605         int32_t          destCapacity,
1606         UErrorCode       &status);
1607 
1608   /**
1609     *   Set a processing time limit for match operations with this Matcher.
1610     *
1611     *   Some patterns, when matching certain strings, can run in exponential time.
1612     *   For practical purposes, the match operation may appear to be in an
1613     *   infinite loop.
1614     *   When a limit is set a match operation will fail with an error if the
1615     *   limit is exceeded.
1616     *   <p>
1617     *   The units of the limit are steps of the match engine.
1618     *   Correspondence with actual processor time will depend on the speed
1619     *   of the processor and the details of the specific pattern, but will
1620     *   typically be on the order of milliseconds.
1621     *   <p>
1622     *   By default, the matching time is not limited.
1623     *   <p>
1624     *
1625     *   @param   limit       The limit value, or 0 for no limit.
1626     *   @param   status      A reference to a UErrorCode to receive any errors.
1627     *   @stable ICU 4.0
1628     */
1629     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1630 
1631   /**
1632     * Get the time limit, if any, for match operations made with this Matcher.
1633     *
1634     *   @return the maximum allowed time for a match, in units of processing steps.
1635     *   @stable ICU 4.0
1636     */
1637     virtual int32_t getTimeLimit() const;
1638 
1639   /**
1640     *  Set the amount of heap storage available for use by the match backtracking stack.
1641     *  The matcher is also reset, discarding any results from previous matches.
1642     *  <p>
1643     *  ICU uses a backtracking regular expression engine, with the backtrack stack
1644     *  maintained on the heap.  This function sets the limit to the amount of memory
1645     *  that can be used  for this purpose.  A backtracking stack overflow will
1646     *  result in an error from the match operation that caused it.
1647     *  <p>
1648     *  A limit is desirable because a malicious or poorly designed pattern can use
1649     *  excessive memory, potentially crashing the process.  A limit is enabled
1650     *  by default.
1651     *  <p>
1652     *  @param limit  The maximum size, in bytes, of the matching backtrack stack.
1653     *                A value of zero means no limit.
1654     *                The limit must be greater or equal to zero.
1655     *
1656     *  @param status   A reference to a UErrorCode to receive any errors.
1657     *
1658     *  @stable ICU 4.0
1659     */
1660     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
1661 
1662   /**
1663     *  Get the size of the heap storage available for use by the back tracking stack.
1664     *
1665     *  @return  the maximum backtracking stack size, in bytes, or zero if the
1666     *           stack size is unlimited.
1667     *  @stable ICU 4.0
1668     */
1669     virtual int32_t  getStackLimit() const;
1670 
1671 
1672   /**
1673     * Set a callback function for use with this Matcher.
1674     * During matching operations the function will be called periodically,
1675     * giving the application the opportunity to terminate a long-running
1676     * match.
1677     *
1678     *    @param   callback    A pointer to the user-supplied callback function.
1679     *    @param   context     User context pointer.  The value supplied at the
1680     *                         time the callback function is set will be saved
1681     *                         and passed to the callback each time that it is called.
1682     *    @param   status      A reference to a UErrorCode to receive any errors.
1683     *  @stable ICU 4.0
1684     */
1685     virtual void setMatchCallback(URegexMatchCallback     *callback,
1686                                   const void              *context,
1687                                   UErrorCode              &status);
1688 
1689 
1690   /**
1691     *  Get the callback function for this URegularExpression.
1692     *
1693     *    @param   callback    Out parameter, receives a pointer to the user-supplied
1694     *                         callback function.
1695     *    @param   context     Out parameter, receives the user context pointer that
1696     *                         was set when uregex_setMatchCallback() was called.
1697     *    @param   status      A reference to a UErrorCode to receive any errors.
1698     *    @stable ICU 4.0
1699     */
1700     virtual void getMatchCallback(URegexMatchCallback     *&callback,
1701                                   const void              *&context,
1702                                   UErrorCode              &status);
1703 
1704 
1705   /**
1706     * Set a progress callback function for use with find operations on this Matcher.
1707     * During find operations, the callback will be invoked after each return from a
1708     * match attempt, giving the application the opportunity to terminate a long-running
1709     * find operation.
1710     *
1711     *    @param   callback    A pointer to the user-supplied callback function.
1712     *    @param   context     User context pointer.  The value supplied at the
1713     *                         time the callback function is set will be saved
1714     *                         and passed to the callback each time that it is called.
1715     *    @param   status      A reference to a UErrorCode to receive any errors.
1716     *    @stable ICU 4.6
1717     */
1718     virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
1719                                               const void                              *context,
1720                                               UErrorCode                              &status);
1721 
1722 
1723   /**
1724     *  Get the find progress callback function for this URegularExpression.
1725     *
1726     *    @param   callback    Out parameter, receives a pointer to the user-supplied
1727     *                         callback function.
1728     *    @param   context     Out parameter, receives the user context pointer that
1729     *                         was set when uregex_setFindProgressCallback() was called.
1730     *    @param   status      A reference to a UErrorCode to receive any errors.
1731     *    @stable ICU 4.6
1732     */
1733     virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
1734                                               const void                      *&context,
1735                                               UErrorCode                      &status);
1736 
1737 #ifndef U_HIDE_INTERNAL_API
1738    /**
1739      *   setTrace   Debug function, enable/disable tracing of the matching engine.
1740      *              For internal ICU development use only.  DO NO USE!!!!
1741      *   @internal
1742      */
1743     void setTrace(UBool state);
1744 #endif  /* U_HIDE_INTERNAL_API */
1745 
1746     /**
1747     * ICU "poor man's RTTI", returns a UClassID for this class.
1748     *
1749     * @stable ICU 2.2
1750     */
1751     static UClassID U_EXPORT2 getStaticClassID();
1752 
1753     /**
1754      * ICU "poor man's RTTI", returns a UClassID for the actual class.
1755      *
1756      * @stable ICU 2.2
1757      */
1758     virtual UClassID getDynamicClassID() const;
1759 
1760 private:
1761     // Constructors and other object boilerplate are private.
1762     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1763     RegexMatcher();                  // default constructor not implemented
1764     RegexMatcher(const RegexPattern *pat);
1765     RegexMatcher(const RegexMatcher &other);
1766     RegexMatcher &operator =(const RegexMatcher &rhs);
1767     void init(UErrorCode &status);                      // Common initialization
1768     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
1769 
1770     friend class RegexPattern;
1771     friend class RegexCImpl;
1772 public:
1773 #ifndef U_HIDE_INTERNAL_API
1774     /** @internal  */
1775     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
1776 #endif  /* U_HIDE_INTERNAL_API */
1777 private:
1778 
1779     //
1780     //  MatchAt   This is the internal interface to the match engine itself.
1781     //            Match status comes back in matcher member variables.
1782     //
1783     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1784     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
1785     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
1786     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
1787     REStackFrame        *resetStack();
1788     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1789     void                 IncrementTime(UErrorCode &status);
1790 
1791     // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1792     inline UBool         findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1793 
1794     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1795 
1796     UBool                findUsingChunk(UErrorCode &status);
1797     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1798     UBool                isChunkWordBoundary(int32_t pos);
1799 
1800     const RegexPattern  *fPattern;
1801     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
1802                                            //   should delete it when through.
1803 
1804     const UnicodeString *fInput;           // The string being matched. Only used for input()
1805     UText               *fInputText;       // The text being matched. Is never NULL.
1806     UText               *fAltInputText;    // A shallow copy of the text being matched.
1807                                            //   Only created if the pattern contains backreferences.
1808     int64_t              fInputLength;     // Full length of the input text.
1809     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
1810 
1811     int64_t              fRegionStart;     // Start of the input region, default = 0.
1812     int64_t              fRegionLimit;     // End of input region, default to input.length.
1813 
1814     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
1815     int64_t              fAnchorLimit;     //   See useAnchoringBounds
1816 
1817     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
1818     int64_t              fLookLimit;       //   and other boundary tests.  See
1819                                            //   useTransparentBounds
1820 
1821     int64_t              fActiveStart;     // Currently active bounds for matching.
1822     int64_t              fActiveLimit;     //   Usually is the same as region, but
1823                                            //   is changed to fLookStart/Limit when
1824                                            //   entering look around regions.
1825 
1826     UBool                fTransparentBounds;  // True if using transparent bounds.
1827     UBool                fAnchoringBounds; // True if using anchoring bounds.
1828 
1829     UBool                fMatch;           // True if the last attempted match was successful.
1830     int64_t              fMatchStart;      // Position of the start of the most recent match
1831     int64_t              fMatchEnd;        // First position after the end of the most recent match
1832                                            //   Zero if no previous match, even when a region
1833                                            //   is active.
1834     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
1835                                            //   or -1 if there was no previous match.
1836     int64_t              fAppendPosition;  // First position after the end of the previous
1837                                            //   appendReplacement().  As described by the
1838                                            //   JavaDoc for Java Matcher, where it is called
1839                                            //   "append position"
1840     UBool                fHitEnd;          // True if the last match touched the end of input.
1841     UBool                fRequireEnd;      // True if the last match required end-of-input
1842                                            //    (matched $ or Z)
1843 
1844     UVector64           *fStack;
1845     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
1846                                            //   which will contain the capture group results.
1847                                            //   NOT valid while match engine is running.
1848 
1849     int64_t             *fData;            // Data area for use by the compiled pattern.
1850     int64_t             fSmallData[8];     //   Use this for data if it's enough.
1851 
1852     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
1853                                            //   match engine run.  Zero for unlimited.
1854 
1855     int32_t             fTime;             // Match time, accumulates while matching.
1856     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
1857                                            //   Kept separately from fTime to keep as much
1858                                            //   code as possible out of the inline
1859                                            //   StateSave function.
1860 
1861     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
1862                                            //   stack, in bytes.  Zero for unlimited.
1863 
1864     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
1865                                            //   NULL if there is no callback.
1866     const void         *fCallbackContext;  // User Context ptr for callback function.
1867 
1868     URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
1869                                                            //   NULL if there is no callback.
1870     const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
1871 
1872 
1873     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1874 
1875     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
1876 
1877     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
1878                                            //   reported, or that permanently disables this matcher.
1879 
1880     RuleBasedBreakIterator  *fWordBreakItr;
1881 };
1882 
1883 U_NAMESPACE_END
1884 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
1885 #endif
1886