1 /*
2 *******************************************************************************
3 *   Copyright (C) 2004-2015, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  uregex.cpp
7 */
8 
9 #include "unicode/utypes.h"
10 
11 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
12 
13 #include "unicode/regex.h"
14 #include "unicode/uregex.h"
15 #include "unicode/unistr.h"
16 #include "unicode/ustring.h"
17 #include "unicode/uchar.h"
18 #include "unicode/uobject.h"
19 #include "unicode/utf16.h"
20 #include "cmemory.h"
21 #include "uassert.h"
22 #include "uhash.h"
23 #include "umutex.h"
24 #include "uvectr32.h"
25 
26 #include "regextxt.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
31 
32 struct RegularExpression: public UMemory {
33 public:
34     RegularExpression();
35     ~RegularExpression();
36     int32_t           fMagic;
37     RegexPattern     *fPat;
38     u_atomic_int32_t *fPatRefCount;
39     UChar            *fPatString;
40     int32_t           fPatStringLen;
41     RegexMatcher     *fMatcher;
42     const UChar      *fText;         // Text from setText()
43     int32_t           fTextLength;   // Length provided by user with setText(), which
44                                      //  may be -1.
45     UBool             fOwnsText;
46 };
47 
48 static const int32_t REXP_MAGIC = 0x72657870; // "rexp" in ASCII
49 
RegularExpression()50 RegularExpression::RegularExpression() {
51     fMagic        = REXP_MAGIC;
52     fPat          = NULL;
53     fPatRefCount  = NULL;
54     fPatString    = NULL;
55     fPatStringLen = 0;
56     fMatcher      = NULL;
57     fText         = NULL;
58     fTextLength   = 0;
59     fOwnsText     = FALSE;
60 }
61 
~RegularExpression()62 RegularExpression::~RegularExpression() {
63     delete fMatcher;
64     fMatcher = NULL;
65     if (fPatRefCount!=NULL && umtx_atomic_dec(fPatRefCount)==0) {
66         delete fPat;
67         uprv_free(fPatString);
68         uprv_free((void *)fPatRefCount);
69     }
70     if (fOwnsText && fText!=NULL) {
71         uprv_free((void *)fText);
72     }
73     fMagic = 0;
74 }
75 
76 U_NAMESPACE_END
77 
78 U_NAMESPACE_USE
79 
80 //----------------------------------------------------------------------------------------
81 //
82 //   validateRE    Do boilerplate style checks on API function parameters.
83 //                 Return TRUE if they look OK.
84 //----------------------------------------------------------------------------------------
validateRE(const RegularExpression * re,UBool requiresText,UErrorCode * status)85 static UBool validateRE(const RegularExpression *re, UBool requiresText, UErrorCode *status) {
86     if (U_FAILURE(*status)) {
87         return FALSE;
88     }
89     if (re == NULL || re->fMagic != REXP_MAGIC) {
90         *status = U_ILLEGAL_ARGUMENT_ERROR;
91         return FALSE;
92     }
93     // !!! Not sure how to update this with the new UText backing, which is stored in re->fMatcher anyway
94     if (requiresText && re->fText == NULL && !re->fOwnsText) {
95         *status = U_REGEX_INVALID_STATE;
96         return FALSE;
97     }
98     return TRUE;
99 }
100 
101 //----------------------------------------------------------------------------------------
102 //
103 //    uregex_open
104 //
105 //----------------------------------------------------------------------------------------
106 U_CAPI URegularExpression *  U_EXPORT2
uregex_open(const UChar * pattern,int32_t patternLength,uint32_t flags,UParseError * pe,UErrorCode * status)107 uregex_open( const  UChar          *pattern,
108                     int32_t         patternLength,
109                     uint32_t        flags,
110                     UParseError    *pe,
111                     UErrorCode     *status) {
112 
113     if (U_FAILURE(*status)) {
114         return NULL;
115     }
116     if (pattern == NULL || patternLength < -1 || patternLength == 0) {
117         *status = U_ILLEGAL_ARGUMENT_ERROR;
118         return NULL;
119     }
120     int32_t actualPatLen = patternLength;
121     if (actualPatLen == -1) {
122         actualPatLen = u_strlen(pattern);
123     }
124 
125     RegularExpression  *re     = new RegularExpression;
126     u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
127     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(actualPatLen+1));
128     if (re == NULL || refC == NULL || patBuf == NULL) {
129         *status = U_MEMORY_ALLOCATION_ERROR;
130         delete re;
131         uprv_free((void *)refC);
132         uprv_free(patBuf);
133         return NULL;
134     }
135     re->fPatRefCount = refC;
136     *re->fPatRefCount = 1;
137 
138     //
139     // Make a copy of the pattern string, so we can return it later if asked.
140     //    For compiling the pattern, we will use a UText wrapper around
141     //    this local copy, to avoid making even more copies.
142     //
143     re->fPatString    = patBuf;
144     re->fPatStringLen = patternLength;
145     u_memcpy(patBuf, pattern, actualPatLen);
146     patBuf[actualPatLen] = 0;
147 
148     UText patText = UTEXT_INITIALIZER;
149     utext_openUChars(&patText, patBuf, patternLength, status);
150 
151     //
152     // Compile the pattern
153     //
154     if (pe != NULL) {
155         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
156     } else {
157         re->fPat = RegexPattern::compile(&patText, flags, *status);
158     }
159     utext_close(&patText);
160 
161     if (U_FAILURE(*status)) {
162         goto ErrorExit;
163     }
164 
165     //
166     // Create the matcher object
167     //
168     re->fMatcher = re->fPat->matcher(*status);
169     if (U_SUCCESS(*status)) {
170         return (URegularExpression*)re;
171     }
172 
173 ErrorExit:
174     delete re;
175     return NULL;
176 
177 }
178 
179 //----------------------------------------------------------------------------------------
180 //
181 //    uregex_openUText
182 //
183 //----------------------------------------------------------------------------------------
184 U_CAPI URegularExpression *  U_EXPORT2
uregex_openUText(UText * pattern,uint32_t flags,UParseError * pe,UErrorCode * status)185 uregex_openUText(UText          *pattern,
186                  uint32_t        flags,
187                  UParseError    *pe,
188                  UErrorCode     *status) {
189 
190     if (U_FAILURE(*status)) {
191         return NULL;
192     }
193     if (pattern == NULL) {
194         *status = U_ILLEGAL_ARGUMENT_ERROR;
195         return NULL;
196     }
197 
198     int64_t patternNativeLength = utext_nativeLength(pattern);
199 
200     if (patternNativeLength == 0) {
201         *status = U_ILLEGAL_ARGUMENT_ERROR;
202         return NULL;
203     }
204 
205     RegularExpression *re     = new RegularExpression;
206 
207     UErrorCode lengthStatus = U_ZERO_ERROR;
208     int32_t pattern16Length = utext_extract(pattern, 0, patternNativeLength, NULL, 0, &lengthStatus);
209 
210     u_atomic_int32_t   *refC   = (u_atomic_int32_t *)uprv_malloc(sizeof(int32_t));
211     UChar              *patBuf = (UChar *)uprv_malloc(sizeof(UChar)*(pattern16Length+1));
212     if (re == NULL || refC == NULL || patBuf == NULL) {
213         *status = U_MEMORY_ALLOCATION_ERROR;
214         delete re;
215         uprv_free((void *)refC);
216         uprv_free(patBuf);
217         return NULL;
218     }
219     re->fPatRefCount = refC;
220     *re->fPatRefCount = 1;
221 
222     //
223     // Make a copy of the pattern string, so we can return it later if asked.
224     //    For compiling the pattern, we will use a read-only UText wrapper
225     //    around this local copy, to avoid making even more copies.
226     //
227     re->fPatString    = patBuf;
228     re->fPatStringLen = pattern16Length;
229     utext_extract(pattern, 0, patternNativeLength, patBuf, pattern16Length+1, status);
230 
231     UText patText = UTEXT_INITIALIZER;
232     utext_openUChars(&patText, patBuf, pattern16Length, status);
233 
234     //
235     // Compile the pattern
236     //
237     if (pe != NULL) {
238         re->fPat = RegexPattern::compile(&patText, flags, *pe, *status);
239     } else {
240         re->fPat = RegexPattern::compile(&patText, flags, *status);
241     }
242     utext_close(&patText);
243 
244     if (U_FAILURE(*status)) {
245         goto ErrorExit;
246     }
247 
248     //
249     // Create the matcher object
250     //
251     re->fMatcher = re->fPat->matcher(*status);
252     if (U_SUCCESS(*status)) {
253         return (URegularExpression*)re;
254     }
255 
256 ErrorExit:
257     delete re;
258     return NULL;
259 
260 }
261 
262 //----------------------------------------------------------------------------------------
263 //
264 //    uregex_close
265 //
266 //----------------------------------------------------------------------------------------
267 U_CAPI void  U_EXPORT2
uregex_close(URegularExpression * re2)268 uregex_close(URegularExpression  *re2) {
269     RegularExpression *re = (RegularExpression*)re2;
270     UErrorCode  status = U_ZERO_ERROR;
271     if (validateRE(re, FALSE, &status) == FALSE) {
272         return;
273     }
274     delete re;
275 }
276 
277 
278 //----------------------------------------------------------------------------------------
279 //
280 //    uregex_clone
281 //
282 //----------------------------------------------------------------------------------------
283 U_CAPI URegularExpression * U_EXPORT2
uregex_clone(const URegularExpression * source2,UErrorCode * status)284 uregex_clone(const URegularExpression *source2, UErrorCode *status)  {
285     RegularExpression *source = (RegularExpression*)source2;
286     if (validateRE(source, FALSE, status) == FALSE) {
287         return NULL;
288     }
289 
290     RegularExpression *clone = new RegularExpression;
291     if (clone == NULL) {
292         *status = U_MEMORY_ALLOCATION_ERROR;
293         return NULL;
294     }
295 
296     clone->fMatcher = source->fPat->matcher(*status);
297     if (U_FAILURE(*status)) {
298         delete clone;
299         return NULL;
300     }
301 
302     clone->fPat          = source->fPat;
303     clone->fPatRefCount  = source->fPatRefCount;
304     clone->fPatString    = source->fPatString;
305     clone->fPatStringLen = source->fPatStringLen;
306     umtx_atomic_inc(source->fPatRefCount);
307     // Note:  fText is not cloned.
308 
309     return (URegularExpression*)clone;
310 }
311 
312 
313 
314 
315 //------------------------------------------------------------------------------
316 //
317 //    uregex_pattern
318 //
319 //------------------------------------------------------------------------------
320 U_CAPI const UChar * U_EXPORT2
uregex_pattern(const URegularExpression * regexp2,int32_t * patLength,UErrorCode * status)321 uregex_pattern(const  URegularExpression *regexp2,
322                       int32_t            *patLength,
323                       UErrorCode         *status)  {
324     RegularExpression *regexp = (RegularExpression*)regexp2;
325 
326     if (validateRE(regexp, FALSE, status) == FALSE) {
327         return NULL;
328     }
329     if (patLength != NULL) {
330         *patLength = regexp->fPatStringLen;
331     }
332     return regexp->fPatString;
333 }
334 
335 
336 //------------------------------------------------------------------------------
337 //
338 //    uregex_patternUText
339 //
340 //------------------------------------------------------------------------------
341 U_CAPI UText * U_EXPORT2
uregex_patternUText(const URegularExpression * regexp2,UErrorCode * status)342 uregex_patternUText(const URegularExpression *regexp2,
343                           UErrorCode         *status)  {
344     RegularExpression *regexp = (RegularExpression*)regexp2;
345     return regexp->fPat->patternText(*status);
346 }
347 
348 
349 //------------------------------------------------------------------------------
350 //
351 //    uregex_flags
352 //
353 //------------------------------------------------------------------------------
354 U_CAPI int32_t U_EXPORT2
uregex_flags(const URegularExpression * regexp2,UErrorCode * status)355 uregex_flags(const URegularExpression *regexp2, UErrorCode *status)  {
356     RegularExpression *regexp = (RegularExpression*)regexp2;
357     if (validateRE(regexp, FALSE, status) == FALSE) {
358         return 0;
359     }
360     int32_t flags = regexp->fPat->flags();
361     return flags;
362 }
363 
364 
365 //------------------------------------------------------------------------------
366 //
367 //    uregex_setText
368 //
369 //------------------------------------------------------------------------------
370 U_CAPI void U_EXPORT2
uregex_setText(URegularExpression * regexp2,const UChar * text,int32_t textLength,UErrorCode * status)371 uregex_setText(URegularExpression *regexp2,
372                const UChar        *text,
373                int32_t             textLength,
374                UErrorCode         *status)  {
375     RegularExpression *regexp = (RegularExpression*)regexp2;
376     if (validateRE(regexp, FALSE, status) == FALSE) {
377         return;
378     }
379     if (text == NULL || textLength < -1) {
380         *status = U_ILLEGAL_ARGUMENT_ERROR;
381         return;
382     }
383 
384     if (regexp->fOwnsText && regexp->fText != NULL) {
385         uprv_free((void *)regexp->fText);
386     }
387 
388     regexp->fText       = text;
389     regexp->fTextLength = textLength;
390     regexp->fOwnsText   = FALSE;
391 
392     UText input = UTEXT_INITIALIZER;
393     utext_openUChars(&input, text, textLength, status);
394     regexp->fMatcher->reset(&input);
395     utext_close(&input); // reset() made a shallow clone, so we don't need this copy
396 }
397 
398 
399 //------------------------------------------------------------------------------
400 //
401 //    uregex_setUText
402 //
403 //------------------------------------------------------------------------------
404 U_CAPI void U_EXPORT2
uregex_setUText(URegularExpression * regexp2,UText * text,UErrorCode * status)405 uregex_setUText(URegularExpression *regexp2,
406                 UText              *text,
407                 UErrorCode         *status) {
408     RegularExpression *regexp = (RegularExpression*)regexp2;
409     if (validateRE(regexp, FALSE, status) == FALSE) {
410         return;
411     }
412     if (text == NULL) {
413         *status = U_ILLEGAL_ARGUMENT_ERROR;
414         return;
415     }
416 
417     if (regexp->fOwnsText && regexp->fText != NULL) {
418         uprv_free((void *)regexp->fText);
419     }
420 
421     regexp->fText       = NULL; // only fill it in on request
422     regexp->fTextLength = -1;
423     regexp->fOwnsText   = TRUE;
424     regexp->fMatcher->reset(text);
425 }
426 
427 
428 
429 //------------------------------------------------------------------------------
430 //
431 //    uregex_getText
432 //
433 //------------------------------------------------------------------------------
434 U_CAPI const UChar * U_EXPORT2
uregex_getText(URegularExpression * regexp2,int32_t * textLength,UErrorCode * status)435 uregex_getText(URegularExpression *regexp2,
436                int32_t            *textLength,
437                UErrorCode         *status)  {
438     RegularExpression *regexp = (RegularExpression*)regexp2;
439     if (validateRE(regexp, FALSE, status) == FALSE) {
440         return NULL;
441     }
442 
443     if (regexp->fText == NULL) {
444         // need to fill in the text
445         UText *inputText = regexp->fMatcher->inputText();
446         int64_t inputNativeLength = utext_nativeLength(inputText);
447         if (UTEXT_FULL_TEXT_IN_CHUNK(inputText, inputNativeLength)) {
448             regexp->fText = inputText->chunkContents;
449             regexp->fTextLength = (int32_t)inputNativeLength;
450             regexp->fOwnsText = FALSE; // because the UText owns it
451         } else {
452             UErrorCode lengthStatus = U_ZERO_ERROR;
453             regexp->fTextLength = utext_extract(inputText, 0, inputNativeLength, NULL, 0, &lengthStatus); // buffer overflow error
454             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(regexp->fTextLength+1));
455 
456             utext_extract(inputText, 0, inputNativeLength, inputChars, regexp->fTextLength+1, status);
457             regexp->fText = inputChars;
458             regexp->fOwnsText = TRUE; // should already be set but just in case
459         }
460     }
461 
462     if (textLength != NULL) {
463         *textLength = regexp->fTextLength;
464     }
465     return regexp->fText;
466 }
467 
468 
469 //------------------------------------------------------------------------------
470 //
471 //    uregex_getUText
472 //
473 //------------------------------------------------------------------------------
474 U_CAPI UText * U_EXPORT2
uregex_getUText(URegularExpression * regexp2,UText * dest,UErrorCode * status)475 uregex_getUText(URegularExpression *regexp2,
476                 UText              *dest,
477                 UErrorCode         *status)  {
478     RegularExpression *regexp = (RegularExpression*)regexp2;
479     if (validateRE(regexp, FALSE, status) == FALSE) {
480         return dest;
481     }
482     return regexp->fMatcher->getInput(dest, *status);
483 }
484 
485 
486 //------------------------------------------------------------------------------
487 //
488 //    uregex_refreshUText
489 //
490 //------------------------------------------------------------------------------
491 U_CAPI void U_EXPORT2
uregex_refreshUText(URegularExpression * regexp2,UText * text,UErrorCode * status)492 uregex_refreshUText(URegularExpression *regexp2,
493                     UText              *text,
494                     UErrorCode         *status) {
495     RegularExpression *regexp = (RegularExpression*)regexp2;
496     if (validateRE(regexp, FALSE, status) == FALSE) {
497         return;
498     }
499     regexp->fMatcher->refreshInputText(text, *status);
500 }
501 
502 
503 //------------------------------------------------------------------------------
504 //
505 //    uregex_matches
506 //
507 //------------------------------------------------------------------------------
508 U_CAPI UBool U_EXPORT2
uregex_matches(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)509 uregex_matches(URegularExpression *regexp2,
510                int32_t            startIndex,
511                UErrorCode        *status)  {
512     return uregex_matches64( regexp2, (int64_t)startIndex, status);
513 }
514 
515 U_CAPI UBool U_EXPORT2
uregex_matches64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)516 uregex_matches64(URegularExpression *regexp2,
517                  int64_t            startIndex,
518                  UErrorCode        *status)  {
519     RegularExpression *regexp = (RegularExpression*)regexp2;
520     UBool result = FALSE;
521     if (validateRE(regexp, TRUE, status) == FALSE) {
522         return result;
523     }
524     if (startIndex == -1) {
525         result = regexp->fMatcher->matches(*status);
526     } else {
527         result = regexp->fMatcher->matches(startIndex, *status);
528     }
529     return result;
530 }
531 
532 
533 //------------------------------------------------------------------------------
534 //
535 //    uregex_lookingAt
536 //
537 //------------------------------------------------------------------------------
538 U_CAPI UBool U_EXPORT2
uregex_lookingAt(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)539 uregex_lookingAt(URegularExpression *regexp2,
540                  int32_t             startIndex,
541                  UErrorCode         *status)  {
542     return uregex_lookingAt64( regexp2, (int64_t)startIndex, status);
543 }
544 
545 U_CAPI UBool U_EXPORT2
uregex_lookingAt64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)546 uregex_lookingAt64(URegularExpression *regexp2,
547                    int64_t             startIndex,
548                    UErrorCode         *status)  {
549     RegularExpression *regexp = (RegularExpression*)regexp2;
550     UBool result = FALSE;
551     if (validateRE(regexp, TRUE, status) == FALSE) {
552         return result;
553     }
554     if (startIndex == -1) {
555         result = regexp->fMatcher->lookingAt(*status);
556     } else {
557         result = regexp->fMatcher->lookingAt(startIndex, *status);
558     }
559     return result;
560 }
561 
562 
563 
564 //------------------------------------------------------------------------------
565 //
566 //    uregex_find
567 //
568 //------------------------------------------------------------------------------
569 U_CAPI UBool U_EXPORT2
uregex_find(URegularExpression * regexp2,int32_t startIndex,UErrorCode * status)570 uregex_find(URegularExpression *regexp2,
571             int32_t             startIndex,
572             UErrorCode         *status)  {
573     return uregex_find64( regexp2, (int64_t)startIndex, status);
574 }
575 
576 U_CAPI UBool U_EXPORT2
uregex_find64(URegularExpression * regexp2,int64_t startIndex,UErrorCode * status)577 uregex_find64(URegularExpression *regexp2,
578               int64_t             startIndex,
579               UErrorCode         *status)  {
580     RegularExpression *regexp = (RegularExpression*)regexp2;
581     UBool result = FALSE;
582     if (validateRE(regexp, TRUE, status) == FALSE) {
583         return result;
584     }
585     if (startIndex == -1) {
586         regexp->fMatcher->resetPreserveRegion();
587         result = regexp->fMatcher->find(*status);
588     } else {
589         result = regexp->fMatcher->find(startIndex, *status);
590     }
591     return result;
592 }
593 
594 
595 //------------------------------------------------------------------------------
596 //
597 //    uregex_findNext
598 //
599 //------------------------------------------------------------------------------
600 U_CAPI UBool U_EXPORT2
uregex_findNext(URegularExpression * regexp2,UErrorCode * status)601 uregex_findNext(URegularExpression *regexp2,
602                 UErrorCode         *status)  {
603     RegularExpression *regexp = (RegularExpression*)regexp2;
604     if (validateRE(regexp, TRUE, status) == FALSE) {
605         return FALSE;
606     }
607     UBool result = regexp->fMatcher->find(*status);
608     return result;
609 }
610 
611 //------------------------------------------------------------------------------
612 //
613 //    uregex_groupCount
614 //
615 //------------------------------------------------------------------------------
616 U_CAPI int32_t U_EXPORT2
uregex_groupCount(URegularExpression * regexp2,UErrorCode * status)617 uregex_groupCount(URegularExpression *regexp2,
618                   UErrorCode         *status)  {
619     RegularExpression *regexp = (RegularExpression*)regexp2;
620     if (validateRE(regexp, FALSE, status) == FALSE) {
621         return 0;
622     }
623     int32_t  result = regexp->fMatcher->groupCount();
624     return result;
625 }
626 
627 
628 //------------------------------------------------------------------------------
629 //
630 //    uregex_groupNumberFromName
631 //
632 //------------------------------------------------------------------------------
633 int32_t
uregex_groupNumberFromName(URegularExpression * regexp2,const UChar * groupName,int32_t nameLength,UErrorCode * status)634 uregex_groupNumberFromName(URegularExpression *regexp2,
635                            const UChar        *groupName,
636                            int32_t             nameLength,
637                            UErrorCode          *status) {
638     RegularExpression *regexp = (RegularExpression*)regexp2;
639     if (validateRE(regexp, FALSE, status) == FALSE) {
640         return 0;
641     }
642     int32_t  result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
643     return result;
644 }
645 
646 int32_t
uregex_groupNumberFromCName(URegularExpression * regexp2,const char * groupName,int32_t nameLength,UErrorCode * status)647 uregex_groupNumberFromCName(URegularExpression *regexp2,
648                             const char         *groupName,
649                             int32_t             nameLength,
650                             UErrorCode          *status) {
651     RegularExpression *regexp = (RegularExpression*)regexp2;
652     if (validateRE(regexp, FALSE, status) == FALSE) {
653         return 0;
654     }
655     return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
656 }
657 
658 //------------------------------------------------------------------------------
659 //
660 //    uregex_group
661 //
662 //------------------------------------------------------------------------------
663 U_CAPI int32_t U_EXPORT2
uregex_group(URegularExpression * regexp2,int32_t groupNum,UChar * dest,int32_t destCapacity,UErrorCode * status)664 uregex_group(URegularExpression *regexp2,
665              int32_t             groupNum,
666              UChar              *dest,
667              int32_t             destCapacity,
668              UErrorCode          *status)  {
669     RegularExpression *regexp = (RegularExpression*)regexp2;
670     if (validateRE(regexp, TRUE, status) == FALSE) {
671         return 0;
672     }
673     if (destCapacity < 0 || (destCapacity > 0 && dest == NULL)) {
674         *status = U_ILLEGAL_ARGUMENT_ERROR;
675         return 0;
676     }
677 
678     if (destCapacity == 0 || regexp->fText != NULL) {
679         // If preflighting or if we already have the text as UChars,
680         // this is a little cheaper than extracting from the UText
681 
682         //
683         // Pick up the range of characters from the matcher
684         //
685         int32_t  startIx = regexp->fMatcher->start(groupNum, *status);
686         int32_t  endIx   = regexp->fMatcher->end  (groupNum, *status);
687         if (U_FAILURE(*status)) {
688             return 0;
689         }
690 
691         //
692         // Trim length based on buffer capacity
693         //
694         int32_t fullLength = endIx - startIx;
695         int32_t copyLength = fullLength;
696         if (copyLength < destCapacity) {
697             dest[copyLength] = 0;
698         } else if (copyLength == destCapacity) {
699             *status = U_STRING_NOT_TERMINATED_WARNING;
700         } else {
701             copyLength = destCapacity;
702             *status = U_BUFFER_OVERFLOW_ERROR;
703         }
704 
705         //
706         // Copy capture group to user's buffer
707         //
708         if (copyLength > 0) {
709             u_memcpy(dest, &regexp->fText[startIx], copyLength);
710         }
711         return fullLength;
712     } else {
713         int64_t  start = regexp->fMatcher->start64(groupNum, *status);
714         int64_t  limit = regexp->fMatcher->end64(groupNum, *status);
715         if (U_FAILURE(*status)) {
716             return 0;
717         }
718         // Note edge cases:
719         //   Group didn't match: start == end == -1. UText trims to 0, UText gives zero length result.
720         //   Zero Length Match: start == end.
721         int32_t length = utext_extract(regexp->fMatcher->inputText(), start, limit, dest, destCapacity, status);
722         return length;
723     }
724 
725 }
726 
727 
728 //------------------------------------------------------------------------------
729 //
730 //    uregex_groupUText
731 //
732 //------------------------------------------------------------------------------
733 U_CAPI UText * U_EXPORT2
uregex_groupUText(URegularExpression * regexp2,int32_t groupNum,UText * dest,int64_t * groupLength,UErrorCode * status)734 uregex_groupUText(URegularExpression *regexp2,
735                   int32_t             groupNum,
736                   UText              *dest,
737                   int64_t            *groupLength,
738                   UErrorCode         *status)  {
739     RegularExpression *regexp = (RegularExpression*)regexp2;
740     if (validateRE(regexp, TRUE, status) == FALSE) {
741         UErrorCode emptyTextStatus = U_ZERO_ERROR;
742         return (dest ? dest : utext_openUChars(NULL, NULL, 0, &emptyTextStatus));
743     }
744 
745     return regexp->fMatcher->group(groupNum, dest, *groupLength, *status);
746 }
747 
748 //------------------------------------------------------------------------------
749 //
750 //    uregex_start
751 //
752 //------------------------------------------------------------------------------
753 U_CAPI int32_t U_EXPORT2
uregex_start(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)754 uregex_start(URegularExpression *regexp2,
755              int32_t             groupNum,
756              UErrorCode          *status)  {
757     return (int32_t)uregex_start64( regexp2, groupNum, status);
758 }
759 
760 U_CAPI int64_t U_EXPORT2
uregex_start64(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)761 uregex_start64(URegularExpression *regexp2,
762                int32_t             groupNum,
763                UErrorCode          *status)  {
764     RegularExpression *regexp = (RegularExpression*)regexp2;
765     if (validateRE(regexp, TRUE, status) == FALSE) {
766         return 0;
767     }
768     int32_t result = regexp->fMatcher->start(groupNum, *status);
769     return result;
770 }
771 
772 //------------------------------------------------------------------------------
773 //
774 //    uregex_end
775 //
776 //------------------------------------------------------------------------------
777 U_CAPI int32_t U_EXPORT2
uregex_end(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)778 uregex_end(URegularExpression   *regexp2,
779            int32_t               groupNum,
780            UErrorCode           *status)  {
781     return (int32_t)uregex_end64( regexp2, groupNum, status);
782 }
783 
784 U_CAPI int64_t U_EXPORT2
uregex_end64(URegularExpression * regexp2,int32_t groupNum,UErrorCode * status)785 uregex_end64(URegularExpression   *regexp2,
786              int32_t               groupNum,
787              UErrorCode           *status)  {
788     RegularExpression *regexp = (RegularExpression*)regexp2;
789     if (validateRE(regexp, TRUE, status) == FALSE) {
790         return 0;
791     }
792     int32_t result = regexp->fMatcher->end(groupNum, *status);
793     return result;
794 }
795 
796 //------------------------------------------------------------------------------
797 //
798 //    uregex_reset
799 //
800 //------------------------------------------------------------------------------
801 U_CAPI void U_EXPORT2
uregex_reset(URegularExpression * regexp2,int32_t index,UErrorCode * status)802 uregex_reset(URegularExpression    *regexp2,
803              int32_t               index,
804              UErrorCode            *status)  {
805     uregex_reset64( regexp2, (int64_t)index, status);
806 }
807 
808 U_CAPI void U_EXPORT2
uregex_reset64(URegularExpression * regexp2,int64_t index,UErrorCode * status)809 uregex_reset64(URegularExpression    *regexp2,
810                int64_t               index,
811                UErrorCode            *status)  {
812     RegularExpression *regexp = (RegularExpression*)regexp2;
813     if (validateRE(regexp, TRUE, status) == FALSE) {
814         return;
815     }
816     regexp->fMatcher->reset(index, *status);
817 }
818 
819 
820 //------------------------------------------------------------------------------
821 //
822 //    uregex_setRegion
823 //
824 //------------------------------------------------------------------------------
825 U_CAPI void U_EXPORT2
uregex_setRegion(URegularExpression * regexp2,int32_t regionStart,int32_t regionLimit,UErrorCode * status)826 uregex_setRegion(URegularExpression   *regexp2,
827                  int32_t               regionStart,
828                  int32_t               regionLimit,
829                  UErrorCode           *status)  {
830     uregex_setRegion64( regexp2, (int64_t)regionStart, (int64_t)regionLimit, status);
831 }
832 
833 U_CAPI void U_EXPORT2
uregex_setRegion64(URegularExpression * regexp2,int64_t regionStart,int64_t regionLimit,UErrorCode * status)834 uregex_setRegion64(URegularExpression   *regexp2,
835                    int64_t               regionStart,
836                    int64_t               regionLimit,
837                    UErrorCode           *status)  {
838     RegularExpression *regexp = (RegularExpression*)regexp2;
839     if (validateRE(regexp, TRUE, status) == FALSE) {
840         return;
841     }
842     regexp->fMatcher->region(regionStart, regionLimit, *status);
843 }
844 
845 
846 //------------------------------------------------------------------------------
847 //
848 //    uregex_setRegionAndStart
849 //
850 //------------------------------------------------------------------------------
851 U_CAPI void U_EXPORT2
uregex_setRegionAndStart(URegularExpression * regexp2,int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode * status)852 uregex_setRegionAndStart(URegularExpression   *regexp2,
853                  int64_t               regionStart,
854                  int64_t               regionLimit,
855                  int64_t               startIndex,
856                  UErrorCode           *status)  {
857     RegularExpression *regexp = (RegularExpression*)regexp2;
858     if (validateRE(regexp, TRUE, status) == FALSE) {
859         return;
860     }
861     regexp->fMatcher->region(regionStart, regionLimit, startIndex, *status);
862 }
863 
864 //------------------------------------------------------------------------------
865 //
866 //    uregex_regionStart
867 //
868 //------------------------------------------------------------------------------
869 U_CAPI int32_t U_EXPORT2
uregex_regionStart(const URegularExpression * regexp2,UErrorCode * status)870 uregex_regionStart(const  URegularExpression   *regexp2,
871                           UErrorCode           *status)  {
872     return (int32_t)uregex_regionStart64(regexp2, status);
873 }
874 
875 U_CAPI int64_t U_EXPORT2
uregex_regionStart64(const URegularExpression * regexp2,UErrorCode * status)876 uregex_regionStart64(const  URegularExpression   *regexp2,
877                             UErrorCode           *status)  {
878     RegularExpression *regexp = (RegularExpression*)regexp2;
879     if (validateRE(regexp, TRUE, status) == FALSE) {
880         return 0;
881     }
882     return regexp->fMatcher->regionStart();
883 }
884 
885 
886 //------------------------------------------------------------------------------
887 //
888 //    uregex_regionEnd
889 //
890 //------------------------------------------------------------------------------
891 U_CAPI int32_t U_EXPORT2
uregex_regionEnd(const URegularExpression * regexp2,UErrorCode * status)892 uregex_regionEnd(const  URegularExpression   *regexp2,
893                         UErrorCode           *status)  {
894     return (int32_t)uregex_regionEnd64(regexp2, status);
895 }
896 
897 U_CAPI int64_t U_EXPORT2
uregex_regionEnd64(const URegularExpression * regexp2,UErrorCode * status)898 uregex_regionEnd64(const  URegularExpression   *regexp2,
899                           UErrorCode           *status)  {
900     RegularExpression *regexp = (RegularExpression*)regexp2;
901     if (validateRE(regexp, TRUE, status) == FALSE) {
902         return 0;
903     }
904     return regexp->fMatcher->regionEnd();
905 }
906 
907 
908 //------------------------------------------------------------------------------
909 //
910 //    uregex_hasTransparentBounds
911 //
912 //------------------------------------------------------------------------------
913 U_CAPI UBool U_EXPORT2
uregex_hasTransparentBounds(const URegularExpression * regexp2,UErrorCode * status)914 uregex_hasTransparentBounds(const  URegularExpression   *regexp2,
915                                    UErrorCode           *status)  {
916     RegularExpression *regexp = (RegularExpression*)regexp2;
917     if (validateRE(regexp, FALSE, status) == FALSE) {
918         return FALSE;
919     }
920     return regexp->fMatcher->hasTransparentBounds();
921 }
922 
923 
924 //------------------------------------------------------------------------------
925 //
926 //    uregex_useTransparentBounds
927 //
928 //------------------------------------------------------------------------------
929 U_CAPI void U_EXPORT2
uregex_useTransparentBounds(URegularExpression * regexp2,UBool b,UErrorCode * status)930 uregex_useTransparentBounds(URegularExpression    *regexp2,
931                             UBool                  b,
932                             UErrorCode            *status)  {
933     RegularExpression *regexp = (RegularExpression*)regexp2;
934     if (validateRE(regexp, FALSE, status) == FALSE) {
935         return;
936     }
937     regexp->fMatcher->useTransparentBounds(b);
938 }
939 
940 
941 //------------------------------------------------------------------------------
942 //
943 //    uregex_hasAnchoringBounds
944 //
945 //------------------------------------------------------------------------------
946 U_CAPI UBool U_EXPORT2
uregex_hasAnchoringBounds(const URegularExpression * regexp2,UErrorCode * status)947 uregex_hasAnchoringBounds(const  URegularExpression   *regexp2,
948                                  UErrorCode           *status)  {
949     RegularExpression *regexp = (RegularExpression*)regexp2;
950     if (validateRE(regexp, FALSE, status) == FALSE) {
951         return FALSE;
952     }
953     return regexp->fMatcher->hasAnchoringBounds();
954 }
955 
956 
957 //------------------------------------------------------------------------------
958 //
959 //    uregex_useAnchoringBounds
960 //
961 //------------------------------------------------------------------------------
962 U_CAPI void U_EXPORT2
uregex_useAnchoringBounds(URegularExpression * regexp2,UBool b,UErrorCode * status)963 uregex_useAnchoringBounds(URegularExpression    *regexp2,
964                           UBool                  b,
965                           UErrorCode            *status)  {
966     RegularExpression *regexp = (RegularExpression*)regexp2;
967     if (validateRE(regexp, FALSE, status) == FALSE) {
968         return;
969     }
970     regexp->fMatcher->useAnchoringBounds(b);
971 }
972 
973 
974 //------------------------------------------------------------------------------
975 //
976 //    uregex_hitEnd
977 //
978 //------------------------------------------------------------------------------
979 U_CAPI UBool U_EXPORT2
uregex_hitEnd(const URegularExpression * regexp2,UErrorCode * status)980 uregex_hitEnd(const  URegularExpression   *regexp2,
981                      UErrorCode           *status)  {
982     RegularExpression *regexp = (RegularExpression*)regexp2;
983     if (validateRE(regexp, TRUE, status) == FALSE) {
984         return FALSE;
985     }
986     return regexp->fMatcher->hitEnd();
987 }
988 
989 
990 //------------------------------------------------------------------------------
991 //
992 //    uregex_requireEnd
993 //
994 //------------------------------------------------------------------------------
995 U_CAPI UBool U_EXPORT2
uregex_requireEnd(const URegularExpression * regexp2,UErrorCode * status)996 uregex_requireEnd(const  URegularExpression   *regexp2,
997                          UErrorCode           *status)  {
998     RegularExpression *regexp = (RegularExpression*)regexp2;
999     if (validateRE(regexp, TRUE, status) == FALSE) {
1000         return FALSE;
1001     }
1002     return regexp->fMatcher->requireEnd();
1003 }
1004 
1005 
1006 //------------------------------------------------------------------------------
1007 //
1008 //    uregex_setTimeLimit
1009 //
1010 //------------------------------------------------------------------------------
1011 U_CAPI void U_EXPORT2
uregex_setTimeLimit(URegularExpression * regexp2,int32_t limit,UErrorCode * status)1012 uregex_setTimeLimit(URegularExpression   *regexp2,
1013                     int32_t               limit,
1014                     UErrorCode           *status) {
1015     RegularExpression *regexp = (RegularExpression*)regexp2;
1016     if (validateRE(regexp, FALSE, status)) {
1017         regexp->fMatcher->setTimeLimit(limit, *status);
1018     }
1019 }
1020 
1021 
1022 
1023 //------------------------------------------------------------------------------
1024 //
1025 //    uregex_getTimeLimit
1026 //
1027 //------------------------------------------------------------------------------
1028 U_CAPI int32_t U_EXPORT2
uregex_getTimeLimit(const URegularExpression * regexp2,UErrorCode * status)1029 uregex_getTimeLimit(const  URegularExpression   *regexp2,
1030                            UErrorCode           *status) {
1031     int32_t retVal = 0;
1032     RegularExpression *regexp = (RegularExpression*)regexp2;
1033     if (validateRE(regexp, FALSE, status)) {
1034         retVal = regexp->fMatcher->getTimeLimit();
1035     }
1036     return retVal;
1037 }
1038 
1039 
1040 
1041 //------------------------------------------------------------------------------
1042 //
1043 //    uregex_setStackLimit
1044 //
1045 //------------------------------------------------------------------------------
1046 U_CAPI void U_EXPORT2
uregex_setStackLimit(URegularExpression * regexp2,int32_t limit,UErrorCode * status)1047 uregex_setStackLimit(URegularExpression   *regexp2,
1048                      int32_t               limit,
1049                      UErrorCode           *status) {
1050     RegularExpression *regexp = (RegularExpression*)regexp2;
1051     if (validateRE(regexp, FALSE, status)) {
1052         regexp->fMatcher->setStackLimit(limit, *status);
1053     }
1054 }
1055 
1056 
1057 
1058 //------------------------------------------------------------------------------
1059 //
1060 //    uregex_getStackLimit
1061 //
1062 //------------------------------------------------------------------------------
1063 U_CAPI int32_t U_EXPORT2
uregex_getStackLimit(const URegularExpression * regexp2,UErrorCode * status)1064 uregex_getStackLimit(const  URegularExpression   *regexp2,
1065                             UErrorCode           *status) {
1066     int32_t retVal = 0;
1067     RegularExpression *regexp = (RegularExpression*)regexp2;
1068     if (validateRE(regexp, FALSE, status)) {
1069         retVal = regexp->fMatcher->getStackLimit();
1070     }
1071     return retVal;
1072 }
1073 
1074 
1075 //------------------------------------------------------------------------------
1076 //
1077 //    uregex_setMatchCallback
1078 //
1079 //------------------------------------------------------------------------------
1080 U_CAPI void U_EXPORT2
uregex_setMatchCallback(URegularExpression * regexp2,URegexMatchCallback * callback,const void * context,UErrorCode * status)1081 uregex_setMatchCallback(URegularExpression      *regexp2,
1082                         URegexMatchCallback     *callback,
1083                         const void              *context,
1084                         UErrorCode              *status) {
1085     RegularExpression *regexp = (RegularExpression*)regexp2;
1086     if (validateRE(regexp, FALSE, status)) {
1087         regexp->fMatcher->setMatchCallback(callback, context, *status);
1088     }
1089 }
1090 
1091 
1092 //------------------------------------------------------------------------------
1093 //
1094 //    uregex_getMatchCallback
1095 //
1096 //------------------------------------------------------------------------------
1097 U_CAPI void U_EXPORT2
uregex_getMatchCallback(const URegularExpression * regexp2,URegexMatchCallback ** callback,const void ** context,UErrorCode * status)1098 uregex_getMatchCallback(const URegularExpression    *regexp2,
1099                         URegexMatchCallback        **callback,
1100                         const void                 **context,
1101                         UErrorCode                  *status) {
1102     RegularExpression *regexp = (RegularExpression*)regexp2;
1103      if (validateRE(regexp, FALSE, status)) {
1104          regexp->fMatcher->getMatchCallback(*callback, *context, *status);
1105      }
1106 }
1107 
1108 
1109 //------------------------------------------------------------------------------
1110 //
1111 //    uregex_setMatchProgressCallback
1112 //
1113 //------------------------------------------------------------------------------
1114 U_CAPI void U_EXPORT2
uregex_setFindProgressCallback(URegularExpression * regexp2,URegexFindProgressCallback * callback,const void * context,UErrorCode * status)1115 uregex_setFindProgressCallback(URegularExpression              *regexp2,
1116                                 URegexFindProgressCallback      *callback,
1117                                 const void                      *context,
1118                                 UErrorCode                      *status) {
1119     RegularExpression *regexp = (RegularExpression*)regexp2;
1120     if (validateRE(regexp, FALSE, status)) {
1121         regexp->fMatcher->setFindProgressCallback(callback, context, *status);
1122     }
1123 }
1124 
1125 
1126 //------------------------------------------------------------------------------
1127 //
1128 //    uregex_getMatchCallback
1129 //
1130 //------------------------------------------------------------------------------
1131 U_CAPI void U_EXPORT2
uregex_getFindProgressCallback(const URegularExpression * regexp2,URegexFindProgressCallback ** callback,const void ** context,UErrorCode * status)1132 uregex_getFindProgressCallback(const URegularExpression          *regexp2,
1133                                 URegexFindProgressCallback        **callback,
1134                                 const void                        **context,
1135                                 UErrorCode                        *status) {
1136     RegularExpression *regexp = (RegularExpression*)regexp2;
1137      if (validateRE(regexp, FALSE, status)) {
1138          regexp->fMatcher->getFindProgressCallback(*callback, *context, *status);
1139      }
1140 }
1141 
1142 
1143 //------------------------------------------------------------------------------
1144 //
1145 //    uregex_replaceAll
1146 //
1147 //------------------------------------------------------------------------------
1148 U_CAPI int32_t U_EXPORT2
uregex_replaceAll(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)1149 uregex_replaceAll(URegularExpression    *regexp2,
1150                   const UChar           *replacementText,
1151                   int32_t                replacementLength,
1152                   UChar                 *destBuf,
1153                   int32_t                destCapacity,
1154                   UErrorCode            *status)  {
1155     RegularExpression *regexp = (RegularExpression*)regexp2;
1156     if (validateRE(regexp, TRUE, status) == FALSE) {
1157         return 0;
1158     }
1159     if (replacementText == NULL || replacementLength < -1 ||
1160         (destBuf == NULL && destCapacity > 0) ||
1161         destCapacity < 0) {
1162         *status = U_ILLEGAL_ARGUMENT_ERROR;
1163         return 0;
1164     }
1165 
1166     int32_t   len = 0;
1167 
1168     uregex_reset(regexp2, 0, status);
1169 
1170     // Note: Seperate error code variables for findNext() and appendReplacement()
1171     //       are used so that destination buffer overflow errors
1172     //       in appendReplacement won't stop findNext() from working.
1173     //       appendReplacement() and appendTail() special case incoming buffer
1174     //       overflow errors, continuing to return the correct length.
1175     UErrorCode  findStatus = *status;
1176     while (uregex_findNext(regexp2, &findStatus)) {
1177         len += uregex_appendReplacement(regexp2, replacementText, replacementLength,
1178                                         &destBuf, &destCapacity, status);
1179     }
1180     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1181 
1182     if (U_FAILURE(findStatus)) {
1183         // If anything went wrong with the findNext(), make that error trump
1184         //   whatever may have happened with the append() operations.
1185         //   Errors in findNext() are not expected.
1186         *status = findStatus;
1187     }
1188 
1189     return len;
1190 }
1191 
1192 
1193 //------------------------------------------------------------------------------
1194 //
1195 //    uregex_replaceAllUText
1196 //
1197 //------------------------------------------------------------------------------
1198 U_CAPI UText * U_EXPORT2
uregex_replaceAllUText(URegularExpression * regexp2,UText * replacementText,UText * dest,UErrorCode * status)1199 uregex_replaceAllUText(URegularExpression    *regexp2,
1200                        UText                 *replacementText,
1201                        UText                 *dest,
1202                        UErrorCode            *status)  {
1203     RegularExpression *regexp = (RegularExpression*)regexp2;
1204     if (validateRE(regexp, TRUE, status) == FALSE) {
1205         return 0;
1206     }
1207     if (replacementText == NULL) {
1208         *status = U_ILLEGAL_ARGUMENT_ERROR;
1209         return 0;
1210     }
1211 
1212     dest = regexp->fMatcher->replaceAll(replacementText, dest, *status);
1213     return dest;
1214 }
1215 
1216 
1217 //------------------------------------------------------------------------------
1218 //
1219 //    uregex_replaceFirst
1220 //
1221 //------------------------------------------------------------------------------
1222 U_CAPI int32_t U_EXPORT2
uregex_replaceFirst(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar * destBuf,int32_t destCapacity,UErrorCode * status)1223 uregex_replaceFirst(URegularExpression  *regexp2,
1224                     const UChar         *replacementText,
1225                     int32_t              replacementLength,
1226                     UChar               *destBuf,
1227                     int32_t              destCapacity,
1228                     UErrorCode          *status)  {
1229     RegularExpression *regexp = (RegularExpression*)regexp2;
1230     if (validateRE(regexp, TRUE, status) == FALSE) {
1231         return 0;
1232     }
1233     if (replacementText == NULL || replacementLength < -1 ||
1234         (destBuf == NULL && destCapacity > 0) ||
1235         destCapacity < 0) {
1236         *status = U_ILLEGAL_ARGUMENT_ERROR;
1237         return 0;
1238     }
1239 
1240     int32_t   len = 0;
1241     UBool     findSucceeded;
1242     uregex_reset(regexp2, 0, status);
1243     findSucceeded = uregex_find(regexp2, 0, status);
1244     if (findSucceeded) {
1245         len = uregex_appendReplacement(regexp2, replacementText, replacementLength,
1246                                        &destBuf, &destCapacity, status);
1247     }
1248     len += uregex_appendTail(regexp2, &destBuf, &destCapacity, status);
1249 
1250     return len;
1251 }
1252 
1253 
1254 //------------------------------------------------------------------------------
1255 //
1256 //    uregex_replaceFirstUText
1257 //
1258 //------------------------------------------------------------------------------
1259 U_CAPI UText * U_EXPORT2
uregex_replaceFirstUText(URegularExpression * regexp2,UText * replacementText,UText * dest,UErrorCode * status)1260 uregex_replaceFirstUText(URegularExpression  *regexp2,
1261                          UText                 *replacementText,
1262                          UText                 *dest,
1263                          UErrorCode            *status)  {
1264     RegularExpression *regexp = (RegularExpression*)regexp2;
1265     if (validateRE(regexp, TRUE, status) == FALSE) {
1266         return 0;
1267     }
1268     if (replacementText == NULL) {
1269         *status = U_ILLEGAL_ARGUMENT_ERROR;
1270         return 0;
1271     }
1272 
1273     dest = regexp->fMatcher->replaceFirst(replacementText, dest, *status);
1274     return dest;
1275 }
1276 
1277 
1278 //------------------------------------------------------------------------------
1279 //
1280 //    uregex_appendReplacement
1281 //
1282 //------------------------------------------------------------------------------
1283 
1284 U_NAMESPACE_BEGIN
1285 //
1286 //  Dummy class, because these functions need to be friends of class RegexMatcher,
1287 //               and stand-alone C functions don't work as friends
1288 //
1289 class RegexCImpl {
1290  public:
1291    inline static  int32_t appendReplacement(RegularExpression    *regexp,
1292                       const UChar           *replacementText,
1293                       int32_t                replacementLength,
1294                       UChar                **destBuf,
1295                       int32_t               *destCapacity,
1296                       UErrorCode            *status);
1297 
1298    inline static int32_t appendTail(RegularExpression    *regexp,
1299         UChar                **destBuf,
1300         int32_t               *destCapacity,
1301         UErrorCode            *status);
1302 
1303     inline static int32_t split(RegularExpression    *regexp,
1304         UChar                 *destBuf,
1305         int32_t                destCapacity,
1306         int32_t               *requiredCapacity,
1307         UChar                 *destFields[],
1308         int32_t                destFieldsCapacity,
1309         UErrorCode            *status);
1310 };
1311 
1312 U_NAMESPACE_END
1313 
1314 
1315 
1316 static const UChar BACKSLASH  = 0x5c;
1317 static const UChar DOLLARSIGN = 0x24;
1318 static const UChar LEFTBRACKET = 0x7b;
1319 static const UChar RIGHTBRACKET = 0x7d;
1320 
1321 //
1322 //  Move a character to an output buffer, with bounds checking on the index.
1323 //      Index advances even if capacity is exceeded, for preflight size computations.
1324 //      This little sequence is used a LOT.
1325 //
appendToBuf(UChar c,int32_t * idx,UChar * buf,int32_t bufCapacity)1326 static inline void appendToBuf(UChar c, int32_t *idx, UChar *buf, int32_t bufCapacity) {
1327     if (*idx < bufCapacity) {
1328         buf[*idx] = c;
1329     }
1330     (*idx)++;
1331 }
1332 
1333 
1334 //
1335 //  appendReplacement, the actual implementation.
1336 //
appendReplacement(RegularExpression * regexp,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1337 int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
1338                                       const UChar           *replacementText,
1339                                       int32_t                replacementLength,
1340                                       UChar                **destBuf,
1341                                       int32_t               *destCapacity,
1342                                       UErrorCode            *status)  {
1343 
1344     // If we come in with a buffer overflow error, don't suppress the operation.
1345     //  A series of appendReplacements, appendTail need to correctly preflight
1346     //  the buffer size when an overflow happens somewhere in the middle.
1347     UBool pendingBufferOverflow = FALSE;
1348     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1349         pendingBufferOverflow = TRUE;
1350         *status = U_ZERO_ERROR;
1351     }
1352 
1353     //
1354     // Validate all paramters
1355     //
1356     if (validateRE(regexp, TRUE, status) == FALSE) {
1357         return 0;
1358     }
1359     if (replacementText == NULL || replacementLength < -1 ||
1360         destCapacity == NULL || destBuf == NULL ||
1361         (*destBuf == NULL && *destCapacity > 0) ||
1362         *destCapacity < 0) {
1363         *status = U_ILLEGAL_ARGUMENT_ERROR;
1364         return 0;
1365     }
1366 
1367     RegexMatcher *m = regexp->fMatcher;
1368     if (m->fMatch == FALSE) {
1369         *status = U_REGEX_INVALID_STATE;
1370         return 0;
1371     }
1372 
1373     UChar    *dest             = *destBuf;
1374     int32_t   capacity         = *destCapacity;
1375     int32_t   destIdx          =  0;
1376     int32_t   i;
1377 
1378     // If it wasn't supplied by the caller,  get the length of the replacement text.
1379     //   TODO:  slightly smarter logic in the copy loop could watch for the NUL on
1380     //          the fly and avoid this step.
1381     if (replacementLength == -1) {
1382         replacementLength = u_strlen(replacementText);
1383     }
1384 
1385     // Copy input string from the end of previous match to start of current match
1386     if (regexp->fText != NULL) {
1387         int32_t matchStart;
1388         int32_t lastMatchEnd;
1389         if (UTEXT_USES_U16(m->fInputText)) {
1390             lastMatchEnd = (int32_t)m->fLastMatchEnd;
1391             matchStart = (int32_t)m->fMatchStart;
1392         } else {
1393             // !!!: Would like a better way to do this!
1394             UErrorCode tempStatus = U_ZERO_ERROR;
1395             lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
1396             tempStatus = U_ZERO_ERROR;
1397             matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
1398         }
1399         for (i=lastMatchEnd; i<matchStart; i++) {
1400             appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
1401         }
1402     } else {
1403         UErrorCode possibleOverflowError = U_ZERO_ERROR; // ignore
1404         destIdx += utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart,
1405                                  dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity),
1406                                  &possibleOverflowError);
1407     }
1408     U_ASSERT(destIdx >= 0);
1409 
1410     // scan the replacement text, looking for substitutions ($n) and \escapes.
1411     int32_t  replIdx = 0;
1412     while (replIdx < replacementLength && U_SUCCESS(*status)) {
1413         UChar  c = replacementText[replIdx];
1414         replIdx++;
1415         if (c != DOLLARSIGN && c != BACKSLASH) {
1416             // Common case, no substitution, no escaping,
1417             //  just copy the char to the dest buf.
1418             appendToBuf(c, &destIdx, dest, capacity);
1419             continue;
1420         }
1421 
1422         if (c == BACKSLASH) {
1423             // Backslash Escape.  Copy the following char out without further checks.
1424             //                    Note:  Surrogate pairs don't need any special handling
1425             //                           The second half wont be a '$' or a '\', and
1426             //                           will move to the dest normally on the next
1427             //                           loop iteration.
1428             if (replIdx >= replacementLength) {
1429                 break;
1430             }
1431             c = replacementText[replIdx];
1432 
1433             if (c==0x55/*U*/ || c==0x75/*u*/) {
1434                 // We have a \udddd or \Udddddddd escape sequence.
1435                 UChar32 escapedChar =
1436                     u_unescapeAt(uregex_ucstr_unescape_charAt,
1437                        &replIdx,                   // Index is updated by unescapeAt
1438                        replacementLength,          // Length of replacement text
1439                        (void *)replacementText);
1440 
1441                 if (escapedChar != (UChar32)0xFFFFFFFF) {
1442                     if (escapedChar <= 0xffff) {
1443                         appendToBuf((UChar)escapedChar, &destIdx, dest, capacity);
1444                     } else {
1445                         appendToBuf(U16_LEAD(escapedChar), &destIdx, dest, capacity);
1446                         appendToBuf(U16_TRAIL(escapedChar), &destIdx, dest, capacity);
1447                     }
1448                     continue;
1449                 }
1450                 // Note:  if the \u escape was invalid, just fall through and
1451                 //        treat it as a plain \<anything> escape.
1452             }
1453 
1454             // Plain backslash escape.  Just put out the escaped character.
1455             appendToBuf(c, &destIdx, dest, capacity);
1456 
1457             replIdx++;
1458             continue;
1459         }
1460 
1461         // We've got a $.  Pick up the following capture group name or number.
1462         // For numbers, consume only digits that produce a valid capture group for the pattern.
1463 
1464         int32_t groupNum  = 0;
1465         U_ASSERT(c == DOLLARSIGN);
1466         UChar32 c32;
1467         U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1468         if (u_isdigit(c32)) {
1469             int32_t numDigits = 0;
1470             int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
1471             for (;;) {
1472                 if (replIdx >= replacementLength) {
1473                     break;
1474                 }
1475                 U16_GET(replacementText, 0, replIdx, replacementLength, c32);
1476                 if (u_isdigit(c32) == FALSE) {
1477                     break;
1478                 }
1479 
1480                 int32_t digitVal = u_charDigitValue(c32);
1481                 if (groupNum * 10 + digitVal <= numCaptureGroups) {
1482                     groupNum = groupNum * 10 + digitVal;
1483                     U16_FWD_1(replacementText, replIdx, replacementLength);
1484                     numDigits++;
1485                 } else {
1486                     if (numDigits == 0) {
1487                         *status = U_INDEX_OUTOFBOUNDS_ERROR;
1488                     }
1489                     break;
1490                 }
1491             }
1492         } else if (c32 == LEFTBRACKET) {
1493             // Scan for Named Capture Group, ${name}.
1494             UnicodeString groupName;
1495             U16_FWD_1(replacementText, replIdx, replacementLength);
1496             while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) {
1497                 if (replIdx >= replacementLength) {
1498                     *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1499                     break;
1500                 }
1501                 U16_NEXT(replacementText, replIdx, replacementLength, c32);
1502                 if ((c32 >= 0x41 && c32 <= 0x5a) ||           // A..Z
1503                         (c32 >= 0x61 && c32 <= 0x7a) ||       // a..z
1504                         (c32 >= 0x31 && c32 <= 0x39)) {       // 0..9
1505                     groupName.append(c32);
1506                 } else if (c32 == RIGHTBRACKET) {
1507                     groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
1508                     if (groupNum == 0) {
1509                         // Name not defined by pattern.
1510                         *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1511                     }
1512                 } else {
1513                     // Character was something other than a name char or a closing '}'
1514                     *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1515                 }
1516             }
1517         } else {
1518             // $ not followed by {name} or digits.
1519             *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
1520         }
1521 
1522 
1523         // Finally, append the capture group data to the destination.
1524         if (U_SUCCESS(*status)) {
1525             destIdx += uregex_group((URegularExpression*)regexp, groupNum,
1526                                     dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
1527             if (*status == U_BUFFER_OVERFLOW_ERROR) {
1528                 // Ignore buffer overflow when extracting the group.  We need to
1529                 //   continue on to get full size of the untruncated result.  We will
1530                 //   raise our own buffer overflow error at the end.
1531                 *status = U_ZERO_ERROR;
1532             }
1533         }
1534 
1535         if (U_FAILURE(*status)) {
1536             // bad group number or name.
1537             break;
1538         }
1539     }
1540 
1541     //
1542     //  Nul Terminate the dest buffer if possible.
1543     //  Set the appropriate buffer overflow or not terminated error, if needed.
1544     //
1545     if (destIdx < capacity) {
1546         dest[destIdx] = 0;
1547     } else if (U_SUCCESS(*status)) {
1548         if (destIdx == *destCapacity) {
1549             *status = U_STRING_NOT_TERMINATED_WARNING;
1550         } else {
1551             *status = U_BUFFER_OVERFLOW_ERROR;
1552         }
1553     }
1554 
1555     //
1556     // Return an updated dest buffer and capacity to the caller.
1557     //
1558     if (destIdx > 0 &&  *destCapacity > 0) {
1559         if (destIdx < capacity) {
1560             *destBuf      += destIdx;
1561             *destCapacity -= destIdx;
1562         } else {
1563             *destBuf      += capacity;
1564             *destCapacity =  0;
1565         }
1566     }
1567 
1568     // If we came in with a buffer overflow, make sure we go out with one also.
1569     //   (A zero length match right at the end of the previous match could
1570     //    make this function succeed even though a previous call had overflowed the buf)
1571     if (pendingBufferOverflow && U_SUCCESS(*status)) {
1572         *status = U_BUFFER_OVERFLOW_ERROR;
1573     }
1574 
1575     return destIdx;
1576 }
1577 
1578 //
1579 //   appendReplacement   the actual API function,
1580 //
1581 U_CAPI int32_t U_EXPORT2
uregex_appendReplacement(URegularExpression * regexp2,const UChar * replacementText,int32_t replacementLength,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1582 uregex_appendReplacement(URegularExpression    *regexp2,
1583                          const UChar           *replacementText,
1584                          int32_t                replacementLength,
1585                          UChar                **destBuf,
1586                          int32_t               *destCapacity,
1587                          UErrorCode            *status) {
1588 
1589     RegularExpression *regexp = (RegularExpression*)regexp2;
1590     return RegexCImpl::appendReplacement(
1591         regexp, replacementText, replacementLength,destBuf, destCapacity, status);
1592 }
1593 
1594 //
1595 //   uregex_appendReplacementUText...can just use the normal C++ method
1596 //
1597 U_CAPI void U_EXPORT2
uregex_appendReplacementUText(URegularExpression * regexp2,UText * replText,UText * dest,UErrorCode * status)1598 uregex_appendReplacementUText(URegularExpression    *regexp2,
1599                               UText                 *replText,
1600                               UText                 *dest,
1601                               UErrorCode            *status)  {
1602     RegularExpression *regexp = (RegularExpression*)regexp2;
1603     regexp->fMatcher->appendReplacement(dest, replText, *status);
1604 }
1605 
1606 
1607 //------------------------------------------------------------------------------
1608 //
1609 //    uregex_appendTail
1610 //
1611 //------------------------------------------------------------------------------
appendTail(RegularExpression * regexp,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1612 int32_t RegexCImpl::appendTail(RegularExpression    *regexp,
1613                                UChar                **destBuf,
1614                                int32_t               *destCapacity,
1615                                UErrorCode            *status)
1616 {
1617 
1618     // If we come in with a buffer overflow error, don't suppress the operation.
1619     //  A series of appendReplacements, appendTail need to correctly preflight
1620     //  the buffer size when an overflow happens somewhere in the middle.
1621     UBool pendingBufferOverflow = FALSE;
1622     if (*status == U_BUFFER_OVERFLOW_ERROR && destCapacity != NULL && *destCapacity == 0) {
1623         pendingBufferOverflow = TRUE;
1624         *status = U_ZERO_ERROR;
1625     }
1626 
1627     if (validateRE(regexp, TRUE, status) == FALSE) {
1628         return 0;
1629     }
1630 
1631     if (destCapacity == NULL || destBuf == NULL ||
1632         (*destBuf == NULL && *destCapacity > 0) ||
1633         *destCapacity < 0)
1634     {
1635         *status = U_ILLEGAL_ARGUMENT_ERROR;
1636         return 0;
1637     }
1638 
1639     RegexMatcher *m = regexp->fMatcher;
1640 
1641     int32_t  destIdx     = 0;
1642     int32_t  destCap     = *destCapacity;
1643     UChar    *dest       = *destBuf;
1644 
1645     if (regexp->fText != NULL) {
1646         int32_t srcIdx;
1647         int64_t nativeIdx = (m->fMatch ? m->fMatchEnd : m->fLastMatchEnd);
1648         if (nativeIdx == -1) {
1649             srcIdx = 0;
1650         } else if (UTEXT_USES_U16(m->fInputText)) {
1651             srcIdx = (int32_t)nativeIdx;
1652         } else {
1653             UErrorCode status = U_ZERO_ERROR;
1654             srcIdx = utext_extract(m->fInputText, 0, nativeIdx, NULL, 0, &status);
1655         }
1656 
1657         for (;;) {
1658             U_ASSERT(destIdx >= 0);
1659 
1660             if (srcIdx == regexp->fTextLength) {
1661                 break;
1662             }
1663             UChar c = regexp->fText[srcIdx];
1664             if (c == 0 && regexp->fTextLength == -1) {
1665                 regexp->fTextLength = srcIdx;
1666                 break;
1667             }
1668 
1669             if (destIdx < destCap) {
1670                 dest[destIdx] = c;
1671             } else {
1672                 // We've overflowed the dest buffer.
1673                 //  If the total input string length is known, we can
1674                 //    compute the total buffer size needed without scanning through the string.
1675                 if (regexp->fTextLength > 0) {
1676                     destIdx += (regexp->fTextLength - srcIdx);
1677                     break;
1678                 }
1679             }
1680             srcIdx++;
1681             destIdx++;
1682         }
1683     } else {
1684         int64_t  srcIdx;
1685         if (m->fMatch) {
1686             // The most recent call to find() succeeded.
1687             srcIdx = m->fMatchEnd;
1688         } else {
1689             // The last call to find() on this matcher failed().
1690             //   Look back to the end of the last find() that succeeded for src index.
1691             srcIdx = m->fLastMatchEnd;
1692             if (srcIdx == -1)  {
1693                 // There has been no successful match with this matcher.
1694                 //   We want to copy the whole string.
1695                 srcIdx = 0;
1696             }
1697         }
1698 
1699         destIdx = utext_extract(m->fInputText, srcIdx, m->fInputLength, dest, destCap, status);
1700     }
1701 
1702     //
1703     //  NUL terminate the output string, if possible, otherwise issue the
1704     //   appropriate error or warning.
1705     //
1706     if (destIdx < destCap) {
1707         dest[destIdx] = 0;
1708     } else  if (destIdx == destCap) {
1709         *status = U_STRING_NOT_TERMINATED_WARNING;
1710     } else {
1711         *status = U_BUFFER_OVERFLOW_ERROR;
1712     }
1713 
1714     //
1715     // Update the user's buffer ptr and capacity vars to reflect the
1716     //   amount used.
1717     //
1718     if (destIdx < destCap) {
1719         *destBuf      += destIdx;
1720         *destCapacity -= destIdx;
1721     } else if (*destBuf != NULL) {
1722         *destBuf      += destCap;
1723         *destCapacity  = 0;
1724     }
1725 
1726     if (pendingBufferOverflow && U_SUCCESS(*status)) {
1727         *status = U_BUFFER_OVERFLOW_ERROR;
1728     }
1729 
1730     return destIdx;
1731 }
1732 
1733 
1734 //
1735 //   appendTail   the actual API function
1736 //
1737 U_CAPI int32_t U_EXPORT2
uregex_appendTail(URegularExpression * regexp2,UChar ** destBuf,int32_t * destCapacity,UErrorCode * status)1738 uregex_appendTail(URegularExpression    *regexp2,
1739                   UChar                **destBuf,
1740                   int32_t               *destCapacity,
1741                   UErrorCode            *status)  {
1742     RegularExpression *regexp = (RegularExpression*)regexp2;
1743     return RegexCImpl::appendTail(regexp, destBuf, destCapacity, status);
1744 }
1745 
1746 
1747 //
1748 //   uregex_appendTailUText...can just use the normal C++ method
1749 //
1750 U_CAPI UText * U_EXPORT2
uregex_appendTailUText(URegularExpression * regexp2,UText * dest,UErrorCode * status)1751 uregex_appendTailUText(URegularExpression    *regexp2,
1752                        UText                 *dest,
1753                        UErrorCode            *status)  {
1754     RegularExpression *regexp = (RegularExpression*)regexp2;
1755     return regexp->fMatcher->appendTail(dest, *status);
1756 }
1757 
1758 
1759 //------------------------------------------------------------------------------
1760 //
1761 //    copyString     Internal utility to copy a string to an output buffer,
1762 //                   while managing buffer overflow and preflight size
1763 //                   computation.  NUL termination is added to destination,
1764 //                   and the NUL is counted in the output size.
1765 //
1766 //------------------------------------------------------------------------------
1767 #if 0
1768 static void copyString(UChar        *destBuffer,    //  Destination buffer.
1769                        int32_t       destCapacity,  //  Total capacity of dest buffer
1770                        int32_t      *destIndex,     //  Index into dest buffer.  Updated on return.
1771                                                     //    Update not clipped to destCapacity.
1772                        const UChar  *srcPtr,        //  Pointer to source string
1773                        int32_t       srcLen)        //  Source string len.
1774 {
1775     int32_t  si;
1776     int32_t  di = *destIndex;
1777     UChar    c;
1778 
1779     for (si=0; si<srcLen;  si++) {
1780         c = srcPtr[si];
1781         if (di < destCapacity) {
1782             destBuffer[di] = c;
1783             di++;
1784         } else {
1785             di += srcLen - si;
1786             break;
1787         }
1788     }
1789     if (di<destCapacity) {
1790         destBuffer[di] = 0;
1791     }
1792     di++;
1793     *destIndex = di;
1794 }
1795 #endif
1796 
1797 //------------------------------------------------------------------------------
1798 //
1799 //    uregex_split
1800 //
1801 //------------------------------------------------------------------------------
split(RegularExpression * regexp,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1802 int32_t RegexCImpl::split(RegularExpression     *regexp,
1803                           UChar                 *destBuf,
1804                           int32_t                destCapacity,
1805                           int32_t               *requiredCapacity,
1806                           UChar                 *destFields[],
1807                           int32_t                destFieldsCapacity,
1808                           UErrorCode            *status) {
1809     //
1810     // Reset for the input text
1811     //
1812     regexp->fMatcher->reset();
1813     UText *inputText = regexp->fMatcher->fInputText;
1814     int64_t   nextOutputStringStart = 0;
1815     int64_t   inputLen = regexp->fMatcher->fInputLength;
1816     if (inputLen == 0) {
1817         return 0;
1818     }
1819 
1820     //
1821     // Loop through the input text, searching for the delimiter pattern
1822     //
1823     int32_t   i;             // Index of the field being processed.
1824     int32_t   destIdx = 0;   // Next available position in destBuf;
1825     int32_t   numCaptureGroups = regexp->fMatcher->groupCount();
1826     UErrorCode  tStatus = U_ZERO_ERROR;   // Want to ignore any buffer overflow errors so that the strings are still counted
1827     for (i=0; ; i++) {
1828         if (i>=destFieldsCapacity-1) {
1829             // There are one or zero output strings left.
1830             // Fill the last output string with whatever is left from the input, then exit the loop.
1831             //  ( i will be == destFieldsCapacity if we filled the output array while processing
1832             //    capture groups of the delimiter expression, in which case we will discard the
1833             //    last capture group saved in favor of the unprocessed remainder of the
1834             //    input string.)
1835             if (inputLen > nextOutputStringStart) {
1836                 if (i != destFieldsCapacity-1) {
1837                     // No fields are left.  Recycle the last one for holding the trailing part of
1838                     //   the input string.
1839                     i = destFieldsCapacity-1;
1840                     destIdx = (int32_t)(destFields[i] - destFields[0]);
1841                 }
1842 
1843                 destFields[i] = &destBuf[destIdx];
1844                 destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1845                                              &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1846             }
1847             break;
1848         }
1849 
1850         if (regexp->fMatcher->find()) {
1851             // We found another delimiter.  Move everything from where we started looking
1852             //  up until the start of the delimiter into the next output string.
1853             destFields[i] = &destBuf[destIdx];
1854 
1855             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, regexp->fMatcher->fMatchStart,
1856                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), &tStatus);
1857             if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1858                 tStatus = U_ZERO_ERROR;
1859             } else {
1860                 *status = tStatus;
1861             }
1862             nextOutputStringStart = regexp->fMatcher->fMatchEnd;
1863 
1864             // If the delimiter pattern has capturing parentheses, the captured
1865             //  text goes out into the next n destination strings.
1866             int32_t groupNum;
1867             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
1868                 // If we've run out of output string slots, bail out.
1869                 if (i==destFieldsCapacity-1) {
1870                     break;
1871                 }
1872                 i++;
1873 
1874                 // Set up to extract the capture group contents into the dest buffer.
1875                 destFields[i] = &destBuf[destIdx];
1876                 tStatus = U_ZERO_ERROR;
1877                 int32_t t = uregex_group((URegularExpression*)regexp,
1878                                          groupNum,
1879                                          destFields[i],
1880                                          REMAINING_CAPACITY(destIdx, destCapacity),
1881                                          &tStatus);
1882                 destIdx += t + 1;    // Record the space used in the output string buffer.
1883                                      //  +1 for the NUL that terminates the string.
1884                 if (tStatus == U_BUFFER_OVERFLOW_ERROR) {
1885                     tStatus = U_ZERO_ERROR;
1886                 } else {
1887                     *status = tStatus;
1888                 }
1889             }
1890 
1891             if (nextOutputStringStart == inputLen) {
1892                 // The delimiter was at the end of the string.
1893                 // Output an empty string, and then we are done.
1894                 if (destIdx < destCapacity) {
1895                     destBuf[destIdx] = 0;
1896                 }
1897                 if (i < destFieldsCapacity-1) {
1898                    ++i;
1899                 }
1900                 if (destIdx < destCapacity) {
1901                     destFields[i] = destBuf + destIdx;
1902                 }
1903                 ++destIdx;
1904                 break;
1905             }
1906 
1907         }
1908         else
1909         {
1910             // We ran off the end of the input while looking for the next delimiter.
1911             // All the remaining text goes into the current output string.
1912             destFields[i] = &destBuf[destIdx];
1913             destIdx += 1 + utext_extract(inputText, nextOutputStringStart, inputLen,
1914                                          &destBuf[destIdx], REMAINING_CAPACITY(destIdx, destCapacity), status);
1915             break;
1916         }
1917     }
1918 
1919     // Zero out any unused portion of the destFields array
1920     int j;
1921     for (j=i+1; j<destFieldsCapacity; j++) {
1922         destFields[j] = NULL;
1923     }
1924 
1925     if (requiredCapacity != NULL) {
1926         *requiredCapacity = destIdx;
1927     }
1928     if (destIdx > destCapacity) {
1929         *status = U_BUFFER_OVERFLOW_ERROR;
1930     }
1931     return i+1;
1932 }
1933 
1934 //
1935 //   uregex_split   The actual API function
1936 //
1937 U_CAPI int32_t U_EXPORT2
uregex_split(URegularExpression * regexp2,UChar * destBuf,int32_t destCapacity,int32_t * requiredCapacity,UChar * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1938 uregex_split(URegularExpression      *regexp2,
1939              UChar                   *destBuf,
1940              int32_t                  destCapacity,
1941              int32_t                 *requiredCapacity,
1942              UChar                   *destFields[],
1943              int32_t                  destFieldsCapacity,
1944              UErrorCode              *status) {
1945     RegularExpression *regexp = (RegularExpression*)regexp2;
1946     if (validateRE(regexp, TRUE, status) == FALSE) {
1947         return 0;
1948     }
1949     if ((destBuf == NULL && destCapacity > 0) ||
1950         destCapacity < 0 ||
1951         destFields == NULL ||
1952         destFieldsCapacity < 1 ) {
1953         *status = U_ILLEGAL_ARGUMENT_ERROR;
1954         return 0;
1955     }
1956 
1957     return RegexCImpl::split(regexp, destBuf, destCapacity, requiredCapacity, destFields, destFieldsCapacity, status);
1958 }
1959 
1960 
1961 //
1962 //   uregex_splitUText...can just use the normal C++ method
1963 //
1964 U_CAPI int32_t U_EXPORT2
uregex_splitUText(URegularExpression * regexp2,UText * destFields[],int32_t destFieldsCapacity,UErrorCode * status)1965 uregex_splitUText(URegularExpression    *regexp2,
1966                   UText                 *destFields[],
1967                   int32_t                destFieldsCapacity,
1968                   UErrorCode            *status) {
1969     RegularExpression *regexp = (RegularExpression*)regexp2;
1970     return regexp->fMatcher->split(regexp->fMatcher->inputText(), destFields, destFieldsCapacity, *status);
1971 }
1972 
1973 
1974 #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1975 
1976