1 /*
2 *******************************************************************************
3 *
4 *   © 2016 and later: Unicode, Inc. and others.
5 *   License & terms of use: http://www.unicode.org/copyright.html#License
6 *
7 *******************************************************************************
8 *******************************************************************************
9 *
10 *   Copyright (C) 2003-2014, International Business Machines
11 *   Corporation and others.  All Rights Reserved.
12 *
13 *******************************************************************************
14 *   file name:  uciter8.c
15 *   encoding:   UTF-8
16 *   tab size:   8 (not used)
17 *   indentation:4
18 *
19 *   created on: 2003jan10
20 *   created by: Markus W. Scherer
21 *
22 *   This file contains sample code that illustrates reading
23 *   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
24 *   and also accepting single surrogates.
25 */
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include "unicode/utypes.h"
30 #include "unicode/uiter.h"
31 #include "uit_len8.h"
32 
33 #ifndef UPRV_LENGTHOF
34 #define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
35 #endif
36 
37 #define log_err printf
38 
39 /* UCharIterator test ------------------------------------------------------- */
40 
41 /*
42  * The following code is a copy of the UCharIterator test code in
43  * source/test/cintltst/custrtst.c,
44  * testing the lenient-8 iterator instead of the UTF-8 one.
45  */
46 
47 /*
48  * Compare results from two iterators, should be same.
49  * Assume that the text is not empty and that
50  * iteration start==0 and iteration limit==length.
51  */
52 static void
compareIterators(UCharIterator * iter1,const char * n1,UCharIterator * iter2,const char * n2)53 compareIterators(UCharIterator *iter1, const char *n1,
54                  UCharIterator *iter2, const char *n2) {
55     int32_t i, pos1, pos2, middle, length;
56     UChar32 c1, c2;
57 
58     /* compare lengths */
59     length=iter1->getIndex(iter1, UITER_LENGTH);
60     pos2=iter2->getIndex(iter2, UITER_LENGTH);
61     if(length!=pos2) {
62         log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
63         return;
64     }
65 
66     /* set into the middle */
67     middle=length/2;
68 
69     pos1=iter1->move(iter1, middle, UITER_ZERO);
70     if(pos1!=middle) {
71         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
72         return;
73     }
74 
75     pos2=iter2->move(iter2, middle, UITER_ZERO);
76     if(pos2!=middle) {
77         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
78         return;
79     }
80 
81     /* test current() */
82     c1=iter1->current(iter1);
83     c2=iter2->current(iter2);
84     if(c1!=c2) {
85         log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
86         return;
87     }
88 
89     /* move forward 3 UChars */
90     for(i=0; i<3; ++i) {
91         c1=iter1->next(iter1);
92         c2=iter2->next(iter2);
93         if(c1!=c2) {
94             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
95             return;
96         }
97     }
98 
99     /* move backward 5 UChars */
100     for(i=0; i<5; ++i) {
101         c1=iter1->previous(iter1);
102         c2=iter2->previous(iter2);
103         if(c1!=c2) {
104             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
105             return;
106         }
107     }
108 
109     /* iterate forward from the beginning */
110     pos1=iter1->move(iter1, 0, UITER_START);
111     if(pos1<0) {
112         log_err("%s->move(start) failed\n", n1);
113         return;
114     }
115     if(!iter1->hasNext(iter1)) {
116         log_err("%s->hasNext() at the start returns FALSE\n", n1);
117         return;
118     }
119 
120     pos2=iter2->move(iter2, 0, UITER_START);
121     if(pos2<0) {
122         log_err("%s->move(start) failed\n", n2);
123         return;
124     }
125     if(!iter2->hasNext(iter2)) {
126         log_err("%s->hasNext() at the start returns FALSE\n", n2);
127         return;
128     }
129 
130     do {
131         c1=iter1->next(iter1);
132         c2=iter2->next(iter2);
133         if(c1!=c2) {
134             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
135             return;
136         }
137     } while(c1>=0);
138 
139     if(iter1->hasNext(iter1)) {
140         log_err("%s->hasNext() at the end returns TRUE\n", n1);
141         return;
142     }
143     if(iter2->hasNext(iter2)) {
144         log_err("%s->hasNext() at the end returns TRUE\n", n2);
145         return;
146     }
147 
148     /* back to the middle */
149     pos1=iter1->move(iter1, middle, UITER_ZERO);
150     if(pos1!=middle) {
151         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
152         return;
153     }
154 
155     pos2=iter2->move(iter2, middle, UITER_ZERO);
156     if(pos2!=middle) {
157         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
158         return;
159     }
160 
161     /* move to index 1 */
162     pos1=iter1->move(iter1, 1, UITER_ZERO);
163     if(pos1!=1) {
164         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
165         return;
166     }
167 
168     pos2=iter2->move(iter2, 1, UITER_ZERO);
169     if(pos2!=1) {
170         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
171         return;
172     }
173 
174     /* iterate backward from the end */
175     pos1=iter1->move(iter1, 0, UITER_LIMIT);
176     if(pos1<0) {
177         log_err("%s->move(limit) failed\n", n1);
178         return;
179     }
180     if(!iter1->hasPrevious(iter1)) {
181         log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
182         return;
183     }
184 
185     pos2=iter2->move(iter2, 0, UITER_LIMIT);
186     if(pos2<0) {
187         log_err("%s->move(limit) failed\n", n2);
188         return;
189     }
190     if(!iter2->hasPrevious(iter2)) {
191         log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
192         return;
193     }
194 
195     do {
196         c1=iter1->previous(iter1);
197         c2=iter2->previous(iter2);
198         if(c1!=c2) {
199             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
200             return;
201         }
202     } while(c1>=0);
203 
204     if(iter1->hasPrevious(iter1)) {
205         log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
206         return;
207     }
208     if(iter2->hasPrevious(iter2)) {
209         log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
210         return;
211     }
212 }
213 
214 /*
215  * Test the iterator's getState() and setState() functions.
216  * iter1 and iter2 must be set up for the same iterator type and the same string
217  * but may be physically different structs (different addresses).
218  *
219  * Assume that the text is not empty and that
220  * iteration start==0 and iteration limit==length.
221  * It must be 2<=middle<=length-2.
222  */
223 static void
testIteratorState(UCharIterator * iter1,UCharIterator * iter2,const char * n,int32_t middle)224 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
225     UChar32 u[4];
226 
227     UErrorCode errorCode;
228     UChar32 c;
229     uint32_t state;
230     int32_t i, j;
231 
232     /* get four UChars from the middle of the string */
233     iter1->move(iter1, middle-2, UITER_ZERO);
234     for(i=0; i<4; ++i) {
235         c=iter1->next(iter1);
236         if(c<0) {
237             /* the test violates the assumptions, see comment above */
238             log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
239             return;
240         }
241         u[i]=c;
242     }
243 
244     /* move to the middle and get the state */
245     iter1->move(iter1, -2, UITER_CURRENT);
246     state=uiter_getState(iter1);
247 
248     /* set the state into the second iterator and compare the results */
249     errorCode=U_ZERO_ERROR;
250     uiter_setState(iter2, state, &errorCode);
251     if(U_FAILURE(errorCode)) {
252         log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
253         return;
254     }
255 
256     c=iter2->current(iter2);
257     if(c!=u[2]) {
258         log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
259     }
260 
261     c=iter2->previous(iter2);
262     if(c!=u[1]) {
263         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
264     }
265 
266     iter2->move(iter2, 2, UITER_CURRENT);
267     c=iter2->next(iter2);
268     if(c!=u[3]) {
269         log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
270     }
271 
272     iter2->move(iter2, -3, UITER_CURRENT);
273     c=iter2->previous(iter2);
274     if(c!=u[0]) {
275         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
276     }
277 
278     /* move the second iterator back to the middle */
279     iter2->move(iter2, 1, UITER_CURRENT);
280     iter2->next(iter2);
281 
282     /* check that both are in the middle */
283     i=iter1->getIndex(iter1, UITER_CURRENT);
284     j=iter2->getIndex(iter2, UITER_CURRENT);
285     if(i!=middle) {
286         log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
287     }
288     if(i!=j) {
289         log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
290     }
291 
292     /* compare lengths */
293     i=iter1->getIndex(iter1, UITER_LENGTH);
294     j=iter2->getIndex(iter2, UITER_LENGTH);
295     if(i!=j) {
296         log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
297     }
298 }
299 
300 static void
TestLenient8Iterator()301 TestLenient8Iterator() {
302     static const UChar text[]={
303         0x61, 0x62, 0x63,
304         /* dffd 107fd             d801    dffd - in UTF-16, U+107fd=<d801 dffd> */
305         0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
306         0x78, 0x79, 0x7a, 0
307     };
308     static const uint8_t bytes[]={
309         0x61, 0x62, 0x63,
310         /* dffd            107fd                    d801               dffd - mixture */
311         0xed, 0xbf, 0xbd,  0xf0, 0x90, 0x9f, 0xbd,  0xed, 0xa0, 0x81,  0xed, 0xbf, 0xbd,
312         0x78, 0x79, 0x7a, 0
313     };
314 
315     UCharIterator iter1, iter2;
316     UChar32 c1, c2;
317     int32_t length;
318 
319     puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
320 
321     /* compare the same string between UTF-16 and lenient-8 UCharIterators */
322     uiter_setString(&iter1, text, -1);
323     uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
324     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
325 
326     /* try again with length=-1 */
327     uiter_setLenient8(&iter2, (const char *)bytes, -1);
328     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
329 
330     /* test get/set state */
331     length=UPRV_LENGTHOF(text)-1;
332     uiter_setLenient8(&iter1, (const char*)bytes, -1);
333     testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
334     testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
335 
336     /* ---------------------------------------------------------------------- */
337 
338     puts("no output so far means that the lenient-8 iterator works fine");
339 
340     puts("iterate forward:\nUTF-16\tlenient-8");
341     uiter_setString(&iter1, text, -1);
342     iter1.move(&iter1, 0, UITER_START);
343     iter2.move(&iter2, 0, UITER_START);
344     for(;;) {
345         c1=iter1.next(&iter1);
346         c2=iter2.next(&iter2);
347         if(c1<0 && c2<0) {
348             break;
349         }
350         if(c1<0) {
351             printf("\t%04x\n", c2);
352         } else if(c2<0) {
353             printf("%04x\n", c1);
354         } else {
355             printf("%04x\t%04x\n", c1, c2);
356         }
357     }
358 }
359 
360 extern int
main(int argc,const char * argv[])361 main(int argc, const char *argv[]) {
362     TestLenient8Iterator();
363     return 0;
364 }
365