1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 2003-2014, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uciter8.c
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2003jan10
14 *   created by: Markus W. Scherer
15 *
16 *   This file contains sample code that illustrates reading
17 *   8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
18 *   and also accepting single surrogates.
19 */
20 
21 #include <stdio.h>
22 #include <string.h>
23 #include "unicode/utypes.h"
24 #include "unicode/uiter.h"
25 #include "uit_len8.h"
26 
27 #define log_err printf
28 
29 /* UCharIterator test ------------------------------------------------------- */
30 
31 /*
32  * The following code is a copy of the UCharIterator test code in
33  * source/test/cintltst/custrtst.c,
34  * testing the lenient-8 iterator instead of the UTF-8 one.
35  */
36 
37 /*
38  * Compare results from two iterators, should be same.
39  * Assume that the text is not empty and that
40  * iteration start==0 and iteration limit==length.
41  */
42 static void
compareIterators(UCharIterator * iter1,const char * n1,UCharIterator * iter2,const char * n2)43 compareIterators(UCharIterator *iter1, const char *n1,
44                  UCharIterator *iter2, const char *n2) {
45     int32_t i, pos1, pos2, middle, length;
46     UChar32 c1, c2;
47 
48     /* compare lengths */
49     length=iter1->getIndex(iter1, UITER_LENGTH);
50     pos2=iter2->getIndex(iter2, UITER_LENGTH);
51     if(length!=pos2) {
52         log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
53         return;
54     }
55 
56     /* set into the middle */
57     middle=length/2;
58 
59     pos1=iter1->move(iter1, middle, UITER_ZERO);
60     if(pos1!=middle) {
61         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
62         return;
63     }
64 
65     pos2=iter2->move(iter2, middle, UITER_ZERO);
66     if(pos2!=middle) {
67         log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
68         return;
69     }
70 
71     /* test current() */
72     c1=iter1->current(iter1);
73     c2=iter2->current(iter2);
74     if(c1!=c2) {
75         log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
76         return;
77     }
78 
79     /* move forward 3 UChars */
80     for(i=0; i<3; ++i) {
81         c1=iter1->next(iter1);
82         c2=iter2->next(iter2);
83         if(c1!=c2) {
84             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
85             return;
86         }
87     }
88 
89     /* move backward 5 UChars */
90     for(i=0; i<5; ++i) {
91         c1=iter1->previous(iter1);
92         c2=iter2->previous(iter2);
93         if(c1!=c2) {
94             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
95             return;
96         }
97     }
98 
99     /* iterate forward from the beginning */
100     pos1=iter1->move(iter1, 0, UITER_START);
101     if(pos1<0) {
102         log_err("%s->move(start) failed\n", n1);
103         return;
104     }
105     if(!iter1->hasNext(iter1)) {
106         log_err("%s->hasNext() at the start returns FALSE\n", n1);
107         return;
108     }
109 
110     pos2=iter2->move(iter2, 0, UITER_START);
111     if(pos2<0) {
112         log_err("%s->move(start) failed\n", n2);
113         return;
114     }
115     if(!iter2->hasNext(iter2)) {
116         log_err("%s->hasNext() at the start returns FALSE\n", n2);
117         return;
118     }
119 
120     do {
121         c1=iter1->next(iter1);
122         c2=iter2->next(iter2);
123         if(c1!=c2) {
124             log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
125             return;
126         }
127     } while(c1>=0);
128 
129     if(iter1->hasNext(iter1)) {
130         log_err("%s->hasNext() at the end returns TRUE\n", n1);
131         return;
132     }
133     if(iter2->hasNext(iter2)) {
134         log_err("%s->hasNext() at the end returns TRUE\n", n2);
135         return;
136     }
137 
138     /* back to the middle */
139     pos1=iter1->move(iter1, middle, UITER_ZERO);
140     if(pos1!=middle) {
141         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
142         return;
143     }
144 
145     pos2=iter2->move(iter2, middle, UITER_ZERO);
146     if(pos2!=middle) {
147         log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
148         return;
149     }
150 
151     /* move to index 1 */
152     pos1=iter1->move(iter1, 1, UITER_ZERO);
153     if(pos1!=1) {
154         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
155         return;
156     }
157 
158     pos2=iter2->move(iter2, 1, UITER_ZERO);
159     if(pos2!=1) {
160         log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
161         return;
162     }
163 
164     /* iterate backward from the end */
165     pos1=iter1->move(iter1, 0, UITER_LIMIT);
166     if(pos1<0) {
167         log_err("%s->move(limit) failed\n", n1);
168         return;
169     }
170     if(!iter1->hasPrevious(iter1)) {
171         log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
172         return;
173     }
174 
175     pos2=iter2->move(iter2, 0, UITER_LIMIT);
176     if(pos2<0) {
177         log_err("%s->move(limit) failed\n", n2);
178         return;
179     }
180     if(!iter2->hasPrevious(iter2)) {
181         log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
182         return;
183     }
184 
185     do {
186         c1=iter1->previous(iter1);
187         c2=iter2->previous(iter2);
188         if(c1!=c2) {
189             log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
190             return;
191         }
192     } while(c1>=0);
193 
194     if(iter1->hasPrevious(iter1)) {
195         log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
196         return;
197     }
198     if(iter2->hasPrevious(iter2)) {
199         log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
200         return;
201     }
202 }
203 
204 /*
205  * Test the iterator's getState() and setState() functions.
206  * iter1 and iter2 must be set up for the same iterator type and the same string
207  * but may be physically different structs (different addresses).
208  *
209  * Assume that the text is not empty and that
210  * iteration start==0 and iteration limit==length.
211  * It must be 2<=middle<=length-2.
212  */
213 static void
testIteratorState(UCharIterator * iter1,UCharIterator * iter2,const char * n,int32_t middle)214 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
215     UChar32 u[4];
216 
217     UErrorCode errorCode;
218     UChar32 c;
219     uint32_t state;
220     int32_t i, j;
221 
222     /* get four UChars from the middle of the string */
223     iter1->move(iter1, middle-2, UITER_ZERO);
224     for(i=0; i<4; ++i) {
225         c=iter1->next(iter1);
226         if(c<0) {
227             /* the test violates the assumptions, see comment above */
228             log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
229             return;
230         }
231         u[i]=c;
232     }
233 
234     /* move to the middle and get the state */
235     iter1->move(iter1, -2, UITER_CURRENT);
236     state=uiter_getState(iter1);
237 
238     /* set the state into the second iterator and compare the results */
239     errorCode=U_ZERO_ERROR;
240     uiter_setState(iter2, state, &errorCode);
241     if(U_FAILURE(errorCode)) {
242         log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
243         return;
244     }
245 
246     c=iter2->current(iter2);
247     if(c!=u[2]) {
248         log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
249     }
250 
251     c=iter2->previous(iter2);
252     if(c!=u[1]) {
253         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
254     }
255 
256     iter2->move(iter2, 2, UITER_CURRENT);
257     c=iter2->next(iter2);
258     if(c!=u[3]) {
259         log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
260     }
261 
262     iter2->move(iter2, -3, UITER_CURRENT);
263     c=iter2->previous(iter2);
264     if(c!=u[0]) {
265         log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
266     }
267 
268     /* move the second iterator back to the middle */
269     iter2->move(iter2, 1, UITER_CURRENT);
270     iter2->next(iter2);
271 
272     /* check that both are in the middle */
273     i=iter1->getIndex(iter1, UITER_CURRENT);
274     j=iter2->getIndex(iter2, UITER_CURRENT);
275     if(i!=middle) {
276         log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
277     }
278     if(i!=j) {
279         log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
280     }
281 
282     /* compare lengths */
283     i=iter1->getIndex(iter1, UITER_LENGTH);
284     j=iter2->getIndex(iter2, UITER_LENGTH);
285     if(i!=j) {
286         log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
287     }
288 }
289 
290 static void
TestLenient8Iterator()291 TestLenient8Iterator() {
292     static const UChar text[]={
293         0x61, 0x62, 0x63,
294         /* dffd 107fd             d801    dffd - in UTF-16, U+107fd=<d801 dffd> */
295         0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
296         0x78, 0x79, 0x7a, 0
297     };
298     static const uint8_t bytes[]={
299         0x61, 0x62, 0x63,
300         /* dffd            107fd                    d801               dffd - mixture */
301         0xed, 0xbf, 0xbd,  0xf0, 0x90, 0x9f, 0xbd,  0xed, 0xa0, 0x81,  0xed, 0xbf, 0xbd,
302         0x78, 0x79, 0x7a, 0
303     };
304 
305     UCharIterator iter1, iter2;
306     UChar32 c1, c2;
307     int32_t length;
308 
309     puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
310 
311     /* compare the same string between UTF-16 and lenient-8 UCharIterators */
312     uiter_setString(&iter1, text, -1);
313     uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
314     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
315 
316     /* try again with length=-1 */
317     uiter_setLenient8(&iter2, (const char *)bytes, -1);
318     compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
319 
320     /* test get/set state */
321     length=UPRV_LENGTHOF(text)-1;
322     uiter_setLenient8(&iter1, (const char*)bytes, -1);
323     testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
324     testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
325 
326     /* ---------------------------------------------------------------------- */
327 
328     puts("no output so far means that the lenient-8 iterator works fine");
329 
330     puts("iterate forward:\nUTF-16\tlenient-8");
331     uiter_setString(&iter1, text, -1);
332     iter1.move(&iter1, 0, UITER_START);
333     iter2.move(&iter2, 0, UITER_START);
334     for(;;) {
335         c1=iter1.next(&iter1);
336         c2=iter2.next(&iter2);
337         if(c1<0 && c2<0) {
338             break;
339         }
340         if(c1<0) {
341             printf("\t%04x\n", c2);
342         } else if(c2<0) {
343             printf("%04x\n", c1);
344         } else {
345             printf("%04x\t%04x\n", c1, c2);
346         }
347     }
348 }
349 
350 extern int
main(int argc,const char * argv[])351 main(int argc, const char *argv[]) {
352     TestLenient8Iterator();
353     return 0;
354 }
355