1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003-2014, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uciter8.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003jan10
14 * created by: Markus W. Scherer
15 *
16 * This file contains sample code that illustrates reading
17 * 8-bit Unicode text leniently, accepting a mix of UTF-8 and CESU-8
18 * and also accepting single surrogates.
19 */
20
21 #include <stdio.h>
22 #include <string.h>
23 #include "unicode/utypes.h"
24 #include "unicode/uiter.h"
25 #include "uit_len8.h"
26
27 #define log_err printf
28
29 /* UCharIterator test ------------------------------------------------------- */
30
31 /*
32 * The following code is a copy of the UCharIterator test code in
33 * source/test/cintltst/custrtst.c,
34 * testing the lenient-8 iterator instead of the UTF-8 one.
35 */
36
37 /*
38 * Compare results from two iterators, should be same.
39 * Assume that the text is not empty and that
40 * iteration start==0 and iteration limit==length.
41 */
42 static void
compareIterators(UCharIterator * iter1,const char * n1,UCharIterator * iter2,const char * n2)43 compareIterators(UCharIterator *iter1, const char *n1,
44 UCharIterator *iter2, const char *n2) {
45 int32_t i, pos1, pos2, middle, length;
46 UChar32 c1, c2;
47
48 /* compare lengths */
49 length=iter1->getIndex(iter1, UITER_LENGTH);
50 pos2=iter2->getIndex(iter2, UITER_LENGTH);
51 if(length!=pos2) {
52 log_err("%s->getIndex(length)=%d != %d=%s->getIndex(length)\n", n1, length, pos2, n2);
53 return;
54 }
55
56 /* set into the middle */
57 middle=length/2;
58
59 pos1=iter1->move(iter1, middle, UITER_ZERO);
60 if(pos1!=middle) {
61 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
62 return;
63 }
64
65 pos2=iter2->move(iter2, middle, UITER_ZERO);
66 if(pos2!=middle) {
67 log_err("%s->move(from 0 to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
68 return;
69 }
70
71 /* test current() */
72 c1=iter1->current(iter1);
73 c2=iter2->current(iter2);
74 if(c1!=c2) {
75 log_err("%s->current()=U+%04x != U+%04x=%s->current() at middle=%d\n", n1, c1, c2, n2, middle);
76 return;
77 }
78
79 /* move forward 3 UChars */
80 for(i=0; i<3; ++i) {
81 c1=iter1->next(iter1);
82 c2=iter2->next(iter2);
83 if(c1!=c2) {
84 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
85 return;
86 }
87 }
88
89 /* move backward 5 UChars */
90 for(i=0; i<5; ++i) {
91 c1=iter1->previous(iter1);
92 c2=iter2->previous(iter2);
93 if(c1!=c2) {
94 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d (started in middle)\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
95 return;
96 }
97 }
98
99 /* iterate forward from the beginning */
100 pos1=iter1->move(iter1, 0, UITER_START);
101 if(pos1<0) {
102 log_err("%s->move(start) failed\n", n1);
103 return;
104 }
105 if(!iter1->hasNext(iter1)) {
106 log_err("%s->hasNext() at the start returns FALSE\n", n1);
107 return;
108 }
109
110 pos2=iter2->move(iter2, 0, UITER_START);
111 if(pos2<0) {
112 log_err("%s->move(start) failed\n", n2);
113 return;
114 }
115 if(!iter2->hasNext(iter2)) {
116 log_err("%s->hasNext() at the start returns FALSE\n", n2);
117 return;
118 }
119
120 do {
121 c1=iter1->next(iter1);
122 c2=iter2->next(iter2);
123 if(c1!=c2) {
124 log_err("%s->next()=U+%04x != U+%04x=%s->next() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
125 return;
126 }
127 } while(c1>=0);
128
129 if(iter1->hasNext(iter1)) {
130 log_err("%s->hasNext() at the end returns TRUE\n", n1);
131 return;
132 }
133 if(iter2->hasNext(iter2)) {
134 log_err("%s->hasNext() at the end returns TRUE\n", n2);
135 return;
136 }
137
138 /* back to the middle */
139 pos1=iter1->move(iter1, middle, UITER_ZERO);
140 if(pos1!=middle) {
141 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n1, middle, pos1);
142 return;
143 }
144
145 pos2=iter2->move(iter2, middle, UITER_ZERO);
146 if(pos2!=middle) {
147 log_err("%s->move(from end to middle %d)=%d does not move to the middle\n", n2, middle, pos2);
148 return;
149 }
150
151 /* move to index 1 */
152 pos1=iter1->move(iter1, 1, UITER_ZERO);
153 if(pos1!=1) {
154 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n1, middle, pos1);
155 return;
156 }
157
158 pos2=iter2->move(iter2, 1, UITER_ZERO);
159 if(pos2!=1) {
160 log_err("%s->move(from middle %d to 1)=%d does not move to 1\n", n2, middle, pos2);
161 return;
162 }
163
164 /* iterate backward from the end */
165 pos1=iter1->move(iter1, 0, UITER_LIMIT);
166 if(pos1<0) {
167 log_err("%s->move(limit) failed\n", n1);
168 return;
169 }
170 if(!iter1->hasPrevious(iter1)) {
171 log_err("%s->hasPrevious() at the end returns FALSE\n", n1);
172 return;
173 }
174
175 pos2=iter2->move(iter2, 0, UITER_LIMIT);
176 if(pos2<0) {
177 log_err("%s->move(limit) failed\n", n2);
178 return;
179 }
180 if(!iter2->hasPrevious(iter2)) {
181 log_err("%s->hasPrevious() at the end returns FALSE\n", n2);
182 return;
183 }
184
185 do {
186 c1=iter1->previous(iter1);
187 c2=iter2->previous(iter2);
188 if(c1!=c2) {
189 log_err("%s->previous()=U+%04x != U+%04x=%s->previous() at %d\n", n1, c1, c2, n2, iter1->getIndex(iter1, UITER_CURRENT));
190 return;
191 }
192 } while(c1>=0);
193
194 if(iter1->hasPrevious(iter1)) {
195 log_err("%s->hasPrevious() at the start returns TRUE\n", n1);
196 return;
197 }
198 if(iter2->hasPrevious(iter2)) {
199 log_err("%s->hasPrevious() at the start returns TRUE\n", n2);
200 return;
201 }
202 }
203
204 /*
205 * Test the iterator's getState() and setState() functions.
206 * iter1 and iter2 must be set up for the same iterator type and the same string
207 * but may be physically different structs (different addresses).
208 *
209 * Assume that the text is not empty and that
210 * iteration start==0 and iteration limit==length.
211 * It must be 2<=middle<=length-2.
212 */
213 static void
testIteratorState(UCharIterator * iter1,UCharIterator * iter2,const char * n,int32_t middle)214 testIteratorState(UCharIterator *iter1, UCharIterator *iter2, const char *n, int32_t middle) {
215 UChar32 u[4];
216
217 UErrorCode errorCode;
218 UChar32 c;
219 uint32_t state;
220 int32_t i, j;
221
222 /* get four UChars from the middle of the string */
223 iter1->move(iter1, middle-2, UITER_ZERO);
224 for(i=0; i<4; ++i) {
225 c=iter1->next(iter1);
226 if(c<0) {
227 /* the test violates the assumptions, see comment above */
228 log_err("test error: %s[%d]=%d\n", n, middle-2+i, c);
229 return;
230 }
231 u[i]=c;
232 }
233
234 /* move to the middle and get the state */
235 iter1->move(iter1, -2, UITER_CURRENT);
236 state=uiter_getState(iter1);
237
238 /* set the state into the second iterator and compare the results */
239 errorCode=U_ZERO_ERROR;
240 uiter_setState(iter2, state, &errorCode);
241 if(U_FAILURE(errorCode)) {
242 log_err("%s->setState(0x%x) failed: %s\n", n, state, u_errorName(errorCode));
243 return;
244 }
245
246 c=iter2->current(iter2);
247 if(c!=u[2]) {
248 log_err("%s->current(at %d)=U+%04x!=U+%04x\n", n, middle, c, u[2]);
249 }
250
251 c=iter2->previous(iter2);
252 if(c!=u[1]) {
253 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-1, c, u[1]);
254 }
255
256 iter2->move(iter2, 2, UITER_CURRENT);
257 c=iter2->next(iter2);
258 if(c!=u[3]) {
259 log_err("%s->next(at %d)=U+%04x!=U+%04x\n", n, middle+1, c, u[3]);
260 }
261
262 iter2->move(iter2, -3, UITER_CURRENT);
263 c=iter2->previous(iter2);
264 if(c!=u[0]) {
265 log_err("%s->previous(at %d)=U+%04x!=U+%04x\n", n, middle-2, c, u[0]);
266 }
267
268 /* move the second iterator back to the middle */
269 iter2->move(iter2, 1, UITER_CURRENT);
270 iter2->next(iter2);
271
272 /* check that both are in the middle */
273 i=iter1->getIndex(iter1, UITER_CURRENT);
274 j=iter2->getIndex(iter2, UITER_CURRENT);
275 if(i!=middle) {
276 log_err("%s->getIndex(current)=%d!=%d as expected\n", n, i, middle);
277 }
278 if(i!=j) {
279 log_err("%s->getIndex(current)=%d!=%d after setState()\n", n, j, i);
280 }
281
282 /* compare lengths */
283 i=iter1->getIndex(iter1, UITER_LENGTH);
284 j=iter2->getIndex(iter2, UITER_LENGTH);
285 if(i!=j) {
286 log_err("%s->getIndex(length)=%d!=%d before/after setState()\n", n, i, j);
287 }
288 }
289
290 static void
TestLenient8Iterator()291 TestLenient8Iterator() {
292 static const UChar text[]={
293 0x61, 0x62, 0x63,
294 /* dffd 107fd d801 dffd - in UTF-16, U+107fd=<d801 dffd> */
295 0xdffd, 0xd801, 0xdffd, 0xd801, 0xdffd,
296 0x78, 0x79, 0x7a, 0
297 };
298 static const uint8_t bytes[]={
299 0x61, 0x62, 0x63,
300 /* dffd 107fd d801 dffd - mixture */
301 0xed, 0xbf, 0xbd, 0xf0, 0x90, 0x9f, 0xbd, 0xed, 0xa0, 0x81, 0xed, 0xbf, 0xbd,
302 0x78, 0x79, 0x7a, 0
303 };
304
305 UCharIterator iter1, iter2;
306 UChar32 c1, c2;
307 int32_t length;
308
309 puts("test a UCharIterator for lenient 8-bit Unicode (accept single surrogates)");
310
311 /* compare the same string between UTF-16 and lenient-8 UCharIterators */
312 uiter_setString(&iter1, text, -1);
313 uiter_setLenient8(&iter2, (const char *)bytes, sizeof(bytes)-1);
314 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator");
315
316 /* try again with length=-1 */
317 uiter_setLenient8(&iter2, (const char *)bytes, -1);
318 compareIterators(&iter1, "UTF16Iterator", &iter2, "Lenient8Iterator_1");
319
320 /* test get/set state */
321 length=UPRV_LENGTHOF(text)-1;
322 uiter_setLenient8(&iter1, (const char*)bytes, -1);
323 testIteratorState(&iter1, &iter2, "Lenient8IteratorState", length/2);
324 testIteratorState(&iter1, &iter2, "Lenient8IteratorStatePlus1", length/2+1);
325
326 /* ---------------------------------------------------------------------- */
327
328 puts("no output so far means that the lenient-8 iterator works fine");
329
330 puts("iterate forward:\nUTF-16\tlenient-8");
331 uiter_setString(&iter1, text, -1);
332 iter1.move(&iter1, 0, UITER_START);
333 iter2.move(&iter2, 0, UITER_START);
334 for(;;) {
335 c1=iter1.next(&iter1);
336 c2=iter2.next(&iter2);
337 if(c1<0 && c2<0) {
338 break;
339 }
340 if(c1<0) {
341 printf("\t%04x\n", c2);
342 } else if(c2<0) {
343 printf("%04x\n", c1);
344 } else {
345 printf("%04x\t%04x\n", c1, c2);
346 }
347 }
348 }
349
350 extern int
main(int argc,const char * argv[])351 main(int argc, const char *argv[]) {
352 TestLenient8Iterator();
353 return 0;
354 }
355