1 /*
2  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 /**
17  * @file picotok.c
18  *
19  * tokenizer
20  *
21  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
22  * All rights reserved.
23  *
24  * History:
25  * - 2009-04-20 -- initial version
26  *
27  */
28 
29 
30 /* ************************************************************/
31 /* tokenisation and markup handling */
32 /* ************************************************************/
33 
34 /** @addtogroup picotok
35   @b tokenisation_overview
36 
37   markup handling overview:
38 
39   The following markups are recognized
40      - ignore
41      - speed
42      - pitch
43      - volume
44      - voice
45      - preproccontext
46      - mark
47      - play
48      - usesig
49      - genfile
50      - sentence
51      - s
52      - paragraph
53      - p
54      - break
55      - spell            (pauses between letter)
56      - phoneme
57 
58   All markups which are recognized but are not yet implemented in pico
59   system have the mark.
60 */
61 
62 
63 #include "picodefs.h"
64 #include "picoos.h"
65 #include "picobase.h"
66 #include "picodbg.h"
67 #include "picodata.h"
68 #include "picotok.h"
69 #include "picoktab.h"
70 
71 #ifdef __cplusplus
72 extern "C" {
73 #endif
74 #if 0
75 }
76 #endif
77 
78 /* *****************************************************************************/
79 
80 #define IN_BUF_SIZE   255
81 #define OUT_BUF_SIZE  IN_BUF_SIZE + 3 * PICODATA_ITEM_HEADSIZE + 3
82 
83 #define MARKUP_STRING_BUF_SIZE (IN_BUF_SIZE*5)
84 #define MAX_NR_MARKUP_PARAMS 6
85 #define MARKUP_HANDLING_DISABLED  0
86 #define MARKUP_HANDLING_ENABLED 1
87 #define EOL '\n'
88 
89 
90 typedef picoos_int8 pico_tokenSubType;
91 typedef picoos_uint8 pico_tokenType;
92 
93 /** @todo : consider adding these specialized exception codes: */
94 
95 #define PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE PICO_ERR_OTHER
96 #define PICO_ERR_INVALID_MARKUP_TAG        PICO_ERR_OTHER
97 #define PICO_ERR_INTERNAL_LIMIT            PICO_ERR_OTHER
98 
99 typedef enum {MIDummyStart, MIIgnore,
100               MIPitch, MISpeed, MIVolume,
101               MIVoice, MIPreprocContext, MIMarker,
102               MIPlay, MIUseSig, MIGenFile, MIParagraph,
103               MISentence, MIBreak, MISpell, MIPhoneme, MIItem, MISpeaker, MIDummyEnd
104              }  MarkupId;
105 typedef enum {MSNotInMarkup, MSGotStart, MSExpectingmarkupTagName, MSInmarkupTagName,
106               MSGotmarkupTagName, MSInAttrName, MSGotAttrName, MSGotEqual, MSInAttrValue,
107               MSInAttrValueEscaped, MSGotAttrValue, MSGotEndSlash, MSGotEnd,
108               MSError, MSErrorTooLong, MSErrorSyntax
109              }  MarkupState;
110 typedef enum {MENone, MEMissingStart, MEUnknownTag, MEIdent, MEMissingEqual,
111               MEMissingQuote, MEMissingEnd, MEUnexpectedChar, MEInterprete
112              }  MarkupParseError;
113 
114 typedef enum {MTNone, MTStart, MTEnd, MTEmpty} MarkupTagType;
115 
116 #define UTF_CHAR_COMPLETE   2
117 #define UTF_CHAR_INCOMPLETE 1
118 #define UTF_CHAR_MALFORMED  0
119 
120 #define TOK_MARKUP_KW_IGNORE     (picoos_uchar*)"ignore"
121 #define TOK_MARKUP_KW_SPEED      (picoos_uchar*)"speed"
122 #define TOK_MARKUP_KW_PITCH      (picoos_uchar*)"pitch"
123 #define TOK_MARKUP_KW_VOLUME     (picoos_uchar*)"volume"
124 #define TOK_MARKUP_KW_VOICE      (picoos_uchar*)"voice"
125 #define TOK_MARKUP_KW_CONTEXT    (picoos_uchar*)"preproccontext"
126 #define TOK_MARKUP_KW_MARK       (picoos_uchar*)"mark"
127 #define TOK_MARKUP_KW_PLAY       (picoos_uchar*)"play"
128 #define TOK_MARKUP_KW_USESIG     (picoos_uchar*)"usesig"
129 #define TOK_MARKUP_KW_GENFILE    (picoos_uchar*)"genfile"
130 #define TOK_MARKUP_KW_SENTENCE   (picoos_uchar*)"sentence"
131 #define TOK_MARKUP_KW_S          (picoos_uchar*)"s"
132 #define TOK_MARKUP_KW_PARAGRAPH  (picoos_uchar*)"paragraph"
133 #define TOK_MARKUP_KW_P          (picoos_uchar*)"p"
134 #define TOK_MARKUP_KW_BREAK      (picoos_uchar*)"break"
135 #define TOK_MARKUP_KW_SPELL      (picoos_uchar*)"spell"
136 #define TOK_MARKUP_KW_PHONEME    (picoos_uchar*)"phoneme"
137 #define TOK_MARKUP_KW_ITEM       (picoos_uchar*)"item"
138 #define TOK_MARKUP_KW_SPEAKER    (picoos_uchar*)"speaker"
139 
140 #define KWLevel (picoos_uchar *)"level"
141 #define KWName (picoos_uchar *)"name"
142 #define KWProsDomain (picoos_uchar *)"prosodydomain"
143 #define KWTime (picoos_uchar *)"time"
144 #define KWMode (picoos_uchar *)"mode"
145 #define KWSB (picoos_uchar *)"sb"
146 #define KWPB (picoos_uchar *)"pb"
147 #define KWFile (picoos_uchar *)"file"
148 #define KWType (picoos_uchar *)"type"
149 #define KWF0Beg (picoos_uchar *)"f0beg"
150 #define KWF0End (picoos_uchar *)"f0end"
151 #define KWXFadeBeg (picoos_uchar *)"xfadebeg"
152 #define KWXFadeEnd (picoos_uchar *)"xfadeend"
153 #define KWAlphabet (picoos_uchar *)"alphabet"
154 #define KWPH (picoos_uchar *)"ph"
155 #define KWOrthMode (picoos_uchar *)"orthmode"
156 #define KWIgnorePunct (picoos_uchar *)"ignorepunct"
157 #define KWInfo1 (picoos_uchar *)"info1"
158 #define KWInfo2 (picoos_uchar *)"info2"
159 #define KWDATA (picoos_uchar *)"data"
160 
161 #define PICO_SPEED_MIN           20
162 #define PICO_SPEED_MAX          500
163 #define PICO_SPEED_DEFAULT      100
164 #define PICO_SPEED_FACTOR_MIN   500
165 #define PICO_SPEED_FACTOR_MAX  2000
166 
167 #define PICO_PITCH_MIN           50
168 #define PICO_PITCH_MAX          200
169 #define PICO_PITCH_DEFAULT      100
170 #define PICO_PITCH_FACTOR_MIN   500
171 #define PICO_PITCH_FACTOR_MAX  2000
172 #define PICO_PITCH_ADD_MIN     -100
173 #define PICO_PITCH_ADD_MAX      100
174 #define PICO_PITCH_ADD_DEFAULT    0
175 
176 #define PICO_VOLUME_MIN           0
177 #define PICO_VOLUME_MAX         500
178 #define PICO_VOLUME_DEFAULT     100
179 #define PICO_VOLUME_FACTOR_MIN  500
180 #define PICO_VOLUME_FACTOR_MAX 2000
181 
182 #define PICO_SPEAKER_MIN          20
183 #define PICO_SPEAKER_MAX         180
184 #define PICO_SPEAKER_DEFAULT     100
185 #define PICO_SPEAKER_FACTOR_MIN  500
186 #define PICO_SPEAKER_FACTOR_MAX 2000
187 
188 #define PICO_CONTEXT_DEFAULT   (picoos_uchar*)"DEFAULT"
189 
190 #define PARAGRAPH_PAUSE_DUR 500
191 #define SPELL_WITH_PHRASE_BREAK  1
192 #define SPELL_WITH_SENTENCE_BREAK  2
193 
194 /* *****************************************************************************/
195 
196 #define TOK_PUNC_FLUSH  (picoos_char) '\0'
197 
198 typedef picoos_uchar Word[MARKUP_STRING_BUF_SIZE];
199 
200 
201 struct MarkupParam {
202     Word paramId;
203     Word paramVal;
204 };
205 
206 typedef struct MarkupParam MarkupParams[MAX_NR_MARKUP_PARAMS];
207 
208 typedef picoos_uchar utf8char0c[5]; /* one more than needed so it is ended always with 0c*/
209 
210 /** subobject : TokenizeUnit
211  *  shortcut  : tok
212  */
213 typedef struct tok_subobj
214 {
215     picoos_int32 ignLevel;
216 
217     utf8char0c   utf;
218     picoos_int32 utfpos;
219     picoos_int32 utflen;
220 
221     MarkupParams markupParams;
222     picoos_int32 nrMarkupParams;
223     MarkupState markupState;
224     picoos_uchar markupStr[MARKUP_STRING_BUF_SIZE];
225     picoos_int32 markupPos;
226     picoos_int32 markupLevel[MIDummyEnd+1];
227     picoos_uchar markupTagName[IN_BUF_SIZE];
228     MarkupTagType markupTagType;
229     MarkupParseError markupTagErr;
230 
231     picoos_int32 strPos;
232     picoos_uchar strDelim;
233     picoos_bool isFileAttr;
234 
235     pico_tokenType tokenType;
236     pico_tokenSubType tokenSubType;
237 
238     picoos_int32 tokenPos;
239     picoos_uchar tokenStr[IN_BUF_SIZE];
240 
241     picoos_int32 nrEOL;
242 
243     picoos_bool markupHandlingMode;       /* to be moved ??? */
244     picoos_bool aborted;                  /* to be moved ??? */
245 
246     picoos_bool start;
247 
248     picoos_uint8 outBuf[OUT_BUF_SIZE]; /* internal output buffer */
249     picoos_uint16 outReadPos; /* next pos to read from outBuf */
250     picoos_uint16 outWritePos; /* next pos to write to outBuf */
251 
252     picoos_uchar saveFile[IN_BUF_SIZE];
253     Word phonemes;
254 
255     picotrns_SimpleTransducer transducer;
256 
257     /* kbs */
258 
259     picoktab_Graphs graphTab;
260     picokfst_FST xsampa_parser;
261     picokfst_FST svoxpa_parser;
262     picokfst_FST xsampa2svoxpa_mapper;
263 
264 
265 
266 } tok_subobj_t;
267 
268 /* *****************************************************************************/
269 
270 static void tok_treatMarkupAsSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok);
271 static void tok_treatChar (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar ch, picoos_bool markupHandling);
272 static void tok_treatMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok);
273 static void tok_putToMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[]);
274 static void tok_treatSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok);
275 static MarkupId tok_markupTagId (picoos_uchar tagId[]);
276 
277 /* *****************************************************************************/
278 
tok_strEqual(picoos_uchar * str1,picoos_uchar * str2)279 static picoos_bool tok_strEqual(picoos_uchar * str1, picoos_uchar * str2)
280 {
281    return (picoos_strcmp((picoos_char*)str1, (picoos_char*)str2) == 0);
282 }
283 
tok_reduceBlanks(picoos_uchar * str)284 static void tok_reduceBlanks(picoos_uchar * str)
285             /* Remove leading and trailing blanks of 'str' and reduce
286                groups of blanks within string to exactly one blank. */
287 
288 {
289     int i = 0;
290     int j = 0;
291 
292      while (str[j] != 0) {
293         if (str[j] == (picoos_uchar)' ') {
294             /* note one blank except at the beginning of string */
295             if (i > 0) {
296                 str[i] = (picoos_uchar)' ';
297                 i++;
298             }
299             j++;
300             while (str[j] == (picoos_uchar)' ') {
301                 j++;
302             }
303         } else {
304             str[i] = str[j];
305             j++;
306             i++;
307         }
308     }
309 
310     /* remove blanks at end of string */
311     if ((i > 0) && (str[i - 1] == ' ')) {
312         i--;
313     }
314     str[i] = 0;
315 }
316 
317 
tok_startIgnore(tok_subobj_t * tok)318 static void tok_startIgnore (tok_subobj_t * tok)
319 {
320     tok->ignLevel++;
321 }
322 
323 
tok_endIgnore(tok_subobj_t * tok)324 static void tok_endIgnore (tok_subobj_t * tok)
325 {
326     if (tok->ignLevel > 0) {
327         tok->ignLevel--;
328     }
329 }
330 
331 
tok_getParamIntVal(MarkupParams params,picoos_uchar paramId[],picoos_int32 * paramVal,picoos_bool * paramFound)332 static void tok_getParamIntVal (MarkupParams params, picoos_uchar paramId[], picoos_int32 * paramVal, picoos_bool * paramFound)
333 {
334     int i=0;
335 
336     while ((i < MAX_NR_MARKUP_PARAMS) && !tok_strEqual(paramId,params[i].paramId)) {
337         i++;
338     }
339     if ((i < MAX_NR_MARKUP_PARAMS)) {
340         (*paramVal) = picoos_atoi((picoos_char*)params[i].paramVal);
341         (*paramFound) = TRUE;
342     } else {
343         (*paramVal) =  -1;
344         (*paramFound) = FALSE;
345     }
346 }
347 
348 
349 
tok_getParamStrVal(MarkupParams params,picoos_uchar paramId[],picoos_uchar paramStrVal[],picoos_bool * paramFound)350 static void tok_getParamStrVal (MarkupParams params, picoos_uchar paramId[], picoos_uchar paramStrVal[], picoos_bool * paramFound)
351 {
352     int i=0;
353 
354     while ((i < MAX_NR_MARKUP_PARAMS) &&  !tok_strEqual(paramId,params[i].paramId)) {
355         i++;
356     }
357     if (i < MAX_NR_MARKUP_PARAMS) {
358         picoos_strcpy((picoos_char*)paramStrVal, (picoos_char*)params[i].paramVal);
359         (*paramFound) = TRUE;
360     } else {
361         paramStrVal[0] = 0;
362         (*paramFound) = FALSE;
363     }
364 }
365 
366 
tok_getParamPhonesStr(MarkupParams params,picoos_uchar paramId[],picoos_uchar alphabet[],picoos_uchar phones[],picoos_int32 phoneslen,picoos_bool * paramFound)367 static void tok_getParamPhonesStr (MarkupParams params, picoos_uchar paramId[], picoos_uchar alphabet[], picoos_uchar phones[], picoos_int32 phoneslen, picoos_bool * paramFound)
368 {
369 
370     int i;
371     picoos_bool done;
372 
373     i = 0;
374     while ((i < MAX_NR_MARKUP_PARAMS) &&  !tok_strEqual(paramId, params[i].paramId)) {
375         i++;
376     }
377     if (i < MAX_NR_MARKUP_PARAMS) {
378         if (tok_strEqual(alphabet, PICODATA_XSAMPA) || tok_strEqual(alphabet, (picoos_uchar*)"")) {
379             picoos_strlcpy((picoos_char*)phones, (picoos_char*)params[i].paramVal, phoneslen);
380             done = TRUE;
381         } else {
382             done = FALSE;
383         }
384         (*paramFound) = TRUE;
385     } else {
386         done = FALSE;
387         (*paramFound) = FALSE;
388     }
389     if (!done) {
390         phones[0] = 0;
391     }
392 }
393 
394 
tok_clearMarkupParams(MarkupParams params)395 static void tok_clearMarkupParams (MarkupParams params)
396 {
397     int i;
398 
399     for (i = 0; i<MAX_NR_MARKUP_PARAMS; i++) {
400         params[i].paramId[0] = 0;
401         params[i].paramVal[0] = 0;
402     }
403 }
404 
405 
tok_getDur(picoos_uchar durStr[],picoos_uint32 * dur,picoos_bool * done)406 static void tok_getDur (picoos_uchar durStr[], picoos_uint32 * dur, picoos_bool * done)
407 {
408 
409     int num=0;
410     int i=0;
411     picoos_uchar tmpWord[IN_BUF_SIZE];
412 
413     picoos_strlcpy((picoos_char*)tmpWord, (picoos_char*)durStr, sizeof(tmpWord));
414     tok_reduceBlanks(tmpWord);
415     while ((durStr[i] >= '0') && (durStr[i] <= '9')) {
416         num = 10 * num + (int)durStr[i] - (int)'0';
417         tmpWord[i] = ' ';
418         i++;
419     }
420     tok_reduceBlanks(tmpWord);
421     if (tok_strEqual(tmpWord, (picoos_uchar*)"s")) {
422         (*dur) = (1000 * num);
423         (*done) = TRUE;
424     } else if (tok_strEqual(tmpWord,(picoos_uchar*)"ms")) {
425         (*dur) = num;
426         (*done) = TRUE;
427     } else {
428         (*dur) = 0;
429         (*done) = FALSE;
430     }
431 }
432 
433 
tok_putToUtf(tok_subobj_t * tok,picoos_uchar ch)434 static picoos_int32 tok_putToUtf (tok_subobj_t * tok, picoos_uchar ch)
435 {
436     if (tok->utfpos < PICOBASE_UTF8_MAXLEN) {
437         tok->utf[tok->utfpos] = ch;
438         if (tok->utfpos == 0) {
439             tok->utflen = picobase_det_utf8_length(ch);
440         } else if (((ch < (picoos_uchar)'\200') || (ch >= (picoos_uchar)'\300'))) {
441             tok->utflen = 0;
442         }
443         (tok->utfpos)++;
444         if ((tok->utfpos == tok->utflen)) {
445             if ((tok->utfpos < PICOBASE_UTF8_MAXLEN)) {
446                 tok->utf[tok->utfpos] = 0;
447             }
448             return UTF_CHAR_COMPLETE;
449         } else if (tok->utfpos < tok->utflen) {
450             return UTF_CHAR_INCOMPLETE;
451         } else {
452             return UTF_CHAR_MALFORMED;
453         }
454     } else {
455         return UTF_CHAR_MALFORMED;
456     }
457 }
458 
459 
tok_isRelative(picoos_uchar strval[],picoos_uint32 * val)460 static picoos_bool tok_isRelative (picoos_uchar strval[], picoos_uint32 * val)
461 {
462     picoos_int32 len;
463     picoos_bool rel;
464 
465     rel = FALSE;
466     len = picoos_strlen((picoos_char*)strval);
467     if (len > 0) {
468         if (strval[len - 1] == '%') {
469             strval[len - 1] = 0;
470             if ((strval[0] == '+') || (strval[0] == '-')) {
471                 (*val) = 1000 + (picoos_atoi((picoos_char*)strval) * 10);
472             } else {
473                 (*val) = picoos_atoi((picoos_char*)strval) * 10;
474             }
475             rel = TRUE;
476         }
477     }
478     return rel;
479 }
480 
481 
tok_putItem(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uint8 itemType,picoos_uint8 info1,picoos_uint8 info2,picoos_uint16 val,picoos_uchar str[])482 static void tok_putItem (picodata_ProcessingUnit this,  tok_subobj_t * tok,
483                          picoos_uint8 itemType, picoos_uint8 info1, picoos_uint8 info2,
484                          picoos_uint16 val,
485                          picoos_uchar str[])
486 {
487     picoos_int32 len, i;
488 
489     if ((itemType == PICODATA_ITEM_CMD) && (info1 == PICODATA_ITEMINFO1_CMD_FLUSH)) {
490         tok->outBuf[tok->outWritePos++] = itemType;
491         tok->outBuf[tok->outWritePos++] = info1;
492         tok->outBuf[tok->outWritePos++] = info2;
493         tok->outBuf[tok->outWritePos++] = 0;
494     }
495     else if (tok->ignLevel <= 0) {
496         switch (itemType) {
497         case PICODATA_ITEM_CMD:
498             switch (info1) {
499             case PICODATA_ITEMINFO1_CMD_CONTEXT:
500             case PICODATA_ITEMINFO1_CMD_VOICE:
501             case PICODATA_ITEMINFO1_CMD_MARKER:
502             case PICODATA_ITEMINFO1_CMD_PLAY:
503             case PICODATA_ITEMINFO1_CMD_SAVE:
504             case PICODATA_ITEMINFO1_CMD_UNSAVE:
505             case PICODATA_ITEMINFO1_CMD_PROSDOMAIN:
506             case PICODATA_ITEMINFO1_CMD_PHONEME:
507                 len = picoos_strlen((picoos_char*)str);
508                 if (tok->outWritePos + 4 + len < OUT_BUF_SIZE) {
509                     tok->outBuf[tok->outWritePos++] = itemType;
510                     tok->outBuf[tok->outWritePos++] = info1;
511                     tok->outBuf[tok->outWritePos++] = info2;
512                     tok->outBuf[tok->outWritePos++] = len;
513                     for (i=0; i<len; i++) {
514                         tok->outBuf[tok->outWritePos++] = str[i];
515                     }
516                 }
517                 else {
518                     PICODBG_WARN(("tok_putItem: output buffer too small"));
519                 }
520                 break;
521             case PICODATA_ITEMINFO1_CMD_IGNSIG:
522             case PICODATA_ITEMINFO1_CMD_IGNORE:
523                 if (tok->outWritePos + 4 < OUT_BUF_SIZE) {
524                     tok->outBuf[tok->outWritePos++] = itemType;
525                     tok->outBuf[tok->outWritePos++] = info1;
526                     tok->outBuf[tok->outWritePos++] = info2;
527                     tok->outBuf[tok->outWritePos++] = 0;
528                 }
529                 else {
530                     PICODBG_WARN(("tok_putItem: output buffer too small"));
531                 }
532                 break;
533             case PICODATA_ITEMINFO1_CMD_SPEED:
534             case PICODATA_ITEMINFO1_CMD_PITCH:
535             case PICODATA_ITEMINFO1_CMD_VOLUME:
536             case PICODATA_ITEMINFO1_CMD_SPELL:
537             case PICODATA_ITEMINFO1_CMD_SIL:
538             case PICODATA_ITEMINFO1_CMD_SPEAKER:
539                 if (tok->outWritePos + 4 + 2 < OUT_BUF_SIZE) {
540                     tok->outBuf[tok->outWritePos++] = itemType;
541                     tok->outBuf[tok->outWritePos++] = info1;
542                     tok->outBuf[tok->outWritePos++] = info2;
543                     tok->outBuf[tok->outWritePos++] = 2;
544                     tok->outBuf[tok->outWritePos++] = val % 256;
545                     tok->outBuf[tok->outWritePos++] = val / 256;
546                 }
547                 else {
548                     PICODBG_WARN(("tok_putItem: output buffer too small"));
549                 }
550                 break;
551             default:
552                 PICODBG_WARN(("tok_putItem: unknown command type"));
553             }
554             break;
555         case PICODATA_ITEM_TOKEN:
556             len = picoos_strlen((picoos_char*)str);
557             if (tok->outWritePos + 4 + len < OUT_BUF_SIZE) {
558                 tok->outBuf[tok->outWritePos++] = itemType;
559                 tok->outBuf[tok->outWritePos++] = info1;
560                 tok->outBuf[tok->outWritePos++] = info2;
561                 tok->outBuf[tok->outWritePos++] = len;
562                 for (i=0; i<len; i++) {
563                     tok->outBuf[tok->outWritePos++] = str[i];
564                 }
565             }
566             else {
567                 PICODBG_WARN(("tok_putItem: output buffer too small"));
568             }
569             break;
570         default:
571             PICODBG_WARN(("tok_putItem: unknown item type"));
572         }
573     }
574 }
575 
576 
tok_putItem2(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uint8 type,picoos_uint8 info1,picoos_uint8 info2,picoos_uint8 len,picoos_uint8 data[])577 static void tok_putItem2 (picodata_ProcessingUnit this,  tok_subobj_t * tok,
578                           picoos_uint8 type,
579                           picoos_uint8 info1, picoos_uint8 info2,
580                           picoos_uint8 len,
581                           picoos_uint8 data[])
582 {
583     picoos_int32 i;
584 
585     if (is_valid_itemtype(type)) {
586         tok->outBuf[tok->outWritePos++] = type;
587         tok->outBuf[tok->outWritePos++] = info1;
588         tok->outBuf[tok->outWritePos++] = info2;
589         tok->outBuf[tok->outWritePos++] = len;
590         for (i=0; i<len; i++) {
591             tok->outBuf[tok->outWritePos++] = data[i];
592         }
593     }
594 }
595 
596 
tok_markupTagId(picoos_uchar tagId[])597 static MarkupId tok_markupTagId (picoos_uchar tagId[])
598 {
599     if (picoos_strstr(tagId,(picoos_char *)"svox:") == (picoos_char *)tagId) {
600         tagId+=5;
601     }
602     if (tok_strEqual(tagId, TOK_MARKUP_KW_IGNORE)) {
603         return MIIgnore;
604     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPEED)) {
605         return MISpeed;
606     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PITCH)) {
607         return MIPitch;
608     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_VOLUME)) {
609         return MIVolume;
610     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPEAKER)) {
611         return MISpeaker;
612     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_VOICE)) {
613         return MIVoice;
614     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_CONTEXT)) {
615         return MIPreprocContext;
616     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_MARK)) {
617         return MIMarker;
618     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PLAY)) {
619         return MIPlay;
620     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_USESIG)) {
621         return MIUseSig;
622     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_GENFILE)) {
623         return MIGenFile;
624     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SENTENCE) || tok_strEqual(tagId, TOK_MARKUP_KW_S)) {
625         return MISentence;
626     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PARAGRAPH) || tok_strEqual(tagId, TOK_MARKUP_KW_P)) {
627         return MIParagraph;
628     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_BREAK)) {
629         return MIBreak;
630     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_SPELL)) {
631         return MISpell;
632     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_PHONEME)) {
633         return MIPhoneme;
634     } else if (tok_strEqual(tagId, TOK_MARKUP_KW_ITEM)) {
635         return MIItem;
636     } else {
637         return MIDummyEnd;
638     }
639 }
640 
641 
tok_checkLimits(picodata_ProcessingUnit this,picoos_uint32 * value,picoos_uint32 min,picoos_uint32 max,picoos_uchar valueType[])642 static void tok_checkLimits (picodata_ProcessingUnit this, picoos_uint32 * value, picoos_uint32 min, picoos_uint32 max, picoos_uchar valueType[])
643 {
644     if ((((*value) < min) || ((*value) > max))) {
645         picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE, (picoos_char*)"", (picoos_char*)"attempt to set illegal value %i for %s", *value, valueType);
646         if (((*value) < min)) {
647             (*value) = min;
648         } else if (((*value) > max)) {
649             (*value) = max;
650         }
651     }
652 }
653 
654 
655 
656 /*
657 
658 static void tok_checkRealLimits (picodata_ProcessingUnit this, picoos_single * value, picoos_single min, picoos_single max, picoos_uchar valueType[])
659 {
660     if ((((*value) < min) || ((*value) > max))) {
661           picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE, (picoos_char*)"", (picoos_char*)"attempt to set illegal value %f for %s", *value, valueType);
662         if (((*value) < min)) {
663             (*value) = min;
664         } else if (((*value) > max)) {
665             (*value) = max;
666         }
667     }
668 }
669 */
670 
671 #define VAL_STR_LEN 21
672 
tok_interpretMarkup(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_bool isStartTag,MarkupId mId)673 static void tok_interpretMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_bool isStartTag, MarkupId mId)
674 {
675     picoos_bool done;
676     picoos_int32 ival;
677     picoos_uint32 uval;
678     picoos_int32 ival2;
679     picoos_uchar valStr[VAL_STR_LEN];
680     picoos_uchar valStr2[VAL_STR_LEN];
681     picoos_uchar valStr3[VAL_STR_LEN];
682     picoos_int32 i2;
683     picoos_uint32 dur;
684     picoos_bool done1;
685     picoos_bool paramFound;
686     picoos_uint8 type, info1, info2;
687     picoos_uint8 data[256];
688     picoos_int32 pos, n, len;
689     picoos_uchar part[10];
690 
691     done = FALSE;
692     switch (mId) {
693         case MIIgnore:
694             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
695                 tok_startIgnore(tok);
696                 done = TRUE;
697             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
698                 tok_endIgnore(tok);
699                 done = TRUE;
700             }
701             break;
702         case MISpeed:
703             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
704                 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
705                     tok_checkLimits(this, & uval, PICO_SPEED_FACTOR_MIN, PICO_SPEED_FACTOR_MAX,(picoos_uchar*)"relative speed factor");
706                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
707                 } else {
708                     uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
709                     tok_checkLimits(this, & uval, PICO_SPEED_MIN, PICO_SPEED_MAX,(picoos_uchar*)"speed");
710                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
711                 }
712                 done = TRUE;
713             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
714                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEED, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_SPEED_DEFAULT, (picoos_uchar*)"");
715                 done = TRUE;
716             }
717             break;
718         case MIPitch:
719             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
720                 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
721                     tok_checkLimits(this, & uval,PICO_PITCH_FACTOR_MIN,PICO_PITCH_FACTOR_MAX, (picoos_uchar*)"relative pitch factor");
722                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
723                 } else {
724                     uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
725                     tok_checkLimits(this, & uval,PICO_PITCH_MIN,PICO_PITCH_MAX, (picoos_uchar*)"pitch");
726                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH,PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
727                 }
728                 done = TRUE;
729             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
730                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PITCH,PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_PITCH_DEFAULT, (picoos_uchar*)"");
731                 done = TRUE;
732             }
733             break;
734         case MIVolume:
735             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
736                 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
737                     tok_checkLimits(this, & uval, PICO_VOLUME_FACTOR_MIN, PICO_VOLUME_FACTOR_MAX, (picoos_uchar*)"relative volume factor");
738                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
739                 } else {
740                     uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
741                     tok_checkLimits(this, & uval, PICO_VOLUME_MIN, PICO_VOLUME_MAX, (picoos_uchar*)"volume");
742                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
743                 }
744                 done = TRUE;
745             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
746                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOLUME, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_VOLUME_DEFAULT, (picoos_uchar*)"");
747                 done = TRUE;
748             }
749             break;
750         case MISpeaker:
751             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWLevel)) {
752                 if (tok_isRelative(tok->markupParams[0].paramVal, & uval)) {
753                     tok_checkLimits(this, & uval, PICO_SPEAKER_FACTOR_MIN, PICO_SPEAKER_FACTOR_MAX, (picoos_uchar*)"relative speaker factor");
754                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_RELATIVE, uval, (picoos_uchar*)"");
755                 } else {
756                     uval = picoos_atoi((picoos_char*)tok->markupParams[0].paramVal);
757                     tok_checkLimits(this, & uval, PICO_SPEAKER_MIN, PICO_SPEAKER_MAX, (picoos_uchar*)"volume");
758                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_ABSOLUTE, uval, (picoos_uchar*)"");
759                 }
760                 done = TRUE;
761             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
762                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPEAKER, PICODATA_ITEMINFO2_CMD_ABSOLUTE, PICO_SPEAKER_DEFAULT, (picoos_uchar*)"");
763                 done = TRUE;
764             }
765             break;
766 
767         case MIVoice:
768             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) {
769                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOICE, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal);
770                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
771                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 0, 0, (picoos_uchar*)"");
772                 done = TRUE;
773             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
774                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_VOICE, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
775                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
776                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 0, 0, (picoos_uchar*)"");
777                 done = TRUE;
778             }
779             break;
780         case MIPreprocContext:
781             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) {
782                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_CONTEXT, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal);
783                 done = TRUE;
784             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
785                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_CONTEXT, PICODATA_ITEMINFO2_NA, 0, PICO_CONTEXT_DEFAULT);
786                 done = TRUE;
787             }
788             break;
789         case MIMarker:
790             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWName)) {
791                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_MARKER, PICODATA_ITEMINFO2_NA, 0, tok->markupParams[0].paramVal);
792                 done = TRUE;
793             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
794                 done = TRUE;
795             }
796             break;
797         case MISentence:
798             if (isStartTag) {
799                 tok_getParamStrVal(tok->markupParams, KWProsDomain, (picoos_uchar*)valStr, & paramFound);
800                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
801                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 2, 0, valStr);
802                 done = TRUE;
803             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
804                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
805                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 2, 0, (picoos_uchar*)"");
806                 done = TRUE;
807             }
808             break;
809         case MIParagraph:
810             if (isStartTag) {
811                 tok_getParamStrVal(tok->markupParams, KWProsDomain, (picoos_uchar*)valStr, & paramFound);
812                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
813                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 1, 0, valStr);
814                 done = TRUE;
815             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
816                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
817                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SIL, PICODATA_ITEMINFO2_NA, PARAGRAPH_PAUSE_DUR, (picoos_uchar*)"");
818                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PROSDOMAIN, 1, 0, (picoos_uchar*)"");
819                 done = TRUE;
820             }
821             break;
822         case MIBreak:
823             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWTime)) {
824                 tok_getDur(tok->markupParams[0].paramVal, & dur, & done1);
825                 tok_checkLimits (this, &dur, 0, 65535, (picoos_uchar*)"time");
826                 if (done1) {
827                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SIL, PICODATA_ITEMINFO2_NA, dur, (picoos_uchar*)"");
828                     done = TRUE;
829                 }
830             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
831                 done = TRUE;
832             }
833             break;
834         case MISpell:
835             if (isStartTag) {
836                 if (tok_strEqual(tok->markupParams[0].paramId, KWMode)) {
837                     if (tok_strEqual(tok->markupParams[0].paramVal, KWPB)) {
838                         uval = SPELL_WITH_PHRASE_BREAK;
839                     } else if (tok_strEqual(tok->markupParams[0].paramVal, KWSB)) {
840                         uval = SPELL_WITH_SENTENCE_BREAK;
841                     } else {
842                         tok_getDur(tok->markupParams[0].paramVal, & uval, & done1);
843                         tok_checkLimits (this, & uval, 0, 65535, (picoos_uchar*)"time");
844                         if (done1) {
845                             done = TRUE;
846                         }
847                     }
848                 } else {
849                     uval = SPELL_WITH_PHRASE_BREAK;
850                 }
851                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPELL, PICODATA_ITEMINFO2_CMD_START, uval, (picoos_uchar*)"");
852                 done = TRUE;
853             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
854                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SPELL, PICODATA_ITEMINFO2_CMD_END, 0, (picoos_uchar*)"");
855                 done = TRUE;
856             }
857             break;
858         case MIGenFile:
859             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) {
860                 if (tok->saveFile[0] != 0) {
861                    tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_UNSAVE,
862                                picodata_getPuTypeFromExtension(tok->saveFile, /*input*/FALSE), 0, tok->saveFile);
863                    tok->saveFile[0] = 0;
864                 }
865                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_SAVE,
866                             picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal,  /*input*/FALSE), 0, tok->markupParams[0].paramVal);
867                 picoos_strcpy((picoos_char*)tok->saveFile, (picoos_char*)tok->markupParams[0].paramVal);
868                 done = TRUE;
869             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
870                 if (tok->saveFile[0] != 0) {
871                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_UNSAVE,
872                                 picodata_getPuTypeFromExtension(tok->saveFile, /*input*/FALSE), 0, (picoos_uchar*)"");
873                     tok->saveFile[0] = 0;
874                 }
875                 done = TRUE;
876             }
877             break;
878         case MIPlay:
879             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) {
880                 if (picoos_FileExists(this->common, (picoos_char*)tok->markupParams[0].paramVal)) {
881                     tok_getParamIntVal(tok->markupParams,KWF0Beg,& ival,& paramFound);
882                     tok_getParamIntVal(tok->markupParams,KWF0End,& ival2,& paramFound);
883                     tok_getParamStrVal(tok->markupParams,KWAlphabet,valStr3,& paramFound);
884                     tok_getParamPhonesStr(tok->markupParams,KWXFadeBeg,valStr3,valStr,VAL_STR_LEN,& paramFound);
885                     tok_getParamPhonesStr(tok->markupParams,KWXFadeEnd,valStr3,valStr2,VAL_STR_LEN,& paramFound);
886                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PLAY,
887                                 picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/TRUE), 0, tok->markupParams[0].paramVal);
888                     tok_startIgnore(tok);
889                 } else {
890                     if (tok->ignLevel > 0) {
891                         tok_startIgnore(tok);
892                     } else {
893                        picoos_emRaiseWarning(this->common->em, PICO_EXC_CANT_OPEN_FILE, (picoos_char*)"", (picoos_char*)"file '%s' not found; synthesizing enclosed text instead\n", tok->markupParams[0].paramVal);
894                     }
895                 }
896                 done = TRUE;
897             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
898                 tok_endIgnore(tok);
899                 done = TRUE;
900             }
901             break;
902         case MIUseSig:
903             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWFile)) {
904                 if (picoos_FileExists(this->common, (picoos_char*)tok->markupParams[0].paramVal)) {
905                     tok_getParamIntVal(tok->markupParams,KWF0Beg,& ival,& paramFound);
906                     tok_getParamIntVal(tok->markupParams,KWF0End,& ival2,& paramFound);
907                     tok_getParamStrVal(tok->markupParams,KWAlphabet,valStr3, & paramFound);
908                     tok_getParamPhonesStr(tok->markupParams,KWXFadeBeg,valStr3,valStr,VAL_STR_LEN,& paramFound);
909                     tok_getParamPhonesStr(tok->markupParams,KWXFadeEnd,valStr3,valStr2,VAL_STR_LEN,& paramFound);
910                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PLAY,
911                                 picodata_getPuTypeFromExtension(tok->markupParams[0].paramVal, /*input*/TRUE), 0, tok->markupParams[0].paramVal);
912                     tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_IGNSIG, PICODATA_ITEMINFO2_CMD_START, 0, (picoos_uchar*)"");
913                 } else {
914                     if (tok->ignLevel <= 0) {
915                         picoos_emRaiseWarning(this->common->em, PICO_EXC_CANT_OPEN_FILE, (picoos_char*)"", (picoos_char*)"file '%s' not found; synthesizing enclosed text instead", tok->markupParams[0].paramVal);
916                     }
917                 }
918                 done = TRUE;
919             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
920                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_IGNSIG, PICODATA_ITEMINFO2_CMD_END, 0, (picoos_uchar*)"");
921                 done = TRUE;
922             }
923             break;
924         case MIPhoneme:
925             i2 = 0;
926             if (isStartTag) {
927                 if (tok_strEqual(tok->markupParams[0].paramId, KWAlphabet) && tok_strEqual(tok->markupParams[1].paramId, KWPH)) {
928                     if (tok_strEqual(tok->markupParams[2].paramId, KWOrthMode)
929                         && tok_strEqual(tok->markupParams[2].paramVal, KWIgnorePunct)) {
930                         i2 = 1;
931                     }
932                     if (picodata_mapPAStrToPAIds(tok->transducer, this->common, tok->xsampa_parser, tok->svoxpa_parser, tok->xsampa2svoxpa_mapper, tok->markupParams[1].paramVal, tok->markupParams[0].paramVal, tok->phonemes, sizeof(tok->phonemes)-1) == PICO_OK) {
933                         tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME,
934                             PICODATA_ITEMINFO2_CMD_START, i2, tok->phonemes);
935                         done = TRUE;
936                     } else {
937                         PICODBG_WARN(("cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal));
938                         picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE,(picoos_char*)"", (picoos_char*)"cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal);
939                         done = TRUE;
940                     }
941                 } else if (tok_strEqual(tok->markupParams[0].paramId, KWPH)) {
942                     if (tok_strEqual(tok->markupParams[1].paramId, KWOrthMode)
943                         && tok_strEqual(tok->markupParams[1].paramVal, KWIgnorePunct)) {
944                         i2 = 1;
945                     }
946                     if (picodata_mapPAStrToPAIds(tok->transducer, this->common, tok->xsampa_parser, tok->svoxpa_parser, tok->xsampa2svoxpa_mapper, tok->markupParams[0].paramVal, PICODATA_XSAMPA, tok->phonemes, sizeof(tok->phonemes)) == PICO_OK) {
947                         tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME,
948                             PICODATA_ITEMINFO2_CMD_START, i2, tok->phonemes);
949                         done = TRUE;
950                     }
951                     else {
952                         PICODBG_WARN(("cannot map phonetic string '%s'; synthesizeing text instead", tok->markupParams[1].paramVal));
953                         picoos_emRaiseWarning(this->common->em, PICO_ERR_MARKUP_VALUE_OUT_OF_RANGE,(picoos_char*)"", (picoos_char*)"cannot map phonetic string '%s'; synthesizing text instead", tok->markupParams[0].paramVal);
954                         done = TRUE;
955                     }
956                 }
957             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId, (picoos_uchar*)"")) {
958                 tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_PHONEME,
959                     PICODATA_ITEMINFO2_CMD_END, i2, (picoos_uchar*)"");
960                 done = TRUE;
961             }
962             break;
963         case MIItem:
964             if (isStartTag && tok_strEqual(tok->markupParams[0].paramId, KWType) &&
965                               tok_strEqual(tok->markupParams[1].paramId, KWInfo1)&&
966                               tok_strEqual(tok->markupParams[2].paramId, KWInfo2)&&
967                               tok_strEqual(tok->markupParams[3].paramId, KWDATA)) {
968                   picoos_int32 len2, n2;
969                   type = picoos_atoi(tok->markupParams[0].paramVal);
970                   info1 = picoos_atoi(tok->markupParams[1].paramVal);
971                   info2 = picoos_atoi(tok->markupParams[2].paramVal);
972                   n = 0; n2 = 0;
973                   len2 = (picoos_int32)picoos_strlen(tok->markupParams[3].paramVal);
974                   while (n<len2) {
975                       while ((tok->markupParams[3].paramVal[n] != 0) && (tok->markupParams[3].paramVal[n] <= 32)) {
976                           n++;
977                       }
978                       tok->markupParams[3].paramVal[n2] = tok->markupParams[3].paramVal[n];
979                       n++;
980                       n2++;
981                   }
982                   if (is_valid_itemtype(type)) {
983                       done = TRUE;
984                       len = 0;
985                       pos = 0;
986                       picoos_get_sep_part_str(tok->markupParams[3].paramVal, picoos_strlen(tok->markupParams[3].paramVal),
987                                           &pos, ',', part, 10, &done1);
988                       while (done && done1) {
989                           n = picoos_atoi(part);
990                           if ((n>=0) && (n<256) && (len<256)) {
991                               data[len++] = n;
992                           }
993                           else {
994                               done = FALSE;
995                           }
996                           picoos_get_sep_part_str(tok->markupParams[3].paramVal, picoos_strlen(tok->markupParams[3].paramVal),
997                                           &pos, ',', part, 10, &done1);
998                       }
999                       if (done) {
1000                           tok_putItem2(this, tok, type, info1, info2, len, data);
1001                       }
1002                   }
1003                   else {
1004                       done = FALSE;
1005                   }
1006             } else if (!isStartTag && tok_strEqual(tok->markupParams[0].paramId,(picoos_uchar*)"")) {
1007                 done = TRUE;
1008             }
1009             break;
1010     default:
1011         break;
1012     }
1013     if (!done) {
1014         tok->markupTagErr = MEInterprete;
1015     }
1016     if (isStartTag) {
1017         tok->markupLevel[mId]++;
1018     } else if ((tok->markupLevel[mId] > 0)) {
1019         tok->markupLevel[mId]--;
1020     }
1021 }
1022 
1023 
tok_attrChar(picoos_uchar ch,picoos_bool first)1024 static picoos_bool tok_attrChar (picoos_uchar ch, picoos_bool first)
1025 {
1026     return ((((ch >= (picoos_uchar)'A') && (ch <= (picoos_uchar)'Z')) ||
1027              ((ch >= (picoos_uchar)'a') && (ch <= (picoos_uchar)'z'))) ||
1028              ( !(first) && ((ch >= (picoos_uchar)'0') && (ch <= (picoos_uchar)'9'))));
1029 }
1030 
1031 
1032 
tok_idChar(picoos_uchar ch,picoos_bool first)1033 static picoos_bool tok_idChar (picoos_uchar ch, picoos_bool first)
1034 {
1035     return tok_attrChar(ch, first) || ( !(first) && (ch == (picoos_uchar)':'));
1036 }
1037 
1038 
tok_setIsFileAttr(picoos_uchar name[],picoos_bool * isFile)1039 static void tok_setIsFileAttr (picoos_uchar name[], picoos_bool * isFile)
1040 {
1041     (*isFile) = tok_strEqual(name, KWFile);
1042 }
1043 
1044 /* *****************************************************************************/
1045 
tok_putToSimpleToken(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uchar str[],pico_tokenType type,pico_tokenSubType subtype)1046 static void tok_putToSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[], pico_tokenType type, pico_tokenSubType subtype)
1047 {
1048     int i, len;
1049 
1050     if (str[0] != 0) {
1051         len = picoos_strlen((picoos_char*)str);
1052         for (i = 0; i < len; i++) {
1053             if (tok->tokenPos >= IN_BUF_SIZE) {
1054                 picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT, (picoos_char*)"", (picoos_char*)"simple token too long; forced treatment");
1055                 tok_treatSimpleToken(this, tok);
1056             }
1057             tok->tokenStr[tok->tokenPos] = str[i];
1058             tok->tokenPos++;
1059         }
1060     }
1061     tok->tokenType = type;
1062     tok->tokenSubType = subtype;
1063 }
1064 
1065 
tok_putToMarkup(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uchar str[])1066 static void tok_putToMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar str[])
1067 {
1068     picoos_int32 i, len;
1069     picoos_uint8 ok;
1070 
1071     tok->markupTagErr = MENone;
1072     len = picoos_strlen((picoos_char*)str);
1073     for (i = 0; i< len; i++) {
1074         if (tok->markupPos >= (MARKUP_STRING_BUF_SIZE - 1)) {
1075             if ((tok->markupPos == (MARKUP_STRING_BUF_SIZE - 1)) && (tok_markupTagId(tok->markupTagName) != MIDummyEnd)) {
1076                 picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT ,(picoos_char*)"", (picoos_char*)"markup tag too long");
1077             }
1078             tok->markupState = MSErrorTooLong;
1079         } else if ((str[i] == (picoos_uchar)' ') && ((tok->markupState == MSExpectingmarkupTagName) || (tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSGotAttrName) || (tok->markupState == MSGotEqual) || (tok->markupState == MSGotAttrValue))) {
1080         } else if ((str[i] == (picoos_uchar)'>') && ((tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSInmarkupTagName) || (tok->markupState == MSGotAttrValue))) {
1081             tok->markupState = MSGotEnd;
1082         } else if ((str[i] == (picoos_uchar)'/') && ((tok->markupState == MSGotmarkupTagName) || (tok->markupState == MSInmarkupTagName) || (tok->markupState == MSGotAttrValue))) {
1083             if (tok->markupTagType == MTEnd) {
1084                 tok->markupTagErr = MEUnexpectedChar;
1085                 tok->markupState = MSError;
1086             } else {
1087                 tok->markupTagType = MTEmpty;
1088                 tok->markupState = MSGotEndSlash;
1089             }
1090         } else {
1091             switch (tok->markupState) {
1092                 case MSNotInMarkup:
1093                     if (str[i] == (picoos_uchar)'<') {
1094                         tok_clearMarkupParams(tok->markupParams);
1095                         tok->nrMarkupParams = 0;
1096                         tok->strPos = 0;
1097                         tok->markupTagType = MTStart;
1098                         tok->markupState = MSGotStart;
1099                     } else {
1100                         tok->markupTagErr = MEMissingStart;
1101                         tok->markupState = MSError;
1102                     }
1103                     break;
1104                 case MSGotStart:
1105                     if (str[i] == (picoos_uchar)'/') {
1106                         tok->markupTagType = MTEnd;
1107                         tok->markupState = MSExpectingmarkupTagName;
1108                     } else if (str[i] == (picoos_uchar)' ') {
1109                         tok->markupState = MSExpectingmarkupTagName;
1110                     } else if (tok_idChar(str[i],TRUE)) {
1111                         tok->markupTagType = MTStart;
1112                         tok->markupTagName[tok->strPos] = str[i];
1113                         tok->strPos++;
1114                         tok->markupTagName[tok->strPos] = 0;
1115                         tok->markupState = MSInmarkupTagName;
1116                     } else {
1117                         tok->markupTagErr = MEUnexpectedChar;
1118                         tok->markupState = MSError;
1119                     }
1120                     break;
1121                 case MSInmarkupTagName:   case MSExpectingmarkupTagName:
1122                     if (tok_idChar(str[i],tok->markupState == MSExpectingmarkupTagName)) {
1123                         tok->markupTagName[tok->strPos] = str[i];
1124                         tok->strPos++;
1125                         tok->markupTagName[(tok->strPos)] = 0;
1126                         tok->markupState = MSInmarkupTagName;
1127                     } else if ((tok->markupState == MSInmarkupTagName) && (str[i] == (picoos_uchar)' ')) {
1128                         tok->markupState = MSGotmarkupTagName;
1129                         picobase_lowercase_utf8_str(tok->markupTagName, (picoos_char*)tok->markupTagName, IN_BUF_SIZE, &ok);
1130                         tok->strPos = 0;
1131                     } else {
1132                         tok->markupTagErr = MEIdent;
1133                         tok->markupState = MSError;
1134                     }
1135                     break;
1136                 case MSGotmarkupTagName:   case MSGotAttrValue:
1137                     if (tok_attrChar(str[i], TRUE)) {
1138                         if (tok->markupTagType == MTEnd) {
1139                             tok->markupTagErr = MEUnexpectedChar;
1140                             tok->markupState = MSError;
1141                         } else {
1142                             if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1143                                 tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = str[i];
1144                                 tok->strPos++;
1145                                 tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = 0;
1146                             } else {
1147                                 picoos_emRaiseWarning(this->common->em, PICO_ERR_INTERNAL_LIMIT ,(picoos_char*)"", (picoos_char*)"too many attributes in markup; ignoring");
1148                             }
1149                             tok->markupState = MSInAttrName;
1150                         }
1151                     } else {
1152                         tok->markupTagErr = MEUnexpectedChar;
1153                         tok->markupState = MSError;
1154                     }
1155                     break;
1156                 case MSInAttrName:
1157                     if (tok_attrChar(str[i], FALSE)) {
1158                         if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1159                             tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = str[i];
1160                             tok->strPos++;
1161                             tok->markupParams[tok->nrMarkupParams].paramId[tok->strPos] = 0;
1162                         }
1163                         tok->markupState = MSInAttrName;
1164                     } else if (str[i] == (picoos_uchar)' ') {
1165                         picobase_lowercase_utf8_str(tok->markupParams[tok->nrMarkupParams].paramId, (picoos_char*)tok->markupParams[tok->nrMarkupParams].paramId, IN_BUF_SIZE, &ok);
1166                         tok_setIsFileAttr(tok->markupParams[tok->nrMarkupParams].paramId, & tok->isFileAttr);
1167                         tok->markupState = MSGotAttrName;
1168                     } else if (str[i] == (picoos_uchar)'=') {
1169                         picobase_lowercase_utf8_str(tok->markupParams[tok->nrMarkupParams].paramId, (picoos_char*)tok->markupParams[tok->nrMarkupParams].paramId, IN_BUF_SIZE, &ok);
1170                         tok_setIsFileAttr(tok->markupParams[tok->nrMarkupParams].paramId, & tok->isFileAttr);
1171                         tok->markupState = MSGotEqual;
1172                     } else {
1173                         tok->markupTagErr = MEMissingEqual;
1174                         tok->markupState = MSError;
1175                     }
1176                     break;
1177                 case MSGotAttrName:
1178                     if (str[i] == (picoos_uchar)'=') {
1179                         tok->markupState = MSGotEqual;
1180                     } else {
1181                         tok->markupTagErr = MEMissingEqual;
1182                         tok->markupState = MSError;
1183                     }
1184                     break;
1185                 case MSGotEqual:
1186                     if ((str[i] == (picoos_uchar)'"') || (str[i] == (picoos_uchar)'\'')) {
1187                         tok->strDelim = str[i];
1188                         tok->strPos = 0;
1189                         tok->markupState = MSInAttrValue;
1190                     } else {
1191                         tok->markupTagErr = MEMissingQuote;
1192                         tok->markupState = MSError;
1193                     }
1194                     break;
1195                 case MSInAttrValue:
1196                     if (!(tok->isFileAttr) && (str[i] == (picoos_uchar)'\\')) {
1197                         tok->markupState = MSInAttrValueEscaped;
1198                     } else if (str[i] == tok->strDelim) {
1199                         if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1200                             tok->nrMarkupParams++;
1201                         }
1202                         tok->strPos = 0;
1203                         tok->markupState = MSGotAttrValue;
1204                     } else {
1205                         if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1206                             tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = str[i];
1207                             tok->strPos++;
1208                             tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = 0;
1209                         }
1210                         tok->markupState = MSInAttrValue;
1211                     }
1212                     break;
1213                 case MSInAttrValueEscaped:
1214                     if (tok->nrMarkupParams < MAX_NR_MARKUP_PARAMS) {
1215                         tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = str[i];
1216                         tok->strPos++;
1217                         tok->markupParams[tok->nrMarkupParams].paramVal[tok->strPos] = 0;
1218                     }
1219                     tok->markupState = MSInAttrValue;
1220                     break;
1221                 case MSGotEndSlash:
1222                     if (str[i] == (picoos_uchar)'>') {
1223                         tok->markupState = MSGotEnd;
1224                     } else {
1225                         tok->markupTagErr = MEUnexpectedChar;
1226                         tok->markupState = MSError;
1227                     }
1228                     break;
1229             default:
1230                 tok->markupTagErr = MEUnexpectedChar;
1231                 tok->markupState = MSError;
1232                 break;
1233             }
1234         }
1235         if (tok->markupTagErr == MENone) {
1236             tok->markupStr[tok->markupPos] = str[i];
1237             tok->markupPos++;
1238         } /* else restart parsing at current char */
1239         tok->markupStr[tok->markupPos] = 0;
1240     }
1241     /*
1242     PICODBG_DEBUG(("putToMarkup %s", tok->markupStr));
1243     */
1244 }
1245 
1246 /* *****************************************************************************/
1247 
tok_treatMarkupAsSimpleToken(picodata_ProcessingUnit this,tok_subobj_t * tok)1248 static void tok_treatMarkupAsSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok)
1249 {
1250     picoos_int32 i;
1251 
1252     tok->utfpos = 0;
1253     tok->utflen = 0;
1254     tok->markupState = MSNotInMarkup;
1255     for (i = 0; i < tok->markupPos; i++) {
1256         tok_treatChar(this, tok, tok->markupStr[i], FALSE);
1257     }
1258     tok->markupPos = 0;
1259     tok->strPos = 0;
1260 }
1261 
1262 
tok_treatMarkup(picodata_ProcessingUnit this,tok_subobj_t * tok)1263 static void tok_treatMarkup (picodata_ProcessingUnit this, tok_subobj_t * tok)
1264 {
1265     MarkupId mId;
1266 
1267     if (tok_markupTagId(tok->markupTagName) != MIDummyEnd) {
1268         if (tok->markupTagErr == MENone) {
1269             tok->markupState = MSNotInMarkup;
1270             if ((tok->tokenType != PICODATA_ITEMINFO1_TOKTYPE_SPACE) && (tok->tokenType != PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED)) {
1271                 tok_treatSimpleToken(this, tok);
1272             }
1273             tok_putToSimpleToken(this, tok, (picoos_uchar*)" ", PICODATA_ITEMINFO1_TOKTYPE_SPACE, -1);
1274             mId = tok_markupTagId(tok->markupTagName);
1275             if ((tok->markupTagType == MTStart) || (tok->markupTagType == MTEmpty)) {
1276                 tok_interpretMarkup(this, tok, TRUE, mId);
1277             }
1278             if (((tok->markupTagType == MTEnd) || (tok->markupTagType == MTEmpty))) {
1279                 tok_clearMarkupParams(tok->markupParams);
1280                 tok->nrMarkupParams = 0;
1281                 tok_interpretMarkup(this, tok, FALSE,mId);
1282             }
1283         }
1284         if (tok->markupTagErr != MENone) {
1285             if (!tok->aborted) {
1286               picoos_emRaiseWarning(this->common->em, PICO_ERR_INVALID_MARKUP_TAG, (picoos_char*)"", (picoos_char*)"syntax error in markup token '%s'",tok->markupStr);
1287             }
1288             tok_treatMarkupAsSimpleToken(this, tok);
1289         }
1290     } else {
1291         tok_treatMarkupAsSimpleToken(this, tok);
1292     }
1293     tok->markupState = MSNotInMarkup;
1294     tok->markupPos = 0;
1295     tok->strPos = 0;
1296 }
1297 
1298 
1299 
tok_treatChar(picodata_ProcessingUnit this,tok_subobj_t * tok,picoos_uchar ch,picoos_bool markupHandling)1300 static void tok_treatChar (picodata_ProcessingUnit this, tok_subobj_t * tok, picoos_uchar ch, picoos_bool markupHandling)
1301 {
1302     picoos_int32 i, id;
1303     picoos_uint8 uval8;
1304     pico_tokenType type = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1305     pico_tokenSubType subtype = -1;
1306     picoos_bool dummy;
1307     utf8char0c utf2;
1308     picoos_int32 utf2pos;
1309 
1310     if (ch == NULLC) {
1311       tok_treatSimpleToken(this, tok);
1312       tok_putItem(this, tok, PICODATA_ITEM_CMD, PICODATA_ITEMINFO1_CMD_FLUSH, PICODATA_ITEMINFO2_NA, 0, (picoos_uchar*)"");
1313     }
1314     else {
1315       switch (tok_putToUtf(tok, ch)) {
1316         case UTF_CHAR_MALFORMED:
1317             tok->utfpos = 0;
1318             tok->utflen = 0;
1319             break;
1320         case UTF_CHAR_INCOMPLETE:
1321             break;
1322         case UTF_CHAR_COMPLETE:
1323             markupHandling = (markupHandling && (tok->markupHandlingMode == MARKUP_HANDLING_ENABLED));
1324             id = picoktab_graphOffset(tok->graphTab, tok->utf);
1325             if (id > 0) {
1326                 if (picoktab_getIntPropTokenType(tok->graphTab, id, &uval8)) {
1327                     type = (pico_tokenType)uval8;
1328                     if (type == PICODATA_ITEMINFO1_TOKTYPE_LETTERV) {
1329                         type = PICODATA_ITEMINFO1_TOKTYPE_LETTER;
1330                     }
1331                 }
1332                 dummy = picoktab_getIntPropTokenSubType(tok->graphTab, id, &subtype);
1333             } else if (tok->utf[tok->utfpos-1] <= (picoos_uchar)' ') {
1334                 type = PICODATA_ITEMINFO1_TOKTYPE_SPACE;
1335                 subtype =  -1;
1336             } else {
1337                 type = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1338                 subtype =  -1;
1339             }
1340             if ((tok->utf[tok->utfpos-1] > (picoos_uchar)' ')) {
1341                 tok->nrEOL = 0;
1342             } else if ((tok->utf[tok->utfpos-1] == EOL)) {
1343                 tok->nrEOL++;
1344             }
1345             if (markupHandling && (tok->markupState != MSNotInMarkup)) {
1346                 tok_putToMarkup(this, tok, tok->utf);
1347                 if (tok->markupState >= MSError) {
1348                     picoos_strlcpy(utf2, tok->utf, 5);
1349                     utf2pos = tok->utfpos;
1350                     /* treat string up to (but not including) current char as simple
1351                        token and restart markup tag parsing with current char */
1352                     tok_treatMarkupAsSimpleToken(this, tok);
1353                     for (i = 0; i < utf2pos; i++) {
1354                         tok_treatChar(this, tok, utf2[i], markupHandling);
1355                     }
1356                 } else if (tok->markupState == MSGotEnd) {
1357                     tok_treatMarkup(this, tok);
1358                 }
1359             } else if ((markupHandling && (tok->utf[tok->utfpos-1] == (picoos_uchar)'<'))) {
1360                 tok_putToMarkup(this, tok, tok->utf);
1361             } else if (type != PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED) {
1362                 if ((type != tok->tokenType) || (type == PICODATA_ITEMINFO1_TOKTYPE_CHAR) || (subtype != tok->tokenSubType)) {
1363                     tok_treatSimpleToken(this, tok);
1364                 } else if ((tok->utf[tok->utfpos-1] == EOL) && (tok->nrEOL == 2)) {
1365                     tok_treatSimpleToken(this, tok);
1366                     tok_putToSimpleToken(this, tok, (picoos_uchar*)".", PICODATA_ITEMINFO1_TOKTYPE_CHAR, -1);
1367                     tok_treatSimpleToken(this, tok);
1368                 }
1369                 tok_putToSimpleToken(this, tok, tok->utf, type, subtype);
1370             } else {
1371                 tok_treatSimpleToken(this, tok);
1372             }
1373             tok->utfpos = 0;
1374             tok->utflen = 0;
1375             break;
1376       }
1377     }
1378 }
1379 
1380 
tok_treatSimpleToken(picodata_ProcessingUnit this,tok_subobj_t * tok)1381 static void tok_treatSimpleToken (picodata_ProcessingUnit this, tok_subobj_t * tok)
1382 {
1383     if (tok->tokenPos < IN_BUF_SIZE) {
1384         tok->tokenStr[tok->tokenPos] = 0;
1385     }
1386     if (tok->markupState != MSNotInMarkup) {
1387         if (!(tok->aborted) && (tok->markupState >= MSGotmarkupTagName) && (tok_markupTagId(tok->markupTagName) != MIDummyEnd)) {
1388             picoos_emRaiseWarning(this->common->em, PICO_ERR_INVALID_MARKUP_TAG, (picoos_char*)"", (picoos_char*)"unfinished markup tag '%s'",tok->markupStr);
1389         }
1390         tok_treatMarkupAsSimpleToken(this, tok);
1391         tok_treatSimpleToken(this, tok);
1392     } else if ((tok->tokenPos > 0) && ((tok->ignLevel <= 0) || (tok->tokenType == PICODATA_ITEMINFO1_TOKTYPE_SPACE))) {
1393         tok_putItem(this, tok, PICODATA_ITEM_TOKEN, tok->tokenType, (picoos_uint8)tok->tokenSubType, 0, tok->tokenStr);
1394     }
1395     tok->tokenPos = 0;
1396     tok->tokenType = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1397     tok->tokenSubType =  -1;
1398 }
1399 
1400 /* *****************************************************************************/
1401 
tokReset(register picodata_ProcessingUnit this,picoos_int32 resetMode)1402 static pico_status_t tokReset(register picodata_ProcessingUnit this, picoos_int32 resetMode)
1403 {
1404     tok_subobj_t * tok;
1405     MarkupId mId;
1406 
1407     if (NULL == this || NULL == this->subObj) {
1408         return PICO_ERR_OTHER;
1409     }
1410     tok = (tok_subobj_t *) this->subObj;
1411 
1412     tok->ignLevel = 0;
1413 
1414     tok->utfpos = 0;
1415     tok->utflen = 0;
1416 
1417     tok_clearMarkupParams(tok->markupParams);
1418     tok->nrMarkupParams = 0;
1419     tok->markupState = MSNotInMarkup;
1420     tok->markupPos = 0;
1421     for (mId = MIDummyStart; mId <= MIDummyEnd; mId++) {
1422         tok->markupLevel[mId] = 0;
1423     }
1424     tok->markupTagName[0] = 0;
1425     tok->markupTagType = MTNone;
1426     tok->markupTagErr = MENone;
1427 
1428     tok->strPos = 0;
1429     tok->strDelim = 0;
1430     tok->isFileAttr = FALSE;
1431 
1432     tok->tokenType = PICODATA_ITEMINFO1_TOKTYPE_UNDEFINED;
1433     tok->tokenSubType =  -1;
1434     tok->tokenPos = 0;
1435 
1436     tok->nrEOL = 0;
1437 
1438 
1439     tok->markupHandlingMode = TRUE;
1440     tok->aborted = FALSE;
1441 
1442     tok->start = TRUE;
1443 
1444     tok->outReadPos = 0;
1445     tok->outWritePos = 0;
1446 
1447     tok->saveFile[0] = 0;
1448 
1449 
1450     tok->graphTab = picoktab_getGraphs(this->voice->kbArray[PICOKNOW_KBID_TAB_GRAPHS]);
1451 
1452     tok->xsampa_parser = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_XSAMPA_PARSE]);
1453     PICODBG_TRACE(("got xsampa_parser @ %i",tok->xsampa_parser));
1454 
1455     tok->svoxpa_parser = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_SVOXPA_PARSE]);
1456     PICODBG_TRACE(("got svoxpa_parser @ %i",tok->svoxpa_parser));
1457 
1458     tok->xsampa2svoxpa_mapper = picokfst_getFST(this->voice->kbArray[PICOKNOW_KBID_FST_XSAMPA2SVOXPA]);
1459     PICODBG_TRACE(("got xsampa2svoxpa_mapper @ %i",tok->xsampa2svoxpa_mapper));
1460 
1461 
1462 
1463     return PICO_OK;
1464 }
1465 
tokInitialize(register picodata_ProcessingUnit this,picoos_int32 resetMode)1466 static pico_status_t tokInitialize(register picodata_ProcessingUnit this, picoos_int32 resetMode)
1467 {
1468 /*
1469 
1470     tok_subobj_t * tok;
1471 
1472     if (NULL == this || NULL == this->subObj) {
1473         return PICO_ERR_OTHER;
1474     }
1475     tok = (tok_subobj_t *) this->subObj;
1476 */
1477     return tokReset(this, resetMode);
1478 }
1479 
1480 
tokTerminate(register picodata_ProcessingUnit this)1481 static pico_status_t tokTerminate(register picodata_ProcessingUnit this)
1482 {
1483     return PICO_OK;
1484 }
1485 
1486 static picodata_step_result_t tokStep(register picodata_ProcessingUnit this, picoos_int16 mode, picoos_uint16 * numBytesOutput);
1487 
tokSubObjDeallocate(register picodata_ProcessingUnit this,picoos_MemoryManager mm)1488 static pico_status_t tokSubObjDeallocate(register picodata_ProcessingUnit this,
1489         picoos_MemoryManager mm)
1490 {
1491 
1492     if (NULL != this) {
1493         picoos_deallocate(this->common->mm, (void *) &this->subObj);
1494     }
1495     mm = mm;        /* avoid warning "var not used in this function"*/
1496     return PICO_OK;
1497 }
1498 
picotok_newTokenizeUnit(picoos_MemoryManager mm,picoos_Common common,picodata_CharBuffer cbIn,picodata_CharBuffer cbOut,picorsrc_Voice voice)1499 picodata_ProcessingUnit picotok_newTokenizeUnit(picoos_MemoryManager mm, picoos_Common common,
1500         picodata_CharBuffer cbIn, picodata_CharBuffer cbOut,
1501         picorsrc_Voice voice)
1502 {
1503     tok_subobj_t * tok;
1504     picodata_ProcessingUnit this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice);
1505     if (this == NULL) {
1506         return NULL;
1507     }
1508     this->initialize = tokInitialize;
1509     PICODBG_DEBUG(("set this->step to tokStep"));
1510     this->step = tokStep;
1511     this->terminate = tokTerminate;
1512     this->subDeallocate = tokSubObjDeallocate;
1513     this->subObj = picoos_allocate(mm, sizeof(tok_subobj_t));
1514     if (this->subObj == NULL) {
1515         picoos_deallocate(mm, (void *)&this);
1516         return NULL;
1517     }
1518     tok = (tok_subobj_t *) this->subObj;
1519     tok->transducer = picotrns_newSimpleTransducer(mm, common, 10*(PICOTRNS_MAX_NUM_POSSYM+2));
1520     if (NULL == tok->transducer) {
1521         tokSubObjDeallocate(this,mm);
1522         picoos_deallocate(mm, (void *)&this);
1523         return NULL;
1524     }
1525     tokInitialize(this, PICO_RESET_FULL);
1526     return this;
1527 }
1528 
1529 /**
1530  * fill up internal buffer, try to locate token, write token to output
1531  */
tokStep(register picodata_ProcessingUnit this,picoos_int16 mode,picoos_uint16 * numBytesOutput)1532 picodata_step_result_t tokStep(register picodata_ProcessingUnit this,
1533         picoos_int16 mode, picoos_uint16 * numBytesOutput)
1534 {
1535     register tok_subobj_t * tok;
1536 
1537     if (NULL == this || NULL == this->subObj) {
1538         return PICODATA_PU_ERROR;
1539     }
1540     tok = (tok_subobj_t *) this->subObj;
1541 
1542     mode = mode;        /* avoid warning "var not used in this function"*/
1543 
1544     *numBytesOutput = 0;
1545     while (1) { /* exit via return */
1546         picoos_int16 ch;
1547 
1548         if ((tok->outWritePos - tok->outReadPos) > 0) {
1549             if (picodata_cbPutItem(this->cbOut, &tok->outBuf[tok->outReadPos], tok->outWritePos - tok->outReadPos, numBytesOutput) == PICO_OK) {
1550                 PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG],
1551                     (picoos_uint8 *)"tok:", &tok->outBuf[tok->outReadPos], tok->outWritePos - tok->outReadPos);
1552                 tok->outReadPos += *numBytesOutput;
1553                 if (tok->outWritePos == tok->outReadPos) {
1554                     tok->outWritePos = 0;
1555                     tok->outReadPos = 0;
1556                 }
1557             }
1558             else {
1559                 return PICODATA_PU_OUT_FULL;
1560             }
1561 
1562         }
1563         else if (PICO_EOF != (ch = picodata_cbGetCh(this->cbIn))) {
1564             PICODBG_DEBUG(("read in %c", (picoos_char) ch));
1565             tok_treatChar(this, tok, (picoos_uchar) ch, /*markupHandling*/TRUE);
1566         }
1567         else {
1568             return PICODATA_PU_IDLE;
1569         }
1570     }
1571 }
1572 
1573 #ifdef __cplusplus
1574 }
1575 #endif
1576 
1577 /* end */
1578