1 /*
2  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 /**
17  * @file picotok.h
18  *
19  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
20  * All rights reserved.
21  *
22  * History:
23  * - 2009-04-20 -- initial version
24  *
25  */
26 
27 
28 /** @addtogroup picotok
29 itemtype, iteminfo1, iteminfo2, content -> TYPE(INFO1,INFO2)content
30 in the following
31 
32 input
33 =====
34 
35 - UTF8 text
36 
37 limitations: currently only german umlauts in addition to ASCII
38 
39 
40 minimal input size (before processing starts)
41 ==================
42 
43 processing (ie. tokenization) starts when
44 - 'PICO_EOF' char received (which happens whenever the cbIn buffer is empty)
45 - tok-internal buffer is full
46 
47 
48 items output
49 ============
50 
51 processing the character stream can result in one of the
52 following items:
53 -> WORDGRAPH(NA,NA)graph    <- mapped to lower case; incl. 1-2 digit nrs (0-99)
54 -> OTHER(NA,NA)string       <- skip or spell
55 -> PUNC(PUNCtype,PUNCsubtype)
56 -> CMD(CMDtype,CMDsubtype)args
57 
58 with
59 - PUNCtype %d
60     PICODATA_ITEMINFO1_PUNC_SENTEND
61     PICODATA_ITEMINFO1_PUNC_PHRASEEND
62 - PUNCsubtype %d
63     PICODATA_ITEMINFO2_PUNC_SENT_T
64     PICODATA_ITEMINFO2_PUNC_SENT_Q
65     PICODATA_ITEMINFO2_PUNC_SENT_E
66     PICODATA_ITEMINFO2_PUNC_PHRASE
67     (used later: PICODATA_ITEMINFO2_PUNC_PHRASE_FORCED)
68 - CMDtype %d
69     PICODATA_ITEMINFO1_CMD_FLUSH    (no args)
70     ? PICODATA_ITEMINFO1_CMD_PLAY ? (not yet)
71 - CMDsubtype %d
72     PICODATA_ITEMINFO2_NA
73     ? PICODATA_ITEMINFO2_CMD_PLAY_G2P ? (not yet)
74 - graph, len>0, utf8 graphemes, %s
75 - string, len>0, can be any string with printable ascii characters, %s
76 
77 
78 other limitations
79 =================
80 
81 - item size: header plus len=256 (valid for Pico in general)
82  */
83 
84 
85 #ifndef PICOTOK_H_
86 #define PICOTOK_H_
87 
88 #include "picoos.h"
89 #include "picodata.h"
90 #include "picorsrc.h"
91 
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95 #if 0
96 }
97 #endif
98 
99 
100 
101 picodata_ProcessingUnit picotok_newTokenizeUnit(
102         picoos_MemoryManager mm,
103         picoos_Common common,
104         picodata_CharBuffer cbIn,
105         picodata_CharBuffer cbOut,
106         picorsrc_Voice voice);
107 
108 #define PICOTOK_OUTBUF_SIZE 256
109 
110 #ifdef __cplusplus
111 }
112 #endif
113 
114 
115 #endif /*PICOTOK_H_*/
116