1 /*
2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 /**
17 * @file picowa.c
18 *
19 * word analysis PU - lexicon lookup and POS prediction
20 *
21 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
22 * All rights reserved.
23 *
24 * History:
25 * - 2009-04-20 -- initial version
26 *
27 */
28
29 #include "picoos.h"
30 #include "picodbg.h"
31 #include "picodata.h"
32 #include "picowa.h"
33 #include "picoklex.h"
34 #include "picokdt.h"
35 #include "picoktab.h"
36
37 #ifdef __cplusplus
38 extern "C" {
39 #endif
40 #if 0
41 }
42 #endif
43
44 /* PU waStep states */
45 #define WA_STEPSTATE_COLLECT 0
46 #define WA_STEPSTATE_PROCESS 1
47 #define WA_STEPSTATE_FEED 2
48
49
50 /* subobject : WordAnaUnit
51 * shortcut : wa
52 * context size : one item
53 */
54 typedef struct wa_subobj {
55 picoos_uint8 procState; /* for next processing step decision */
56
57 /* one item only */
58 picoos_uint8 inBuf[PICOWA_MAXITEMSIZE]; /* internal input buffer */
59 picoos_uint16 inBufSize; /* actually allocated size */
60 picoos_uint16 inLen; /* length of item in inBuf, 0 for empty buf */
61
62 picoos_uint8 outBuf[PICOWA_MAXITEMSIZE]; /* internal output buffer */
63 picoos_uint16 outBufSize; /* actually allocated size */
64 picoos_uint16 outLen; /* length of item in outBuf, 0 for empty buf */
65
66 /* lex knowledge base */
67 picoklex_Lex lex;
68
69 /* ulex knowledge bases */
70 picoos_uint8 numUlex;
71 picoklex_Lex ulex[PICOKNOW_MAX_NUM_ULEX];
72
73 /* tab knowledge base */
74 picoktab_Pos tabpos;
75
76 /* dtposp knowledge base */
77 picokdt_DtPosP dtposp;
78 } wa_subobj_t;
79
80
waInitialize(register picodata_ProcessingUnit this,picoos_int32 resetMode)81 static pico_status_t waInitialize(register picodata_ProcessingUnit this, picoos_int32 resetMode) {
82 picoos_uint8 i;
83 picoklex_Lex ulex;
84 wa_subobj_t * wa;
85
86 picoknow_kb_id_t ulexKbIds[PICOKNOW_MAX_NUM_ULEX] = PICOKNOW_KBID_ULEX_ARRAY;
87
88 PICODBG_DEBUG(("calling"));
89
90 if (NULL == this || NULL == this->subObj) {
91 return (picodata_step_result_t) picoos_emRaiseException(this->common->em,
92 PICO_ERR_NULLPTR_ACCESS, NULL, NULL);
93 }
94 wa = (wa_subobj_t *) this->subObj;
95 wa->procState = WA_STEPSTATE_COLLECT;
96 wa->inBufSize = PICOWA_MAXITEMSIZE;
97 wa->inLen = 0;
98 wa->outBufSize = PICOWA_MAXITEMSIZE;
99 wa->outLen = 0;
100
101 if (resetMode == PICO_RESET_SOFT) {
102 /*following initializations needed only at startup or after a full reset*/
103 return PICO_OK;
104 }
105 /* kb lex */
106 wa->lex = picoklex_getLex(this->voice->kbArray[PICOKNOW_KBID_LEX_MAIN]);
107 if (wa->lex == NULL) {
108 return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
109 NULL, NULL);
110 }
111 PICODBG_DEBUG(("got lex"));
112
113 /* kb ulex[] */
114 wa->numUlex = 0;
115 for (i = 0; i<PICOKNOW_MAX_NUM_ULEX; i++) {
116 ulex = picoklex_getLex(this->voice->kbArray[ulexKbIds[i]]);
117 if (NULL != ulex) {
118 wa->ulex[wa->numUlex++] = ulex;
119 }
120 }
121 PICODBG_DEBUG(("got %i user lexica", wa->numUlex));
122
123 /* kb tabpos */
124 wa->tabpos =
125 picoktab_getPos(this->voice->kbArray[PICOKNOW_KBID_TAB_POS]);
126 if (wa->tabpos == NULL) {
127 return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
128 NULL, NULL);
129 }
130 PICODBG_DEBUG(("got tabpos"));
131
132 /* kb dtposp */
133 wa->dtposp = picokdt_getDtPosP(this->voice->kbArray[PICOKNOW_KBID_DT_POSP]);
134 if (wa->dtposp == NULL) {
135 return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
136 NULL, NULL);
137 }
138 PICODBG_DEBUG(("got dtposp"));
139 return PICO_OK;
140 }
141
142 static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
143 picoos_int16 mode,
144 picoos_uint16 *numBytesOutput);
145
waTerminate(register picodata_ProcessingUnit this)146 static pico_status_t waTerminate(register picodata_ProcessingUnit this) {
147 return PICO_OK;
148 }
149
waSubObjDeallocate(register picodata_ProcessingUnit this,picoos_MemoryManager mm)150 static pico_status_t waSubObjDeallocate(register picodata_ProcessingUnit this,
151 picoos_MemoryManager mm) {
152 if (NULL != this) {
153 picoos_deallocate(this->common->mm, (void *) &this->subObj);
154 }
155 mm = mm; /* avoid warning "var not used in this function"*/
156 return PICO_OK;
157 }
158
159
picowa_newWordAnaUnit(picoos_MemoryManager mm,picoos_Common common,picodata_CharBuffer cbIn,picodata_CharBuffer cbOut,picorsrc_Voice voice)160 picodata_ProcessingUnit picowa_newWordAnaUnit(picoos_MemoryManager mm,
161 picoos_Common common,
162 picodata_CharBuffer cbIn,
163 picodata_CharBuffer cbOut,
164 picorsrc_Voice voice) {
165 picodata_ProcessingUnit this;
166
167 this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice);
168 if (this == NULL) {
169 return NULL;
170 }
171
172 this->initialize = waInitialize;
173 PICODBG_DEBUG(("set this->step to waStep"));
174 this->step = waStep;
175 this->terminate = waTerminate;
176 this->subDeallocate = waSubObjDeallocate;
177 this->subObj = picoos_allocate(mm, sizeof(wa_subobj_t));
178 if (this->subObj == NULL) {
179 picoos_deallocate(mm, (void *)&this);
180 picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
181 return NULL;
182 }
183
184 waInitialize(this, PICO_RESET_FULL);
185 return this;
186 }
187
188 /* ***********************************************************************/
189 /* WORDGRAPH proc functions */
190 /* ***********************************************************************/
191
waClassifyPos(register picodata_ProcessingUnit this,register wa_subobj_t * wa,const picoos_uint8 * graph,const picoos_uint16 graphlen)192 static picoos_uint8 waClassifyPos(register picodata_ProcessingUnit this,
193 register wa_subobj_t *wa,
194 const picoos_uint8 *graph,
195 const picoos_uint16 graphlen) {
196 picokdt_classify_result_t dtres;
197 picoos_uint8 specchar;
198 picoos_uint16 i;
199
200 PICODBG_DEBUG(("graphlen %d", graphlen));
201
202 /* check existence of special char (e.g. hyphen) in graph:
203 for now, check existence of hard-coded ascii hyphen,
204 ie. preproc needs to match all UTF8 hyphens to the ascii
205 hyphen. */
206 /* @todo : consider specifying special char(s) in lingware. */
207 specchar = FALSE;
208 i = 0;
209 while ((i < graphlen) && (!specchar)) {
210 if (graph[i++] == '-') {
211 specchar = TRUE;
212 }
213 }
214
215 /* construct input vector, which is set in dtposp */
216 if (!picokdt_dtPosPconstructInVec(wa->dtposp, graph, graphlen, specchar)) {
217 /* error constructing invec */
218 PICODBG_WARN(("problem with invec"));
219 picoos_emRaiseWarning(this->common->em, PICO_WARN_INVECTOR, NULL, NULL);
220 return PICODATA_ITEMINFO1_ERR;
221 }
222
223 /* classify */
224 if (!picokdt_dtPosPclassify(wa->dtposp)) {
225 /* error doing classification */
226 PICODBG_WARN(("problem classifying"));
227 picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
228 NULL, NULL);
229 return PICODATA_ITEMINFO1_ERR;
230 }
231
232 /* decompose */
233 if (!picokdt_dtPosPdecomposeOutClass(wa->dtposp, &dtres)) {
234 /* error decomposing */
235 PICODBG_WARN(("problem decomposing"));
236 picoos_emRaiseWarning(this->common->em, PICO_WARN_OUTVECTOR,
237 NULL, NULL);
238 return PICODATA_ITEMINFO1_ERR;
239 }
240
241 if (dtres.set) {
242 PICODBG_DEBUG(("class %d", dtres.class));
243 return (picoos_uint8)dtres.class;
244 } else {
245 PICODBG_WARN(("result not set"));
246 picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
247 NULL, NULL);
248 return PICODATA_ITEMINFO1_ERR;
249 }
250 }
251
252
waProcessWordgraph(register picodata_ProcessingUnit this,register wa_subobj_t * wa,picodata_itemhead_t * head,const picoos_uint8 * content)253 static pico_status_t waProcessWordgraph(register picodata_ProcessingUnit this,
254 register wa_subobj_t *wa /*inout*/,
255 picodata_itemhead_t *head /*inout*/,
256 const picoos_uint8 *content) {
257 pico_status_t status;
258 picoklex_lexl_result_t lexres;
259 picoos_uint8 posbuf[PICOKTAB_MAXNRPOS_IN_COMB];
260 picoos_uint8 i;
261 picoos_uint8 foundIndex;
262 picoos_bool found;
263
264
265 PICODBG_DEBUG(("type %c, len %d", head->type, head->len));
266
267 /* do lookup
268 if no entry found:
269 do POS prediction: -> WORDGRAPH(POSes,NA)graph
270 else:
271 if incl-phone:
272 N entries possible -> WORDINDEX(POSes,NA)POS1|ind1...POSN|indN
273 (N in {1,...,PICOKLEX_MAX_NRRES}, now up to 4)
274 else:
275 no phone, one entry -> WORDGRAPH(POS,NA)graph
276 */
277
278 found = FALSE;
279 i = 0;
280 while (!found && (i < wa->numUlex)) {
281 found = picoklex_lexLookup(wa->ulex[i], content, head->len, &lexres);
282 i++;
283 }
284 /* note that if found, i will be incremented nevertheless, so i >= 1 */
285 if (found) {
286 foundIndex = i;
287 } else {
288 foundIndex = 0;
289 }
290 if (!found && !picoklex_lexLookup(wa->lex, content, head->len, &lexres)) {
291 /* no lex entry found, WORDGRAPH(POS,NA)graph */
292 if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
293 wa->outBuf, wa->outBufSize,
294 &wa->outLen)) {
295 wa->inLen = 0;
296 /* predict and modify pos in info1 */
297 if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
298 waClassifyPos(this, wa, content, head->len))) {
299 return picoos_emRaiseException(this->common->em,
300 PICO_EXC_BUF_OVERFLOW,NULL,NULL);
301 }
302 }
303
304 } else { /* at least one entry found */
305 PICODBG_DEBUG(("at least one entry found in lexicon %i",foundIndex));
306 if (lexres.phonfound) { /* incl. ind-phone and possibly multi-ent. */
307 if (lexres.nrres > PICOKLEX_MAX_NRRES) {
308 /* not possible with system lexicon, needs to be
309 ensured for user lex too */
310 picoos_emRaiseWarning(this->common->em, PICO_WARN_FALLBACK,NULL,
311 (picoos_char *)"using %d lexicon lookup results",
312 PICOKLEX_MAX_NRRES);
313 lexres.nrres = PICOKLEX_MAX_NRRES;
314 }
315 head->type = PICODATA_ITEM_WORDINDEX;
316 if (lexres.nrres == 1) {
317 head->info1 = lexres.posind[0];
318 } else {
319 /* more than one result, POSgroup info needs to be
320 determined for later POS disambiguation */
321 for (i = 0; i < lexres.nrres; i++) {
322 posbuf[i] = lexres.posind[i * PICOKLEX_POSIND_SIZE];
323 }
324 head->info1 = picoktab_getPosGroup(wa->tabpos, posbuf,
325 lexres.nrres);
326 }
327 head->info2 = foundIndex;
328 head->len = lexres.posindlen;
329 if ((status = picodata_put_itemparts(head, lexres.posind,
330 lexres.posindlen,
331 wa->outBuf, wa->outBufSize,
332 &wa->outLen)) == PICO_OK) {
333 wa->inLen = 0;
334 } else {
335 return picoos_emRaiseException(this->common->em, status,
336 NULL, NULL);
337 }
338
339 } else { /* no phone, :G2P, one entry: WORDGRAPH(POS,NA)graph */
340 if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
341 wa->outBuf, wa->outBufSize,
342 &wa->outLen)) {
343 wa->inLen = 0;
344 /* set lex pos in info1 */
345 if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
346 lexres.posind[0])) {
347 return picoos_emRaiseException(this->common->em,
348 PICO_EXC_BUF_OVERFLOW,
349 NULL, NULL);
350 }
351 }
352 }
353 }
354 return PICO_OK;
355 }
356
357
358 /* ***********************************************************************/
359 /* waStep function */
360 /* ***********************************************************************/
361
362 /*
363 collect into internal buffer, process, and then feed to output buffer
364
365 init state: COLLECT ext ext
366 state transitions: in IN OUTout
367 COLLECT | getOneItem ->-1 +1 0 0 | (ATOMIC) -> PROCESS (got item)
368 COLLECT | getOneItem -> 0 0 0 0 | IDLE (got no item)
369
370 PROCESS | procOneItem -> 0 -1 +1 0 | (ATOMIC) -> FEED (proc'ed item)
371 PROCESS | procOneItem -> 0 -1 0 0 | BUSY -> COLLECT (item skipped)
372
373 FEED | putOneItem -> 0 0 -1 +1 | BUSY -> COLLECT (put item)
374 FEED | putOneItem -> 0 0 1 0 | OUT_FULL (put no item)
375 */
376
waStep(register picodata_ProcessingUnit this,picoos_int16 mode,picoos_uint16 * numBytesOutput)377 static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
378 picoos_int16 mode,
379 picoos_uint16 * numBytesOutput) {
380 register wa_subobj_t *wa;
381 pico_status_t rv = PICO_OK;
382
383 if (NULL == this || NULL == this->subObj) {
384 return PICODATA_PU_ERROR;
385 }
386 wa = (wa_subobj_t *) this->subObj;
387 mode = mode; /* avoid warning "var not used in this function"*/
388 *numBytesOutput = 0;
389 while (1) { /* exit via return */
390 PICODBG_DEBUG(("doing state %i, inLen: %d, outLen: %d",
391 wa->procState, wa->inLen, wa->outLen));
392
393 switch (wa->procState) {
394 /* collect state: get item from charBuf and store in
395 * internal inBuf
396 */
397 case WA_STEPSTATE_COLLECT:
398 if (wa->inLen == 0) { /* is input buffer empty? */
399 picoos_uint16 blen;
400 /* try to get one item */
401 rv = picodata_cbGetItem(this->cbIn, wa->inBuf,
402 wa->inBufSize, &blen);
403 PICODBG_DEBUG(("after getting item, status: %d", rv));
404 if (PICO_OK == rv) {
405 /* we now have one item */
406 wa->inLen = blen;
407 wa->procState = WA_STEPSTATE_PROCESS;
408 /* uncomment next line to split into two steps */
409 /* return PICODATA_PU_ATOMIC; */
410 } else if (PICO_EOF == rv) {
411 /* there was no item in the char buffer */
412 return PICODATA_PU_IDLE;
413 } else if ((PICO_EXC_BUF_UNDERFLOW == rv)
414 || (PICO_EXC_BUF_OVERFLOW == rv)) {
415 PICODBG_ERROR(("problem getting item"));
416 picoos_emRaiseException(this->common->em, rv,
417 NULL, NULL);
418 return PICODATA_PU_ERROR;
419 } else {
420 PICODBG_ERROR(("problem getting item, unhandled"));
421 picoos_emRaiseException(this->common->em, rv,
422 NULL, NULL);
423 return PICODATA_PU_ERROR;
424 }
425 } else { /* there already is an item in the input buffer */
426 PICODBG_WARN(("item already in input buffer"));
427 picoos_emRaiseWarning(this->common->em,
428 PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
429 wa->procState = WA_STEPSTATE_PROCESS;
430 /* uncomment next to split into two steps */
431 /* return PICODATA_PU_ATOMIC; */
432 }
433 break;
434
435
436 /* process state: process item in internal inBuf and put
437 * result in internal outBuf
438 */
439 case WA_STEPSTATE_PROCESS:
440
441 /* ensure there is an item in inBuf and it is valid */
442 if ((wa->inLen > 0) && picodata_is_valid_item(wa->inBuf,
443 wa->inLen)) {
444 picodata_itemhead_t ihead;
445 picoos_uint8 *icontent;
446 pico_status_t rvP = PICO_OK;
447
448 rv = picodata_get_iteminfo(wa->inBuf, wa->inLen, &ihead,
449 &icontent);
450 if (PICO_OK == rv) {
451
452 switch (ihead.type) {
453 case PICODATA_ITEM_WORDGRAPH:
454
455 if (0 < ihead.len) {
456 rvP = waProcessWordgraph(this, wa, &ihead,
457 icontent);
458 } else {
459 /* else ignore empty WORDGRAPH */
460 wa->inLen = 0;
461 wa->procState = WA_STEPSTATE_COLLECT;
462 return PICODATA_PU_BUSY;
463 }
464 break;
465 case PICODATA_ITEM_OTHER:
466 /* skip item */
467 rvP = PICO_WARN_PU_DISCARD_BUF;
468 break;
469 default:
470 /* copy item unmodified */
471 rvP = picodata_copy_item(wa->inBuf,
472 wa->inLen, wa->outBuf,
473 wa->outBufSize, &wa->outLen);
474 break;
475 }
476
477 if (PICO_OK == rvP) {
478 wa->inLen = 0;
479 wa->procState = WA_STEPSTATE_FEED;
480 /* uncomment next to split into two steps */
481 /* return PICODATA_PU_ATOMIC; */
482 } else if (PICO_WARN_PU_DISCARD_BUF == rvP) {
483 /* discard input buffer and get a new item */
484 PICODBG_INFO(("skipping OTHER item"));
485 /* picoos_emRaiseWarning(this->common->em,
486 PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
487 */
488 wa->inLen = 0;
489 wa->procState = WA_STEPSTATE_COLLECT;
490 return PICODATA_PU_BUSY;
491 } else {
492 /* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf
493 PICO_ERR_OTHER <- no valid item in inbuf
494 or return from processWordgraph
495 */
496 PICODBG_ERROR(("problem processing item", rvP));
497 picoos_emRaiseException(this->common->em, rvP,
498 NULL, NULL);
499 return PICODATA_PU_ERROR;
500 }
501
502 } else { /* could not get iteminfo */
503 /* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf
504 PICO_ERR_OTHER <- no valid item in inbuf
505 */
506 PICODBG_ERROR(("problem getting item info, "
507 "discard buffer content"));
508 wa->inLen = 0;
509 wa->procState = WA_STEPSTATE_COLLECT;
510 picoos_emRaiseException(this->common->em, rv,
511 NULL, NULL);
512 return PICODATA_PU_ERROR;
513 }
514
515 } else if (wa->inLen == 0) { /* no item in inBuf */
516 PICODBG_INFO(("no item in inBuf"));
517 /* wa->inLen = 0;*/
518 wa->procState = WA_STEPSTATE_COLLECT;
519 return PICODATA_PU_BUSY;
520
521 } else { /* no valid item in inBuf */
522 /* bad state/item, discard buffer content */
523 PICODBG_WARN(("no valid item, discard buffer content"));
524 picoos_emRaiseWarning(this->common->em,
525 PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
526 picoos_emRaiseWarning(this->common->em,
527 PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
528 wa->inLen = 0;
529 wa->procState = WA_STEPSTATE_COLLECT;
530 return PICODATA_PU_BUSY;
531 }
532 break;
533
534
535 /* feed state: copy item in internal outBuf to output charBuf */
536 case WA_STEPSTATE_FEED:
537
538 /* check that item fits in cb should not be needed */
539 rv = picodata_cbPutItem(this->cbOut, wa->outBuf,
540 wa->outLen, numBytesOutput);
541
542 PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG],
543 (picoos_uint8 *)"wana: ", wa->outBuf,
544 wa->outLen);
545
546 PICODBG_DEBUG(("put item, status: %d", rv));
547 if (PICO_OK == rv) {
548 wa->outLen = 0;
549 wa->procState = WA_STEPSTATE_COLLECT;
550 return PICODATA_PU_BUSY;
551 } else if (PICO_EXC_BUF_OVERFLOW == rv) {
552 PICODBG_INFO(("feeding, overflow, PICODATA_PU_OUT_FULL"));
553 return PICODATA_PU_OUT_FULL;
554 } else if ((PICO_EXC_BUF_UNDERFLOW == rv)
555 || (PICO_ERR_OTHER == rv)) {
556 PICODBG_WARN(("feeding problem, discarding item"));
557 wa->outLen = 0;
558 wa->procState = WA_STEPSTATE_COLLECT;
559 picoos_emRaiseWarning(this->common->em, rv, NULL,NULL);
560 return PICODATA_PU_BUSY;
561 }
562 break;
563
564 default:
565 break;
566
567 } /* switch */
568
569 } /* while */
570
571 /* should be never reached */
572 PICODBG_ERROR(("reached end of function"));
573 picoos_emRaiseException(this->common->em, PICO_ERR_OTHER, NULL, NULL);
574 return PICODATA_PU_ERROR;
575 }
576
577 #ifdef __cplusplus
578 }
579 #endif
580
581
582 /* end */
583