1 /*
2  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 /**
17  * @file picoklex.c
18  *
19  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
20  * All rights reserved.
21  *
22  * History:
23  * - 2009-04-20 -- initial version
24  *
25  */
26 #include "picoos.h"
27 #include "picodbg.h"
28 #include "picodata.h"
29 #include "picoknow.h"
30 #include "picoklex.h"
31 
32 #ifdef __cplusplus
33 extern "C" {
34 #endif
35 #if 0
36 }
37 #endif
38 
39 /* ************************************************************/
40 /* lexicon */
41 /* ************************************************************/
42 
43 /**
44  * @addtogroup picolex
45  *
46   overview:
47   - lex consists of optional searchindex and a non-empty list of lexblocks
48   - lexblocks are fixed size, at the start of a block there is also the
49     start of an entry
50   - using the searchindex a unambiguous lexblock can be determined which
51     contains the entry (or there is no entry)
52   - one lex entry has POS GRAPH PHON, all mandatory, but
53     - PHON can be empty string -> no pronunciation in the resulting TTS output
54     - PHON can be :G2P -> use G2P later to add pronunciation
55   - (POS,GRAPH) is a uniq key (only one entry allowed)
56   - (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and
57     differing POS and differing PHON possible)
58     - for one graph we can have two or three solutions from the lex
59        which all need to be passed on the the next PU
60     - in this case GRAPH, POS, and PHON all must be available in lex
61 
62   sizing:
63     - 3 bytes entry index -> 16MB addressable
64     - 2 bytes searchindex nr -> 64K blocks possible
65     - 5 bytes per searchindex entry
66       - 3 bytes for graph-prefix
67       - 2 bytes blockadr in searchindex -> 64K blocks possible
68     - lexblock size 512B:
69       - 32M possible
70       - with ~20 bytes per entry
71         -> max. average of ~26 entries to be searched per lookup
72     - overhead of ~10 bytes per block to sync with
73       block boundaries
74     - examples:
75       - 500KB lex -> 1000 blocks,
76         1000 entries in searchindex, ~25.6K lex-entries,
77         - ~5KB searchindex
78            ~10KB overhead for block sync
79       - 100KB lex -> 200 blocks,
80         200 entries in searchindex, ~5.1K lex-entries,
81         - ~1KB searchindex
82            ~2KB overhead for block sync
83 
84   pil-file: lexicon knowledge base in binary form
85 
86     lex-kb = content
87 
88     content = searchindex {lexblock}1:NRBLOCKS2
89 
90     lexblock = {lexentry}1:        (lexblock size is fixed 512Bytes)
91 
92     searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2
93 
94     lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1
95                LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2
96 
97     - special cases:
98       - PHON is empty string (no pronunciation in the resulting TTS output):
99         lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1  2 POS1
100       - PHON can be :G2P -> use G2P later to add pronunciation:
101         lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1  3 POS1 <reserved-phon-val=5>
102     - multi-byte values always little endian
103 */
104 
105 
106 /* ************************************************************/
107 /* lexicon data defines */
108 /* may not be changed with current implementation */
109 /* ************************************************************/
110 
111 /* nr bytes of nrblocks info */
112 #define PICOKLEX_LEX_NRBLOCKS_SIZE 2
113 
114 /* search index entry: - nr graphs
115                        - nr bytes of block index
116                        - nr bytes per entry, NRGRAPHS*INDSIZE */
117 #define PICOKLEX_LEX_SIE_NRGRAPHS  3
118 #define PICOKLEX_LEX_SIE_INDSIZE   2
119 #define PICOKLEX_LEX_SIE_SIZE      5
120 
121 /* nr of bytes per lexblock */
122 #define PICOKLEX_LEXBLOCK_SIZE   512
123 
124 
125 /* reserved values in klex to indicate :G2P needed for a lexentry */
126 #define PICOKLEX_NEEDS_G2P   5
127 
128 
129 /* ************************************************************/
130 /* lexicon type and loading */
131 /* ************************************************************/
132 
133 /** object       : LexKnowledgeBase
134  *  shortcut     : klex
135  *  derived from : picoknow_KnowledgeBase
136  */
137 
138 typedef struct klex_subobj *klex_SubObj;
139 
140 typedef struct klex_subobj
141 {
142     picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */
143     picoos_uint8 *searchind;
144     picoos_uint8 *lexblocks;
145 } klex_subobj_t;
146 
147 
klexInitialize(register picoknow_KnowledgeBase this,picoos_Common common)148 static pico_status_t klexInitialize(register picoknow_KnowledgeBase this,
149                                     picoos_Common common)
150 {
151     picoos_uint32 curpos = 0;
152     klex_subobj_t *klex;
153 
154     PICODBG_DEBUG(("start"));
155 
156     /* check whether (this->size != 0) done before calling this function */
157 
158     if (NULL == this || NULL == this->subObj) {
159         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
160                                        NULL, NULL);
161     }
162     klex = (klex_subobj_t *) this->subObj;
163 
164     if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos,
165                                              &(klex->nrblocks))) {
166         if (klex->nrblocks > 0) {
167             PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos));
168             klex->searchind = this->base + curpos;
169         } else {
170             klex->searchind = NULL;
171         }
172         klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE +
173                              (klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE));
174         return PICO_OK;
175     } else {
176         return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT,
177                                        NULL, NULL);
178     }
179 }
180 
181 
klexSubObjDeallocate(register picoknow_KnowledgeBase this,picoos_MemoryManager mm)182 static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this,
183                                           picoos_MemoryManager mm)
184 {
185     if (NULL != this) {
186         picoos_deallocate(mm, (void *) &this->subObj);
187     }
188     return PICO_OK;
189 }
190 
191 
192 /* we don't offer a specialized constructor for a LexKnowledgeBase but
193  * instead a "specializer" of an allready existing generic
194  * picoknow_KnowledgeBase */
195 
picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,picoos_Common common)196 pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,
197                                                   picoos_Common common)
198 {
199     if (NULL == this) {
200         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
201                                        NULL, NULL);
202     }
203     if (this->size > 0) {
204         this->subDeallocate = klexSubObjDeallocate;
205         this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t));
206         if (NULL == this->subObj) {
207             return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
208                                            NULL, NULL);
209         }
210         return klexInitialize(this, common);
211     } else {
212         /* some dummy klex */
213         return PICO_OK;
214     }
215 }
216 
217 /* for now we don't need to do anything special for the main lex */
218 /*
219 pico_status_t picoklex_specializeMainLexKnowledgeBase(
220         picoknow_KnowledgeBase this,
221         picoos_Common common)
222 {
223     return picoklex_specializeLexKnowledgeBase(this,common);
224 }
225 */
226 
227 
228 /* ************************************************************/
229 /* lexicon getLex */
230 /* ************************************************************/
231 
picoklex_getLex(picoknow_KnowledgeBase this)232 picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this)
233 {
234     if (NULL == this) {
235         return NULL;
236     } else {
237         return (picoklex_Lex) this->subObj;
238     }
239 }
240 
241 
242 /* ************************************************************/
243 /* functions on searchindex */
244 /* ************************************************************/
245 
246 
klex_getSearchIndexVal(const klex_SubObj this,picoos_uint16 index)247 static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this,
248                                             picoos_uint16 index)
249 {
250     picoos_uint32 pos, val;
251     pos = index * PICOKLEX_LEX_SIE_SIZE;
252     val = this->searchind[pos];
253     val = (val << 8) + this->searchind[pos + 1];
254     val = (val << 8) + this->searchind[pos + 2];
255     return val;
256 }
257 
258 
259 /* Determine first lexblock containing entries for specified
260    grapheme. */
261 
klex_getLexblockNr(const klex_SubObj this,const picoos_uint8 * graphsi)262 static picoos_uint16 klex_getLexblockNr(const klex_SubObj this,
263                                         const picoos_uint8 *graphsi) {
264     /* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */
265     picoos_int32 low, mid, high;
266     picoos_uint32 searchval, indval;
267 
268     /* PICOKLEX_LEX_SIE_NRGRAPHS */
269 
270     /* convert graph-prefix to number with 'lexicographic' ordering */
271     searchval = graphsi[0];
272     searchval = (searchval << 8) + graphsi[1];
273     searchval = (searchval << 8) + graphsi[2];
274 
275     low = 0;
276     high = this->nrblocks;
277 
278     /* do binary search */
279     while (low < high) {
280         mid = (low + high) / 2;
281         indval = klex_getSearchIndexVal(this, mid);
282         if (indval < searchval) {
283             low = mid + 1;
284         } else {
285             high = mid;
286         }
287     }
288     PICODBG_ASSERT(high == low);
289     /* low points to the first entry greater than or equal to searchval */
290 
291     if (low < this->nrblocks) {
292         indval = klex_getSearchIndexVal(this, low);
293         if (indval > searchval) {
294             low--;
295             /* if there are identical elements in the search index we have
296                to move to the first one */
297             if (low > 0) {
298                 indval = klex_getSearchIndexVal(this, low);
299                 while (indval == klex_getSearchIndexVal(this, low-1)) {
300                     low--;
301                 }
302             }
303         }
304     } else {
305         low = this->nrblocks - 1;
306     }
307 
308 #if defined(PICO_DEBUG)
309     {
310         picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE;
311         PICODBG_DEBUG(("binary search result is %c%c%c (%d)",
312                        this->searchind[pos], this->searchind[pos + 1],
313                        this->searchind[pos + 2], low));
314     }
315 #endif
316 
317     return (picoos_uint16) low;
318 }
319 
320 
321 /* Determine number of adjacent lexblocks containing entries for
322    the same grapheme search prefix (identified by search index). */
323 
klex_getLexblockRange(const klex_SubObj this,picoos_uint16 index)324 static picoos_uint16 klex_getLexblockRange(const klex_SubObj this,
325                                            picoos_uint16 index)
326 {
327     picoos_uint16 count;
328     picoos_uint32 sval1, sval2;
329 
330     sval1 = klex_getSearchIndexVal(this, index);
331 
332 #if defined(PICO_DEBUG)
333     /* 'index' must point to first lexblock of its kind */
334     if (index > 0) {
335         sval2 = klex_getSearchIndexVal(this, index - 1);
336         PICODBG_ASSERT(sval1 != sval2);
337     }
338 #endif
339 
340     index++;
341     sval2 = klex_getSearchIndexVal(this, index);
342 
343     count = 1;
344     while (sval1 == sval2) {
345         count++;
346         index++;
347         sval2 = klex_getSearchIndexVal(this, index);
348     }
349 
350     return count;
351 }
352 
353 
354 /* ************************************************************/
355 /* functions on single lexblock */
356 /* ************************************************************/
357 
klex_lexMatch(picoos_uint8 * lexentry,const picoos_uint8 * graph,const picoos_uint16 graphlen)358 static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry,
359                                  const picoos_uint8 *graph,
360                                  const picoos_uint16 graphlen) {
361     picoos_uint8 i;
362     picoos_uint8 lexlen;
363     picoos_uint8 *lexgraph;
364 
365     lexlen = lexentry[0] - 1;
366     lexgraph = &(lexentry[1]);
367     for (i=0; (i<graphlen) && (i<lexlen); i++) {
368         PICODBG_TRACE(("%d|%d  graph|lex: %c|%c", graphlen, lexlen,
369                        graph[i], lexgraph[i]));
370         if (lexgraph[i] < graph[i]) {
371             return -1;
372         } else if (lexgraph[i] > graph[i]) {
373             return 1;
374         }
375     }
376     if (graphlen == lexlen) {
377         return 0;
378     } else if (lexlen < graphlen) {
379         return -1;
380     } else {
381         return 1;
382     }
383 }
384 
385 
klex_setLexResult(const picoos_uint8 * lexentry,const picoos_uint32 lexpos,picoklex_lexl_result_t * lexres)386 static void klex_setLexResult(const picoos_uint8 *lexentry,
387                               const picoos_uint32 lexpos,
388                               picoklex_lexl_result_t *lexres) {
389     picoos_uint8 i;
390 
391     /* check if :G2P */
392     if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) {
393         /* set pos */
394         lexres->posind[0] = lexentry[lexentry[0] + 1];
395         /* set rest */
396         lexres->phonfound = FALSE;
397         lexres->posindlen = 1;
398         lexres->nrres = 1;
399         PICODBG_DEBUG(("result %d :G2P", lexres->nrres));
400     } else {
401         i = lexres->nrres * (PICOKLEX_POSIND_SIZE);
402         lexres->posindlen += PICOKLEX_POSIND_SIZE;
403         lexres->phonfound = TRUE;
404         /* set pos */
405         lexres->posind[i++] = lexentry[lexentry[0] + 1];
406         /* set ind, PICOKLEX_IND_SIZE */
407         lexres->posind[i++] = 0x000000ff & (lexpos);
408         lexres->posind[i++] = 0x000000ff & (lexpos >>  8);
409         lexres->posind[i]   = 0x000000ff & (lexpos >> 16);
410         lexres->nrres++;
411         PICODBG_DEBUG(("result %d", lexres->nrres));
412     }
413 }
414 
415 
klex_lexblockLookup(klex_SubObj this,const picoos_uint32 lexposStart,const picoos_uint32 lexposEnd,const picoos_uint8 * graph,const picoos_uint16 graphlen,picoklex_lexl_result_t * lexres)416 static void klex_lexblockLookup(klex_SubObj this,
417                                 const picoos_uint32 lexposStart,
418                                 const picoos_uint32 lexposEnd,
419                                 const picoos_uint8 *graph,
420                                 const picoos_uint16 graphlen,
421                                 picoklex_lexl_result_t *lexres) {
422     picoos_uint32 lexpos;
423     picoos_int8 rv;
424 
425     lexres->nrres = 0;
426 
427     lexpos = lexposStart;
428     rv = -1;
429     while ((rv < 0) && (lexpos < lexposEnd)) {
430 
431         rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen);
432 
433         if (rv == 0) { /* found */
434             klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres);
435             if (lexres->phonfound) {
436                 /* look for more results, up to MAX_NRRES, don't even
437                    check if more results would be available */
438                 while ((lexres->nrres < PICOKLEX_MAX_NRRES) &&
439                        (lexpos < lexposEnd)) {
440                     lexpos += this->lexblocks[lexpos];
441                     lexpos += this->lexblocks[lexpos];
442                     /* if there are no more entries in this block, advance
443                        to next block by skipping all zeros */
444                     while ((this->lexblocks[lexpos] == 0) &&
445                            (lexpos < lexposEnd)) {
446                         lexpos++;
447                     }
448                     if (lexpos < lexposEnd) {
449                         if (klex_lexMatch(&(this->lexblocks[lexpos]), graph,
450                                           graphlen) == 0) {
451                             klex_setLexResult(&(this->lexblocks[lexpos]),
452                                               lexpos, lexres);
453                         } else {
454                             /* no more results, quit loop */
455                             lexpos = lexposEnd;
456                         }
457                     }
458                 }
459             } else {
460                 /* :G2P mark */
461             }
462         } else if (rv < 0) {
463             /* not found, goto next entry */
464             lexpos += this->lexblocks[lexpos];
465             lexpos += this->lexblocks[lexpos];
466             /* if there are no more entries in this block, advance
467                to next block by skipping all zeros */
468             while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) {
469                 lexpos++;
470             }
471         } else {
472             /* rv > 0, not found, won't show up later in block */
473         }
474     }
475 }
476 
477 
478 /* ************************************************************/
479 /* lexicon lookup functions */
480 /* ************************************************************/
481 
picoklex_lexLookup(const picoklex_Lex this,const picoos_uint8 * graph,const picoos_uint16 graphlen,picoklex_lexl_result_t * lexres)482 picoos_uint8 picoklex_lexLookup(const picoklex_Lex this,
483                                 const picoos_uint8 *graph,
484                                 const picoos_uint16 graphlen,
485                                 picoklex_lexl_result_t *lexres) {
486     picoos_uint16 lbnr, lbc;
487     picoos_uint32 lexposStart, lexposEnd;
488     picoos_uint8 i;
489     picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS];
490     klex_SubObj klex = (klex_SubObj) this;
491 
492     if (NULL == klex) {
493         PICODBG_ERROR(("no lexicon loaded"));
494         /* no exception here needed, already checked at initialization */
495         return FALSE;
496     }
497 
498     lexres->nrres = 0;
499     lexres->posindlen = 0;
500     lexres->phonfound = FALSE;
501 
502     for (i = 0; i<PICOKLEX_LEX_SIE_NRGRAPHS; i++) {
503         if (i < graphlen) {
504             tgraph[i] = graph[i];
505         } else {
506             tgraph[i] = '\0';
507         }
508     }
509     PICODBG_DEBUG(("tgraph: %c%c%c", tgraph[0],tgraph[1],tgraph[2]));
510 
511     if ((klex->nrblocks) == 0) {
512         /* no searchindex, no lexblock */
513         PICODBG_WARN(("no searchindex, no lexblock"));
514         return FALSE;
515     } else {
516         lbnr = klex_getLexblockNr(klex, tgraph);
517         PICODBG_ASSERT(lbnr < klex->nrblocks);
518         lbc = klex_getLexblockRange(klex, lbnr);
519         PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks));
520     }
521     PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc));
522 
523     lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE;
524     lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE;
525 
526     PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd));
527     klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres);
528     PICODBG_DEBUG(("lookup done, %d found", lexres->nrres));
529 
530     return (lexres->nrres > 0);
531 }
532 
533 
picoklex_lexIndLookup(const picoklex_Lex this,const picoos_uint8 * ind,const picoos_uint8 indlen,picoos_uint8 * pos,picoos_uint8 ** phon,picoos_uint8 * phonlen)534 picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this,
535                                    const picoos_uint8 *ind,
536                                    const picoos_uint8 indlen,
537                                    picoos_uint8 *pos,
538                                    picoos_uint8 **phon,
539                                    picoos_uint8 *phonlen) {
540     picoos_uint32 pentry;
541     klex_SubObj klex = (klex_SubObj) this;
542 
543     /* check indlen */
544     if (indlen != PICOKLEX_IND_SIZE) {
545         return FALSE;
546     }
547 
548     /* PICOKLEX_IND_SIZE */
549     pentry = 0x000000ff & (ind[0]);
550     pentry |= ((picoos_uint32)(ind[1]) <<  8);
551     pentry |= ((picoos_uint32)(ind[2]) << 16);
552 
553     /* check ind if it is within lexblocks byte stream, if not, return FALSE */
554     if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) {
555         return FALSE;
556     }
557 
558     pentry += (klex->lexblocks[pentry]);
559     *phonlen = (klex->lexblocks[pentry++]) - 2;
560     *pos = klex->lexblocks[pentry++];
561     *phon = &(klex->lexblocks[pentry]);
562 
563     PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen));
564     return TRUE;
565 }
566 
567 #ifdef __cplusplus
568 }
569 #endif
570 
571 
572 /* end */
573