1/*
2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16/**
17 * @file picoklex.c
18 *
19 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
20 * All rights reserved.
21 *
22 * History:
23 * - 2009-04-20 -- initial version
24 *
25 */
26#include "picoos.h"
27#include "picodbg.h"
28#include "picodata.h"
29#include "picoknow.h"
30#include "picoklex.h"
31
32#ifdef __cplusplus
33extern "C" {
34#endif
35#if 0
36}
37#endif
38
39/* ************************************************************/
40/* lexicon */
41/* ************************************************************/
42
43/**
44 * @addtogroup picolex
45 *
46  overview:
47  - lex consists of optional searchindex and a non-empty list of lexblocks
48  - lexblocks are fixed size, at the start of a block there is also the
49    start of an entry
50  - using the searchindex a unambiguous lexblock can be determined which
51    contains the entry (or there is no entry)
52  - one lex entry has POS GRAPH PHON, all mandatory, but
53    - PHON can be empty string -> no pronunciation in the resulting TTS output
54    - PHON can be :G2P -> use G2P later to add pronunciation
55  - (POS,GRAPH) is a uniq key (only one entry allowed)
56  - (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and
57    differing POS and differing PHON possible)
58    - for one graph we can have two or three solutions from the lex
59       which all need to be passed on the the next PU
60    - in this case GRAPH, POS, and PHON all must be available in lex
61
62  sizing:
63    - 3 bytes entry index -> 16MB addressable
64    - 2 bytes searchindex nr -> 64K blocks possible
65    - 5 bytes per searchindex entry
66      - 3 bytes for graph-prefix
67      - 2 bytes blockadr in searchindex -> 64K blocks possible
68    - lexblock size 512B:
69      - 32M possible
70      - with ~20 bytes per entry
71        -> max. average of ~26 entries to be searched per lookup
72    - overhead of ~10 bytes per block to sync with
73      block boundaries
74    - examples:
75      - 500KB lex -> 1000 blocks,
76        1000 entries in searchindex, ~25.6K lex-entries,
77        - ~5KB searchindex
78           ~10KB overhead for block sync
79      - 100KB lex -> 200 blocks,
80        200 entries in searchindex, ~5.1K lex-entries,
81        - ~1KB searchindex
82           ~2KB overhead for block sync
83
84  pil-file: lexicon knowledge base in binary form
85
86    lex-kb = content
87
88    content = searchindex {lexblock}1:NRBLOCKS2
89
90    lexblock = {lexentry}1:        (lexblock size is fixed 512Bytes)
91
92    searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2
93
94    lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1
95               LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2
96
97    - special cases:
98      - PHON is empty string (no pronunciation in the resulting TTS output):
99        lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1  2 POS1
100      - PHON can be :G2P -> use G2P later to add pronunciation:
101        lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1  3 POS1 <reserved-phon-val=5>
102    - multi-byte values always little endian
103*/
104
105
106/* ************************************************************/
107/* lexicon data defines */
108/* may not be changed with current implementation */
109/* ************************************************************/
110
111/* nr bytes of nrblocks info */
112#define PICOKLEX_LEX_NRBLOCKS_SIZE 2
113
114/* search index entry: - nr graphs
115                       - nr bytes of block index
116                       - nr bytes per entry, NRGRAPHS*INDSIZE */
117#define PICOKLEX_LEX_SIE_NRGRAPHS  3
118#define PICOKLEX_LEX_SIE_INDSIZE   2
119#define PICOKLEX_LEX_SIE_SIZE      5
120
121/* nr of bytes per lexblock */
122#define PICOKLEX_LEXBLOCK_SIZE   512
123
124
125/* reserved values in klex to indicate :G2P needed for a lexentry */
126#define PICOKLEX_NEEDS_G2P   5
127
128
129/* ************************************************************/
130/* lexicon type and loading */
131/* ************************************************************/
132
133/** object       : LexKnowledgeBase
134 *  shortcut     : klex
135 *  derived from : picoknow_KnowledgeBase
136 */
137
138typedef struct klex_subobj *klex_SubObj;
139
140typedef struct klex_subobj
141{
142    picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */
143    picoos_uint8 *searchind;
144    picoos_uint8 *lexblocks;
145} klex_subobj_t;
146
147
148static pico_status_t klexInitialize(register picoknow_KnowledgeBase this,
149                                    picoos_Common common)
150{
151    picoos_uint32 curpos = 0;
152    klex_subobj_t *klex;
153
154    PICODBG_DEBUG(("start"));
155
156    /* check whether (this->size != 0) done before calling this function */
157
158    if (NULL == this || NULL == this->subObj) {
159        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
160                                       NULL, NULL);
161    }
162    klex = (klex_subobj_t *) this->subObj;
163
164    if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos,
165                                             &(klex->nrblocks))) {
166        if (klex->nrblocks > 0) {
167            PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos));
168            klex->searchind = this->base + curpos;
169        } else {
170            klex->searchind = NULL;
171        }
172        klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE +
173                             (klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE));
174        return PICO_OK;
175    } else {
176        return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT,
177                                       NULL, NULL);
178    }
179}
180
181
182static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this,
183                                          picoos_MemoryManager mm)
184{
185    if (NULL != this) {
186        picoos_deallocate(mm, (void *) &this->subObj);
187    }
188    return PICO_OK;
189}
190
191
192/* we don't offer a specialized constructor for a LexKnowledgeBase but
193 * instead a "specializer" of an allready existing generic
194 * picoknow_KnowledgeBase */
195
196pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,
197                                                  picoos_Common common)
198{
199    if (NULL == this) {
200        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
201                                       NULL, NULL);
202    }
203    if (this->size > 0) {
204        this->subDeallocate = klexSubObjDeallocate;
205        this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t));
206        if (NULL == this->subObj) {
207            return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
208                                           NULL, NULL);
209        }
210        return klexInitialize(this, common);
211    } else {
212        /* some dummy klex */
213        return PICO_OK;
214    }
215}
216
217/* for now we don't need to do anything special for the main lex */
218/*
219pico_status_t picoklex_specializeMainLexKnowledgeBase(
220        picoknow_KnowledgeBase this,
221        picoos_Common common)
222{
223    return picoklex_specializeLexKnowledgeBase(this,common);
224}
225*/
226
227
228/* ************************************************************/
229/* lexicon getLex */
230/* ************************************************************/
231
232picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this)
233{
234    if (NULL == this) {
235        return NULL;
236    } else {
237        return (picoklex_Lex) this->subObj;
238    }
239}
240
241
242/* ************************************************************/
243/* functions on searchindex */
244/* ************************************************************/
245
246
247static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this,
248                                            picoos_uint16 index)
249{
250    picoos_uint32 pos, val;
251    pos = index * PICOKLEX_LEX_SIE_SIZE;
252    val = this->searchind[pos];
253    val = (val << 8) + this->searchind[pos + 1];
254    val = (val << 8) + this->searchind[pos + 2];
255    return val;
256}
257
258
259/* Determine first lexblock containing entries for specified
260   grapheme. */
261
262static picoos_uint16 klex_getLexblockNr(const klex_SubObj this,
263                                        const picoos_uint8 *graphsi) {
264    /* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */
265    picoos_int32 low, mid, high;
266    picoos_uint32 searchval, indval;
267
268    /* PICOKLEX_LEX_SIE_NRGRAPHS */
269
270    /* convert graph-prefix to number with 'lexicographic' ordering */
271    searchval = graphsi[0];
272    searchval = (searchval << 8) + graphsi[1];
273    searchval = (searchval << 8) + graphsi[2];
274
275    low = 0;
276    high = this->nrblocks;
277
278    /* do binary search */
279    while (low < high) {
280        mid = (low + high) / 2;
281        indval = klex_getSearchIndexVal(this, mid);
282        if (indval < searchval) {
283            low = mid + 1;
284        } else {
285            high = mid;
286        }
287    }
288    PICODBG_ASSERT(high == low);
289    /* low points to the first entry greater than or equal to searchval */
290
291    if (low < this->nrblocks) {
292        indval = klex_getSearchIndexVal(this, low);
293        if (indval > searchval) {
294            low--;
295            /* if there are identical elements in the search index we have
296               to move to the first one */
297            if (low > 0) {
298                indval = klex_getSearchIndexVal(this, low);
299                while (indval == klex_getSearchIndexVal(this, low-1)) {
300                    low--;
301                }
302            }
303        }
304    } else {
305        low = this->nrblocks - 1;
306    }
307
308#if defined(PICO_DEBUG)
309    {
310        picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE;
311        PICODBG_DEBUG(("binary search result is %c%c%c (%d)",
312                       this->searchind[pos], this->searchind[pos + 1],
313                       this->searchind[pos + 2], low));
314    }
315#endif
316
317    return (picoos_uint16) low;
318}
319
320
321/* Determine number of adjacent lexblocks containing entries for
322   the same grapheme search prefix (identified by search index). */
323
324static picoos_uint16 klex_getLexblockRange(const klex_SubObj this,
325                                           picoos_uint16 index)
326{
327    picoos_uint16 count;
328    picoos_uint32 sval1, sval2;
329
330    sval1 = klex_getSearchIndexVal(this, index);
331
332#if defined(PICO_DEBUG)
333    /* 'index' must point to first lexblock of its kind */
334    if (index > 0) {
335        sval2 = klex_getSearchIndexVal(this, index - 1);
336        PICODBG_ASSERT(sval1 != sval2);
337    }
338#endif
339
340    index++;
341    sval2 = klex_getSearchIndexVal(this, index);
342
343    count = 1;
344    while (sval1 == sval2) {
345        count++;
346        index++;
347        sval2 = klex_getSearchIndexVal(this, index);
348    }
349
350    return count;
351}
352
353
354/* ************************************************************/
355/* functions on single lexblock */
356/* ************************************************************/
357
358static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry,
359                                 const picoos_uint8 *graph,
360                                 const picoos_uint16 graphlen) {
361    picoos_uint8 i;
362    picoos_uint8 lexlen;
363    picoos_uint8 *lexgraph;
364
365    lexlen = lexentry[0] - 1;
366    lexgraph = &(lexentry[1]);
367    for (i=0; (i<graphlen) && (i<lexlen); i++) {
368        PICODBG_TRACE(("%d|%d  graph|lex: %c|%c", graphlen, lexlen,
369                       graph[i], lexgraph[i]));
370        if (lexgraph[i] < graph[i]) {
371            return -1;
372        } else if (lexgraph[i] > graph[i]) {
373            return 1;
374        }
375    }
376    if (graphlen == lexlen) {
377        return 0;
378    } else if (lexlen < graphlen) {
379        return -1;
380    } else {
381        return 1;
382    }
383}
384
385
386static void klex_setLexResult(const picoos_uint8 *lexentry,
387                              const picoos_uint32 lexpos,
388                              picoklex_lexl_result_t *lexres) {
389    picoos_uint8 i;
390
391    /* check if :G2P */
392    if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) {
393        /* set pos */
394        lexres->posind[0] = lexentry[lexentry[0] + 1];
395        /* set rest */
396        lexres->phonfound = FALSE;
397        lexres->posindlen = 1;
398        lexres->nrres = 1;
399        PICODBG_DEBUG(("result %d :G2P", lexres->nrres));
400    } else {
401        i = lexres->nrres * (PICOKLEX_POSIND_SIZE);
402        lexres->posindlen += PICOKLEX_POSIND_SIZE;
403        lexres->phonfound = TRUE;
404        /* set pos */
405        lexres->posind[i++] = lexentry[lexentry[0] + 1];
406        /* set ind, PICOKLEX_IND_SIZE */
407        lexres->posind[i++] = 0x000000ff & (lexpos);
408        lexres->posind[i++] = 0x000000ff & (lexpos >>  8);
409        lexres->posind[i]   = 0x000000ff & (lexpos >> 16);
410        lexres->nrres++;
411        PICODBG_DEBUG(("result %d", lexres->nrres));
412    }
413}
414
415
416static void klex_lexblockLookup(klex_SubObj this,
417                                const picoos_uint32 lexposStart,
418                                const picoos_uint32 lexposEnd,
419                                const picoos_uint8 *graph,
420                                const picoos_uint16 graphlen,
421                                picoklex_lexl_result_t *lexres) {
422    picoos_uint32 lexpos;
423    picoos_int8 rv;
424
425    lexres->nrres = 0;
426
427    lexpos = lexposStart;
428    rv = -1;
429    while ((rv < 0) && (lexpos < lexposEnd)) {
430
431        rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen);
432
433        if (rv == 0) { /* found */
434            klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres);
435            if (lexres->phonfound) {
436                /* look for more results, up to MAX_NRRES, don't even
437                   check if more results would be available */
438                while ((lexres->nrres < PICOKLEX_MAX_NRRES) &&
439                       (lexpos < lexposEnd)) {
440                    lexpos += this->lexblocks[lexpos];
441                    lexpos += this->lexblocks[lexpos];
442                    /* if there are no more entries in this block, advance
443                       to next block by skipping all zeros */
444                    while ((this->lexblocks[lexpos] == 0) &&
445                           (lexpos < lexposEnd)) {
446                        lexpos++;
447                    }
448                    if (lexpos < lexposEnd) {
449                        if (klex_lexMatch(&(this->lexblocks[lexpos]), graph,
450                                          graphlen) == 0) {
451                            klex_setLexResult(&(this->lexblocks[lexpos]),
452                                              lexpos, lexres);
453                        } else {
454                            /* no more results, quit loop */
455                            lexpos = lexposEnd;
456                        }
457                    }
458                }
459            } else {
460                /* :G2P mark */
461            }
462        } else if (rv < 0) {
463            /* not found, goto next entry */
464            lexpos += this->lexblocks[lexpos];
465            lexpos += this->lexblocks[lexpos];
466            /* if there are no more entries in this block, advance
467               to next block by skipping all zeros */
468            while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) {
469                lexpos++;
470            }
471        } else {
472            /* rv > 0, not found, won't show up later in block */
473        }
474    }
475}
476
477
478/* ************************************************************/
479/* lexicon lookup functions */
480/* ************************************************************/
481
482picoos_uint8 picoklex_lexLookup(const picoklex_Lex this,
483                                const picoos_uint8 *graph,
484                                const picoos_uint16 graphlen,
485                                picoklex_lexl_result_t *lexres) {
486    picoos_uint16 lbnr, lbc;
487    picoos_uint32 lexposStart, lexposEnd;
488    picoos_uint8 i;
489    picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS];
490    klex_SubObj klex = (klex_SubObj) this;
491
492    if (NULL == klex) {
493        PICODBG_ERROR(("no lexicon loaded"));
494        /* no exception here needed, already checked at initialization */
495        return FALSE;
496    }
497
498    lexres->nrres = 0;
499    lexres->posindlen = 0;
500    lexres->phonfound = FALSE;
501
502    for (i = 0; i<PICOKLEX_LEX_SIE_NRGRAPHS; i++) {
503        if (i < graphlen) {
504            tgraph[i] = graph[i];
505        } else {
506            tgraph[i] = '\0';
507        }
508    }
509    PICODBG_DEBUG(("tgraph: %c%c%c", tgraph[0],tgraph[1],tgraph[2]));
510
511    if ((klex->nrblocks) == 0) {
512        /* no searchindex, no lexblock */
513        PICODBG_WARN(("no searchindex, no lexblock"));
514        return FALSE;
515    } else {
516        lbnr = klex_getLexblockNr(klex, tgraph);
517        PICODBG_ASSERT(lbnr < klex->nrblocks);
518        lbc = klex_getLexblockRange(klex, lbnr);
519        PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks));
520    }
521    PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc));
522
523    lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE;
524    lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE;
525
526    PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd));
527    klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres);
528    PICODBG_DEBUG(("lookup done, %d found", lexres->nrres));
529
530    return (lexres->nrres > 0);
531}
532
533
534picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this,
535                                   const picoos_uint8 *ind,
536                                   const picoos_uint8 indlen,
537                                   picoos_uint8 *pos,
538                                   picoos_uint8 **phon,
539                                   picoos_uint8 *phonlen) {
540    picoos_uint32 pentry;
541    klex_SubObj klex = (klex_SubObj) this;
542
543    /* check indlen */
544    if (indlen != PICOKLEX_IND_SIZE) {
545        return FALSE;
546    }
547
548    /* PICOKLEX_IND_SIZE */
549    pentry = 0x000000ff & (ind[0]);
550    pentry |= ((picoos_uint32)(ind[1]) <<  8);
551    pentry |= ((picoos_uint32)(ind[2]) << 16);
552
553    /* check ind if it is within lexblocks byte stream, if not, return FALSE */
554    if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) {
555        return FALSE;
556    }
557
558    pentry += (klex->lexblocks[pentry]);
559    *phonlen = (klex->lexblocks[pentry++]) - 2;
560    *pos = klex->lexblocks[pentry++];
561    *phon = &(klex->lexblocks[pentry]);
562
563    PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen));
564    return TRUE;
565}
566
567#ifdef __cplusplus
568}
569#endif
570
571
572/* end */
573