1/*
2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16/**
17 * @file picoktab.c
18 *
19 * symbol tables needed at runtime
20 *
21 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
22 * All rights reserved.
23 *
24 * History:
25 * - 2009-04-20 -- initial version
26 *
27 */
28
29#include "picoos.h"
30#include "picodbg.h"
31#include "picoknow.h"
32#include "picobase.h"
33#include "picoktab.h"
34#include "picodata.h"
35
36#ifdef __cplusplus
37extern "C" {
38#endif
39#if 0
40}
41#endif
42
43
44/** @todo : the following would be better part of a knowledge base.
45 * Make sure it is consistent with the phoneme symbol table used in the lingware */
46
47/* PLANE_PHONEMES */
48
49/* PLANE_POS */
50
51/* PLANE_PB_STRENGTHS */
52
53/* PLANE_ACCENTS */
54
55/* PLANE_INTERN */
56#define PICOKTAB_TMPID_PHONSTART      '\x26'  /* 38  '&' */
57#define PICOKTAB_TMPID_PHONTERM       '\x23'  /* 35  '#' */
58
59
60/* ************************************************************/
61/* fixed ids */
62/* ************************************************************/
63
64
65static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this,
66                                       picoos_Common common)
67{
68    picoktab_FixedIds ids;
69
70    PICODBG_DEBUG(("start"));
71
72    if (NULL == this || NULL == this->subObj) {
73        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
74                                       NULL, NULL);
75    }
76    ids = (picoktab_FixedIds) this->subObj;
77
78    ids->phonStartId = PICOKTAB_TMPID_PHONSTART;
79    ids->phonTermId = PICOKTAB_TMPID_PHONTERM;
80    return PICO_OK;
81}
82
83
84static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this,
85                                             picoos_MemoryManager mm)
86{
87    if (NULL != this) {
88        picoos_deallocate(mm, (void *) &this->subObj);
89    }
90    return PICO_OK;
91}
92
93pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this,
94                                                  picoos_Common common)
95{
96    if (NULL == this) {
97        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
98                                       NULL, NULL);
99    }
100    this->subDeallocate = ktabIdsSubObjDeallocate;
101    this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t));
102    if (NULL == this->subObj) {
103        return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
104                                       NULL, NULL);
105    }
106    return ktabIdsInitialize(this, common);
107}
108
109picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this)
110{
111    return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj));
112}
113
114
115picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm)
116{
117    picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this));
118    if (NULL != this) {
119        /* initialize */
120    }
121    return this;
122}
123
124
125void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this)
126{
127    if (NULL != (*this)) {
128        /* terminate */
129        picoos_deallocate(mm,(void *)this);
130    }
131}
132
133
134
135/* ************************************************************/
136/* Graphs */
137/* ************************************************************/
138
139/* overview binary file format for graphs kb:
140
141    graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs
142
143    NROFSENTRIES  : 2 bytes, number of entries in offset table
144    SIZEOFSENTRY  : 1 byte,  size of one entry in offset table
145
146    ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET)
147
148    OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs
149
150    graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph)
151
152    graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2]
153
154    FROM          : 1..4 unsigned bytes, UTF8 character without terminating 0
155    TO            : 1..4 unsigned bytes, UTF8 character without terminating 0
156    PROPSET       : 1 unsigned byte, least significant bit : has TO field
157                                                             next bit : has TOKENTYPE
158                                                             next bit : has TOKENSUBTYPE
159                                                             next bit : has VALUE
160                                                             next bit : has LOWERCASE
161                                                             next bit : has GRAPHSUBS1
162                                                             next bit : has GRAPHSUBS2
163                                                             next bit : has PUNC
164
165    TOKENTYPE    : 1 unsigned byte
166    TOKENSUBTYPE : 1 unsigned byte
167    VALUE        : 1 unsigned byte
168    LOWERCASE    : 1..4 unsigned bytes, UTF8 character without terminating 0
169    GRAPHSUBS1   : 1..4 unsigned bytes, UTF8 character without terminating 0
170    GRAPHSUBS2   : 1..4 unsigned bytes, UTF8 character without terminating 0
171    PUNC         : 1 unsigned byte
172*/
173
174static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop);
175
176#define KTAB_START_GRAPHS_NR_OFFSET     0
177#define KTAB_START_GRAPHS_SIZE_OFFSET   2
178#define KTAB_START_GRAPHS_OFFSET_TABLE  3
179#define KTAB_START_GRAPHS_GRAPH_TABLE   0
180
181/* bitmasks to extract the grapheme properties info from the property set */
182#define KTAB_GRAPH_PROPSET_TO            ((picoos_uint8)'\x01')
183#define KTAB_GRAPH_PROPSET_TOKENTYPE     ((picoos_uint8)'\x02')
184#define KTAB_GRAPH_PROPSET_TOKENSUBTYPE  ((picoos_uint8)'\x04')
185#define KTAB_GRAPH_PROPSET_VALUE         ((picoos_uint8)'\x08')
186#define KTAB_GRAPH_PROPSET_LOWERCASE     ((picoos_uint8)'\x010')
187#define KTAB_GRAPH_PROPSET_GRAPHSUBS1    ((picoos_uint8)'\x020')
188#define KTAB_GRAPH_PROPSET_GRAPHSUBS2    ((picoos_uint8)'\x040')
189#define KTAB_GRAPH_PROPSET_PUNCT         ((picoos_uint8)'\x080')
190
191
192typedef struct ktabgraphs_subobj *ktabgraphs_SubObj;
193
194typedef struct ktabgraphs_subobj {
195    picoos_uint16 nrOffset;
196    picoos_uint16 sizeOffset;
197
198    picoos_uint8 * offsetTable;
199    picoos_uint8 * graphTable;
200} ktabgraphs_subobj_t;
201
202
203
204static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this,
205                                          picoos_Common common) {
206    ktabgraphs_subobj_t * ktabgraphs;
207
208    PICODBG_DEBUG(("start"));
209
210    if (NULL == this || NULL == this->subObj) {
211        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
212                                       NULL, NULL);
213    }
214    ktabgraphs = (ktabgraphs_subobj_t *) this->subObj;
215    ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]);
216    ktabgraphs->sizeOffset  = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]);
217    ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]);
218    ktabgraphs->graphTable  = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]);
219    return PICO_OK;
220}
221
222static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this,
223                                                picoos_MemoryManager mm) {
224    if (NULL != this) {
225        picoos_deallocate(mm, (void *) &this->subObj);
226    }
227    return PICO_OK;
228}
229
230
231pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this,
232                                                     picoos_Common common) {
233    if (NULL == this) {
234        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
235                                       NULL, NULL);
236    }
237    this->subDeallocate = ktabGraphsSubObjDeallocate;
238    this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t));
239    if (NULL == this->subObj) {
240        return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
241                                       NULL, NULL);
242    }
243    return ktabGraphsInitialize(this, common);
244}
245
246
247picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) {
248    if (NULL == this) {
249        return NULL;
250    } else {
251        return (picoktab_Graphs) this->subObj;
252    }
253}
254
255
256/* Graphs methods */
257
258picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this,
259                                       const picoos_uint8 *graph,
260                                       const picoos_uint8 graphlenmax) {
261
262  picoos_uint8 ui8App;
263  picoos_uint32 graphsOffset;
264  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
265
266  ui8App = graphlenmax;        /* avoid warning "var not used in this function"*/
267
268  graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph);
269  return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV;
270}
271
272
273static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str)
274{
275  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
276  picoos_uint32 i, l;
277
278  i = 0;
279  l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]);
280  while (i<l) {
281    str[i] = g->graphTable[graphsOffset+propOffset+i];
282    i++;
283  }
284  str[l] = 0;
285}
286
287
288static picoos_uint32 ktab_propOffset(const picoktab_Graphs this,
289        picoos_uint32 graphsOffset, picoos_uint32 prop)
290/* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table;
291 If the property is found, a value > 0 is returned otherwise 0 */
292{
293    picoos_uint32 n = 0;
294    ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
295
296    if ((g->graphTable[graphsOffset] & prop) == prop) {
297        n = n + 1; /* overread PROPSET field */
298        n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */
299        if (prop > KTAB_GRAPH_PROPSET_TO) {
300            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO)
301                    == KTAB_GRAPH_PROPSET_TO) {
302                n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */
303            }
304        } else {
305            return n;
306        }
307        if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) {
308            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE)
309                    == KTAB_GRAPH_PROPSET_TOKENTYPE) {
310                n = n + 1; /* overread TOKENTYPE field */
311            }
312        } else {
313            return n;
314        }
315        if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
316            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE)
317                    == KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
318                n = n + 1; /* overread stokentype field */
319            }
320        } else {
321            return n;
322        }
323        if (prop > KTAB_GRAPH_PROPSET_VALUE) {
324            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE)
325                    == KTAB_GRAPH_PROPSET_VALUE) {
326                n = n + 1; /* overread value field */
327            }
328        } else {
329            return n;
330        }
331        if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) {
332            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE)
333                    == KTAB_GRAPH_PROPSET_LOWERCASE) {
334                n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */
335            }
336        } else {
337            return n;
338        }
339        if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
340            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1)
341                    == KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
342                n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */
343            }
344        } else {
345            return n;
346        }
347        if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
348            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2)
349                    == KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
350                n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */
351            }
352        } else {
353            return n;
354        }
355        if (prop > KTAB_GRAPH_PROPSET_PUNCT) {
356            if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT)
357                    == KTAB_GRAPH_PROPSET_PUNCT) {
358                n = n + 1; /* overread value field */
359            }
360        } else {
361            return n;
362        }
363    }
364
365    return n;
366}
367
368
369picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph)
370{  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
371   picoos_int32 a, b, m;
372   picoos_uint32 graphsOffset;
373   picoos_uint32 propOffset;
374   picobase_utf8char from;
375   picobase_utf8char to;
376   picoos_bool utfGEfrom;
377   picoos_bool utfLEto;
378
379   if (g->nrOffset > 0) {
380     a = 0;
381     b = g->nrOffset-1;
382     do  {
383       m = (a+b) / 2;
384
385       /* get offset to graph[m] */
386       if (g->sizeOffset == 1) {
387         graphsOffset = g->offsetTable[g->sizeOffset*m];
388       }
389       else {
390         graphsOffset =     g->offsetTable[g->sizeOffset*m    ] +
391                        256*g->offsetTable[g->sizeOffset*m + 1];
392         /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset));
393         */
394       }
395
396       /* get FROM and TO field of graph[m] */
397       ktab_getStrProp(this, graphsOffset, 1, from);
398       propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO);
399       if (propOffset > 0) {
400         ktab_getStrProp(this, graphsOffset, propOffset, to);
401       }
402       else {
403         picoos_strcpy((picoos_char *)to, (picoos_char *)from);
404       }
405
406       /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to));
407       */
408       utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0;
409       utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0;
410
411       if (utfGEfrom && utfLEto) {
412         /* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph));
413          */
414         return graphsOffset;
415       }
416       if (!utfGEfrom) {
417         b = m-1;
418       }
419       else if (!utfLEto) {
420         a = m+1;
421       }
422     } while (a<=b);
423   }
424   PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph));
425   return 0;
426}
427
428
429
430
431picoos_bool  picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType)
432{
433  picoos_uint32 propOffset;
434  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
435
436  propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE);
437  if (propOffset > 0) {
438    *stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]);
439    return TRUE;
440  }
441  else {
442    return FALSE;
443  }
444}
445
446
447picoos_bool  picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType)
448{
449  picoos_uint32 propOffset;
450  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
451
452  propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE);
453  if (propOffset > 0) {
454    *stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]);
455    return TRUE;
456  }
457  else {
458    return FALSE;
459  }
460}
461
462picoos_bool  picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value)
463{
464  picoos_uint32 propOffset;
465  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
466
467  propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE);
468  if (propOffset > 0) {
469    *value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]);
470    return TRUE;
471  }
472  else {
473    return FALSE;
474  }
475}
476
477
478picoos_bool  picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2)
479{
480  picoos_uint32 propOffset;
481  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
482
483  propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT);
484  if (propOffset > 0) {
485      if (g->graphTable[graphsOffset+propOffset] == 2) {
486          *info1 = PICODATA_ITEMINFO1_PUNC_SENTEND;
487      }
488      else {
489          *info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND;
490      }
491    if (g->graphTable[graphsOffset+1] == '.') {
492        *info2 = PICODATA_ITEMINFO2_PUNC_SENT_T;
493    }
494    else if (g->graphTable[graphsOffset+1] == '?') {
495        *info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q;
496    }
497    else if (g->graphTable[graphsOffset+1] == '!') {
498        *info2 = PICODATA_ITEMINFO2_PUNC_SENT_E;
499    }
500    else {
501        *info2 = PICODATA_ITEMINFO2_PUNC_PHRASE;
502    }
503    return TRUE;
504  }
505  else {
506    return FALSE;
507  }
508}
509
510
511picoos_bool  picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase)
512{
513  picoos_uint32 propOffset;
514
515  propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE);
516  if (propOffset > 0) {
517    ktab_getStrProp(this, graphsOffset, propOffset, lowercase);
518    return TRUE;
519  }
520  else {
521    return FALSE;
522  }
523}
524
525
526picoos_bool  picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1)
527{
528  picoos_uint32 propOffset;
529
530  propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1);
531  if (propOffset > 0) {
532    ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1);
533    return TRUE;
534  }
535  else {
536    return FALSE;
537  }
538}
539
540
541picoos_bool  picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2)
542{
543  picoos_uint32 propOffset;
544
545  propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2);
546  if (propOffset > 0) {
547    ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2);
548    return TRUE;
549  }
550  else {
551    return FALSE;
552  }
553}
554/* *****************************************************************/
555/* used for tools */
556
557static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to)
558{
559  picoos_uint32 l;
560  l = picobase_det_utf8_length(**pos);
561  while (l>0) {
562    *(to++) = *((*pos)++);
563    l--;
564  }
565  *to = 0;
566}
567
568picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this)
569{
570    ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
571    return g->nrOffset;
572}
573
574void picoktab_graphsGetGraphInfo(const picoktab_Graphs this,
575        picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to,
576        picoos_uint8 * propset,
577        picoos_uint8 * stokenType, picoos_uint8 * stokenSubType,
578        picoos_uint8 * value, picoos_uchar * lowercase,
579        picoos_uchar * graphsubs1, picoos_uchar * graphsubs2,
580        picoos_uint8 * punct) {
581    ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
582    picoos_uint32 graphsOffset;
583    picoos_uint8 * pos;
584
585    /* calculate offset of graph[graphIndex] */
586    if (g->sizeOffset == 1) {
587        graphsOffset = g->offsetTable[graphIndex];
588    } else {
589        graphsOffset = g->offsetTable[2 * graphIndex]
590                + (g->offsetTable[2 * graphIndex + 1] << 8);
591    }
592    pos = &(g->graphTable[graphsOffset]);
593    *propset = *pos;
594
595    pos++; /* advance to FROM */
596    ktab_getUtf8(&pos, from); /* get FROM and advance */
597    if ((*propset) & KTAB_GRAPH_PROPSET_TO) {
598        ktab_getUtf8(&pos, to); /* get TO and advance */
599    } else {
600        picoos_strcpy((picoos_char *)to, (picoos_char *)from);
601    }
602    if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) {
603        (*stokenType) = *(pos++); /* get TOKENTYPE and advance */
604    } else {
605        (*stokenType) = -1;
606    }
607    if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
608        (*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */
609    } else {
610        (*stokenSubType) = -1;
611    }
612    if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) {
613        (*value) = *(pos++); /* get VALUE and advance */
614    } else {
615        (*value) = -1;
616    }
617    if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) {
618        ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */
619    } else {
620        lowercase[0] = NULLC;
621    }
622    if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
623        ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */
624    } else {
625        graphsubs1[0] = NULLC;
626    }
627    if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
628        ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */
629    } else {
630        graphsubs2[0] = NULLC;
631    }
632    if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) {
633        (*punct) = *(pos++); /* get PUNCT and advance */
634    } else {
635        (*punct) = -1;
636    }
637}
638
639/* ************************************************************/
640/* Phones */
641/* ************************************************************/
642
643/* overview binary file format for phones kb:
644
645    phones-kb = specids propertytable
646
647    specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1
648              RESERVE1 RESERVE1 RESERVE1
649
650    propertytable = {PHONEPROP2}=256
651
652    PRIMSTRESSID1: one byte, ID of primary stress
653    SECSTRESSID1: one byte, ID of secondary stress
654    SYLLBOUNDID1: one byte, ID of syllable boundary
655    PAUSEID1: one byte, ID of pause
656    RESERVE1: reserved for future use
657
658    PHONEPROP2: one byte, max. of 256 phones directly access this table
659                to check a property for a phone; binary properties
660                encoded (1 bit per prop)
661       least significant bit: vowel
662                    next bit: diphth
663                    next bit: glott
664                    next bit: nonsyllvowel
665                    next bit: syllcons
666       3 bits spare
667 */
668
669#define KTAB_START_SPECIDS   0
670#define KTAB_IND_PRIMSTRESS  0
671#define KTAB_IND_SECSTRESS   1
672#define KTAB_IND_SYLLBOUND   2
673#define KTAB_IND_PAUSE       3
674#define KTAB_IND_WORDBOUND   4
675
676#define KTAB_START_PROPS     8
677
678
679typedef struct ktabphones_subobj *ktabphones_SubObj;
680
681typedef struct ktabphones_subobj {
682    picoos_uint8 *specids;
683    picoos_uint8 *props;
684} ktabphones_subobj_t;
685
686
687/* bitmasks to extract the property info from props */
688#define KTAB_PPROP_VOWEL        '\x01'
689#define KTAB_PPROP_DIPHTH       '\x02'
690#define KTAB_PPROP_GLOTT        '\x04'
691#define KTAB_PPROP_NONSYLLVOWEL '\x08'
692#define KTAB_PPROP_SYLLCONS     '\x10'
693
694
695static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this,
696                                          picoos_Common common) {
697    ktabphones_subobj_t * ktabphones;
698
699    PICODBG_DEBUG(("start"));
700
701    if (NULL == this || NULL == this->subObj) {
702        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
703                                       NULL, NULL);
704    }
705    ktabphones = (ktabphones_subobj_t *) this->subObj;
706    ktabphones->specids = &(this->base[KTAB_START_SPECIDS]);
707    ktabphones->props   = &(this->base[KTAB_START_PROPS]);
708    return PICO_OK;
709}
710
711static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this,
712                                                picoos_MemoryManager mm) {
713    if (NULL != this) {
714        picoos_deallocate(mm, (void *) &this->subObj);
715    }
716    return PICO_OK;
717}
718
719pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this,
720                                                     picoos_Common common) {
721    if (NULL == this) {
722        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
723                                       NULL, NULL);
724    }
725    this->subDeallocate = ktabPhonesSubObjDeallocate;
726    this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t));
727    if (NULL == this->subObj) {
728        return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
729                                       NULL, NULL);
730    }
731    return ktabPhonesInitialize(this, common);
732}
733
734picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) {
735    if (NULL == this) {
736        return NULL;
737    } else {
738        return (picoktab_Phones) this->subObj;
739    }
740}
741
742
743/* Phones methods */
744
745picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this,
746                                   const picoos_uint8 ch) {
747    return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]);
748}
749picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this,
750                                    const picoos_uint8 ch) {
751    return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]);
752}
753picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this,
754                                   const picoos_uint8 ch) {
755    return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]);
756}
757picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this,
758                                          const picoos_uint8 ch) {
759    return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]);
760}
761picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this,
762                                      const picoos_uint8 ch) {
763    return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]);
764}
765
766picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this,
767                                    const picoos_uint8 ch) {
768    picoos_uint8 props;
769    props = ((ktabphones_SubObj)this)->props[ch];
770    return (((KTAB_PPROP_VOWEL & props) &&
771             !(KTAB_PPROP_NONSYLLVOWEL & props))
772            || (KTAB_PPROP_SYLLCONS & props));
773}
774
775picoos_bool picoktab_isPrimstress(const picoktab_Phones this,
776                                   const picoos_uint8 ch) {
777    return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]);
778}
779picoos_bool picoktab_isSecstress(const picoktab_Phones this,
780                                  const picoos_uint8 ch) {
781    return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]);
782}
783picoos_bool picoktab_isSyllbound(const picoktab_Phones this,
784                                  const picoos_uint8 ch) {
785    return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]);
786}
787picoos_bool picoktab_isWordbound(const picoktab_Phones this,
788                                  const picoos_uint8 ch) {
789    return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]);
790}
791picoos_bool picoktab_isPause(const picoktab_Phones this,
792                              const picoos_uint8 ch) {
793    return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]);
794}
795
796picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) {
797    return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS];
798}
799picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) {
800    return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS];
801}
802picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) {
803    return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND];
804}
805picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) {
806    return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND];
807}
808picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) {
809    return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE];
810}
811
812/* ************************************************************/
813/* Pos */
814/* ************************************************************/
815
816/* overview binary file format for pos kb:
817
818    pos-kb = header posids
819    header = {COUNT2 OFFS2}=8
820    posids = {POSID1 {PARTID1}0:8}1:
821
822    where POSID1 is the value of the (combined) part-of-speech symbol,
823    and {PARTID1} are the symbol values of its components (empty if it
824    is not a combined symbol). The {PARTID1} list is sorted.
825    Part-of-speech symbols with equal number of components are grouped
826    together.
827
828    The header contains information about these groups:
829
830    COUNT2 specifies the number of elements in the group, and OFFS2
831    specifies the offset (relative to the beginning of the kb) where
832    the group data starts, i.e.:
833
834    25   32  -> 25 not-combined elements, starting at offset 32
835    44   57  -> 44 elements composed of 2 symbols, starting at offset 57
836    23  189  -> 23 elements composed of 3 symbols, starting at offset 189
837    ...
838
839    Currently, each symbol may be composed of up to 8 other symbols.
840    Therefore, the header has 8 entries, too. The header starts with
841    the unique POS list, and then in increasing order, 2 symbols, 3
842    symbols,...
843
844Zur Anschauung die ge-printf-te Version:
845
846 25   32
847 44   57
848 23  189
849 12  281
850  4  341
851  1  365
852  0    0
853  0    0
854 33 |
855 34 |
856 35 |
857 60 |
858 etc.
859 36 |  35  60
860 50 |  35  95
861 51 |  35  97
862 58 |  35 120
863 59 |  35 131
864 61 |  60  75
865 63 |  60  95
866 64 |  60  97
867 etc.
868 42 |  35  60 117
869 44 |  35  60 131
870 45 |  35  73  97
871 48 |  35  84  97
872 54 |  35  97 131
873 56 |  35 113 120
874 57 |  35 117 120
875 62 |  60  84 122
876 etc.
877 */
878
879typedef struct ktabpos_subobj *ktabpos_SubObj;
880
881typedef struct ktabpos_subobj {
882    picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB];
883    picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB];
884} ktabpos_subobj_t;
885
886
887static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this,
888                                       picoos_Common common) {
889    ktabpos_subobj_t *ktabpos;
890    picoos_uint16 osprev;
891    picoos_uint16 os, pos;
892    picoos_uint8 i;
893
894    PICODBG_DEBUG(("start"));
895
896    if (NULL == this || NULL == this->subObj) {
897        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
898                                       NULL, NULL);
899    }
900    ktabpos = (ktabpos_subobj_t *)this->subObj;
901
902    os = 0;
903    for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) {
904        ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 |
905            this->base[pos];
906        if (ktabpos->nrcomb[i] > 0) {
907            osprev = os;
908            os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2];
909            ktabpos->nrcombstart[i] = &(this->base[os]);
910            PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos,
911                           ktabpos->nrcomb[i], osprev, os));
912            if (osprev >= os) {
913                /* cannot be, in a valid kb */
914                return picoos_emRaiseException(common->em,
915                                               PICO_EXC_FILE_CORRUPT,
916                                               NULL, NULL);
917            }
918        } else {
919            if (i == 0) {
920                /* cannot be, in a valid kb */
921                return picoos_emRaiseException(common->em,
922                                               PICO_EXC_FILE_CORRUPT,
923                                               NULL, NULL);
924            }
925            ktabpos->nrcombstart[i] = NULL;
926        }
927    }
928    return PICO_OK;
929}
930
931static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this,
932                                             picoos_MemoryManager mm) {
933    if (NULL != this) {
934        picoos_deallocate(mm, (void *) &this->subObj);
935    }
936    return PICO_OK;
937}
938
939pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this,
940                                                  picoos_Common common) {
941    if (NULL == this) {
942        return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
943                                       NULL, NULL);
944    }
945    this->subDeallocate = ktabPosSubObjDeallocate;
946    this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t));
947    if (NULL == this->subObj) {
948        return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
949                                       NULL, NULL);
950    }
951    return ktabPosInitialize(this, common);
952}
953
954picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) {
955    if (NULL == this) {
956        return NULL;
957    } else {
958        return (picoktab_Pos) this->subObj;
959    }
960}
961
962
963/* Pos methods */
964
965static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1,
966                                         const picoos_uint8 *grp2,
967                                         picoos_uint8 len)
968{
969    /* if both, grp1 and grp2 would be sorted in ascending order
970       we could implement a function picoktab_comparePosGroup in
971       a similar manner as strcmp */
972
973    picoos_uint16 i, j, equal;
974
975    equal = 1;
976
977    i = 0;
978    while (equal && (i < len)) {
979        /* search grp1[i] in grp2 */
980        j = 0;
981        while ((j < len) && (grp1[i] != grp2[j])) {
982            j++;
983        }
984        equal = (j < len);
985        i++;
986    }
987
988    return equal;
989}
990
991
992picoos_bool picoktab_isUniquePos(const picoktab_Pos this,
993                                  const picoos_uint8 pos) {
994    ktabpos_subobj_t *ktabpos;
995    picoos_uint16 i;
996
997    /* speed-up possible with e.g. binary search */
998
999    ktabpos = (ktabpos_subobj_t *)this;
1000    PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0]));
1001    i = 0;
1002    while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) {
1003        PICODBG_TRACE(("compare with pos %d at position %d",
1004                       ktabpos->nrcombstart[0][i], pos, i));
1005        i++;
1006    }
1007    return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i]));
1008}
1009
1010
1011picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this,
1012                                       const picoos_uint8 pos,
1013                                       const picoos_uint8 posgroup)
1014{
1015    ktabpos_subobj_t *ktabpos;
1016    picoos_uint8 *grp;
1017    picoos_uint16 i, j, n, s, grplen;
1018    picoos_uint8 *e;
1019    picoos_uint8 found;
1020
1021    ktabpos = (ktabpos_subobj_t *) this;
1022
1023    grp = NULL;
1024    found = FALSE;
1025    grplen = 0;
1026
1027    /* currently, a linear search is required to find 'posgroup'; the
1028       knowledge base should be extended to allow for a faster search */
1029
1030    /* treat case i==0, grplen==0, ie. pos == posgroup */
1031    if (pos == posgroup) {
1032        found = TRUE;
1033    }
1034
1035    i = 1;
1036    while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) {
1037        n = ktabpos->nrcomb[i];       /* number of entries */
1038        e = ktabpos->nrcombstart[i];  /* ptr to first entry */
1039        s = i + 2;                    /* size of an entry in bytes */
1040        /* was with while starting at 0:
1041        s = i > 0 ? i + 2 : 1;
1042        */
1043        j = 0;
1044        while ((grp == NULL) && (j < n)) {
1045            if (posgroup == e[0]) {
1046                grp = e + 1;
1047                grplen = s - 1;
1048            }
1049            e += s;
1050            j++;
1051        }
1052        i++;
1053    }
1054
1055    /* test if 'pos' is contained in the components of 'posgroup' */
1056    if (grp != NULL) {
1057        for (i = 0; !found && (i < grplen); i++) {
1058            if (pos == grp[i]) {
1059                found = TRUE;
1060            }
1061        }
1062
1063        /* just a way to test picoktab_getPosGroup */
1064        /*
1065        PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup);
1066        */
1067    }
1068
1069    return found;
1070}
1071
1072
1073picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this,
1074                                  const picoos_uint8 *poslist,
1075                                  const picoos_uint8 poslistlen)
1076{
1077    picoos_uint8 poscomb;
1078    ktabpos_subobj_t *ktabpos;
1079    picoos_uint16 i, j, n, s;
1080    picoos_uint8 *e;
1081
1082    ktabpos = (ktabpos_subobj_t *) this;
1083    poscomb = 0;
1084
1085    if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) {
1086        i = poslistlen - 1;
1087        if (i > 0) {
1088            n = ktabpos->nrcomb[i];       /* number of entries */
1089            e = ktabpos->nrcombstart[i];  /* ptr to first entry */
1090            s = i + 2;                    /* size of an entry in bytes */
1091            j = 0;
1092            while (!poscomb && (j < n)) {
1093                if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) {
1094                    poscomb = *e;
1095                }
1096                e += s;
1097                j++;
1098            }
1099            if (!poscomb) {
1100                /* combination not found; shouldn't occur if lingware OK! */
1101                /* contingency solution: take first */
1102                PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0]));
1103                poscomb = poslist[0];
1104            }
1105        } else {  /* not a composed POS */
1106            poscomb = poslist[0];
1107        }
1108    }
1109
1110    return poscomb;
1111}
1112
1113#ifdef __cplusplus
1114}
1115#endif
1116
1117
1118/* end */
1119