1/*
2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16/**
17 * @file picowa.c
18 *
19 * word analysis PU - lexicon lookup and POS prediction
20 *
21 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
22 * All rights reserved.
23 *
24 * History:
25 * - 2009-04-20 -- initial version
26 *
27 */
28
29#include "picoos.h"
30#include "picodbg.h"
31#include "picodata.h"
32#include "picowa.h"
33#include "picoklex.h"
34#include "picokdt.h"
35#include "picoktab.h"
36
37#ifdef __cplusplus
38extern "C" {
39#endif
40#if 0
41}
42#endif
43
44/* PU waStep states */
45#define WA_STEPSTATE_COLLECT  0
46#define WA_STEPSTATE_PROCESS  1
47#define WA_STEPSTATE_FEED     2
48
49
50/*  subobject    : WordAnaUnit
51 *  shortcut     : wa
52 *  context size : one item
53 */
54typedef struct wa_subobj {
55    picoos_uint8 procState; /* for next processing step decision */
56
57    /* one item only */
58    picoos_uint8 inBuf[PICOWA_MAXITEMSIZE]; /* internal input buffer */
59    picoos_uint16 inBufSize; /* actually allocated size */
60    picoos_uint16 inLen; /* length of item in inBuf, 0 for empty buf */
61
62    picoos_uint8 outBuf[PICOWA_MAXITEMSIZE]; /* internal output buffer */
63    picoos_uint16 outBufSize; /* actually allocated size */
64    picoos_uint16 outLen; /* length of item in outBuf, 0 for empty buf */
65
66    /* lex knowledge base */
67    picoklex_Lex lex;
68
69    /* ulex knowledge bases */
70    picoos_uint8 numUlex;
71    picoklex_Lex ulex[PICOKNOW_MAX_NUM_ULEX];
72
73    /* tab knowledge base */
74    picoktab_Pos tabpos;
75
76    /* dtposp knowledge base */
77    picokdt_DtPosP dtposp;
78} wa_subobj_t;
79
80
81static pico_status_t waInitialize(register picodata_ProcessingUnit this, picoos_int32 resetMode) {
82    picoos_uint8 i;
83    picoklex_Lex ulex;
84    wa_subobj_t * wa;
85
86    picoknow_kb_id_t ulexKbIds[PICOKNOW_MAX_NUM_ULEX] = PICOKNOW_KBID_ULEX_ARRAY;
87
88    PICODBG_DEBUG(("calling"));
89
90    if (NULL == this || NULL == this->subObj) {
91        return (picodata_step_result_t) picoos_emRaiseException(this->common->em,
92                                       PICO_ERR_NULLPTR_ACCESS, NULL, NULL);
93    }
94    wa = (wa_subobj_t *) this->subObj;
95    wa->procState = WA_STEPSTATE_COLLECT;
96    wa->inBufSize = PICOWA_MAXITEMSIZE;
97    wa->inLen = 0;
98    wa->outBufSize = PICOWA_MAXITEMSIZE;
99    wa->outLen = 0;
100
101    if (resetMode == PICO_RESET_SOFT) {
102        /*following initializations needed only at startup or after a full reset*/
103        return PICO_OK;
104    }
105    /* kb lex */
106    wa->lex = picoklex_getLex(this->voice->kbArray[PICOKNOW_KBID_LEX_MAIN]);
107    if (wa->lex == NULL) {
108        return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
109                                       NULL, NULL);
110    }
111    PICODBG_DEBUG(("got lex"));
112
113    /* kb ulex[] */
114    wa->numUlex = 0;
115    for (i = 0; i<PICOKNOW_MAX_NUM_ULEX; i++) {
116        ulex = picoklex_getLex(this->voice->kbArray[ulexKbIds[i]]);
117        if (NULL != ulex) {
118            wa->ulex[wa->numUlex++] = ulex;
119        }
120    }
121    PICODBG_DEBUG(("got %i user lexica", wa->numUlex));
122
123    /* kb tabpos */
124    wa->tabpos =
125        picoktab_getPos(this->voice->kbArray[PICOKNOW_KBID_TAB_POS]);
126    if (wa->tabpos == NULL) {
127        return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
128                                       NULL, NULL);
129    }
130    PICODBG_DEBUG(("got tabpos"));
131
132    /* kb dtposp */
133    wa->dtposp = picokdt_getDtPosP(this->voice->kbArray[PICOKNOW_KBID_DT_POSP]);
134    if (wa->dtposp == NULL) {
135        return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING,
136                                       NULL, NULL);
137    }
138    PICODBG_DEBUG(("got dtposp"));
139    return PICO_OK;
140}
141
142static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
143                                     picoos_int16 mode,
144                                     picoos_uint16 *numBytesOutput);
145
146static pico_status_t waTerminate(register picodata_ProcessingUnit this) {
147    return PICO_OK;
148}
149
150static pico_status_t waSubObjDeallocate(register picodata_ProcessingUnit this,
151                                        picoos_MemoryManager mm) {
152    if (NULL != this) {
153        picoos_deallocate(this->common->mm, (void *) &this->subObj);
154    }
155    mm = mm;        /* avoid warning "var not used in this function"*/
156    return PICO_OK;
157}
158
159
160picodata_ProcessingUnit picowa_newWordAnaUnit(picoos_MemoryManager mm,
161                                              picoos_Common common,
162                                              picodata_CharBuffer cbIn,
163                                              picodata_CharBuffer cbOut,
164                                              picorsrc_Voice voice) {
165    picodata_ProcessingUnit this;
166
167    this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice);
168    if (this == NULL) {
169        return NULL;
170    }
171
172    this->initialize = waInitialize;
173    PICODBG_DEBUG(("set this->step to waStep"));
174    this->step = waStep;
175    this->terminate = waTerminate;
176    this->subDeallocate = waSubObjDeallocate;
177    this->subObj = picoos_allocate(mm, sizeof(wa_subobj_t));
178    if (this->subObj == NULL) {
179        picoos_deallocate(mm, (void *)&this);
180        picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
181        return NULL;
182    }
183
184    waInitialize(this, PICO_RESET_FULL);
185    return this;
186}
187
188/* ***********************************************************************/
189/*                       WORDGRAPH proc functions                        */
190/* ***********************************************************************/
191
192static picoos_uint8 waClassifyPos(register picodata_ProcessingUnit this,
193                                  register wa_subobj_t *wa,
194                                  const picoos_uint8 *graph,
195                                  const picoos_uint16 graphlen) {
196    picokdt_classify_result_t dtres;
197    picoos_uint8 specchar;
198    picoos_uint16 i;
199
200    PICODBG_DEBUG(("graphlen %d", graphlen));
201
202    /* check existence of special char (e.g. hyphen) in graph:
203       for now, check existence of hard-coded ascii hyphen,
204       ie. preproc needs to match all UTF8 hyphens to the ascii
205       hyphen. */
206    /*  @todo : consider specifying special char(s) in lingware. */
207    specchar = FALSE;
208    i = 0;
209    while ((i < graphlen) && (!specchar)) {
210        if (graph[i++] == '-') {
211            specchar = TRUE;
212        }
213    }
214
215    /* construct input vector, which is set in dtposp */
216    if (!picokdt_dtPosPconstructInVec(wa->dtposp, graph, graphlen, specchar)) {
217        /* error constructing invec */
218        PICODBG_WARN(("problem with invec"));
219        picoos_emRaiseWarning(this->common->em, PICO_WARN_INVECTOR, NULL, NULL);
220        return PICODATA_ITEMINFO1_ERR;
221    }
222
223    /* classify */
224    if (!picokdt_dtPosPclassify(wa->dtposp)) {
225        /* error doing classification */
226        PICODBG_WARN(("problem classifying"));
227        picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
228                              NULL, NULL);
229        return PICODATA_ITEMINFO1_ERR;
230    }
231
232    /* decompose */
233    if (!picokdt_dtPosPdecomposeOutClass(wa->dtposp, &dtres)) {
234        /* error decomposing */
235        PICODBG_WARN(("problem decomposing"));
236        picoos_emRaiseWarning(this->common->em, PICO_WARN_OUTVECTOR,
237                              NULL, NULL);
238        return PICODATA_ITEMINFO1_ERR;
239    }
240
241    if (dtres.set) {
242        PICODBG_DEBUG(("class %d", dtres.class));
243        return (picoos_uint8)dtres.class;
244    } else {
245        PICODBG_WARN(("result not set"));
246        picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION,
247                              NULL, NULL);
248        return PICODATA_ITEMINFO1_ERR;
249    }
250}
251
252
253static pico_status_t waProcessWordgraph(register picodata_ProcessingUnit this,
254                                        register wa_subobj_t *wa /*inout*/,
255                                        picodata_itemhead_t *head /*inout*/,
256                                        const picoos_uint8 *content) {
257    pico_status_t status;
258    picoklex_lexl_result_t lexres;
259    picoos_uint8 posbuf[PICOKTAB_MAXNRPOS_IN_COMB];
260    picoos_uint8 i;
261    picoos_uint8 foundIndex;
262    picoos_bool found;
263
264
265    PICODBG_DEBUG(("type %c, len %d", head->type, head->len));
266
267    /* do lookup
268       if no entry found:
269         do POS prediction:     -> WORDGRAPH(POSes,NA)graph
270       else:
271         if incl-phone:
272           N entries possible  -> WORDINDEX(POSes,NA)POS1|ind1...POSN|indN
273           (N in {1,...,PICOKLEX_MAX_NRRES}, now up to 4)
274         else:
275           no phone, one entry  -> WORDGRAPH(POS,NA)graph
276    */
277
278    found = FALSE;
279    i = 0;
280    while (!found && (i < wa->numUlex)) {
281        found = picoklex_lexLookup(wa->ulex[i], content, head->len, &lexres);
282        i++;
283    }
284    /* note that if found, i will be incremented nevertheless, so i >= 1 */
285    if (found) {
286        foundIndex = i;
287    } else {
288        foundIndex = 0;
289    }
290    if (!found && !picoklex_lexLookup(wa->lex, content, head->len, &lexres)) {
291        /* no lex entry found, WORDGRAPH(POS,NA)graph */
292        if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
293                                          wa->outBuf, wa->outBufSize,
294                                          &wa->outLen)) {
295            wa->inLen = 0;
296            /* predict and modify pos in info1 */
297            if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
298                                   waClassifyPos(this, wa, content, head->len))) {
299                return picoos_emRaiseException(this->common->em,
300                                               PICO_EXC_BUF_OVERFLOW,NULL,NULL);
301            }
302        }
303
304    } else {    /* at least one entry found */
305        PICODBG_DEBUG(("at least one entry found in lexicon %i",foundIndex));
306        if (lexres.phonfound) {    /* incl. ind-phone and possibly multi-ent. */
307            if (lexres.nrres > PICOKLEX_MAX_NRRES) {
308                /* not possible with system lexicon, needs to be
309                   ensured for user lex too */
310                picoos_emRaiseWarning(this->common->em, PICO_WARN_FALLBACK,NULL,
311                        (picoos_char *)"using %d lexicon lookup results",
312                        PICOKLEX_MAX_NRRES);
313                lexres.nrres = PICOKLEX_MAX_NRRES;
314            }
315            head->type = PICODATA_ITEM_WORDINDEX;
316            if (lexres.nrres == 1) {
317                head->info1 = lexres.posind[0];
318            } else {
319                /* more than one result, POSgroup info needs to be
320                   determined for later POS disambiguation */
321                for (i = 0; i < lexres.nrres; i++) {
322                    posbuf[i] = lexres.posind[i * PICOKLEX_POSIND_SIZE];
323                }
324                head->info1 = picoktab_getPosGroup(wa->tabpos, posbuf,
325                                                   lexres.nrres);
326            }
327            head->info2 = foundIndex;
328            head->len = lexres.posindlen;
329            if ((status = picodata_put_itemparts(head, lexres.posind,
330                                                 lexres.posindlen,
331                                                 wa->outBuf, wa->outBufSize,
332                                                 &wa->outLen)) == PICO_OK) {
333                wa->inLen = 0;
334            } else {
335                return picoos_emRaiseException(this->common->em, status,
336                                               NULL, NULL);
337            }
338
339        } else {    /* no phone, :G2P, one entry: WORDGRAPH(POS,NA)graph */
340            if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen,
341                                              wa->outBuf, wa->outBufSize,
342                                              &wa->outLen)) {
343                wa->inLen = 0;
344                /* set lex pos in info1 */
345                if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen,
346                                                      lexres.posind[0])) {
347                    return picoos_emRaiseException(this->common->em,
348                                                   PICO_EXC_BUF_OVERFLOW,
349                                                   NULL, NULL);
350                }
351            }
352        }
353    }
354    return PICO_OK;
355}
356
357
358/* ***********************************************************************/
359/*                          waStep function                              */
360/* ***********************************************************************/
361
362/*
363   collect into internal buffer, process, and then feed to output buffer
364
365   init state: COLLECT      ext      ext
366   state transitions:       in IN OUTout
367   COLLECT | getOneItem  ->-1 +1  0  0   | (ATOMIC) -> PROCESS (got item)
368   COLLECT | getOneItem  -> 0  0  0  0   | IDLE                (got no item)
369
370   PROCESS | procOneItem -> 0 -1 +1  0   | (ATOMIC) -> FEED    (proc'ed item)
371   PROCESS | procOneItem -> 0 -1  0  0   | BUSY     -> COLLECT (item skipped)
372
373   FEED    | putOneItem  -> 0  0 -1 +1   | BUSY     -> COLLECT (put item)
374   FEED    | putOneItem  -> 0  0  1  0   | OUT_FULL            (put no item)
375*/
376
377static picodata_step_result_t waStep(register picodata_ProcessingUnit this,
378                                     picoos_int16 mode,
379                                     picoos_uint16 * numBytesOutput) {
380    register wa_subobj_t *wa;
381    pico_status_t rv = PICO_OK;
382
383    if (NULL == this || NULL == this->subObj) {
384        return PICODATA_PU_ERROR;
385    }
386    wa = (wa_subobj_t *) this->subObj;
387    mode = mode;        /* avoid warning "var not used in this function"*/
388    *numBytesOutput = 0;
389    while (1) { /* exit via return */
390        PICODBG_DEBUG(("doing state %i, inLen: %d, outLen: %d",
391                       wa->procState, wa->inLen, wa->outLen));
392
393        switch (wa->procState) {
394            /* collect state: get item from charBuf and store in
395             * internal inBuf
396             */
397            case WA_STEPSTATE_COLLECT:
398                if (wa->inLen == 0) { /* is input buffer empty? */
399                    picoos_uint16 blen;
400                    /* try to get one item */
401                    rv = picodata_cbGetItem(this->cbIn, wa->inBuf,
402                                            wa->inBufSize, &blen);
403                    PICODBG_DEBUG(("after getting item, status: %d", rv));
404                    if (PICO_OK == rv) {
405                        /* we now have one item */
406                        wa->inLen = blen;
407                        wa->procState = WA_STEPSTATE_PROCESS;
408                        /* uncomment next line to split into two steps */
409                        /* return PICODATA_PU_ATOMIC; */
410                    } else if (PICO_EOF == rv) {
411                        /* there was no item in the char buffer */
412                        return PICODATA_PU_IDLE;
413                    } else if ((PICO_EXC_BUF_UNDERFLOW == rv)
414                               || (PICO_EXC_BUF_OVERFLOW == rv)) {
415                        PICODBG_ERROR(("problem getting item"));
416                        picoos_emRaiseException(this->common->em, rv,
417                                                NULL, NULL);
418                        return PICODATA_PU_ERROR;
419                    } else {
420                        PICODBG_ERROR(("problem getting item, unhandled"));
421                        picoos_emRaiseException(this->common->em, rv,
422                                                NULL, NULL);
423                        return PICODATA_PU_ERROR;
424                    }
425                } else { /* there already is an item in the input buffer */
426                    PICODBG_WARN(("item already in input buffer"));
427                    picoos_emRaiseWarning(this->common->em,
428                                          PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
429                    wa->procState = WA_STEPSTATE_PROCESS;
430                    /* uncomment next to split into two steps */
431                    /* return PICODATA_PU_ATOMIC; */
432                }
433                break;
434
435
436            /* process state: process item in internal inBuf and put
437             * result in internal outBuf
438             */
439            case WA_STEPSTATE_PROCESS:
440
441                /* ensure there is an item in inBuf and it is valid */
442                if ((wa->inLen > 0) && picodata_is_valid_item(wa->inBuf,
443                                                              wa->inLen)) {
444                    picodata_itemhead_t ihead;
445                    picoos_uint8 *icontent;
446                    pico_status_t rvP = PICO_OK;
447
448                    rv = picodata_get_iteminfo(wa->inBuf, wa->inLen, &ihead,
449                                               &icontent);
450                    if (PICO_OK == rv) {
451
452                        switch (ihead.type) {
453                            case PICODATA_ITEM_WORDGRAPH:
454
455                                if (0 < ihead.len) {
456                                    rvP = waProcessWordgraph(this, wa, &ihead,
457                                                             icontent);
458                                } else {
459                                    /* else ignore empty WORDGRAPH */
460                                    wa->inLen = 0;
461                                    wa->procState = WA_STEPSTATE_COLLECT;
462                                    return PICODATA_PU_BUSY;
463                                }
464                                break;
465                            case PICODATA_ITEM_OTHER:
466                                /* skip item */
467                                rvP = PICO_WARN_PU_DISCARD_BUF;
468                                break;
469                            default:
470                                /* copy item unmodified */
471                                rvP = picodata_copy_item(wa->inBuf,
472                                                         wa->inLen, wa->outBuf,
473                                                         wa->outBufSize, &wa->outLen);
474                                break;
475                        }
476
477                        if (PICO_OK == rvP) {
478                            wa->inLen = 0;
479                            wa->procState = WA_STEPSTATE_FEED;
480                            /* uncomment next to split into two steps */
481                            /* return PICODATA_PU_ATOMIC; */
482                        } else if (PICO_WARN_PU_DISCARD_BUF == rvP) {
483                            /* discard input buffer and get a new item */
484                            PICODBG_INFO(("skipping OTHER item"));
485/*                            picoos_emRaiseWarning(this->common->em,
486                                                  PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
487*/
488                            wa->inLen = 0;
489                            wa->procState = WA_STEPSTATE_COLLECT;
490                            return PICODATA_PU_BUSY;
491                        } else {
492                            /* PICO_EXC_BUF_OVERFLOW   <- overflow in outbuf
493                               PICO_ERR_OTHER          <- no valid item in inbuf
494                               or return from processWordgraph
495                            */
496                            PICODBG_ERROR(("problem processing item", rvP));
497                            picoos_emRaiseException(this->common->em, rvP,
498                                                    NULL, NULL);
499                            return PICODATA_PU_ERROR;
500                        }
501
502                    } else {    /* could not get iteminfo */
503                        /* PICO_EXC_BUF_OVERFLOW   <- overflow in outbuf
504                           PICO_ERR_OTHER          <- no valid item in inbuf
505                        */
506                        PICODBG_ERROR(("problem getting item info, "
507                                       "discard buffer content"));
508                        wa->inLen = 0;
509                        wa->procState = WA_STEPSTATE_COLLECT;
510                        picoos_emRaiseException(this->common->em, rv,
511                                                NULL, NULL);
512                        return PICODATA_PU_ERROR;
513                    }
514
515                } else if (wa->inLen == 0) {    /* no item in inBuf */
516                    PICODBG_INFO(("no item in inBuf"));
517                    /* wa->inLen = 0;*/
518                    wa->procState = WA_STEPSTATE_COLLECT;
519                    return PICODATA_PU_BUSY;
520
521                } else {    /* no valid item in inBuf */
522                    /* bad state/item, discard buffer content */
523                    PICODBG_WARN(("no valid item, discard buffer content"));
524                    picoos_emRaiseWarning(this->common->em,
525                                          PICO_WARN_PU_IRREG_ITEM, NULL, NULL);
526                    picoos_emRaiseWarning(this->common->em,
527                                          PICO_WARN_PU_DISCARD_BUF, NULL, NULL);
528                    wa->inLen = 0;
529                    wa->procState = WA_STEPSTATE_COLLECT;
530                    return PICODATA_PU_BUSY;
531                }
532                break;
533
534
535            /* feed state: copy item in internal outBuf to output charBuf */
536            case WA_STEPSTATE_FEED:
537
538                /* check that item fits in cb should not be needed */
539                rv = picodata_cbPutItem(this->cbOut, wa->outBuf,
540                                        wa->outLen, numBytesOutput);
541
542                PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG],
543                                   (picoos_uint8 *)"wana: ", wa->outBuf,
544                                   wa->outLen);
545
546                PICODBG_DEBUG(("put item, status: %d", rv));
547                if (PICO_OK == rv) {
548                    wa->outLen = 0;
549                    wa->procState = WA_STEPSTATE_COLLECT;
550                    return PICODATA_PU_BUSY;
551                } else if (PICO_EXC_BUF_OVERFLOW == rv) {
552                    PICODBG_INFO(("feeding, overflow, PICODATA_PU_OUT_FULL"));
553                    return PICODATA_PU_OUT_FULL;
554                } else if ((PICO_EXC_BUF_UNDERFLOW == rv)
555                           || (PICO_ERR_OTHER == rv)) {
556                    PICODBG_WARN(("feeding problem, discarding item"));
557                    wa->outLen = 0;
558                    wa->procState = WA_STEPSTATE_COLLECT;
559                    picoos_emRaiseWarning(this->common->em, rv, NULL,NULL);
560                    return PICODATA_PU_BUSY;
561                }
562                break;
563
564            default:
565                break;
566
567        } /* switch */
568
569    } /* while */
570
571    /* should be never reached */
572    PICODBG_ERROR(("reached end of function"));
573    picoos_emRaiseException(this->common->em, PICO_ERR_OTHER, NULL, NULL);
574    return PICODATA_PU_ERROR;
575}
576
577#ifdef __cplusplus
578}
579#endif
580
581
582/* end */
583