1/*---------------------------------------------------------------------------*
2 *  text_parser.c  *
3 *                                                                           *
4 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5 *                                                                           *
6 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7 *  you may not use this file except in compliance with the License.         *
8 *                                                                           *
9 *  You may obtain a copy of the License at                                  *
10 *      http://www.apache.org/licenses/LICENSE-2.0                           *
11 *                                                                           *
12 *  Unless required by applicable law or agreed to in writing, software      *
13 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 *  See the License for the specific language governing permissions and      *
16 *  limitations under the License.                                           *
17 *                                                                           *
18 *---------------------------------------------------------------------------*/
19
20#include"pstdio.h"
21#include"srec_context.h"
22#include"astar.h"
23
24#include "passert.h"
25#include "portable.h"
26
27
28#define MAX_LOCAL_LEN 256
29#define PARSE_PASS 0
30#define PARSE_FAIL 1
31
32
33static int check_word_path(srec_context* context, arc_token* atok,
34                           const char* transcription, int tlen)
35{
36  const char    *wd, *p;
37  char          *q;
38  arc_token*    next_atok;
39  wordID        wdID;
40  int           q_position;
41
42  if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
43  {
44    PLogError("Transcription too long [%s]\n", transcription);
45    return PARSE_FAIL;
46  }
47
48  while (1) {
49    char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */
50
51    /* wd points to the first char of last word */
52    wd = transcription;
53    if (tlen > 0)
54    {
55      for (wd = transcription + tlen - 1; wd > transcription; wd--)
56      {
57        if (*wd == ' ')
58        {
59          wd++;
60          break;
61        }
62      }
63    }
64    for (p = wd, q = copy_of_word; ; p++, q++)
65    {
66      q_position = q - copy_of_word;
67      if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN)
68      {
69        PLogError("Word too long in transcription [%s]\n", transcription);
70        return PARSE_FAIL;
71      }
72      *q = *p;
73      if (*p == ' ' || *p == '\0')
74      {
75        *q = 0;
76        break;
77      }
78    }
79    wdID = wordmap_find_index(context->olabels, copy_of_word);
80
81    if (wdID < MAXwordID)
82    {
83      next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
84    }
85    else
86    {
87      next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
88      if (!next_atok) return PARSE_FAIL;
89    }
90
91    if (!next_atok) return PARSE_FAIL;
92
93    int whether_final_atok = 0;
94    arc_token* tmp;
95    for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
96         tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
97    {
98      if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
99    }
100
101    if (wd == transcription && whether_final_atok) return PARSE_PASS;
102    if (wd == transcription) return PARSE_FAIL;
103    tlen--;
104    while (transcription[tlen] != ' ' && tlen > 0) tlen--;
105
106    atok = next_atok;
107  }
108}
109
110int FST_CheckPath_Simple(srec_context* context, const char* transcription)
111{
112  arc_token* atok = &context->arc_token_list[0];
113  int transcription_len = strlen(transcription);
114  int rc;
115
116  for (; transcription_len > 0; transcription_len--)
117    if (transcription[transcription_len-1] != ' ') break;
118  rc = check_word_path(context, atok, transcription, transcription_len);
119  return rc;
120}
121
122int FST_CheckPath_Complex(srec_context* context, const char* transcription,
123                          char* literal, size_t max_literal_len)
124{
125  int i, j, rc;
126  int num_spaces;
127  char copy_of_transcription[MAX_LOCAL_LEN];
128  char* spaces[24], *p; /* can't go too high here!! */
129  ASSERT(strlen(transcription) < MAX_LOCAL_LEN);
130
131  strcpy(copy_of_transcription, transcription);
132  for (num_spaces = 0, p = copy_of_transcription; *p; p++)
133  {
134    if (*p == ' ')
135    {
136      if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
137      {
138        PLogError("FST_CheckPath_Complex() failed on too many words\n");
139        return PARSE_FAIL;
140      }
141      spaces[num_spaces++] = p;
142    }
143  }
144
145  if (num_spaces == 0)
146  {
147    rc = FST_CheckPath_Simple(context, transcription);
148    if (rc == PARSE_PASS)
149    {
150      ASSERT(strlen(copy_of_transcription) < max_literal_len);
151      strcpy(literal, copy_of_transcription);
152    }
153    return rc;
154  }
155
156  for (i = 0; i < (1 << num_spaces); i++)
157  {
158    /* find the space pointers */
159    for (j = 0; j < num_spaces; j++)
160      *spaces[j] = i & (1 << j) ? '_' : ' ';
161    /* check each word, potentially within a rule! */
162    for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
163    {
164      wordID k, wdid = wordmap_find_index(context->olabels, p);
165      if (wdid < MAXwordID) continue;
166      for (k = 1; k < context->olabels->num_slots; k++)
167      {
168        wdid = wordmap_find_index_in_rule(context->olabels, p, k);
169        if (wdid < MAXwordID) break;
170      }
171      if (wdid == MAXwordID)
172        goto next_i;
173    }
174    /* fix the nulls back */
175    for (j = 0; j < num_spaces; j++)
176      *spaces[j] = i & (1 << j) ? '_' : ' ';
177    rc = FST_CheckPath_Simple(context, copy_of_transcription);
178    if (rc == PARSE_PASS)
179    {
180      ASSERT(strlen(copy_of_transcription) < max_literal_len);
181      strcpy(literal, copy_of_transcription);
182      return rc;
183    }
184next_i:
185    continue;
186  }
187  return PARSE_FAIL;
188}
189
190static void clean_up_sentence(char* s);
191
192int FST_CheckPath(srec_context* context, const char* transcription,
193                  char* literal, size_t max_literal_len)
194{
195  char mytranscription[256];
196  passert(strlen(transcription) < sizeof(mytranscription));
197  strcpy(mytranscription, transcription);
198  clean_up_sentence(mytranscription);
199  if (!context->arc_token_list)
200    return 2;
201  else
202    return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
203}
204
205static void clean_up_sentence(char* s)
206{
207  char* p, *q;
208  if (0) printf("sentence: '%s'\n", s);
209  /* change speech codes to spaces */
210  for (p = s; *p; p++)
211  {
212    if (*p == '[')
213      for (;*p && *p != ']'; p++)
214        *p = ' ';
215    if (*p == ']') *p = ' ';
216  }
217  /* trim leading spaces */
218  for (p = s; *p == ' ';)
219    for (q = p; *q; q++) *q = *(q + 1);
220  /* trim middle spaces */
221  for (p = s; p && *p;)
222  {
223    if (!*p) break;
224    p = strchr(p, ' ');
225    if (!p) break;
226    for (;*(p + 1) == ' ';)
227      for (q = p; *q; q++) *q = *(q + 1);
228    p++;
229  }
230  /* trim ending spaces */
231  for (p = s + strlen(s); p != s;)
232    if (*(--p) == ' ') *p = 0;
233    else break;
234
235  if (0) printf("clean_sentence: '%s'\n", s);
236}
237
238
239
240