1/*---------------------------------------------------------------------------*
2 *  voc_read.c  *
3 *                                                                           *
4 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
5 *                                                                           *
6 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
7 *  you may not use this file except in compliance with the License.         *
8 *                                                                           *
9 *  You may obtain a copy of the License at                                  *
10 *      http://www.apache.org/licenses/LICENSE-2.0                           *
11 *                                                                           *
12 *  Unless required by applicable law or agreed to in writing, software      *
13 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
14 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
15 *  See the License for the specific language governing permissions and      *
16 *  limitations under the License.                                           *
17 *                                                                           *
18 *---------------------------------------------------------------------------*/
19
20
21#ifndef _RTT
22#include <stdio.h>
23#endif
24#include <stdlib.h>
25#include <math.h>
26#include <assert.h>
27
28#if defined(__cplusplus) && defined(_MSC_VER)
29extern "C"
30{
31#include <string.h>
32}
33#else
34#include <string.h>
35#endif
36
37#include <sys/types.h>
38#include <sys/stat.h>
39#ifdef _WIN32
40#define stat _stat
41#else
42#include <unistd.h>
43#endif
44
45
46#include <fcntl.h>
47#include <sys/mman.h>
48
49#include <zipfile/zipfile.h>
50
51
52#include "hmmlib.h"
53#include "duk_io.h"
54#include "LCHAR.h"
55#include "portable.h"
56
57#include "memmove.h"
58
59static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";
60
61
62#define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r')
63
64
65#ifndef _RTT
66
67/**
68 *  Read word models and their phoneme transcriptions from .ok or .voc files.
69 *  returns -1 on error
70 */
71int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
72{
73  const char *ok;
74  ESR_ReturnCode rc;
75  int result;
76  int i;
77  char token[256];
78
79  ASSERT(voc);
80
81  if (basename == NULL || strlen(basename) == 0) {
82    PLogError("Error: invalid arg to read_word_transcription()\n");
83    goto CLEANUP;
84  }
85
86  if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) {
87    PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
88    goto CLEANUP;
89  }
90
91  /* this assumption eliminates simplifies bounds checking when parsing */
92  if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
93    PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
94    goto CLEANUP;
95  }
96
97  /* set up point to walk the data */
98  ok = voc->ok_file_data;
99
100  /* verify the header */
101  i = 0;
102  while (*ok != '=') {
103    if (cr_or_nl(*ok)) {
104      PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
105      goto CLEANUP;
106    }
107    token[i++] = *ok++;
108  }
109  token[i] = 0;
110  ok++;
111  CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
112  if (result != 0)
113  {
114    PLogError(L("%s was missing #LANG=en-us header"), basename);
115    goto CLEANUP;
116  }
117  i = 0;
118  while (!cr_or_nl(*ok)) token[i++] = *ok++;
119  token[i] = 0;
120  ok++;
121  CHKLOG(rc, ESR_str2locale(token, locale));
122
123  /* set up first and last entries */
124  voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
125  voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
126  while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */
127  voc->last_entry++;
128
129  /* determine if there are any upper case entries */
130  voc->hasUpper = 1;
131  while (ok < voc->ok_file_data + voc->ok_file_data_length) {
132    int ch = *ok;
133    if ('A' <= ch && ch <= 'Z') {
134      voc->hasUpper = 1;
135      break;
136    }
137    else if ('Z' < ch) {
138      voc->hasUpper = 0;
139      break;
140    }
141    /* scan to the next entry */
142    while (*ok++ != '\n') ;
143  }
144
145  return 0;
146
147CLEANUP:
148  delete_word_transcription(voc);
149
150  PLogError(L("read_word_transcription: failed to read '%s'"), basename);
151
152  return -1;
153}
154#endif
155
156/* the label is terminated with 0 and the entry terminated with ' ' */
157static int kompare(const char* label, const char* entry) {
158  while (*label == *entry) {
159    label++;
160    entry++;
161  }
162  return (*label ? *label : ' ') - *entry;
163}
164
165int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
166  int num_prons;
167  const char* low;
168  const char* middle;
169  const char* high;
170
171  //PLogError(L("get_prons '%s'"), label);
172
173  /* dictionaries are usually lower case, so do this for speed */
174  if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0;
175
176  /* binary search to find matching entry */
177  low = voc->first_entry;
178  high = voc->last_entry;
179  while (1) {
180    /* pick a point in the middle and align to next entry */
181    middle = low + ((high - low) >> 1) - 1;
182    while (*middle++ != '\n') ;
183
184    /* compare 'label' to 'middle' */
185    int diff = kompare(label, middle);
186    if (diff == 0) break;
187
188    /* nothing found */
189    if (low == high) return 0;
190
191    /* 'middle' aligned to 'high', so move 'high' down */
192    if (middle == high) {
193      high -= 2;
194      while (*high != '\n') high--;
195      high++;
196      continue;
197    }
198
199    if (diff > 0) low = middle;
200    else high = middle;
201  }
202
203  /* back up to find the first entry equal to 'label' */
204  low = middle;
205  while (voc->first_entry < low) {
206    const char* lo;
207    for (lo = low - 2; *lo != '\n'; lo--) ;
208    lo++;
209    if (kompare(label, lo)) break;
210    low = lo;
211  }
212
213  /* move forward to the last entry equal to 'label' */
214  high = middle;
215  while (high < voc->last_entry) {
216    const char* hi;
217    for (hi = high; *hi != '\n'; hi++) ;
218    hi++;
219    if (kompare(label, hi)) break;
220    high = hi;
221  }
222
223  /* loop over all the entries */
224  num_prons = 0;
225  while (low <= high) {
226    /* scan over the label */
227    while (*low++ != ' ') ;
228
229    /* skip the whitespace */
230    while (*low == ' ') low++;
231
232    /* copy the pron */
233    while (*low != '\n') {
234      if (--prons_len <= 2) return -1;
235      *prons++ = *low++;
236    }
237    *prons++ = 0;
238    low++;
239    num_prons++;
240  }
241  *prons++ = 0;
242
243  return num_prons;
244}
245
246void delete_word_transcription(vocab_info* voc)
247{
248  ASSERT(voc);
249
250  voc->first_entry = 0;
251  voc->last_entry = 0;
252  if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
253  voc->ok_file_data = NULL;
254  voc->ok_file_data_length = 0;
255}
256
257
258/**************************************************/
259/* may want to move these functions to 'portable' */
260/**************************************************/
261
262static int endeql(const char* string, const char* end) {
263  return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
264}
265
266/* decompress_entry requires an oversize destination buffer, so... */
267static size_t inflateSize(size_t size) {
268  return size + size / 1000 + 1;
269}
270
271int mmap_zip(const char* fname, void** buf, size_t* size) {
272    int fd = -1;
273    struct stat statbuf;
274    zipfile_t zf = 0;
275    zipentry_t ze = 0;
276    char entryname[FILENAME_MAX];
277    size_t size2 = 0;
278    void* buf2 = 0;
279
280    /* open data file, determine size, map it, and close fd */
281    fd = open(fname, O_RDONLY);
282    if (fd < 0) goto FAILED;
283
284    /* determine length */
285    if (fstat(fd, &statbuf) < 0) goto FAILED;
286
287    /* mmap it */
288    *size = statbuf.st_size;
289    *buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
290    if (*buf == MAP_FAILED) goto FAILED;
291
292    /* close fd, since we can */
293    close(fd);
294    fd = -1;
295
296    /* if not a zip file, we are done! */
297    if (!endeql(fname, ".zip")) return 0;
298
299    /* set up zipfiler */
300    zf = init_zipfile(*buf, *size);
301    if (!zf) goto FAILED;
302
303    /* get entry */
304    strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
305    entryname[strlen(entryname) - strlen(".zip")] = 0;
306    ze = lookup_zipentry(zf, entryname);
307    if (!ze) goto FAILED;
308
309    /* mmap anon memory to hold unzipped entry */
310    size2 = get_zipentry_size(ze);
311    buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
312    if (buf2 == (void*)-1) goto FAILED;
313
314    /* unzip entry */
315    if (decompress_zipentry(ze, buf2, size2)) goto FAILED;
316
317    /* release unzipper */
318    release_zipfile(zf);
319    zf = 0;
320
321    /* release mmapped file */
322    munmap(*buf, inflateSize(*size));
323
324    /* set return values */
325    *buf = buf2;
326    *size = size2;
327
328    return 0;
329
330FAILED:
331    if (fd != -1) close(fd);
332    if (zf) release_zipfile(zf);
333    if (buf2) munmap(buf2, inflateSize(size2));
334    if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size));
335    *buf = 0;
336    *size = 0;
337    return -1;
338}
339
340int munmap_zip(void* buf, size_t size) {
341    return munmap(buf, inflateSize(size));
342}
343
344