srec/clib/voc_read.c

/*---------------------------------------------------------------------------*
 *  voc_read.c  *
 *                                                                           *
 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
 *                                                                           *
 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
 *  you may not use this file except in compliance with the License.         *
 *                                                                           *
 *  You may obtain a copy of the License at                                  *
 *      http://www.apache.org/licenses/LICENSE-2.0                           *
 *                                                                           *
 *  Unless required by applicable law or agreed to in writing, software      *
 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
 *  See the License for the specific language governing permissions and      *
 *  limitations under the License.                                           *
 *                                                                           *
 *---------------------------------------------------------------------------*/


#ifndef _RTT
#include <stdio.h>
#endif
#include <stdlib.h>
#include <math.h>
#include <assert.h>

#if defined(__cplusplus) && defined(_MSC_VER)
extern "C"
{
#include <string.h>
}
#else
#include <string.h>
#endif

#include <sys/types.h>
#include <sys/stat.h>
#ifdef _WIN32
#define stat _stat
#else
#include <unistd.h>
#endif


#include <fcntl.h>
#include <sys/mman.h>

#include <zipfile/zipfile.h>


#include "hmmlib.h"
#include "duk_io.h"
#include "LCHAR.h"
#include "portable.h"

#include "memmove.h"

static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";


#define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r')


#ifndef _RTT

/**
 *  Read word models and their phoneme transcriptions from .ok or .voc files.
 *  returns -1 on error
 */
int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
{
  const char *ok;
  ESR_ReturnCode rc;
  int result;
  int i;
  char token[256];

  ASSERT(voc);

  if (basename == NULL || strlen(basename) == 0) {
    PLogError("Error: invalid arg to read_word_transcription()\n");
    goto CLEANUP;
  }

  if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) {
    PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
    goto CLEANUP;
  }

  /* this assumption eliminates simplifies bounds checking when parsing */
  if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
    PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
    goto CLEANUP;
  }

  /* set up point to walk the data */
  ok = voc->ok_file_data;

  /* verify the header */
  i = 0;
  while (*ok != '=') {
    if (cr_or_nl(*ok)) {
      PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
      goto CLEANUP;
    }
    token[i++] = *ok++;
  }
  token[i] = 0;
  ok++;
  CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
  if (result != 0)
  {
    PLogError(L("%s was missing #LANG=en-us header"), basename);
    goto CLEANUP;
  }
  i = 0;
  while (!cr_or_nl(*ok)) token[i++] = *ok++;
  token[i] = 0;
  ok++;
  CHKLOG(rc, ESR_str2locale(token, locale));

  /* set up first and last entries */
  voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
  voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
  while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */
  voc->last_entry++;

  /* determine if there are any upper case entries */
  voc->hasUpper = 1;
  while (ok < voc->ok_file_data + voc->ok_file_data_length) {
    int ch = *ok;
    if ('A' <= ch && ch <= 'Z') {
      voc->hasUpper = 1;
      break;
    }
    else if ('Z' < ch) {
      voc->hasUpper = 0;
      break;
    }
    /* scan to the next entry */
    while (*ok++ != '\n') ;
  }

  return 0;

CLEANUP:
  delete_word_transcription(voc);

  PLogError(L("read_word_transcription: failed to read '%s'"), basename);

  return -1;
}
#endif

/* the label is terminated with 0 and the entry terminated with ' ' */
static int kompare(const char* label, const char* entry) {
  while (*label == *entry) {
    label++;
    entry++;
  }
  return (*label ? *label : ' ') - *entry;
}

int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
  int num_prons;
  const char* low;
  const char* middle;
  const char* high;

  //PLogError(L("get_prons '%s'"), label);

  /* dictionaries are usually lower case, so do this for speed */
  if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0;

  /* binary search to find matching entry */
  low = voc->first_entry;
  high = voc->last_entry;
  while (1) {
    /* pick a point in the middle and align to next entry */
    middle = low + ((high - low) >> 1) - 1;
    while (*middle++ != '\n') ;

    /* compare 'label' to 'middle' */
    int diff = kompare(label, middle);
    if (diff == 0) break;

    /* nothing found */
    if (low == high) return 0;

    /* 'middle' aligned to 'high', so move 'high' down */
    if (middle == high) {
      high -= 2;
      while (*high != '\n') high--;
      high++;
      continue;
    }

    if (diff > 0) low = middle;
    else high = middle;
  }

  /* back up to find the first entry equal to 'label' */
  low = middle;
  while (voc->first_entry < low) {
    const char* lo;
    for (lo = low - 2; *lo != '\n'; lo--) ;
    lo++;
    if (kompare(label, lo)) break;
    low = lo;
  }

  /* move forward to the last entry equal to 'label' */
  high = middle;
  while (high < voc->last_entry) {
    const char* hi;
    for (hi = high; *hi != '\n'; hi++) ;
    hi++;
    if (kompare(label, hi)) break;
    high = hi;
  }

  /* loop over all the entries */
  num_prons = 0;
  while (low <= high) {
    /* scan over the label */
    while (*low++ != ' ') ;

    /* skip the whitespace */
    while (*low == ' ') low++;

    /* copy the pron */
    while (*low != '\n') {
      if (--prons_len <= 2) return -1;
      *prons++ = *low++;
    }
    *prons++ = 0;
    low++;
    num_prons++;
  }
  *prons++ = 0;

  return num_prons;
}

void delete_word_transcription(vocab_info* voc)
{
  ASSERT(voc);

  voc->first_entry = 0;
  voc->last_entry = 0;
  if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
  voc->ok_file_data = NULL;
  voc->ok_file_data_length = 0;
}


/**************************************************/
/* may want to move these functions to 'portable' */
/**************************************************/

static int endeql(const char* string, const char* end) {
  return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
}

/* decompress_entry requires an oversize destination buffer, so... */
static size_t inflateSize(size_t size) {
  return size + size / 1000 + 1;
}

int mmap_zip(const char* fname, void** buf, size_t* size) {
    int fd = -1;
    struct stat statbuf;
    zipfile_t zf = 0;
    zipentry_t ze = 0;
    char entryname[FILENAME_MAX];
    size_t size2 = 0;
    void* buf2 = 0;

    /* open data file, determine size, map it, and close fd */
    fd = open(fname, O_RDONLY);
    if (fd < 0) goto FAILED;

    /* determine length */
    if (fstat(fd, &statbuf) < 0) goto FAILED;

    /* mmap it */
    *size = statbuf.st_size;
    *buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
    if (*buf == MAP_FAILED) goto FAILED;

    /* close fd, since we can */
    close(fd);
    fd = -1;

    /* if not a zip file, we are done! */
    if (!endeql(fname, ".zip")) return 0;

    /* set up zipfiler */
    zf = init_zipfile(*buf, *size);
    if (!zf) goto FAILED;

    /* get entry */
    strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
    entryname[strlen(entryname) - strlen(".zip")] = 0;
    ze = lookup_zipentry(zf, entryname);
    if (!ze) goto FAILED;

    /* mmap anon memory to hold unzipped entry */
    size2 = get_zipentry_size(ze);
    buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
    if (buf2 == (void*)-1) goto FAILED;

    /* unzip entry */
    if (decompress_zipentry(ze, buf2, size2)) goto FAILED;

    /* release unzipper */
    release_zipfile(zf);
    zf = 0;

    /* release mmapped file */
    munmap(*buf, inflateSize(*size));

    /* set return values */
    *buf = buf2;
    *size = size2;

    return 0;

FAILED:
    if (fd != -1) close(fd);
    if (zf) release_zipfile(zf);
    if (buf2) munmap(buf2, inflateSize(size2));
    if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size));
    *buf = 0;
    *size = 0;
    return -1;
}

int munmap_zip(void* buf, size_t size) {
    return munmap(buf, inflateSize(size));
}