1/*---------------------------------------------------------------------------* 2 * utteranc.h * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 21 22#ifndef _h_utteranc_ 23#define _h_utteranc_ 24 25#ifdef SET_RCSID 26static const char utteranc_h[] = "$Id: utteranc.h,v 1.3.6.7 2007/08/31 17:44:53 dahan Exp $"; 27#endif 28 29 30 31#include "all_defs.h" 32#include "hmm_type.h" 33#include "fpi_tgt.h" 34#include "voicing.h" 35#include "specnorm.h" 36#include "channorm.h" 37#include "swicms.h" 38#ifndef _RTT 39#include "duk_io.h" 40#endif 41 42#define DEFAULT_BUFFER_SIZE 100 /* in frames */ 43#define KEEP_FRAMES 40 /* in frames, past frames kept */ 44 45/* Functions supported are 46** new, delete (by source) 47** open file/device, close file/device 48** attach and detach sink 49** read/store samples - including the header 50*/ 51 52/** 53 * @todo document 54 */ 55typedef struct 56{ /* label structure */ 57 char *label; 58 long begin; 59 long end; 60 char *extra; 61 unsigned char flag; 62} 63annotate; 64 65 66/** 67 * @todo document 68 */ 69typedef struct 70{ 71 int utt_type; 72 int dim; 73 fepFramePkt *frame; 74 int num_chan; 75 int do_channorm; 76 spect_dist_info **spchchan; /* Mirrored from the Wave object */ 77 norm_info *channorm; /* Mirrored from the Wave object */ 78 swicms_norm_info *swicms; /* copy of wave obj pointer */ 79 spect_dist_info *backchan[MAX_CHAN_DIM]; 80 featdata *last_push; 81 int voice_duration; 82 int quiet_duration; 83 int unsure_duration; 84 int start_windback; 85} 86utt_generic_info; 87 88#ifndef _RTT 89/** 90 * @todo document 91 */ 92typedef struct 93{ 94 char typ; /* s (16 bit), c (8 bit), u (newton .utb) */ 95 int endian; /* 0 is little 1 is big */ 96 int do_skip; /* skip every other frame */ 97 unsigned long len; /* length of file/utterance */ 98 PFile* file; /* pointer to file */ 99 char name[MAX_LABEL]; /* file name */ 100 /* int op; read or write */ 101 int num_utts; /* no. of utterances in utb file */ 102 annotate *utb_table; /* utb file header information */ 103} 104utt_file_info; 105 106/** 107 * @todo document 108 */ 109typedef struct 110{ 111 int utt_type; 112 int dim; 113 fepFramePkt *frame; 114 int num_chan; 115 int do_channorm; 116 spect_dist_info **spchchan; /* Mirrored from the Wave object */ 117 norm_info *channorm; /* Mirrored from the Wave object */ 118 swicms_norm_info *swicms; /* copy of wave obj pointer */ 119 spect_dist_info *backchan[MAX_CHAN_DIM]; 120 featdata *last_push; 121 int voice_duration; 122 int quiet_duration; 123 int unsure_duration; 124 int start_windback; 125 /* voicing_info voice; */ 126 utt_file_info file; 127} 128file_utterance_info; 129#endif 130 131/** 132 * @todo document 133 */ 134typedef struct 135{ 136 int utt_type; 137 int dim; 138 fepFramePkt *frame; 139 int num_chan; 140 int do_channorm; 141 spect_dist_info **spchchan; /* Mirrored from the Wave object */ 142 norm_info *channorm; /* Mirrored from the Wave object */ 143 swicms_norm_info *swicms; /* copy of wave obj pointer */ 144 spect_dist_info *backchan[MAX_CHAN_DIM]; 145 featdata *last_push; 146 int voice_duration; 147 int quiet_duration; 148 int unsure_duration; 149 int start_windback; 150} 151live_utterance_info; 152 153/** 154 * @todo document 155 */ 156typedef union 157{ 158 int utt_type; /* live or from file */ 159 utt_generic_info gen_utt; /* generic one */ 160#ifndef _RTT 161 file_utterance_info file_utt; 162#endif 163 live_utterance_info live_utt; 164} utterance_info; 165 166 167/* 168** Size of the utb file headers and details 169*/ 170 171#ifndef _RTT 172#define UTT_VERSION 2 173#define UTT_HEADER_SIZE 16 /*Size on disk*/ 174#define UTB_HEADER_SIZE 32 /*Size on disk*/ 175#define UTB_HEADER_USED 16 /*Size on disk*/ /* SAL */ 176 177/** 178 * UTB file header. 179 */ 180typedef struct _UttHeader 181{ 182 /** 183 * The size of the header in bytes. 184 */ 185 unsigned short headerSize; 186 /** 187 * The version of the file format. 188 */ 189 unsigned short version; 190 /** 191 * The size of the payload in bytes. 192 */ 193 unsigned long nBytes; 194 /** 195 * The number of parameters per frame. 196 */ 197 unsigned short nParametersPerFrame; 198 /** 199 * 0=unknown, 1=none, 2=amp-based, 3=harmonicity-based, 4=mrec style 200 */ 201 unsigned short channelNormalization; 202 /** 203 * 0=unknown, 1=no, 2=yes 204 */ 205 unsigned short speakerNormalization; 206 /** 207 * 0=unknown, 1=no, 2=yes 208 */ 209 unsigned short imeldaization; 210 /** 211 * Before imelda truncation. 212 */ 213 unsigned short nOriginalParameters; 214 /** 215 * The number of samples per frame. 216 */ 217 unsigned short samplesPerFrame; 218 /** 219 * The audio sample rate. 220 */ 221 unsigned long sampleRate; 222 /** 223 * not used in version 5. 224 */ 225 unsigned long checksum; 226} 227UttHeader; 228 229int update_utb_header(file_utterance_info *utt, int frames, int samplerate, 230 int framerate); 231void init_utt_v5_header(UttHeader *uhead, int dim, int samplerate, int framerate); 232int init_data_file(char *filename, file_utterance_info *utt, int dimen, 233 char typ, int endian, int do_skip); 234int new_data_file(char *filename, file_utterance_info *utt, int dimen, 235 char typ, int endian); 236int set_data_frame(file_utterance_info *utt, long begin); 237int buffer_data_frames(file_utterance_info *utt, long f_begin, long f_end); 238void more_data_frames(file_utterance_info *utt); 239int save_data_frames(file_utterance_info *utt); 240void close_data_stream(file_utterance_info *utt); 241int init_utb_file(file_utterance_info *utt, annotate **table); 242int position_utb_file(file_utterance_info *utt, long position, annotate *table); 243int load_utb_data(file_utterance_info *utt, int num_frames, int do_skip); 244int load_short_data(file_utterance_info *utt, int num_frames, int do_skip); 245int save_utb_data(file_utterance_info *utt, int num_frames); 246int save_short_data(file_utterance_info *utt, int num_frames); 247int read_utt_head(UttHeader *head, PFile* datafile); 248int write_utt_head(UttHeader *head, PFile* datafile); 249int check_for_utb(char* filename); 250 251/* TCP reading routines 252*/ 253int read_tcp(char *filename, annotate **tag_base); 254int read_lst(char *filename, annotate *tag_base, int ntags); 255int read_utb_table(char *filename, annotate **tag_base); 256void save_tcp(char *tcpnam, annotate *tag, int ntags); 257void compose_tcp_name_of_utt(char* uttname , char* tcpname); 258 259#endif 260 261void init_utterance(utterance_info *utt, int utt_type, int dimen, 262 int buffer_size, int keep_frames, int num_chan, int do_voicing); 263void set_voicing_durations(utterance_info *utt, int voice_duration, 264 int quiet_duration, int unsure_duration, 265 int start_windback); 266void free_utterance(utterance_info *utt); 267int utterance_started(utterance_info *utt); 268int utterance_ended(utterance_info *utt); 269int load_utterance_frame(utterance_info *utt, unsigned char* pUttFrame, int voicing); 270int copy_utterance_frame(utterance_info *oututt, utterance_info *inutt); 271 272#endif /* _h_utteranc_ */ 273