1/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5 * use this file except in compliance with the License. You may obtain a copy of
6 * the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13 * License for the specific language governing permissions and limitations under
14 * the License.
15 */
16
17
18//
19// This converts the data found at http://www.speech.cs.cmu.edu/cgi-bin/cmudict
20// into the *.ok format used by Nuance.
21// We use the file c0.6, which corresponds to (v. 0.6).
22//
23// to run: make cmu2nuance && ./cmu2nuance <c0.6 >c0.6.ok
24//
25// TODO: look at generation of 'L', ')', and ','
26//
27
28#include <stdio.h>
29#include <string.h>
30#include <ctype.h>
31
32
33static const char* xlate(const char* phone, const char* cmu, const char* nuance) {
34  int ncmu = strlen(cmu);
35  if (strncmp(phone, cmu, ncmu) || !isspace(phone[ncmu])) return NULL;
36  fputs(nuance, stdout);
37  return phone + strlen(cmu);
38}
39
40
41int main(int argc, const char* argv[]) {
42  char line[200];
43
44  fputs("#LANG=EN-US\n", stdout);
45
46  for (int lineno = 1; NULL != fgets(line, sizeof(line), stdin); lineno++)
47  {
48    if (line[0] == '#') continue;
49    if (line[0] == 0) continue;
50    if (!isalnum(line[0])) {
51      fprintf(stderr, "warning: ignoring line %d - %s", lineno, line);
52      continue;
53    }
54
55    const char* p = line;
56
57    // parse name, echoing in lower case and skipping (2) suffix
58    while (!isspace(*p)) {
59      if (*p == 0) {
60        fprintf(stderr, "can't read name at line %d\n", lineno);
61        break;
62      }
63      if (p[0] == '(' && isdigit(p[1]) && p[2] == ')' && isspace(p[3])) {
64        p += 3;
65        break;
66      }
67      fputc(tolower(*p), stdout);
68      p++;
69    }
70    fputc(' ', stdout);
71
72    // loop over whitespace delimited phonemes
73    while (1) {
74      // skip leading whitespace
75      while (isspace(*p)) p++;
76      if (*p == 0) break;
77
78      const char* next = 0;
79      if (
80        (next=xlate(p, "AA1 R", ")r")) ||   // odd     AA D
81        (next=xlate(p, "AA0", "o")) ||   // odd     AA D
82        (next=xlate(p, "AA1", "o")) ||   // odd     AA D
83        (next=xlate(p, "AA2", "o")) ||   // odd     AA D
84
85        (next=xlate(p, "AE0", "a")) ||   // at      AE T
86        (next=xlate(p, "AE1", "a")) ||   // at      AE T
87        (next=xlate(p, "AE2", "a")) ||   // at      AE T
88
89//        (next=xlate(p, "AH0 L", "L")) || // drops accuracy by 1%
90        (next=xlate(p, "AH0 N", "~")) ||   // hut     HH AH T - from jean
91        (next=xlate(p, "AH0 M", "}")) ||   // hut     HH AH T - from jean
92        (next=xlate(p, "AH0", "@")) ||   // hut     HH AH T - from jean
93        (next=xlate(p, "AH1", "u")) ||   // hut     HH AH T
94        (next=xlate(p, "AH2", "u")) ||   // hut     HH AH T
95
96        (next=xlate(p, "AO0", "{")) ||   // ought   AO T
97        (next=xlate(p, "AO1", "{")) ||   // ought   AO T
98        (next=xlate(p, "AO2", "{")) ||   // ought   AO T
99
100        (next=xlate(p, "AW0", "?")) ||   // cow     K AW
101        (next=xlate(p, "AW1", "?")) ||   // cow     K AW
102        (next=xlate(p, "AW2", "?")) ||   // cow     K AW
103
104        (next=xlate(p, "AY0", "I")) ||   // hide    HH AY D
105        (next=xlate(p, "AY1", "I")) ||   // hide    HH AY D
106        (next=xlate(p, "AY2", "I")) ||   // hide    HH AY D
107
108        (next=xlate(p, "B"  , "b")) ||   // be      B IY
109        (next=xlate(p, "CH" , "C")) ||   // cheese  CH IY Z
110        (next=xlate(p, "D"  , "d")) ||   // dee     D IY
111        (next=xlate(p, "DH" , "D")) ||   // thee    DH IY
112
113        (next=xlate(p, "EH1 R", ",r")) ||   // Ed      EH D
114        (next=xlate(p, "EH0", "c")) ||   // Ed      EH D - from jean
115        (next=xlate(p, "EH1", "e")) ||   // Ed      EH D
116        (next=xlate(p, "EH2", "e")) ||   // Ed      EH D
117
118        (next=xlate(p, "ER0", "P")) ||   // hurt    HH ER T
119        (next=xlate(p, "ER1", "V")) ||   // hurt    HH ER T
120        (next=xlate(p, "ER2", "V")) ||   // hurt    HH ER T
121
122        (next=xlate(p, "EY0", "A")) ||   // ate     EY T
123        (next=xlate(p, "EY1", "A")) ||   // ate     EY T
124        (next=xlate(p, "EY2", "A")) ||   // ate     EY T
125
126        (next=xlate(p, "F"  , "f")) ||   // fee     F IY
127        (next=xlate(p, "G"  , "g")) ||   // green   G R IY N
128        (next=xlate(p, "HH" , "h")) ||   // he      HH IY
129
130        (next=xlate(p, "IH0", "6")) ||   // it      IH T
131        (next=xlate(p, "IH1", "i")) ||   // it      IH T
132        (next=xlate(p, "IH2", "i")) ||   // it      IH T
133
134        (next=xlate(p, "IY0", "/")) ||   // eat     IY T - from jean
135        (next=xlate(p, "IY1", "E")) ||   // eat     IY T
136        (next=xlate(p, "IY2", "E")) ||   // eat     IY T
137
138        (next=xlate(p, "JH" , "j")) ||   // gee     JH IY
139        (next=xlate(p, "K"  , "k")) ||   // key     K IY
140        (next=xlate(p, "L"  , "l")) ||   // lee     L IY
141        (next=xlate(p, "M"  , "m")) ||   // me      M IY
142        (next=xlate(p, "N"  , "n")) ||   // knee    N IY
143        (next=xlate(p, "NG" , "N")) ||   // ping    P IH NG
144
145        (next=xlate(p, "OW0", "]")) ||   // oat     OW T
146        (next=xlate(p, "OW1", "O")) ||   // oat     OW T
147        (next=xlate(p, "OW2", "O")) ||   // oat     OW T
148
149        (next=xlate(p, "OY0", "<")) ||   // toy     T OY
150        (next=xlate(p, "OY1", "<")) ||   // toy     T OY
151        (next=xlate(p, "OY2", "<")) ||   // toy     T OY
152
153        (next=xlate(p, "P"  , "p")) ||   // pee     P IY
154        (next=xlate(p, "R"  , "r")) ||   // read    R IY D
155        (next=xlate(p, "S"  , "s")) ||   // sea     S IY
156        (next=xlate(p, "SH" , "S")) ||   // she     SH IY
157        (next=xlate(p, "T"  , "t")) ||   // tea     T IY
158        (next=xlate(p, "TH" , "T")) ||   // theta   TH EY T AH
159
160        (next=xlate(p, "UH0", "q")) ||   // hood    HH UH D
161        (next=xlate(p, "UH1", "q")) ||   // hood    HH UH D
162        (next=xlate(p, "UH2", "q")) ||   // hood    HH UH D
163
164        (next=xlate(p, "UW0", "U")) ||   // two     T UW
165        (next=xlate(p, "UW1", "U")) ||   // two     T UW
166        (next=xlate(p, "UW2", "U")) ||   // two     T UW
167
168        (next=xlate(p, "V"  , "v")) ||   // vee     V IY
169        (next=xlate(p, "W"  , "w")) ||   // we      W IY
170        (next=xlate(p, "Y"  , "y")) ||   // yield   Y IY L D
171        (next=xlate(p, "Z"  , "z")) ||   // zee     Z IY
172        (next=xlate(p, "ZH" , "Z")) ||   // seizure S IY ZH ER
173        0) {
174        p = next;
175      }
176      else {
177        fprintf(stderr, "can't pronounce line %d: %s", lineno, p);
178        break;
179      }
180
181    }
182
183    fputc('\n', stdout);
184
185  }
186}
187