1/* Copyright 2013 Google Inc. All Rights Reserved.
2
3   Licensed under the Apache License, Version 2.0 (the "License");
4   you may not use this file except in compliance with the License.
5   You may obtain a copy of the License at
6
7   http://www.apache.org/licenses/LICENSE-2.0
8
9   Unless required by applicable law or agreed to in writing, software
10   distributed under the License is distributed on an "AS IS" BASIS,
11   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   See the License for the specific language governing permissions and
13   limitations under the License.
14
15   Transformations on dictionary words.
16*/
17
18#ifndef BROTLI_DEC_TRANSFORM_H_
19#define BROTLI_DEC_TRANSFORM_H_
20
21#include <stdio.h>
22#include <ctype.h>
23#include "./types.h"
24
25#if defined(__cplusplus) || defined(c_plusplus)
26extern "C" {
27#endif
28
29enum WordTransformType {
30  kIdentity       = 0,
31  kOmitLast1      = 1,
32  kOmitLast2      = 2,
33  kOmitLast3      = 3,
34  kOmitLast4      = 4,
35  kOmitLast5      = 5,
36  kOmitLast6      = 6,
37  kOmitLast7      = 7,
38  kOmitLast8      = 8,
39  kOmitLast9      = 9,
40  kUppercaseFirst = 10,
41  kUppercaseAll   = 11,
42  kOmitFirst1     = 12,
43  kOmitFirst2     = 13,
44  kOmitFirst3     = 14,
45  kOmitFirst4     = 15,
46  kOmitFirst5     = 16,
47  kOmitFirst6     = 17,
48  kOmitFirst7     = 18,
49  kOmitFirst8     = 19,
50  kOmitFirst9     = 20
51};
52
53typedef struct {
54  const char* prefix;
55  enum WordTransformType transform;
56  const char* suffix;
57} Transform;
58
59static const Transform kTransforms[] = {
60     {         "", kIdentity,       ""           },
61     {         "", kIdentity,       " "          },
62     {        " ", kIdentity,       " "          },
63     {         "", kOmitFirst1,     ""           },
64     {         "", kUppercaseFirst, " "          },
65     {         "", kIdentity,       " the "      },
66     {        " ", kIdentity,       ""           },
67     {       "s ", kIdentity,       " "          },
68     {         "", kIdentity,       " of "       },
69     {         "", kUppercaseFirst, ""           },
70     {         "", kIdentity,       " and "      },
71     {         "", kOmitFirst2,     ""           },
72     {         "", kOmitLast1,      ""           },
73     {       ", ", kIdentity,       " "          },
74     {         "", kIdentity,       ", "         },
75     {        " ", kUppercaseFirst, " "          },
76     {         "", kIdentity,       " in "       },
77     {         "", kIdentity,       " to "       },
78     {       "e ", kIdentity,       " "          },
79     {         "", kIdentity,       "\""         },
80     {         "", kIdentity,       "."          },
81     {         "", kIdentity,       "\">"        },
82     {         "", kIdentity,       "\n"         },
83     {         "", kOmitLast3,      ""           },
84     {         "", kIdentity,       "]"          },
85     {         "", kIdentity,       " for "      },
86     {         "", kOmitFirst3,     ""           },
87     {         "", kOmitLast2,      ""           },
88     {         "", kIdentity,       " a "        },
89     {         "", kIdentity,       " that "     },
90     {        " ", kUppercaseFirst, ""           },
91     {         "", kIdentity,       ". "         },
92     {        ".", kIdentity,       ""           },
93     {        " ", kIdentity,       ", "         },
94     {         "", kOmitFirst4,     ""           },
95     {         "", kIdentity,       " with "     },
96     {         "", kIdentity,       "'"          },
97     {         "", kIdentity,       " from "     },
98     {         "", kIdentity,       " by "       },
99     {         "", kOmitFirst5,     ""           },
100     {         "", kOmitFirst6,     ""           },
101     {    " the ", kIdentity,       ""           },
102     {         "", kOmitLast4,      ""           },
103     {         "", kIdentity,       ". The "     },
104     {         "", kUppercaseAll,   ""           },
105     {         "", kIdentity,       " on "       },
106     {         "", kIdentity,       " as "       },
107     {         "", kIdentity,       " is "       },
108     {         "", kOmitLast7,      ""           },
109     {         "", kOmitLast1,      "ing "       },
110     {         "", kIdentity,       "\n\t"       },
111     {         "", kIdentity,       ":"          },
112     {        " ", kIdentity,       ". "         },
113     {         "", kIdentity,       "ed "        },
114     {         "", kOmitFirst9,     ""           },
115     {         "", kOmitFirst7,     ""           },
116     {         "", kOmitLast6,      ""           },
117     {         "", kIdentity,       "("          },
118     {         "", kUppercaseFirst, ", "         },
119     {         "", kOmitLast8,      ""           },
120     {         "", kIdentity,       " at "       },
121     {         "", kIdentity,       "ly "        },
122     {    " the ", kIdentity,       " of "       },
123     {         "", kOmitLast5,      ""           },
124     {         "", kOmitLast9,      ""           },
125     {        " ", kUppercaseFirst, ", "         },
126     {         "", kUppercaseFirst, "\""         },
127     {        ".", kIdentity,       "("          },
128     {         "", kUppercaseAll,   " "          },
129     {         "", kUppercaseFirst, "\">"        },
130     {         "", kIdentity,       "=\""        },
131     {        " ", kIdentity,       "."          },
132     {    ".com/", kIdentity,       ""           },
133     {    " the ", kIdentity,       " of the "   },
134     {         "", kUppercaseFirst, "'"          },
135     {         "", kIdentity,       ". This "    },
136     {         "", kIdentity,       ","          },
137     {        ".", kIdentity,       " "          },
138     {         "", kUppercaseFirst, "("          },
139     {         "", kUppercaseFirst, "."          },
140     {         "", kIdentity,       " not "      },
141     {        " ", kIdentity,       "=\""        },
142     {         "", kIdentity,       "er "        },
143     {        " ", kUppercaseAll,   " "          },
144     {         "", kIdentity,       "al "        },
145     {        " ", kUppercaseAll,   ""           },
146     {         "", kIdentity,       "='"         },
147     {         "", kUppercaseAll,   "\""         },
148     {         "", kUppercaseFirst, ". "         },
149     {        " ", kIdentity,       "("          },
150     {         "", kIdentity,       "ful "       },
151     {        " ", kUppercaseFirst, ". "         },
152     {         "", kIdentity,       "ive "       },
153     {         "", kIdentity,       "less "      },
154     {         "", kUppercaseAll,   "'"          },
155     {         "", kIdentity,       "est "       },
156     {        " ", kUppercaseFirst, "."          },
157     {         "", kUppercaseAll,   "\">"        },
158     {        " ", kIdentity,       "='"         },
159     {         "", kUppercaseFirst, ","          },
160     {         "", kIdentity,       "ize "       },
161     {         "", kUppercaseAll,   "."          },
162     { "\xc2\xa0", kIdentity,       ""           },
163     {        " ", kIdentity,       ","          },
164     {         "", kUppercaseFirst, "=\""        },
165     {         "", kUppercaseAll,   "=\""        },
166     {         "", kIdentity,       "ous "       },
167     {         "", kUppercaseAll,   ", "         },
168     {         "", kUppercaseFirst, "='"         },
169     {        " ", kUppercaseFirst, ","          },
170     {        " ", kUppercaseAll,   "=\""        },
171     {        " ", kUppercaseAll,   ", "         },
172     {         "", kUppercaseAll,   ","          },
173     {         "", kUppercaseAll,   "("          },
174     {         "", kUppercaseAll,   ". "         },
175     {        " ", kUppercaseAll,   "."          },
176     {         "", kUppercaseAll,   "='"         },
177     {        " ", kUppercaseAll,   ". "         },
178     {        " ", kUppercaseFirst, "=\""        },
179     {        " ", kUppercaseAll,   "='"         },
180     {        " ", kUppercaseFirst, "='"         },
181};
182
183static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
184
185static int ToUpperCase(uint8_t *p, int len) {
186  if (p[0] < 0xc0) {
187    if (p[0] >= 'a' && p[0] <= 'z') {
188      p[0] ^= 32;
189    }
190    return 1;
191  }
192  /* An overly simplified uppercasing model for utf-8. */
193  if (p[0] < 0xe0) {
194    p[1] ^= 32;
195    return 2;
196  }
197  /* An arbitrary transform for three byte characters. */
198  p[2] ^= 5;
199  return 3;
200}
201
202static BROTLI_INLINE int TransformDictionaryWord(
203    uint8_t* dst, const uint8_t* word, int len, int transform) {
204  const char* prefix = kTransforms[transform].prefix;
205  const char* suffix = kTransforms[transform].suffix;
206  const int t = kTransforms[transform].transform;
207  int skip = t < kOmitFirst1 ? 0 : t - (kOmitFirst1 - 1);
208  int idx = 0;
209  int i = 0;
210  uint8_t* uppercase;
211  if (skip > len) {
212    skip = len;
213  }
214  while (*prefix) { dst[idx++] = (uint8_t)*prefix++; }
215  word += skip;
216  len -= skip;
217  if (t <= kOmitLast9) {
218    len -= t;
219  }
220  while (i < len) { dst[idx++] = word[i++]; }
221  uppercase = &dst[idx - len];
222  if (t == kUppercaseFirst) {
223    ToUpperCase(uppercase, len);
224  } else if (t == kUppercaseAll) {
225    while (len > 0) {
226      int step = ToUpperCase(uppercase, len);
227      uppercase += step;
228      len -= step;
229    }
230  }
231  while (*suffix) { dst[idx++] = (uint8_t)*suffix++; }
232  return idx;
233}
234
235#if defined(__cplusplus) || defined(c_plusplus)
236}    /* extern "C" */
237#endif
238
239#endif  /* BROTLI_DEC_TRANSFORM_H_ */
240