1#!/usr/bin/env python
2# Copyright (c) 2010 Google Inc. All rights reserved.
3#
4# Redistribution and use in source and binary forms, with or without
5# modification, are permitted provided that the following conditions are
6# met:
7# 
8#     * Redistributions of source code must retain the above copyright
9# notice, this list of conditions and the following disclaimer.
10#     * Redistributions in binary form must reproduce the above
11# copyright notice, this list of conditions and the following disclaimer
12# in the documentation and/or other materials provided with the
13# distribution.
14#     * Neither the name of Google Inc. nor the names of its
15# contributors may be used to endorse or promote products derived from
16# this software without specific prior written permission.
17# 
18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30"""This python script creates the raw data that is our entity
31database. The representation is one string database containing all
32strings we could need, and then a mapping from offset+length -> entity
33data. That is compact, easy to use and efficient."""
34
35import csv
36import os.path
37import string
38import sys
39
40ENTITY = 0
41VALUE = 1
42
43def convert_value_to_int(value):
44    if not value:
45        return "0";
46    assert(value[0] == "U")
47    assert(value[1] == "+")
48    return "0x" + value[2:]
49
50
51def offset_table_entry(offset):
52    return "    &staticEntityTable[%s]," % offset
53
54
55program_name = os.path.basename(__file__)
56if len(sys.argv) < 4 or sys.argv[1] != "-o":
57    # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program_name, file=sys.stderr)
58    sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name)
59    exit(1)
60
61output_path = sys.argv[2]
62input_path = sys.argv[3]
63
64with open(input_path) as html_entity_names_file:
65    entries = list(csv.reader(html_entity_names_file))
66
67entries.sort(key = lambda entry: entry[ENTITY])
68entity_count = len(entries)
69
70output_file = open(output_path, "w")
71
72output_file.write("""/*
73 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 * 1. Redistributions of source code must retain the above copyright
79 *    notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 *    notice, this list of conditions and the following disclaimer in the
82 *    documentation and/or other materials provided with the distribution.
83 *
84 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
85 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
86 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
87 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
88 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
89 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
90 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
91 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
92 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
94 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
95 */
96
97// THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table
98// DO NOT EDIT (unless you are a ninja)!
99
100#include "config.h"
101#include "core/html/parser/HTMLEntityTable.h"
102
103namespace blink {
104
105namespace {
106""")
107
108assert len(entries) > 0, "Code assumes a non-empty entity array."
109def check_ascii(entity_string):
110    for c in entity_string:
111        code = ord(c)
112        assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " +
113                                  "of storage from LChar to UChar to support " +
114                                  "this entity.")
115
116output_file.write("static const LChar staticEntityStringStorage[] = {\n")
117output_file.write("'")
118all_data = ""
119entity_offset = 0
120first_output = True
121saved_by_reusing = 0
122for entry in entries:
123    check_ascii(entry[ENTITY])
124    # Reuse substrings from earlier entries. This saves 1-2000
125    # characters, but it's O(n^2) and not very smart. The optimal
126    # solution has to solve the "Shortest Common Superstring" problem
127    # and that is NP-Complete or worse.
128    #
129    # This would be even more efficient if we didn't store the
130    # semi-colon in the array but as a bit in the entry.
131    entity = entry[ENTITY]
132    already_existing_offset = all_data.find(entity)
133    if already_existing_offset != -1:
134        # Reusing space.
135        this_offset = already_existing_offset
136        saved_by_reusing += len(entity)
137    else:
138        if not first_output:
139            output_file.write(",\n'")
140        first_output = False
141
142        # Try the end of the string and see if we can reuse that to
143        # fit the start of the new entity.
144        data_to_add = entity
145        this_offset = entity_offset
146        for truncated_len in range(len(entity) - 1, 0, -1):
147            if all_data.endswith(entity[:truncated_len]):
148                data_to_add = entity[truncated_len:]
149                this_offset = entity_offset - truncated_len
150                saved_by_reusing += truncated_len
151                break
152
153        output_file.write("', '".join(data_to_add))
154        all_data += data_to_add
155        output_file.write("'")
156        entity_offset += len(data_to_add)
157    assert len(entry) == 2, "We will use slot [2] in the list for the offset."
158    assert this_offset < 32768 # Stored in a 16 bit short.
159    entry.append(this_offset)
160
161output_file.write("};\n")
162
163index = {}
164for offset, entry in enumerate(entries):
165    starting_letter = entry[ENTITY][0]
166    if starting_letter not in index:
167        index[starting_letter] = offset
168
169output_file.write("""
170static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count)
171
172for entry in entries:
173    values = entry[VALUE].split(' ')
174    assert len(values) <= 2, values
175    output_file.write('    { %s, %s, %s, %s }, // &%s\n' % (
176        convert_value_to_int(values[0]),
177        convert_value_to_int(values[1] if len(values) >= 2 else ""),
178        entry[2],
179        len(entry[ENTITY]),
180        entry[ENTITY],
181        ))
182
183output_file.write("""};
184
185""")
186
187output_file.write("""
188}
189""")
190
191output_file.write("static const short uppercaseOffset[] = {\n")
192for letter in string.ascii_uppercase:
193    output_file.write("%d,\n" % index[letter])
194output_file.write("%d\n" % index['a'])
195output_file.write("""};
196
197static const short lowercaseOffset[] = {\n""")
198for letter in string.ascii_lowercase:
199    output_file.write("%d,\n" % index[letter])
200output_file.write("%d\n" % entity_count)
201output_file.write("""};
202
203const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry)
204{
205    return staticEntityStringStorage + entry.entityOffset;
206}
207
208LChar HTMLEntityTableEntry::lastCharacter() const
209{
210    return HTMLEntityTable::entityString(*this)[length - 1];
211}
212
213const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c)
214{
215    if (c >= 'A' && c <= 'Z')
216        return &staticEntityTable[uppercaseOffset[c - 'A']];
217    if (c >= 'a' && c <= 'z')
218        return &staticEntityTable[lowercaseOffset[c - 'a']];
219    return 0;
220}
221
222const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c)
223{
224    if (c >= 'A' && c <= 'Z')
225        return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1;
226    if (c >= 'a' && c <= 'z')
227        return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1;
228    return 0;
229}
230
231const HTMLEntityTableEntry* HTMLEntityTable::firstEntry()
232{
233    return &staticEntityTable[0];
234}
235
236const HTMLEntityTableEntry* HTMLEntityTable::lastEntry()
237{
238    return &staticEntityTable[%s - 1];
239}
240
241}
242""" % entity_count)
243