1#!/usr/bin/env python 2# Copyright (c) 2010 Google Inc. All rights reserved. 3# 4# Redistribution and use in source and binary forms, with or without 5# modification, are permitted provided that the following conditions are 6# met: 7# 8# * Redistributions of source code must retain the above copyright 9# notice, this list of conditions and the following disclaimer. 10# * Redistributions in binary form must reproduce the above 11# copyright notice, this list of conditions and the following disclaimer 12# in the documentation and/or other materials provided with the 13# distribution. 14# * Neither the name of Google Inc. nor the names of its 15# contributors may be used to endorse or promote products derived from 16# this software without specific prior written permission. 17# 18# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30"""This python script creates the raw data that is our entity 31database. The representation is one string database containing all 32strings we could need, and then a mapping from offset+length -> entity 33data. That is compact, easy to use and efficient.""" 34 35import csv 36import os.path 37import string 38import sys 39 40ENTITY = 0 41VALUE = 1 42 43def convert_value_to_int(value): 44 if not value: 45 return "0"; 46 assert(value[0] == "U") 47 assert(value[1] == "+") 48 return "0x" + value[2:] 49 50 51def offset_table_entry(offset): 52 return " &staticEntityTable[%s]," % offset 53 54 55program_name = os.path.basename(__file__) 56if len(sys.argv) < 4 or sys.argv[1] != "-o": 57 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program_name, file=sys.stderr) 58 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name) 59 exit(1) 60 61output_path = sys.argv[2] 62input_path = sys.argv[3] 63 64with open(input_path) as html_entity_names_file: 65 entries = list(csv.reader(html_entity_names_file)) 66 67entries.sort(key = lambda entry: entry[ENTITY]) 68entity_count = len(entries) 69 70output_file = open(output_path, "w") 71 72output_file.write("""/* 73 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 74 * 75 * Redistribution and use in source and binary forms, with or without 76 * modification, are permitted provided that the following conditions 77 * are met: 78 * 1. Redistributions of source code must retain the above copyright 79 * notice, this list of conditions and the following disclaimer. 80 * 2. Redistributions in binary form must reproduce the above copyright 81 * notice, this list of conditions and the following disclaimer in the 82 * documentation and/or other materials provided with the distribution. 83 * 84 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 85 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 86 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 87 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 88 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 89 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 90 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 91 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 92 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 94 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 95 */ 96 97// THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table 98// DO NOT EDIT (unless you are a ninja)! 99 100#include "config.h" 101#include "core/html/parser/HTMLEntityTable.h" 102 103namespace blink { 104 105namespace { 106""") 107 108assert len(entries) > 0, "Code assumes a non-empty entity array." 109def check_ascii(entity_string): 110 for c in entity_string: 111 code = ord(c) 112 assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " + 113 "of storage from LChar to UChar to support " + 114 "this entity.") 115 116output_file.write("static const LChar staticEntityStringStorage[] = {\n") 117output_file.write("'") 118all_data = "" 119entity_offset = 0 120first_output = True 121saved_by_reusing = 0 122for entry in entries: 123 check_ascii(entry[ENTITY]) 124 # Reuse substrings from earlier entries. This saves 1-2000 125 # characters, but it's O(n^2) and not very smart. The optimal 126 # solution has to solve the "Shortest Common Superstring" problem 127 # and that is NP-Complete or worse. 128 # 129 # This would be even more efficient if we didn't store the 130 # semi-colon in the array but as a bit in the entry. 131 entity = entry[ENTITY] 132 already_existing_offset = all_data.find(entity) 133 if already_existing_offset != -1: 134 # Reusing space. 135 this_offset = already_existing_offset 136 saved_by_reusing += len(entity) 137 else: 138 if not first_output: 139 output_file.write(",\n'") 140 first_output = False 141 142 # Try the end of the string and see if we can reuse that to 143 # fit the start of the new entity. 144 data_to_add = entity 145 this_offset = entity_offset 146 for truncated_len in range(len(entity) - 1, 0, -1): 147 if all_data.endswith(entity[:truncated_len]): 148 data_to_add = entity[truncated_len:] 149 this_offset = entity_offset - truncated_len 150 saved_by_reusing += truncated_len 151 break 152 153 output_file.write("', '".join(data_to_add)) 154 all_data += data_to_add 155 output_file.write("'") 156 entity_offset += len(data_to_add) 157 assert len(entry) == 2, "We will use slot [2] in the list for the offset." 158 assert this_offset < 32768 # Stored in a 16 bit short. 159 entry.append(this_offset) 160 161output_file.write("};\n") 162 163index = {} 164for offset, entry in enumerate(entries): 165 starting_letter = entry[ENTITY][0] 166 if starting_letter not in index: 167 index[starting_letter] = offset 168 169output_file.write(""" 170static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) 171 172for entry in entries: 173 values = entry[VALUE].split(' ') 174 assert len(values) <= 2, values 175 output_file.write(' { %s, %s, %s, %s }, // &%s\n' % ( 176 convert_value_to_int(values[0]), 177 convert_value_to_int(values[1] if len(values) >= 2 else ""), 178 entry[2], 179 len(entry[ENTITY]), 180 entry[ENTITY], 181 )) 182 183output_file.write("""}; 184 185""") 186 187output_file.write(""" 188} 189""") 190 191output_file.write("static const short uppercaseOffset[] = {\n") 192for letter in string.ascii_uppercase: 193 output_file.write("%d,\n" % index[letter]) 194output_file.write("%d\n" % index['a']) 195output_file.write("""}; 196 197static const short lowercaseOffset[] = {\n""") 198for letter in string.ascii_lowercase: 199 output_file.write("%d,\n" % index[letter]) 200output_file.write("%d\n" % entity_count) 201output_file.write("""}; 202 203const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry) 204{ 205 return staticEntityStringStorage + entry.entityOffset; 206} 207 208LChar HTMLEntityTableEntry::lastCharacter() const 209{ 210 return HTMLEntityTable::entityString(*this)[length - 1]; 211} 212 213const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) 214{ 215 if (c >= 'A' && c <= 'Z') 216 return &staticEntityTable[uppercaseOffset[c - 'A']]; 217 if (c >= 'a' && c <= 'z') 218 return &staticEntityTable[lowercaseOffset[c - 'a']]; 219 return 0; 220} 221 222const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) 223{ 224 if (c >= 'A' && c <= 'Z') 225 return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1; 226 if (c >= 'a' && c <= 'z') 227 return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1; 228 return 0; 229} 230 231const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() 232{ 233 return &staticEntityTable[0]; 234} 235 236const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() 237{ 238 return &staticEntityTable[%s - 1]; 239} 240 241} 242""" % entity_count) 243