get_search_engines.py revision d26706538834e0ed58bf28f08d9a2885c0e7efcb
1#!/usr/bin/python2.4 2# 3# Copyright (C) 2010 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17""" 18Creates the list of search engines 19 20The created list is placed in the res/values-<locale> directory. Also updates 21res/values/all_search_engines.xml if required with new data. 22 23Usage: get_search_engines.py 24 25Copyright (C) 2010 The Android Open Source Project 26""" 27 28import os 29import re 30import sys 31import urllib 32from xml.dom import minidom 33 34# Locales to generate search engine lists for 35locales = ["cs-CZ", "da-DK", "de-AT", "de-CH", "de-DE", "el-GR", "en-AU", 36 "en-GB", "en-IE", "en-NZ", "en-SG", "en-ZA", "es-ES", "fr-BE", "fr-FR", 37 "it-IT", "ja-JP", "ko-KR", "nb-NO", "nl-BE", "nl-NL", "pl-PL", "pt-PT", 38 "pt-BR", "ru-RU", "sv-SE", "tr-TR", "zh-CN", "zh-HK", "zh-MO", "zh-TW"] 39 40class SearchEngineManager(object): 41 """Manages list of search engines and creates locale specific lists. 42 43 The main method useful for the caller is generateListForLocale(), which 44 creates a locale specific search_engines.xml file suitable for use by the 45 Android WebSearchProvider implementation. 46 """ 47 48 def __init__(self): 49 """Inits SearchEngineManager with relevant search engine data. 50 51 The search engine data is downloaded from the Chrome source repository. 52 """ 53 self.chrome_data = urllib.urlopen( 54 'http://src.chromium.org/viewvc/chrome/trunk/src/chrome/' 55 'browser/search_engines/template_url_prepopulate_data.cc').read() 56 if self.chrome_data.lower().find('repository not found') != -1: 57 print 'Unable to get Chrome source data for search engine list.\nExiting.' 58 sys.exit(2) 59 60 self.resdir = os.path.normpath(os.path.join(sys.path[0], '../res')) 61 62 self.all_engines = set() 63 64 def getXmlString(self, str): 65 """Returns an XML-safe string for the given string. 66 67 Given a string from the search engine data structure, convert it to a 68 string suitable to write to our XML data file by stripping away NULLs, 69 unwanted quotes, wide-string declarations (L"") and replacing C-style 70 unicode characters with XML equivalents. 71 """ 72 str = str.strip() 73 if str.upper() == 'NULL': 74 return '' 75 76 if str.startswith('L"'): 77 str = str[2:] 78 if str.startswith('@') or str.startswith('?'): 79 str = '\\' + str 80 81 str = str.strip('"') 82 str = str.replace('&', '&').replace('<', '<').replace('>', '>') 83 str = str.replace('"', '"').replace('\'', ''') 84 str = re.sub(r'\\x([a-fA-F0-9]+)', r'&#x\1;', str) 85 86 return str 87 88 def getEngineData(self, name): 89 """Returns an array of strings describing the specified search engine. 90 91 The returned strings are in the same order as in the Chrome source data file 92 except that the internal name of the search engine is inserted at the 93 beginning of the list. 94 """ 95 # Find the first occurance of this search engine name in the form 96 # " <name> =" in the chrome data file. 97 re_exp = '\s' + name + '\s*=' 98 search_obj = re.search(re_exp, self.chrome_data) 99 if not search_obj: 100 print ('Unable to find data for search engine ' + name + 101 '. Please check the chrome data file for format changes.') 102 return None 103 104 # Extract the struct declaration between the curly braces. 105 start_pos = self.chrome_data.find('{', search_obj.start()) + 1; 106 end_pos = self.chrome_data.find('};', start_pos); 107 engine_data_str = self.chrome_data[start_pos:end_pos] 108 109 # Remove c++ style '//' comments at the ends of each line 110 engine_data_lines = engine_data_str.split('\n') 111 engine_data_str = "" 112 for line in engine_data_lines: 113 start_pos = line.find(' // ') 114 if start_pos != -1: 115 line = line[:start_pos] 116 engine_data_str = engine_data_str + line + '\n' 117 118 # Join multiple line strings into a single string. 119 engine_data_str = re.sub('\"\s+\"', '', engine_data_str) 120 engine_data_str = re.sub('\"\s+L\"', '', engine_data_str) 121 engine_data_str = engine_data_str.replace('"L"', '') 122 123 engine_data = engine_data_str.split(',') 124 for i in range(len(engine_data)): 125 engine_data[i] = self.getXmlString(engine_data[i]) 126 127 # If the last element was an empty string (due to an extra comma at the 128 # end), ignore it. 129 if not engine_data[len(engine_data) - 1]: 130 engine_data.pop() 131 132 engine_data.insert(0, name) 133 134 return engine_data 135 136 def getSearchEnginesForCountry(self, country): 137 """Returns the list of search engine names for the given country. 138 139 The data comes from the Chrome data file. 140 """ 141 # The Chrome data file has an array defined with the name 'engines_XX' 142 # where XX = country. 143 pos = self.chrome_data.find('engines_' + country) 144 if pos == -1: 145 print ('Unable to find search engine data for country ' + country + '.') 146 return 147 148 # Extract the text between the curly braces for this array declaration 149 engines_start = self.chrome_data.find('{', pos) + 1; 150 engines_end = self.chrome_data.find('}', engines_start); 151 engines_str = self.chrome_data[engines_start:engines_end] 152 153 # Remove embedded /**/ style comments, white spaces, address-of operators 154 # and the trailing comma if any. 155 engines_str = re.sub('\/\*.+\*\/', '', engines_str) 156 engines_str = re.sub('\s+', '', engines_str) 157 engines_str = engines_str.replace('&','') 158 engines_str = engines_str.rstrip(',') 159 160 # Split the array into it's elements 161 engines = engines_str.split(',') 162 163 return engines 164 165 def writeAllEngines(self): 166 """Writes all search engines to the all_search_engines.xml file. 167 """ 168 169 all_search_engines_path = os.path.join(self.resdir, 'values/all_search_engines.xml') 170 171 text = [] 172 173 for engine_name in self.all_engines: 174 engine_data = self.getEngineData(engine_name) 175 text.append(' <string-array name="%s" translatable="false">\n' % (engine_data[0])) 176 for i in range(1, 7): 177 text.append(' <item>%s</item>\n' % (engine_data[i])) 178 text.append(' </string-array>\n') 179 print engine_data[1] + " added to all_search_engines.xml" 180 181 self.generateXmlFromTemplate(os.path.join(sys.path[0], 'all_search_engines.template.xml'), 182 all_search_engines_path, text) 183 184 def generateDefaultList(self): 185 self.writeEngineList(os.path.join(self.resdir, 'values'), "default") 186 187 def generateListForLocale(self, locale): 188 """Creates a new locale specific search_engines.xml file. 189 190 The new file contains search engines specific to that country. If required 191 this function updates all_search_engines.xml file with any new search 192 engine data necessary. 193 """ 194 separator_pos = locale.find('-') 195 if separator_pos == -1: 196 print ('Locale must be of format <language>-<country>. For e.g.' 197 ' "es-US" or "en-GB"') 198 return 199 200 language = locale[0:separator_pos] 201 country = locale[separator_pos + 1:].upper() 202 dir_path = os.path.join(self.resdir, 'values-' + language + '-r' + country) 203 204 self.writeEngineList(dir_path, country) 205 206 def writeEngineList(self, dir_path, country): 207 if os.path.exists(dir_path) and not os.path.isdir(dir_path): 208 print "File exists in output directory path " + dir_path + ". Please remove it and try again." 209 return 210 211 engines = self.getSearchEnginesForCountry(country) 212 if not engines: 213 return 214 for engine in engines: 215 self.all_engines.add(engine) 216 217 # Create the locale specific search_engines.xml file. Each 218 # search_engines.xml file has a hardcoded list of 7 items. If there are less 219 # than 7 search engines for this country, the remaining items are marked as 220 # enabled=false. 221 text = [] 222 text.append(' <string-array name="search_engines" translatable="false">\n'); 223 for engine in engines: 224 engine_data = self.getEngineData(engine) 225 name = engine_data[0] 226 text.append(' <item>%s</item>\n' % (name)) 227 text.append(' </string-array>\n'); 228 229 self.generateXmlFromTemplate(os.path.join(sys.path[0], 'search_engines.template.xml'), 230 os.path.join(dir_path, 'search_engines.xml'), 231 text) 232 233 def generateXmlFromTemplate(self, template_path, out_path, text): 234 # Load the template file and insert the new contents before the last line. 235 template_text = open(template_path).read() 236 pos = template_text.rfind('\n', 0, -2) + 1 237 contents = template_text[0:pos] + ''.join(text) + template_text[pos:] 238 239 # Make sure what we have created is valid XML :) No need to check for errors 240 # as the script will terminate with an exception if the XML was malformed. 241 engines_dom = minidom.parseString(contents) 242 243 dir_path = os.path.dirname(out_path) 244 if not os.path.exists(dir_path): 245 os.makedirs(dir_path) 246 print 'Created directory ' + dir_path 247 file = open(out_path, 'w') 248 file.write(contents) 249 file.close() 250 print 'Wrote ' + out_path 251 252if __name__ == "__main__": 253 manager = SearchEngineManager() 254 manager.generateDefaultList() 255 for locale in locales: 256 manager.generateListForLocale(locale) 257 manager.writeAllEngines() 258 259