1#!/bin/env python 2# 3# Copyright (C) 2014 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18import logging 19import os 20import re 21import sys 22import tempfile 23import textwrap 24import uuid 25 26from xml.etree import ElementTree 27from xml.dom import minidom 28 29# We are dealing with unicode data. It is extremely important to choose between 30# the |unicode| type and the |str| type with unicode encoding as the default 31# storage type for strings, and stick to it. 32# - All strings except filenames and such are of type |unicode| 33# - Note that the xml.etree.ElementTree.parse function actually returns 34# strings in the |str| type. These will be implicitly coerced to |unicode| 35# as needed. If you don't like this, add a phase to explicitly cast these 36# strings. 37# - Whenever using the |str| type, use the suffix |_str| 38# - Moreover, whenever using |str| type with |ascii| encoding, using 39# |_str_ascii| suffix 40FILE_ENCODING = 'utf-8' 41 42class ConverterError(Exception): 43 pass 44 45 46class ServiceProvidersConverter(object): 47 """ Convert the ServiceProviders XML into protobuf format. """ 48 def __init__(self, file_path, out_file_path=None): 49 """ 50 @param file_path: Absolute path to the XML file to read 51 @param out_file_path: Absolute path to the file to which the output 52 should be written. 53 54 """ 55 self._file_path = file_path 56 self._out_file_path = out_file_path 57 58 self._gsm_nodes_no_mccmnc = set() 59 self._gsm_nodes_by_mccmnc = {} 60 self._mcc_mnc_by_mccmnc = {} 61 62 # Book-keeping to sanity check the total number of providers converted, 63 # and detailed information about the conversion. 64 self._xml_cdma_nodes = 0 65 self._xml_gsm_nodes = 0 66 self._protobuf_mnos_dumped = 0 67 self._protobuf_mvnos_dumped = 0 68 self._protobuf_gsm_mnos = 0 69 self._protobuf_cdma_mnos = 0 70 self._protobuf_gsm_mvnos = 0 71 self._protobuf_gsm_unique_mvnos = 0 72 # Turns out some MVNOs are MNOs using a different MCCMNC. 73 self._protobuf_gsm_mvnos_mnos = 0 74 # Remember nodes that we decide to drop at any point. 75 self._dropped_nodes = set() 76 77 # Related to the actual protobuf output: 78 self._indent = 0 79 80 81 def Convert(self): 82 """ Top level function for the conversion. """ 83 parser = ElementTree.XMLParser(encoding=FILE_ENCODING) 84 element_tree = ElementTree.parse(self._file_path, parser=parser) 85 self._root = element_tree.getroot() 86 logging.info('Dumping parsed XML') 87 self._DumpXMLToTempFile() 88 self._xml_cdma_nodes = len(self._root.findall(u'.//cdma')) 89 self._xml_gsm_nodes = len(self._root.findall(u'.//gsm')) 90 91 self._TransformXML() 92 logging.info('Dumping transformed XML.') 93 self._DumpXMLToTempFile() 94 95 self._GroupGSMNodesByMCCMNC() 96 self._FindPrimaryNodes() 97 98 if self._out_file_path is not None: 99 with open(self._out_file_path, 'w') as self._out_file: 100 self._SpewProtobuf() 101 else: 102 self._out_file = sys.stdout 103 self._SpewProtobuf() 104 105 self._RunStatsDiagnostics() 106 107 108 def _CheckStatsEqual(self, lhs, lhs_name, rhs, rhs_name): 109 """ 110 Test that |lhs| == |rhs| and log appropriate message. 111 112 @param lhs: One value to compare. 113 @param lhs_name: str name to be used for |lhs| for logging. 114 @param rhs: Other value to compare. 115 @param rhs_name: str name to be used for |rhs| for logging. 116 @return True if check passes, False otherwise. 117 118 """ 119 result = (lhs == rhs) 120 logger = logging.info if result else logging.error 121 message = 'PASS' if result else 'FAIL' 122 logger('Sanity check: (%s) == (%s) (%d == %d) **%s**', 123 lhs_name, rhs_name, lhs, rhs, message) 124 return result 125 126 127 def _RunStatsDiagnostics(self): 128 """ Checks that the stats about nodes found / dumped tally. """ 129 # First dump dropped nodes. 130 if len(self._dropped_nodes) > 0: 131 logging.warning('Following nodes were dropped:') 132 for node in self._dropped_nodes: 133 logging.info(self._PPrintXML(node).encode(FILE_ENCODING)) 134 135 logging.info('######################') 136 logging.info('Conversion diagnostics') 137 logging.info('######################') 138 139 logging.info('Total number of XML CDMA nodes read [xml_cdma_nodes]: %d', 140 self._xml_cdma_nodes) 141 logging.info('Total number of XML GSM nodes read [xml_gsm_nodes]: %d', 142 self._xml_gsm_nodes) 143 logging.info('Total number of XML nodes read ' 144 '[xml_nodes = xml_cdma_nodes + xml_gsm_nodes]: %d', 145 self._xml_cdma_nodes + self._xml_gsm_nodes) 146 147 logging.info('Total number of protobuf MNOs dumped ' 148 '[protobuf_mnos_dumped]: %d', 149 self._protobuf_mnos_dumped) 150 logging.info('Total number of protobuf MVNOs dumped ' 151 '[protobuf_mvnos_dumped]: %d', 152 self._protobuf_mvnos_dumped) 153 logging.info('Total number of protobuf nodes dropped ' 154 '[protobuf_dropped_nodes]: %d', 155 len(self._dropped_nodes)) 156 logging.info(' (See above for the exact nodes dropped)') 157 158 logging.info('Total number of protobuf CDMA MNOs ' 159 '[protobuf_cdma_mnos]: %d', 160 self._protobuf_cdma_mnos) 161 logging.info('Total number of protobuf GSM MNOs ' 162 '[protobuf_gsm_mnos]: %d', 163 self._protobuf_gsm_mnos) 164 logging.info('Total number of protobuf GSM MVNOs ' 165 '[protobuf_gsm_mvnos]: %d', 166 self._protobuf_gsm_mvnos) 167 logging.info('Total number of protobuf unique GSM MVNOs. ' 168 '[protobuf_gsm_unique_mvnos]: %d', 169 self._protobuf_gsm_unique_mvnos) 170 logging.info(' (Some MVNOs may appear in multiple MNOs)') 171 logging.info('Total number of protobuf GSM MVNOs that are also MNOs. ' 172 '[protobuf_gsm_mvnos_mnos]: %d', 173 self._protobuf_gsm_mvnos_mnos) 174 175 check_results = [] 176 check_results.append(self._CheckStatsEqual( 177 self._protobuf_mnos_dumped, 178 'protobuf_mnos_dumped', 179 self._protobuf_cdma_mnos + self._protobuf_gsm_mnos, 180 'protobuf_cdma_mnos + protobuf_gsm_mnos')) 181 182 check_results.append(self._CheckStatsEqual( 183 self._protobuf_mnos_dumped + self._protobuf_mvnos_dumped, 184 'protobuf_mnos_dumped + protobuf_mvnos_dumped', 185 (self._protobuf_cdma_mnos + 186 self._protobuf_gsm_mnos + 187 self._protobuf_gsm_mvnos), 188 'protobuf_cdma_mnos + protobuf_gsm_mnos + protobuf_gsm_mvnos')) 189 190 check_results.append(self._CheckStatsEqual( 191 self._xml_cdma_nodes + self._xml_gsm_nodes, 192 'xml_cdma_nodes + xml_gsm_nodes', 193 (len(self._dropped_nodes) + 194 self._protobuf_gsm_mnos + 195 self._protobuf_cdma_mnos + 196 self._protobuf_gsm_unique_mvnos - 197 self._protobuf_gsm_mvnos_mnos), 198 ('protobuf_dropped_nodes + ' 199 'protobuf_gsm_mnos + protobuf_cdma_mnos + ' 200 'protobuf_gsm_unique_mvnos - protobuf_gsm_mvnos_mnos'))) 201 202 if False in check_results: 203 self._LogAndRaise('StatsDiagnostics failed.') 204 205 206 def _DumpXMLToTempFile(self): 207 """ Dumps the parsed XML to a temp file for debugging. """ 208 fd, fname = tempfile.mkstemp(prefix='converter_') 209 logging.info('Dumping XML to file %s', fname) 210 with os.fdopen(fd, 'w') as fout: 211 fout.write(self._PPrintXML(self._root).encode(FILE_ENCODING)) 212 213 214 def _EnrichNode(self, node, country_code, primary, roaming_required, names, 215 provider_type): 216 """ 217 Adds the information passed in as children of |node|. 218 219 @param node: The XML node to enrich. 220 @param country_code: The country code for node. Type: str. 221 @param primary: Is this node a primary provider. Type: str 222 @param roaming_required: Does this provider requires roaming. Type: str. 223 @param names: List of names for this provider. Type: [(str, str)]. 224 @param provider_type: Is this node 'gsm'/'cdma'. Type: str. 225 226 """ 227 ElementTree.SubElement(node, u'country', {u'code': country_code}) 228 provider_map = {} 229 provider_map[u'type'] = provider_type 230 if primary is not None: 231 provider_map[u'primary'] = primary 232 if roaming_required is not None: 233 provider_map[u'roaming-required'] = roaming_required 234 ElementTree.SubElement(node, u'provider', provider_map) 235 for name, lang in names: 236 name_map = {} 237 if lang is not None: 238 name_map[u'xml:lang'] = lang 239 name_node = ElementTree.SubElement(node, u'name', name_map) 240 name_node.text = name 241 242 243 def _TransformXML(self): 244 """ 245 Store the country, provider, name, type (gsm/cdma) under the 246 |gsm|/|cdma| nodes. This allows us to directly deal with these nodes 247 instead of going down the tree. 248 249 """ 250 # First find all nodes to be modified, since we can't iterate the tree 251 # while modifying it. 252 nodes = {} 253 for country_node in self._root.findall(u'country'): 254 cur_country = country_node.get(u'code') 255 for provider_node in country_node.findall(u'provider'): 256 primary = provider_node.get(u'primary') 257 roaming_required = provider_node.get(u'roaming-required') 258 names = [(name_node.text, name_node.get(u'xml:lang')) for 259 name_node in provider_node.findall(u'name')] 260 261 for gsm_node in provider_node.findall(u'gsm'): 262 nodes[gsm_node] = (cur_country, 263 primary, 264 roaming_required, 265 names, 266 u'gsm') 267 for cdma_node in provider_node.findall(u'cdma'): 268 # Some CDMA providers have a special name under the <cdma> 269 # node. This name should *override* the names given outside. 270 if cdma_node.find(u'name') is not None: 271 names = [] 272 nodes[cdma_node] = (cur_country, 273 primary, 274 roaming_required, 275 names, 276 u'cdma') 277 278 # Now, iterate through all those nodes and update the tree. 279 for node, args in nodes.iteritems(): 280 self._EnrichNode(node, *args) 281 282 283 def _CheckAmbiguousMCCMNC(self, mcc, mnc): 284 """ 285 Ensure that no two mcc, mnc pairs concat to the same MCCMNC. 286 287 @param mcc: The mcc to check. 288 @param mnc: The mnc to check. 289 290 """ 291 mccmnc = mcc + mnc 292 if mccmnc in self._mcc_mnc_by_mccmnc: 293 old_mcc, old_mnc = self._mcc_mnc_by_mccmnc(mccmnc) 294 if old_mcc != mcc or old_mnc != mnc: 295 self._LogAndRaise(u'Ambiguous MCCMNC pairs detected: ' 296 u'(%s, %s) vs. (%s, %s)', 297 old_mcc, old_mnc, mcc, mnc) 298 299 self._mcc_mnc_by_mccmnc[u'mccmnc'] = (mcc, mnc) 300 301 302 def _GroupGSMNodesByMCCMNC(self): 303 """ Map all GSM nodes with same MCCMNC together. """ 304 for gsm_node in self._root.findall(u'.//gsm'): 305 network_id_nodes = gsm_node.findall(u'network-id') 306 if not network_id_nodes: 307 logging.warning('Found a GSM node with no MCCMNC. ') 308 self._gsm_nodes_no_mccmnc.add(gsm_node) 309 continue 310 311 for network_id_node in gsm_node.findall(u'network-id'): 312 mcc = network_id_node.get(u'mcc') 313 mnc = network_id_node.get(u'mnc') 314 self._CheckAmbiguousMCCMNC(mcc, mnc) 315 mccmnc = mcc + mnc 316 if mccmnc in self._gsm_nodes_by_mccmnc: 317 self._gsm_nodes_by_mccmnc[mccmnc].append(gsm_node) 318 else: 319 self._gsm_nodes_by_mccmnc[mccmnc] = [gsm_node] 320 321 322 def _FindPrimaryNodes(self): 323 """ 324 Finds nodes that correspond to MNOs as opposed to MVNOs. 325 326 All CDMA nodes are primary, all GSM nodes that have a unique MCCMNC are 327 primary, GSM nodes with non-unique MCCMNC that explicitly claim to be 328 primary are primary. 329 330 """ 331 unique_mvnos = set() 332 self._mvnos = {} 333 334 # All cdma nodes are primary. 335 self._primary_cdma_nodes = set(self._root.findall(u'.//cdma')) 336 337 self._protobuf_cdma_mnos = len(self._primary_cdma_nodes) 338 339 340 # Start by marking all nodes with no MCCMNC primary. 341 self._primary_gsm_nodes = self._gsm_nodes_no_mccmnc 342 for mccmnc, nodes in self._gsm_nodes_by_mccmnc.iteritems(): 343 mvnos = set() 344 if len(nodes) == 1: 345 self._primary_gsm_nodes.add(nodes[0]) 346 continue 347 348 # Exactly one node in the list should claim to be primary. 349 primary = None 350 for node in nodes: 351 provider_node = node.find(u'provider') 352 if (provider_node.get(u'primary') and 353 provider_node.get(u'primary') == u'true'): 354 if primary is not None: 355 self._LogAndRaise( 356 u'Found two primary gsm nodes with MCCMNC[' 357 u'%s]: \n%s\n%s', 358 mccmnc, self._PPrintXML(primary), 359 self._PPrintXML(node)) 360 361 primary = node 362 self._primary_gsm_nodes.add(node) 363 else: 364 mvnos.add(node) 365 if primary is None: 366 logging.warning('Failed to find primary node with ' 367 'MCCMNC[%s]. Will make all of them ' 368 'distinct MNOs', mccmnc) 369 logging.info('Nodes found:') 370 for node in nodes: 371 self._PPrintLogXML(logging.info, node) 372 self._primary_gsm_nodes = (self._primary_gsm_nodes | set(nodes)) 373 continue 374 375 # This primary may already have MVNOs due to another MCCMNC. 376 existing_mvnos = self._mvnos.get(primary, set()) 377 self._mvnos[primary] = existing_mvnos | mvnos 378 # Only add to the MVNO count the *new* MVNOs added. 379 self._protobuf_gsm_mvnos += (len(self._mvnos[primary]) - 380 len(existing_mvnos)) 381 unique_mvnos = unique_mvnos | mvnos 382 383 self._primary_nodes = (self._primary_cdma_nodes | 384 self._primary_gsm_nodes) 385 self._protobuf_gsm_mnos = len(self._primary_gsm_nodes) 386 self._protobuf_gsm_unique_mvnos = len(unique_mvnos) 387 self._protobuf_gsm_mvnos_mnos = len( 388 self._primary_gsm_nodes & unique_mvnos) 389 390 391 def _SortOperators(self, node_list): 392 """ Sort operators by country and name """ 393 # First sort by name. 394 node_list.sort(cmp=lambda x, y: 395 cmp(sorted([z.text for z in x.findall(u'name')]), 396 sorted([z.text for z in y.findall(u'name')]))) 397 # Now sort by country. Since list sort is stable, nodes with the same 398 # country remain sorted by name. 399 node_list.sort(cmp=lambda x, y: cmp(x.find(u'country').get(u'code'), 400 y.find(u'country').get(u'code'))) 401 402 403 def _SpewProtobuf(self): 404 """ Entry function for dumping to prototext format. """ 405 _, fname = os.path.split(__file__) 406 self._SpewComment("!!! DO NOT EDIT THIS FILE BY HAND !!!"); 407 self._SpewComment("This file is generated by the script %s" % fname) 408 self._SpewComment("This file was generated from serviceproviders.xml, " 409 "a public domain database of cellular network " 410 "operators around the globe.") 411 412 primaries = list(self._primary_nodes) 413 self._SortOperators(primaries) 414 for node in primaries: 415 self._protobuf_mnos_dumped += 1 416 self._SpewMessageBegin(u'mno') 417 self._SpewData(node) 418 if node in self._mvnos: 419 mvnos = list(self._mvnos[node]) 420 self._SortOperators(mvnos) 421 for mvno_node in mvnos: 422 self._protobuf_mvnos_dumped += 1 423 self._SpewMessageBegin(u'mvno') 424 self._SpewNameFilter(mvno_node) 425 self._SpewData(mvno_node) 426 self._SpewMessageEnd(u'mvno') 427 self._SpewMessageEnd(u'mno') 428 self._SpewLine() 429 430 431 def _SpewNameFilter(self, node): 432 name_list = [] 433 for name_node in node.findall(u'name'): 434 if name_node.text: 435 name_list.append(name_node.text) 436 if not name_list: 437 self._LogAndRaise( 438 u'Did not find any name for MVNO. Can not create filter.\n' 439 u'%s', self._PPrintXML(node)) 440 441 name = u'|'.join(name_list) 442 self._SpewMessageBegin(u'mvno_filter') 443 self._SpewEnum(u'type', u'OPERATOR_NAME') 444 self._SpewString(u'regex', name) 445 self._SpewMessageEnd(u'mvno_filter') 446 447 448 def _SpewData(self, node): 449 self._SpewMessageBegin(u'data') 450 451 self._SpewString(u'uuid', str(uuid.uuid4())) 452 country_node = node.find(u'country') 453 self._SpewString(u'country', country_node.get(u'code')) 454 455 provider_node = node.find(u'provider') 456 provider_type = provider_node.get(u'type') 457 self._SpewEnum(u'provider_type', provider_type.upper()) 458 roaming_required = provider_node.get(u'roaming-required') 459 if roaming_required is not None: 460 self._SpewBool(u'requires_roaming', roaming_required) 461 for name_node in sorted(node.findall(u'name')): 462 self._SpewLocalizedNameNode(name_node) 463 464 # GSM specific fields. 465 for network_id_node in sorted(node.findall(u'network-id')): 466 self._SpewString(u'mccmnc', 467 network_id_node.get(u'mcc') + 468 network_id_node.get(u'mnc')) 469 470 for apn_node in sorted(node.findall(u'apn')): 471 self._SpewMobileAPNNode(apn_node) 472 473 # CDMA specific fields. 474 for sid_node in sorted(node.findall(u'sid')): 475 self._SpewString(u'sid', sid_node.get(u'value')) 476 477 # CDMA networks have some extra username/password/dns information that 478 # corresponds very well with the APN concept of 3GPP, so we map it to an 479 # MobileAPN instead of storing it specially. 480 if (node.find(u'username') is not None or 481 node.find(u'password') is not None or 482 node.find(u'dns') is not None): 483 self._SpewMobileAPNNode(node) 484 485 self._SpewMessageEnd(u'Data') 486 487 488 def _SpewMobileAPNNode(self, apn_node): 489 self._SpewMessageBegin(u'mobile_apn') 490 apn = apn_node.get(u'value') 491 # This may be None when converting a <cdma> node to MobileAPN node. 492 if apn is None: 493 apn='' 494 self._SpewString(u'apn', apn) 495 for plan_node in sorted(apn_node.findall(u'plan')): 496 self._SpewEnum(u'plan', plan_node.get(u'type').upper()) 497 for name_node in sorted(apn_node.findall(u'name')): 498 self._SpewLocalizedNameNode(name_node) 499 for gateway_node in apn_node.findall(u'gateway'): 500 self._SpewString(u'gateway', gateway_node.text) 501 for username_node in apn_node.findall(u'username'): 502 self._SpewString(u'username', username_node.text) 503 for password_node in apn_node.findall(u'password'): 504 self._SpewString(u'password', password_node.text) 505 for dns_node in sorted(apn_node.findall(u'dns')): 506 self._SpewString(u'dns', dns_node.text) 507 self._SpewMessageEnd(u'mobile_apn') 508 509 510 def _SpewLocalizedNameNode(self, name_node): 511 self._SpewMessageBegin(u'localized_name') 512 self._SpewString(u'name', name_node.text) 513 lang = name_node.get(u'xml:lang') 514 if lang is not None: 515 self._SpewString(u'language', lang) 516 self._SpewMessageEnd(u'localized_name') 517 518 519 def _SpewMessageBegin(self, message_name): 520 self._SpewLine(message_name, u'{') 521 self._indent += 1 522 523 524 def _SpewMessageEnd(self, _): 525 self._indent -= 1 526 self._SpewLine(u'}') 527 528 529 def _SpewString(self, key, value): 530 # Treat None |value| as empty string. 531 if value is None: 532 value = u'' 533 self._SpewLine(key, u':', u'"' + value + u'"') 534 535 536 def _SpewBool(self, key, value): 537 self._SpewLine(key, u':', value) 538 539 540 def _SpewEnum(self, key, value): 541 self._SpewLine(key, u':', value) 542 543 544 def _SpewComment(self, comment): 545 line_length = 78 - (2 * self._indent) 546 comment_lines = textwrap.wrap(comment, line_length) 547 for line in comment_lines: 548 self._SpewLine(u'# ' + line) 549 550 551 def _SpewLine(self, *args): 552 indent = (2 * self._indent) * u' ' 553 line = indent + u' '.join(args) + u'\n' 554 self._out_file.write(line.encode(FILE_ENCODING)) 555 556 557 def _PPrintXML(self, node): 558 """ Returns a pretty-printed |unicode| string for the xml |node|. """ 559 rough_string_str = ElementTree.tostring(node, encoding=FILE_ENCODING) 560 reparsed = minidom.parseString(rough_string_str) 561 xml_data_str = reparsed.toprettyxml(indent=u' ', 562 encoding=FILE_ENCODING) 563 xml_data = unicode(xml_data_str, FILE_ENCODING) 564 lines = xml_data.split(u'\n') 565 lines = [line.strip(u'\n') for line in lines] 566 lines = [line for line in lines if not line.strip() == u''] 567 lines = [line.strip(u'\n') for line in lines if line.strip()] 568 retval = u'\n'.join(lines) 569 return retval 570 571 572 def _PPrintLogXML(self, logger, node): 573 """ Logs a given xml |node| to |logger| encoded in 'ascii' format. """ 574 to_print = self._PPrintXML(node) 575 # Marshall, as best as we can to ASCII. 576 to_print_str_ascii = to_print.encode('ascii', errors='replace') 577 lines_str_ascii = to_print_str_ascii.split('\n') 578 logger('NODE:') 579 for line_str_ascii in lines_str_ascii: 580 logger(line_str_ascii) 581 582 583 def _LogAndRaise(self, fmt, *args): 584 """ 585 Logs the error encoded in 'ascii' format and raises an error. 586 587 @param fmt: The base formatted string for the error. 588 @param *args: Arguments to format the string |fmt|. 589 @raises ConverterError 590 591 """ 592 error_string = fmt.format(*args) 593 # Marshall, as best as we can to ASCII. 594 error_string_str_ascii = error_string.encode('ascii', errors='replace') 595 logging.error(error_string_str_ascii) 596 raise ConverterError(error_string_str_ascii) 597 598 599def main(prog_name, args): 600 """ 601 Entry function to this script. 602 603 @param prog_name: Name of the program to display. 604 @param args: Command line arguments. 605 606 """ 607 logging.basicConfig(level=logging.DEBUG) 608 609 if not (1 <= len(args) <= 2): 610 print("Usage: %s <in_file> [<out_file>]" % prog_name) 611 sys.exit(1) 612 613 in_file_path = args[0] 614 out_file_path = args[1] if len(args) == 2 else None 615 616 converter = ServiceProvidersConverter(in_file_path, out_file_path) 617 converter.Convert() 618 619 620if __name__ == '__main__': 621 main(sys.argv[0], sys.argv[1:]) 622