1#!/bin/env python
2#
3# Copyright (C) 2014 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18import logging
19import os
20import re
21import sys
22import tempfile
23import textwrap
24import uuid
25
26from xml.etree import ElementTree
27from xml.dom import minidom
28
29# We are dealing with unicode data. It is extremely important to choose between
30# the |unicode| type and the |str| type with unicode encoding as the default
31# storage type for strings, and stick to it.
32# - All strings except filenames and such are of type |unicode|
33#   - Note that the xml.etree.ElementTree.parse function actually returns
34#     strings in the |str| type. These will be implicitly coerced to |unicode|
35#     as needed. If you don't like this, add a phase to explicitly cast these
36#     strings.
37# - Whenever using the |str| type, use the suffix |_str|
38# - Moreover, whenever using |str| type with |ascii| encoding, using
39#   |_str_ascii| suffix
40FILE_ENCODING = 'utf-8'
41
42class ConverterError(Exception):
43    pass
44
45
46class ServiceProvidersConverter(object):
47    """ Convert the ServiceProviders XML into protobuf format. """
48    def __init__(self, file_path, out_file_path=None):
49        """
50        @param file_path: Absolute path to the XML file to read
51        @param out_file_path: Absolute path to the file to which the output
52                should be written.
53
54        """
55        self._file_path = file_path
56        self._out_file_path = out_file_path
57
58        self._gsm_nodes_no_mccmnc = set()
59        self._gsm_nodes_by_mccmnc = {}
60        self._mcc_mnc_by_mccmnc = {}
61
62        # Book-keeping to sanity check the total number of providers converted,
63        # and detailed information about the conversion.
64        self._xml_cdma_nodes = 0
65        self._xml_gsm_nodes = 0
66        self._protobuf_mnos_dumped = 0
67        self._protobuf_mvnos_dumped = 0
68        self._protobuf_gsm_mnos = 0
69        self._protobuf_cdma_mnos = 0
70        self._protobuf_gsm_mvnos = 0
71        self._protobuf_gsm_unique_mvnos = 0
72        # Turns out some MVNOs are MNOs using a different MCCMNC.
73        self._protobuf_gsm_mvnos_mnos = 0
74        # Remember nodes that we decide to drop at any point.
75        self._dropped_nodes = set()
76
77        # Related to the actual protobuf output:
78        self._indent = 0
79
80
81    def Convert(self):
82        """ Top level function for the conversion. """
83        parser = ElementTree.XMLParser(encoding=FILE_ENCODING)
84        element_tree = ElementTree.parse(self._file_path, parser=parser)
85        self._root = element_tree.getroot()
86        logging.info('Dumping parsed XML')
87        self._DumpXMLToTempFile()
88        self._xml_cdma_nodes = len(self._root.findall(u'.//cdma'))
89        self._xml_gsm_nodes = len(self._root.findall(u'.//gsm'))
90
91        self._TransformXML()
92        logging.info('Dumping transformed XML.')
93        self._DumpXMLToTempFile()
94
95        self._GroupGSMNodesByMCCMNC()
96        self._FindPrimaryNodes()
97
98        if self._out_file_path is not None:
99            with open(self._out_file_path, 'w') as self._out_file:
100                self._SpewProtobuf()
101        else:
102            self._out_file = sys.stdout
103            self._SpewProtobuf()
104
105        self._RunStatsDiagnostics()
106
107
108    def _CheckStatsEqual(self, lhs, lhs_name, rhs, rhs_name):
109        """
110        Test that |lhs| == |rhs| and log appropriate message.
111
112        @param lhs: One value to compare.
113        @param lhs_name: str name to be used for |lhs| for logging.
114        @param rhs: Other value to compare.
115        @param rhs_name: str name to be used for |rhs| for logging.
116        @return True if check passes, False otherwise.
117
118        """
119        result = (lhs == rhs)
120        logger = logging.info if result else logging.error
121        message = 'PASS' if result else 'FAIL'
122        logger('Sanity check: (%s) == (%s) (%d == %d) **%s**',
123               lhs_name, rhs_name, lhs, rhs, message)
124        return result
125
126
127    def _RunStatsDiagnostics(self):
128        """ Checks that the stats about nodes found / dumped tally. """
129        # First dump dropped nodes.
130        if len(self._dropped_nodes) > 0:
131            logging.warning('Following nodes were dropped:')
132            for node in self._dropped_nodes:
133                logging.info(self._PPrintXML(node).encode(FILE_ENCODING))
134
135        logging.info('######################')
136        logging.info('Conversion diagnostics')
137        logging.info('######################')
138
139        logging.info('Total number of XML CDMA nodes read [xml_cdma_nodes]: %d',
140                     self._xml_cdma_nodes)
141        logging.info('Total number of XML GSM nodes read [xml_gsm_nodes]: %d',
142                     self._xml_gsm_nodes)
143        logging.info('Total number of XML nodes read '
144                     '[xml_nodes = xml_cdma_nodes + xml_gsm_nodes]: %d',
145                     self._xml_cdma_nodes + self._xml_gsm_nodes)
146
147        logging.info('Total number of protobuf MNOs dumped '
148                     '[protobuf_mnos_dumped]: %d',
149                     self._protobuf_mnos_dumped)
150        logging.info('Total number of protobuf MVNOs dumped '
151                     '[protobuf_mvnos_dumped]: %d',
152                     self._protobuf_mvnos_dumped)
153        logging.info('Total number of protobuf nodes dropped '
154                     '[protobuf_dropped_nodes]: %d',
155                     len(self._dropped_nodes))
156        logging.info('  (See above for the exact nodes dropped)')
157
158        logging.info('Total number of protobuf CDMA MNOs '
159                     '[protobuf_cdma_mnos]: %d',
160                     self._protobuf_cdma_mnos)
161        logging.info('Total number of protobuf GSM MNOs '
162                     '[protobuf_gsm_mnos]: %d',
163                     self._protobuf_gsm_mnos)
164        logging.info('Total number of protobuf GSM MVNOs '
165                     '[protobuf_gsm_mvnos]: %d',
166                     self._protobuf_gsm_mvnos)
167        logging.info('Total number of protobuf unique GSM MVNOs. '
168                     '[protobuf_gsm_unique_mvnos]: %d',
169                     self._protobuf_gsm_unique_mvnos)
170        logging.info('  (Some MVNOs may appear in multiple MNOs)')
171        logging.info('Total number of protobuf GSM MVNOs that are also MNOs. '
172                     '[protobuf_gsm_mvnos_mnos]: %d',
173                     self._protobuf_gsm_mvnos_mnos)
174
175        check_results = []
176        check_results.append(self._CheckStatsEqual(
177                self._protobuf_mnos_dumped,
178                'protobuf_mnos_dumped',
179                self._protobuf_cdma_mnos + self._protobuf_gsm_mnos,
180                'protobuf_cdma_mnos + protobuf_gsm_mnos'))
181
182        check_results.append(self._CheckStatsEqual(
183                self._protobuf_mnos_dumped + self._protobuf_mvnos_dumped,
184                'protobuf_mnos_dumped + protobuf_mvnos_dumped',
185                (self._protobuf_cdma_mnos +
186                 self._protobuf_gsm_mnos +
187                 self._protobuf_gsm_mvnos),
188                'protobuf_cdma_mnos + protobuf_gsm_mnos + protobuf_gsm_mvnos'))
189
190        check_results.append(self._CheckStatsEqual(
191                self._xml_cdma_nodes + self._xml_gsm_nodes,
192                'xml_cdma_nodes + xml_gsm_nodes',
193                (len(self._dropped_nodes) +
194                 self._protobuf_gsm_mnos +
195                 self._protobuf_cdma_mnos +
196                 self._protobuf_gsm_unique_mvnos -
197                 self._protobuf_gsm_mvnos_mnos),
198                ('protobuf_dropped_nodes + '
199                 'protobuf_gsm_mnos + protobuf_cdma_mnos + '
200                 'protobuf_gsm_unique_mvnos - protobuf_gsm_mvnos_mnos')))
201
202        if False in check_results:
203            self._LogAndRaise('StatsDiagnostics failed.')
204
205
206    def _DumpXMLToTempFile(self):
207        """ Dumps the parsed XML to a temp file for debugging. """
208        fd, fname = tempfile.mkstemp(prefix='converter_')
209        logging.info('Dumping XML to file %s', fname)
210        with os.fdopen(fd, 'w') as fout:
211            fout.write(self._PPrintXML(self._root).encode(FILE_ENCODING))
212
213
214    def _EnrichNode(self, node, country_code, primary, roaming_required, names,
215                    provider_type):
216        """
217        Adds the information passed in as children of |node|.
218
219        @param node: The XML node to enrich.
220        @param country_code: The country code for node. Type: str.
221        @param primary: Is this node a primary provider. Type: str
222        @param roaming_required: Does this provider requires roaming. Type: str.
223        @param names: List of names for this provider. Type: [(str, str)].
224        @param provider_type: Is this node 'gsm'/'cdma'. Type: str.
225
226        """
227        ElementTree.SubElement(node, u'country', {u'code': country_code})
228        provider_map = {}
229        provider_map[u'type'] = provider_type
230        if primary is not None:
231            provider_map[u'primary'] = primary
232        if roaming_required is not None:
233            provider_map[u'roaming-required'] = roaming_required
234        ElementTree.SubElement(node, u'provider', provider_map)
235        for name, lang in names:
236            name_map = {}
237            if lang is not None:
238                name_map[u'xml:lang'] = lang
239            name_node = ElementTree.SubElement(node, u'name', name_map)
240            name_node.text = name
241
242
243    def _TransformXML(self):
244        """
245        Store the country, provider, name, type (gsm/cdma) under the
246        |gsm|/|cdma| nodes. This allows us to directly deal with these nodes
247        instead of going down the tree.
248
249        """
250        # First find all nodes to be modified, since we can't iterate the tree
251        # while modifying it.
252        nodes = {}
253        for country_node in self._root.findall(u'country'):
254            cur_country = country_node.get(u'code')
255            for provider_node in country_node.findall(u'provider'):
256                primary = provider_node.get(u'primary')
257                roaming_required = provider_node.get(u'roaming-required')
258                names = [(name_node.text, name_node.get(u'xml:lang')) for
259                         name_node in provider_node.findall(u'name')]
260
261                for gsm_node in provider_node.findall(u'gsm'):
262                    nodes[gsm_node] = (cur_country,
263                                       primary,
264                                       roaming_required,
265                                       names,
266                                       u'gsm')
267                for cdma_node in provider_node.findall(u'cdma'):
268                    # Some CDMA providers have a special name under the <cdma>
269                    # node. This name should *override* the names given outside.
270                    if cdma_node.find(u'name') is not None:
271                        names = []
272                    nodes[cdma_node] = (cur_country,
273                                        primary,
274                                        roaming_required,
275                                        names,
276                                        u'cdma')
277
278        # Now, iterate through all those nodes and update the tree.
279        for node, args in nodes.iteritems():
280            self._EnrichNode(node, *args)
281
282
283    def _CheckAmbiguousMCCMNC(self, mcc, mnc):
284        """
285        Ensure that no two mcc, mnc pairs concat to the same MCCMNC.
286
287        @param mcc: The mcc to check.
288        @param mnc: The mnc to check.
289
290        """
291        mccmnc = mcc + mnc
292        if mccmnc in self._mcc_mnc_by_mccmnc:
293            old_mcc, old_mnc = self._mcc_mnc_by_mccmnc(mccmnc)
294            if old_mcc != mcc or old_mnc != mnc:
295                self._LogAndRaise(u'Ambiguous MCCMNC pairs detected: '
296                                  u'(%s, %s) vs. (%s, %s)',
297                                  old_mcc, old_mnc, mcc, mnc)
298
299        self._mcc_mnc_by_mccmnc[u'mccmnc'] = (mcc, mnc)
300
301
302    def _GroupGSMNodesByMCCMNC(self):
303        """ Map all GSM nodes with same MCCMNC together. """
304        for gsm_node in self._root.findall(u'.//gsm'):
305            network_id_nodes = gsm_node.findall(u'network-id')
306            if not network_id_nodes:
307                logging.warning('Found a GSM node with no MCCMNC. ')
308                self._gsm_nodes_no_mccmnc.add(gsm_node)
309                continue
310
311            for network_id_node in gsm_node.findall(u'network-id'):
312                mcc = network_id_node.get(u'mcc')
313                mnc = network_id_node.get(u'mnc')
314                self._CheckAmbiguousMCCMNC(mcc, mnc)
315                mccmnc = mcc + mnc
316                if mccmnc in self._gsm_nodes_by_mccmnc:
317                    self._gsm_nodes_by_mccmnc[mccmnc].append(gsm_node)
318                else:
319                    self._gsm_nodes_by_mccmnc[mccmnc] = [gsm_node]
320
321
322    def _FindPrimaryNodes(self):
323        """
324        Finds nodes that correspond to MNOs as opposed to MVNOs.
325
326        All CDMA nodes are primary, all GSM nodes that have a unique MCCMNC are
327        primary, GSM nodes with non-unique MCCMNC that explicitly claim to be
328        primary are primary.
329
330        """
331        unique_mvnos = set()
332        self._mvnos = {}
333
334        # All cdma nodes are primary.
335        self._primary_cdma_nodes = set(self._root.findall(u'.//cdma'))
336
337        self._protobuf_cdma_mnos = len(self._primary_cdma_nodes)
338
339
340        # Start by marking all nodes with no MCCMNC primary.
341        self._primary_gsm_nodes = self._gsm_nodes_no_mccmnc
342        for mccmnc, nodes in self._gsm_nodes_by_mccmnc.iteritems():
343            mvnos = set()
344            if len(nodes) == 1:
345                self._primary_gsm_nodes.add(nodes[0])
346                continue
347
348            # Exactly one node in the list should claim to be primary.
349            primary = None
350            for node in nodes:
351                provider_node = node.find(u'provider')
352                if (provider_node.get(u'primary') and
353                    provider_node.get(u'primary') == u'true'):
354                    if primary is not None:
355                        self._LogAndRaise(
356                                u'Found two primary gsm nodes with MCCMNC['
357                                u'%s]: \n%s\n%s',
358                                mccmnc, self._PPrintXML(primary),
359                                self._PPrintXML(node))
360
361                    primary = node
362                    self._primary_gsm_nodes.add(node)
363                else:
364                    mvnos.add(node)
365            if primary is None:
366                logging.warning('Failed to find primary node with '
367                                'MCCMNC[%s]. Will make all of them '
368                                'distinct MNOs', mccmnc)
369                logging.info('Nodes found:')
370                for node in nodes:
371                    self._PPrintLogXML(logging.info, node)
372                self._primary_gsm_nodes = (self._primary_gsm_nodes | set(nodes))
373                continue
374
375            # This primary may already have MVNOs due to another MCCMNC.
376            existing_mvnos = self._mvnos.get(primary, set())
377            self._mvnos[primary] = existing_mvnos | mvnos
378            # Only add to the MVNO count the *new* MVNOs added.
379            self._protobuf_gsm_mvnos += (len(self._mvnos[primary]) -
380                                         len(existing_mvnos))
381            unique_mvnos = unique_mvnos | mvnos
382
383        self._primary_nodes = (self._primary_cdma_nodes |
384                               self._primary_gsm_nodes)
385        self._protobuf_gsm_mnos = len(self._primary_gsm_nodes)
386        self._protobuf_gsm_unique_mvnos = len(unique_mvnos)
387        self._protobuf_gsm_mvnos_mnos = len(
388                self._primary_gsm_nodes & unique_mvnos)
389
390
391    def _SortOperators(self, node_list):
392        """ Sort operators by country and name """
393        # First sort by name.
394        node_list.sort(cmp=lambda x, y:
395                          cmp(sorted([z.text for z in x.findall(u'name')]),
396                              sorted([z.text for z in y.findall(u'name')])))
397        # Now sort by country. Since list sort is stable, nodes with the same
398        # country remain sorted by name.
399        node_list.sort(cmp=lambda x, y: cmp(x.find(u'country').get(u'code'),
400                                            y.find(u'country').get(u'code')))
401
402
403    def _SpewProtobuf(self):
404        """ Entry function for dumping to prototext format. """
405        _, fname = os.path.split(__file__)
406        self._SpewComment("!!! DO NOT EDIT THIS FILE BY HAND !!!");
407        self._SpewComment("This file is generated by the script %s" % fname)
408        self._SpewComment("This file was generated from serviceproviders.xml, "
409                          "a public domain database of cellular network "
410                          "operators around the globe.")
411
412        primaries = list(self._primary_nodes)
413        self._SortOperators(primaries)
414        for node in primaries:
415            self._protobuf_mnos_dumped += 1
416            self._SpewMessageBegin(u'mno')
417            self._SpewData(node)
418            if node in self._mvnos:
419                mvnos = list(self._mvnos[node])
420                self._SortOperators(mvnos)
421                for mvno_node in mvnos:
422                    self._protobuf_mvnos_dumped += 1
423                    self._SpewMessageBegin(u'mvno')
424                    self._SpewNameFilter(mvno_node)
425                    self._SpewData(mvno_node)
426                    self._SpewMessageEnd(u'mvno')
427            self._SpewMessageEnd(u'mno')
428            self._SpewLine()
429
430
431    def _SpewNameFilter(self, node):
432        name_list = []
433        for name_node in node.findall(u'name'):
434            if name_node.text:
435                name_list.append(name_node.text)
436        if not name_list:
437            self._LogAndRaise(
438                    u'Did not find any name for MVNO. Can not create filter.\n'
439                    u'%s', self._PPrintXML(node))
440
441        name = u'|'.join(name_list)
442        self._SpewMessageBegin(u'mvno_filter')
443        self._SpewEnum(u'type', u'OPERATOR_NAME')
444        self._SpewString(u'regex', name)
445        self._SpewMessageEnd(u'mvno_filter')
446
447
448    def _SpewData(self, node):
449        self._SpewMessageBegin(u'data')
450
451        self._SpewString(u'uuid', str(uuid.uuid4()))
452        country_node = node.find(u'country')
453        self._SpewString(u'country', country_node.get(u'code'))
454
455        provider_node = node.find(u'provider')
456        provider_type = provider_node.get(u'type')
457        self._SpewEnum(u'provider_type', provider_type.upper())
458        roaming_required = provider_node.get(u'roaming-required')
459        if roaming_required is not None:
460            self._SpewBool(u'requires_roaming', roaming_required)
461        for name_node in sorted(node.findall(u'name')):
462            self._SpewLocalizedNameNode(name_node)
463
464        # GSM specific fields.
465        for network_id_node in sorted(node.findall(u'network-id')):
466            self._SpewString(u'mccmnc',
467                             network_id_node.get(u'mcc') +
468                             network_id_node.get(u'mnc'))
469
470        for apn_node in sorted(node.findall(u'apn')):
471            self._SpewMobileAPNNode(apn_node)
472
473        # CDMA specific fields.
474        for sid_node in sorted(node.findall(u'sid')):
475            self._SpewString(u'sid', sid_node.get(u'value'))
476
477        # CDMA networks have some extra username/password/dns information that
478        # corresponds very well with the APN concept of 3GPP, so we map it to an
479        # MobileAPN instead of storing it specially.
480        if (node.find(u'username') is not None or
481            node.find(u'password') is not None or
482            node.find(u'dns') is not None):
483            self._SpewMobileAPNNode(node)
484
485        self._SpewMessageEnd(u'Data')
486
487
488    def _SpewMobileAPNNode(self, apn_node):
489        self._SpewMessageBegin(u'mobile_apn')
490        apn = apn_node.get(u'value')
491        # This may be None when converting a <cdma> node to MobileAPN node.
492        if apn is None:
493            apn=''
494        self._SpewString(u'apn', apn)
495        for plan_node in sorted(apn_node.findall(u'plan')):
496            self._SpewEnum(u'plan', plan_node.get(u'type').upper())
497        for name_node in sorted(apn_node.findall(u'name')):
498            self._SpewLocalizedNameNode(name_node)
499        for gateway_node in apn_node.findall(u'gateway'):
500            self._SpewString(u'gateway', gateway_node.text)
501        for username_node in apn_node.findall(u'username'):
502            self._SpewString(u'username', username_node.text)
503        for password_node in apn_node.findall(u'password'):
504            self._SpewString(u'password', password_node.text)
505        for dns_node in sorted(apn_node.findall(u'dns')):
506            self._SpewString(u'dns', dns_node.text)
507        self._SpewMessageEnd(u'mobile_apn')
508
509
510    def _SpewLocalizedNameNode(self, name_node):
511        self._SpewMessageBegin(u'localized_name')
512        self._SpewString(u'name', name_node.text)
513        lang = name_node.get(u'xml:lang')
514        if lang is not None:
515            self._SpewString(u'language', lang)
516        self._SpewMessageEnd(u'localized_name')
517
518
519    def _SpewMessageBegin(self, message_name):
520        self._SpewLine(message_name, u'{')
521        self._indent += 1
522
523
524    def _SpewMessageEnd(self, _):
525        self._indent -= 1
526        self._SpewLine(u'}')
527
528
529    def _SpewString(self, key, value):
530        # Treat None |value| as empty string.
531        if value is None:
532            value = u''
533        self._SpewLine(key, u':', u'"' + value + u'"')
534
535
536    def _SpewBool(self, key, value):
537        self._SpewLine(key, u':', value)
538
539
540    def _SpewEnum(self, key, value):
541        self._SpewLine(key, u':', value)
542
543
544    def _SpewComment(self, comment):
545        line_length = 78 - (2 * self._indent)
546        comment_lines = textwrap.wrap(comment, line_length)
547        for line in comment_lines:
548            self._SpewLine(u'# ' + line)
549
550
551    def _SpewLine(self, *args):
552        indent = (2 * self._indent) * u' '
553        line = indent + u' '.join(args) + u'\n'
554        self._out_file.write(line.encode(FILE_ENCODING))
555
556
557    def _PPrintXML(self, node):
558        """ Returns a pretty-printed |unicode| string for the xml |node|. """
559        rough_string_str = ElementTree.tostring(node, encoding=FILE_ENCODING)
560        reparsed = minidom.parseString(rough_string_str)
561        xml_data_str = reparsed.toprettyxml(indent=u'  ',
562                                            encoding=FILE_ENCODING)
563        xml_data = unicode(xml_data_str, FILE_ENCODING)
564        lines = xml_data.split(u'\n')
565        lines = [line.strip(u'\n') for line in lines]
566        lines = [line for line in lines if not line.strip() == u'']
567        lines = [line.strip(u'\n') for line in lines if line.strip()]
568        retval = u'\n'.join(lines)
569        return retval
570
571
572    def _PPrintLogXML(self, logger, node):
573        """ Logs a given xml |node| to |logger| encoded in 'ascii' format. """
574        to_print = self._PPrintXML(node)
575        # Marshall, as best as we can to ASCII.
576        to_print_str_ascii = to_print.encode('ascii', errors='replace')
577        lines_str_ascii = to_print_str_ascii.split('\n')
578        logger('NODE:')
579        for line_str_ascii in lines_str_ascii:
580            logger(line_str_ascii)
581
582
583    def _LogAndRaise(self, fmt, *args):
584        """
585        Logs the error encoded in 'ascii' format and raises an error.
586
587        @param fmt: The base formatted string for the error.
588        @param *args: Arguments to format the string |fmt|.
589        @raises ConverterError
590
591        """
592        error_string = fmt.format(*args)
593        # Marshall, as best as we can to ASCII.
594        error_string_str_ascii = error_string.encode('ascii', errors='replace')
595        logging.error(error_string_str_ascii)
596        raise ConverterError(error_string_str_ascii)
597
598
599def main(prog_name, args):
600    """
601    Entry function to this script.
602
603    @param prog_name: Name of the program to display.
604    @param args: Command line arguments.
605
606    """
607    logging.basicConfig(level=logging.DEBUG)
608
609    if not (1 <= len(args) <= 2):
610        print("Usage: %s <in_file> [<out_file>]" % prog_name)
611        sys.exit(1)
612
613    in_file_path = args[0]
614    out_file_path = args[1] if len(args) == 2 else None
615
616    converter = ServiceProvidersConverter(in_file_path, out_file_path)
617    converter.Convert()
618
619
620if __name__ == '__main__':
621    main(sys.argv[0], sys.argv[1:])
622