1#! /usr/bin/env python
2
3"""world -- Print mappings between country names and DNS country codes.
4
5Contact: Barry Warsaw
6Email:   barry@python.org
7Version: %(__version__)s
8
9This script will take a list of Internet addresses and print out where in the
10world those addresses originate from, based on the top-level domain country
11code found in the address.  Addresses can be in any of the following forms:
12
13    xx                -- just the country code or top-level domain identifier
14    host.domain.xx    -- any Internet host or network name
15    somebody@where.xx -- an Internet email address
16
17If no match is found, the address is interpreted as a regular expression and a
18reverse lookup is attempted.  This script will search the country names and
19print a list of matching entries.  You can force reverse mappings with the
20`-r' flag (see below).
21
22For example:
23
24    %% world tz us
25    tz originated from Tanzania, United Republic of
26    us originated from United States
27
28    %% world united
29    united matches 6 countries:
30        ae: United Arab Emirates
31        uk: United Kingdom (common practice)
32        um: United States Minor Outlying Islands
33        us: United States
34        tz: Tanzania, United Republic of
35        gb: United Kingdom
36
37Country codes are maintained by the RIPE Network Coordination Centre,
38in coordination with the ISO 3166 Maintenance Agency at DIN Berlin.  The
39authoritative source of country code mappings is:
40
41    <url:ftp://ftp.ripe.net/iso3166-countrycodes.txt>
42
43The latest known change to this information was:
44
45    Friday, 5 April 2002, 12.00 CET 2002
46
47This script also knows about non-geographic top-level domains, and the
48additional ccTLDs reserved by IANA.
49
50Usage: %(PROGRAM)s [-d] [-p file] [-o] [-h] addr [addr ...]
51
52    --dump
53    -d
54        Print mapping of all top-level domains.
55
56    --parse file
57    -p file
58        Parse an iso3166-countrycodes file extracting the two letter country
59        code followed by the country name.  Note that the three letter country
60        codes and numbers, which are also provided in the standard format
61        file, are ignored.
62
63    --outputdict
64    -o
65        When used in conjunction with the `-p' option, output is in the form
66        of a Python dictionary, and country names are normalized
67        w.r.t. capitalization.  This makes it appropriate for cutting and
68        pasting back into this file.  Output is always to standard out.
69
70    --reverse
71    -r
72        Force reverse lookup.  In this mode the address can be any Python
73        regular expression; this is matched against all country names and a
74        list of matching mappings is printed.  In normal mode (e.g. without
75        this flag), reverse lookup is performed on addresses if no matching
76        country code is found.
77
78    -h
79    --help
80        Print this message.
81"""
82__version__ = '$Revision$'
83
84
85import sys
86import getopt
87import re
88
89PROGRAM = sys.argv[0]
90
91
92
93def usage(code, msg=''):
94    print __doc__ % globals()
95    if msg:
96        print msg
97    sys.exit(code)
98
99
100
101def resolve(rawaddr):
102    parts = rawaddr.split('.')
103    if not len(parts):
104        # no top level domain found, bounce it to the next step
105        return rawaddr
106    addr = parts[-1]
107    if nameorgs.has_key(addr):
108        print rawaddr, 'is in the', nameorgs[addr], 'top level domain'
109        return None
110    elif countries.has_key(addr):
111        print rawaddr, 'originated from', countries[addr]
112        return None
113    else:
114        # Not resolved, bounce it to the next step
115        return rawaddr
116
117
118
119def reverse(regexp):
120    matches = []
121    cre = re.compile(regexp, re.IGNORECASE)
122    for code, country in all.items():
123        mo = cre.search(country)
124        if mo:
125            matches.append(code)
126    # print results
127    if not matches:
128        # not resolved, bounce it to the next step
129        return regexp
130    if len(matches) == 1:
131        code = matches[0]
132        print regexp, "matches code `%s', %s" % (code, all[code])
133    else:
134        print regexp, 'matches %d countries:' % len(matches)
135        for code in matches:
136            print "    %s: %s" % (code, all[code])
137    return None
138
139
140
141def parse(file, normalize):
142    try:
143        fp = open(file)
144    except IOError, (err, msg):
145        print msg, ':', file
146
147    cre = re.compile('(.*?)[ \t]+([A-Z]{2})[ \t]+[A-Z]{3}[ \t]+[0-9]{3}')
148    scanning = 0
149
150    if normalize:
151        print 'countries = {'
152
153    while 1:
154        line = fp.readline()
155        if line == '':
156            break                       # EOF
157        if scanning:
158            mo = cre.match(line)
159            if not mo:
160                line = line.strip()
161                if not line:
162                    continue
163                elif line[0] == '-':
164                    break
165                else:
166                    print 'Could not parse line:', line
167                    continue
168            country, code = mo.group(1, 2)
169            if normalize:
170                words = country.split()
171                for i in range(len(words)):
172                    w = words[i]
173                    # XXX special cases
174                    if w in ('AND', 'OF', 'OF)', 'name:', 'METROPOLITAN'):
175                        words[i] = w.lower()
176                    elif w == 'THE' and i <> 1:
177                        words[i] = w.lower()
178                    elif len(w) > 3 and w[1] == "'":
179                        words[i] = w[0:3].upper() + w[3:].lower()
180                    elif w in ('(U.S.)', 'U.S.'):
181                        pass
182                    elif w[0] == '(' and w <> '(local':
183                        words[i] = '(' + w[1:].capitalize()
184                    elif w.find('-') <> -1:
185                        words[i] = '-'.join(
186                            [s.capitalize() for s in w.split('-')])
187                    else:
188                        words[i] = w.capitalize()
189                code = code.lower()
190                country = ' '.join(words)
191                print '    "%s": "%s",' % (code, country)
192            else:
193                print code, country
194            
195        elif line[0] == '-':
196            scanning = 1
197
198    if normalize:
199        print '    }'
200
201
202def main():
203    help = 0
204    status = 0
205    dump = 0
206    parsefile = None
207    normalize = 0
208    forcerev = 0
209
210    try:
211        opts, args = getopt.getopt(
212            sys.argv[1:],
213            'p:rohd',
214            ['parse=', 'reverse', 'outputdict', 'help', 'dump'])
215    except getopt.error, msg:
216        usage(1, msg)
217
218    for opt, arg in opts:
219        if opt in ('-h', '--help'):
220            help = 1
221        elif opt in ('-d', '--dump'):
222            dump = 1
223        elif opt in ('-p', '--parse'):
224            parsefile = arg
225        elif opt in ('-o', '--outputdict'):
226            normalize = 1
227        elif opt in ('-r', '--reverse'):
228            forcerev = 1
229
230    if help:
231        usage(status)
232
233    if dump:
234        print 'Non-geographic domains:'
235        codes = nameorgs.keys()
236        codes.sort()
237        for code in codes:
238            print '    %4s:' % code, nameorgs[code]
239
240        print '\nCountry coded domains:'
241        codes = countries.keys()
242        codes.sort()
243        for code in codes:
244            print '    %2s:' % code, countries[code]
245    elif parsefile:
246        parse(parsefile, normalize)
247    else:
248        if not forcerev:
249            args = filter(None, map(resolve, args))
250        args = filter(None, map(reverse, args))
251        for arg in args:
252            print 'Where in the world is %s?' % arg
253
254
255
256# The mappings
257nameorgs = {
258    # New top level domains as described by ICANN
259    # http://www.icann.org/tlds/
260    "aero": "air-transport industry",
261    "arpa": "Arpanet",
262    "biz": "business",
263    "com": "commercial",
264    "coop": "cooperatives",
265    "edu": "educational",
266    "gov": "government",
267    "info": "unrestricted `info'",
268    "int": "international",
269    "mil": "military",
270    "museum": "museums",
271    "name": "`name' (for registration by individuals)",
272    "net": "networking",
273    "org": "non-commercial",
274    "pro": "professionals",
275    # These additional ccTLDs are included here even though they are not part
276    # of ISO 3166.  IANA has 5 reserved ccTLDs as described here:
277    #
278    # http://www.iso.org/iso/en/prods-services/iso3166ma/04background-on-iso-3166/iso3166-1-and-ccTLDs.html
279    #
280    # but I can't find an official list anywhere.
281    #
282    # Note that `uk' is the common practice country code for the United
283    # Kingdom.  AFAICT, the official `gb' code is routinely ignored!
284    #
285    # <D.M.Pick@qmw.ac.uk> tells me that `uk' was long in use before ISO3166
286    # was adopted for top-level DNS zone names (although in the reverse order
287    # like uk.ac.qmw) and was carried forward (with the reversal) to avoid a
288    # large-scale renaming process as the UK switched from their old `Coloured
289    # Book' protocols over X.25 to Internet protocols over IP.
290    #
291    # See <url:ftp://ftp.ripe.net/ripe/docs/ripe-159.txt>
292    #
293    # Also, `su', while obsolete is still in limited use.
294    "ac": "Ascension Island",
295    "gg": "Guernsey",
296    "im": "Isle of Man",
297    "je": "Jersey",
298    "uk": "United Kingdom (common practice)",
299    "su": "Soviet Union (still in limited use)",
300    }
301
302
303
304countries = {
305    "af": "Afghanistan",
306    "al": "Albania",
307    "dz": "Algeria",
308    "as": "American Samoa",
309    "ad": "Andorra",
310    "ao": "Angola",
311    "ai": "Anguilla",
312    "aq": "Antarctica",
313    "ag": "Antigua and Barbuda",
314    "ar": "Argentina",
315    "am": "Armenia",
316    "aw": "Aruba",
317    "au": "Australia",
318    "at": "Austria",
319    "az": "Azerbaijan",
320    "bs": "Bahamas",
321    "bh": "Bahrain",
322    "bd": "Bangladesh",
323    "bb": "Barbados",
324    "by": "Belarus",
325    "be": "Belgium",
326    "bz": "Belize",
327    "bj": "Benin",
328    "bm": "Bermuda",
329    "bt": "Bhutan",
330    "bo": "Bolivia",
331    "ba": "Bosnia and Herzegowina",
332    "bw": "Botswana",
333    "bv": "Bouvet Island",
334    "br": "Brazil",
335    "io": "British Indian Ocean Territory",
336    "bn": "Brunei Darussalam",
337    "bg": "Bulgaria",
338    "bf": "Burkina Faso",
339    "bi": "Burundi",
340    "kh": "Cambodia",
341    "cm": "Cameroon",
342    "ca": "Canada",
343    "cv": "Cape Verde",
344    "ky": "Cayman Islands",
345    "cf": "Central African Republic",
346    "td": "Chad",
347    "cl": "Chile",
348    "cn": "China",
349    "cx": "Christmas Island",
350    "cc": "Cocos (Keeling) Islands",
351    "co": "Colombia",
352    "km": "Comoros",
353    "cg": "Congo",
354    "cd": "Congo, The Democratic Republic of the",
355    "ck": "Cook Islands",
356    "cr": "Costa Rica",
357    "ci": "Cote D'Ivoire",
358    "hr": "Croatia",
359    "cu": "Cuba",
360    "cy": "Cyprus",
361    "cz": "Czech Republic",
362    "dk": "Denmark",
363    "dj": "Djibouti",
364    "dm": "Dominica",
365    "do": "Dominican Republic",
366    "tp": "East Timor",
367    "ec": "Ecuador",
368    "eg": "Egypt",
369    "sv": "El Salvador",
370    "gq": "Equatorial Guinea",
371    "er": "Eritrea",
372    "ee": "Estonia",
373    "et": "Ethiopia",
374    "fk": "Falkland Islands (Malvinas)",
375    "fo": "Faroe Islands",
376    "fj": "Fiji",
377    "fi": "Finland",
378    "fr": "France",
379    "gf": "French Guiana",
380    "pf": "French Polynesia",
381    "tf": "French Southern Territories",
382    "ga": "Gabon",
383    "gm": "Gambia",
384    "ge": "Georgia",
385    "de": "Germany",
386    "gh": "Ghana",
387    "gi": "Gibraltar",
388    "gr": "Greece",
389    "gl": "Greenland",
390    "gd": "Grenada",
391    "gp": "Guadeloupe",
392    "gu": "Guam",
393    "gt": "Guatemala",
394    "gn": "Guinea",
395    "gw": "Guinea-Bissau",
396    "gy": "Guyana",
397    "ht": "Haiti",
398    "hm": "Heard Island and Mcdonald Islands",
399    "va": "Holy See (Vatican City State)",
400    "hn": "Honduras",
401    "hk": "Hong Kong",
402    "hu": "Hungary",
403    "is": "Iceland",
404    "in": "India",
405    "id": "Indonesia",
406    "ir": "Iran, Islamic Republic of",
407    "iq": "Iraq",
408    "ie": "Ireland",
409    "il": "Israel",
410    "it": "Italy",
411    "jm": "Jamaica",
412    "jp": "Japan",
413    "jo": "Jordan",
414    "kz": "Kazakstan",
415    "ke": "Kenya",
416    "ki": "Kiribati",
417    "kp": "Korea, Democratic People's Republic of",
418    "kr": "Korea, Republic of",
419    "kw": "Kuwait",
420    "kg": "Kyrgyzstan",
421    "la": "Lao People's Democratic Republic",
422    "lv": "Latvia",
423    "lb": "Lebanon",
424    "ls": "Lesotho",
425    "lr": "Liberia",
426    "ly": "Libyan Arab Jamahiriya",
427    "li": "Liechtenstein",
428    "lt": "Lithuania",
429    "lu": "Luxembourg",
430    "mo": "Macau",
431    "mk": "Macedonia, The Former Yugoslav Republic of",
432    "mg": "Madagascar",
433    "mw": "Malawi",
434    "my": "Malaysia",
435    "mv": "Maldives",
436    "ml": "Mali",
437    "mt": "Malta",
438    "mh": "Marshall Islands",
439    "mq": "Martinique",
440    "mr": "Mauritania",
441    "mu": "Mauritius",
442    "yt": "Mayotte",
443    "mx": "Mexico",
444    "fm": "Micronesia, Federated States of",
445    "md": "Moldova, Republic of",
446    "mc": "Monaco",
447    "mn": "Mongolia",
448    "ms": "Montserrat",
449    "ma": "Morocco",
450    "mz": "Mozambique",
451    "mm": "Myanmar",
452    "na": "Namibia",
453    "nr": "Nauru",
454    "np": "Nepal",
455    "nl": "Netherlands",
456    "an": "Netherlands Antilles",
457    "nc": "New Caledonia",
458    "nz": "New Zealand",
459    "ni": "Nicaragua",
460    "ne": "Niger",
461    "ng": "Nigeria",
462    "nu": "Niue",
463    "nf": "Norfolk Island",
464    "mp": "Northern Mariana Islands",
465    "no": "Norway",
466    "om": "Oman",
467    "pk": "Pakistan",
468    "pw": "Palau",
469    "ps": "Palestinian Territory, Occupied",
470    "pa": "Panama",
471    "pg": "Papua New Guinea",
472    "py": "Paraguay",
473    "pe": "Peru",
474    "ph": "Philippines",
475    "pn": "Pitcairn",
476    "pl": "Poland",
477    "pt": "Portugal",
478    "pr": "Puerto Rico",
479    "qa": "Qatar",
480    "re": "Reunion",
481    "ro": "Romania",
482    "ru": "Russian Federation",
483    "rw": "Rwanda",
484    "sh": "Saint Helena",
485    "kn": "Saint Kitts and Nevis",
486    "lc": "Saint Lucia",
487    "pm": "Saint Pierre and Miquelon",
488    "vc": "Saint Vincent and the Grenadines",
489    "ws": "Samoa",
490    "sm": "San Marino",
491    "st": "Sao Tome and Principe",
492    "sa": "Saudi Arabia",
493    "sn": "Senegal",
494    "sc": "Seychelles",
495    "sl": "Sierra Leone",
496    "sg": "Singapore",
497    "sk": "Slovakia",
498    "si": "Slovenia",
499    "sb": "Solomon Islands",
500    "so": "Somalia",
501    "za": "South Africa",
502    "gs": "South Georgia and the South Sandwich Islands",
503    "es": "Spain",
504    "lk": "Sri Lanka",
505    "sd": "Sudan",
506    "sr": "Suriname",
507    "sj": "Svalbard and Jan Mayen",
508    "sz": "Swaziland",
509    "se": "Sweden",
510    "ch": "Switzerland",
511    "sy": "Syrian Arab Republic",
512    "tw": "Taiwan, Province of China",
513    "tj": "Tajikistan",
514    "tz": "Tanzania, United Republic of",
515    "th": "Thailand",
516    "tg": "Togo",
517    "tk": "Tokelau",
518    "to": "Tonga",
519    "tt": "Trinidad and Tobago",
520    "tn": "Tunisia",
521    "tr": "Turkey",
522    "tm": "Turkmenistan",
523    "tc": "Turks and Caicos Islands",
524    "tv": "Tuvalu",
525    "ug": "Uganda",
526    "ua": "Ukraine",
527    "ae": "United Arab Emirates",
528    "gb": "United Kingdom",
529    "us": "United States",
530    "um": "United States Minor Outlying Islands",
531    "uy": "Uruguay",
532    "uz": "Uzbekistan",
533    "vu": "Vanuatu",
534    "ve": "Venezuela",
535    "vn": "Viet Nam",
536    "vg": "Virgin Islands, British",
537    "vi": "Virgin Islands, U.S.",
538    "wf": "Wallis and Futuna",
539    "eh": "Western Sahara",
540    "ye": "Yemen",
541    "yu": "Yugoslavia",
542    "zm": "Zambia",
543    "zw": "Zimbabwe",
544    }
545
546all = nameorgs.copy()
547all.update(countries)
548
549
550if __name__ == '__main__':
551    main()
552