10e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader#!/usr/bin/env python
20e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
30e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderimport collections
45dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderimport copy
50e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderimport glob
60e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderfrom os import path
70e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderimport sys
80e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderfrom xml.etree import ElementTree
90e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
100e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderfrom fontTools import ttLib
110e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderEMOJI_VS = 0xFE0F
135dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
140e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh PournaderLANG_TO_SCRIPT = {
156c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'as': 'Beng',
16033b2226babcaeeb28cc08de6e2c2304a581bd9fRoozbeh Pournader    'bg': 'Cyrl',
176c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'bn': 'Beng',
18033b2226babcaeeb28cc08de6e2c2304a581bd9fRoozbeh Pournader    'cu': 'Cyrl',
196c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'cy': 'Latn',
206c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'da': 'Latn',
210e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'de': 'Latn',
220e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'en': 'Latn',
230e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'es': 'Latn',
246c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'et': 'Latn',
250e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'eu': 'Latn',
266c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'fr': 'Latn',
276c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ga': 'Latn',
286c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'gu': 'Gujr',
296c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'hi': 'Deva',
306c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'hr': 'Latn',
310e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'hu': 'Latn',
320e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'hy': 'Armn',
336c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ja': 'Jpan',
346c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'kn': 'Knda',
356c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ko': 'Kore',
366c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ml': 'Mlym',
376c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'mn': 'Cyrl',
386c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'mr': 'Deva',
390e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'nb': 'Latn',
400e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'nn': 'Latn',
416c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'or': 'Orya',
426c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'pa': 'Guru',
430e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'pt': 'Latn',
446c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'sl': 'Latn',
456c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ta': 'Taml',
466c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'te': 'Telu',
476c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'tk': 'Latn',
480e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader}
490e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
500e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef lang_to_script(lang_code):
510e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    lang = lang_code.lower()
520e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    while lang not in LANG_TO_SCRIPT:
530e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        hyphen_idx = lang.rfind('-')
540e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assert hyphen_idx != -1, (
550e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            'We do not know what script the "%s" language is written in.'
560e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            % lang_code)
570e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assumed_script = lang[hyphen_idx+1:]
580e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if len(assumed_script) == 4 and assumed_script.isalpha():
590e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            # This is actually the script
600e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            return assumed_script.title()
610e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        lang = lang[:hyphen_idx]
620e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    return LANG_TO_SCRIPT[lang]
630e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
640e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
655dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef printable(inp):
665dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    if type(inp) is set:  # set of character sequences
675dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
685dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    if type(inp) is tuple:  # character sequence
695dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
705dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    else:  # single character
715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return 'U+%04X' % inp
725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef open_font(font):
750e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    font_file, index = font
760e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    font_path = path.join(_fonts_dir, font_file)
770e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    if index is not None:
785dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return ttLib.TTFont(font_path, fontNumber=index)
790e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    else:
805dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return ttLib.TTFont(font_path)
815dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
825dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
835dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef get_best_cmap(font):
845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    ttfont = open_font(font)
850e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    all_unicode_cmap = None
860e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    bmp_cmap = None
870e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for cmap in ttfont['cmap'].tables:
880e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
890e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if specifier == (4, 3, 1):
900e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
910e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            bmp_cmap = cmap
920e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        elif specifier == (12, 3, 10):
930e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert all_unicode_cmap is None, (
940e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'More than one UCS-4 cmap in %s' % (font, ))
950e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            all_unicode_cmap = cmap
960e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
970e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
980e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
990e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef get_variation_sequences_cmap(font):
1015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    ttfont = open_font(font)
1025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    vs_cmap = None
1035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for cmap in ttfont['cmap'].tables:
1045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
1055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if specifier == (14, 0, 5):
1065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
1075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            vs_cmap = cmap
1085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return vs_cmap
1095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef get_emoji_map(font):
1125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Add normal characters
1135dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    emoji_map = copy.copy(get_best_cmap(font))
1145dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}
1155dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1165dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Add variation sequences
1175dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    vs_dict = get_variation_sequences_cmap(font).uvsDict
1185dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for vs in vs_dict:
1195dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        for base, glyph in vs_dict[vs]:
1205dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if glyph is None:
1215dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                emoji_map[(base, vs)] = emoji_map[base]
1225dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            else:
1235dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                emoji_map[(base, vs)] = glyph
1245dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1255dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Add GSUB rules
1265dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    ttfont = open_font(font)
1275dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for lookup in ttfont['GSUB'].table.LookupList.Lookup:
128aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        if lookup.LookupType != 4:
129aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # Other lookups are used in the emoji font for fallback.
130aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # We ignore them for now.
131aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            continue
1325dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        for subtable in lookup.SubTable:
1335dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            ligatures = subtable.ligatures
1345dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            for first_glyph in ligatures:
1355dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                for ligature in ligatures[first_glyph]:
1365dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    sequence = [first_glyph] + ligature.Component
1375dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    sequence = [reverse_cmap[glyph] for glyph in sequence]
1385dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    sequence = tuple(sequence)
1395dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    # Make sure no starting subsequence of 'sequence' has been
1405dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    # seen before.
1415dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    for sub_len in range(2, len(sequence)+1):
1425dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                        subsequence = sequence[:sub_len]
1435dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                        assert subsequence not in emoji_map
1445dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    emoji_map[sequence] = ligature.LigGlyph
1455dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1465dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return emoji_map
1475dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1485dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1490e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef assert_font_supports_any_of_chars(font, chars):
1500e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    best_cmap = get_best_cmap(font)
1510e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for char in chars:
1520e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if char in best_cmap:
1530e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            return
1540e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    sys.exit('None of characters in %s were found in %s' % (chars, font))
1550e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1560e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
157fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournaderdef assert_font_supports_all_of_chars(font, chars):
158fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    best_cmap = get_best_cmap(font)
159fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    for char in chars:
160fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        assert char in best_cmap, (
161fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            'U+%04X was not found in %s' % (char, font))
162fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
163fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
164fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournaderdef assert_font_supports_none_of_chars(font, chars):
165fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    best_cmap = get_best_cmap(font)
166fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    for char in chars:
167fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        assert char not in best_cmap, (
168fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            'U+%04X was found in %s' % (char, font))
169fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
170fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
1715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef assert_font_supports_all_sequences(font, sequences):
1725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    vs_dict = get_variation_sequences_cmap(font).uvsDict
1735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for base, vs in sorted(sequences):
1745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert vs in vs_dict and (base, None) in vs_dict[vs], (
1755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
1765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1775dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1780e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef check_hyphens(hyphens_dir):
1790e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    # Find all the scripts that need automatic hyphenation
1800e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    scripts = set()
1810e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
1820e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        hyb_file = path.basename(hyb_file)
1830e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assert hyb_file.startswith('hyph-'), (
1840e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            'Unknown hyphenation file %s' % hyb_file)
1850e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
1860e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        scripts.add(lang_to_script(lang_code))
1870e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1880e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    HYPHENS = {0x002D, 0x2010}
1890e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for script in scripts:
1900e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        fonts = _script_to_font_map[script]
1910e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assert fonts, 'No fonts found for the "%s" script' % script
1920e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        for font in fonts:
1930e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert_font_supports_any_of_chars(font, HYPHENS)
1940e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1950e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1965dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderclass FontRecord(object):
1975dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    def __init__(self, name, scripts, variant, weight, style, font):
1985dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.name = name
1995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.scripts = scripts
2005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.variant = variant
2015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.weight = weight
2025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.style = style
2035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.font = font
2045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
2055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
2060e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef parse_fonts_xml(fonts_xml_path):
2070e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    global _script_to_font_map, _fallback_chain
2080e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    _script_to_font_map = collections.defaultdict(set)
2090e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    _fallback_chain = []
2100e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    tree = ElementTree.parse(fonts_xml_path)
2119092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    families = tree.findall('family')
2129092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    # Minikin supports up to 254 but users can place their own font at the first
2139092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    # place. Thus, 253 is the maximum allowed number of font families in the
2149092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    # default collection.
2159092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    assert len(families) < 254, (
2169092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka        'System font collection can contains up to 253 font families.')
2179092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    for family in families:
2180e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        name = family.get('name')
2190e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        variant = family.get('variant')
2200e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        langs = family.get('lang')
2210e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if name:
2220e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert variant is None, (
2230e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'No variant expected for LGC font %s.' % name)
2240e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert langs is None, (
2250e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'No language expected for LGC fonts %s.' % name)
2260e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        else:
2270e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert variant in {None, 'elegant', 'compact'}, (
2280e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Unexpected value for variant: %s' % variant)
2290e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2300e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if langs:
2310e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            langs = langs.split()
2320e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            scripts = {lang_to_script(lang) for lang in langs}
2330e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        else:
2340e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            scripts = set()
2350e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2360e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        for child in family:
2370e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert child.tag == 'font', (
2380e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Unknown tag <%s>' % child.tag)
23988b111401028c33f5d7621eec33c0bae3f5e4c9eJungshik Shin            font_file = child.text.rstrip()
2400e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            weight = int(child.get('weight'))
2410e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert weight % 100 == 0, (
2420e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Font weight "%d" is not a multiple of 100.' % weight)
2430e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2440e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            style = child.get('style')
2450e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert style in {'normal', 'italic'}, (
2460e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Unknown style "%s"' % style)
2470e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2480e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            index = child.get('index')
2490e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            if index:
2500e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                index = int(index)
2510e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2525dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            _fallback_chain.append(FontRecord(
2530e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                name,
2540e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                frozenset(scripts),
2550e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                variant,
2560e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                weight,
2570e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                style,
2580e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                (font_file, index)))
2590e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2600e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            if name: # non-empty names are used for default LGC fonts
2610e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                map_scripts = {'Latn', 'Grek', 'Cyrl'}
2620e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            else:
2630e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                map_scripts = scripts
2640e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            for script in map_scripts:
2650e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                _script_to_font_map[script].add((font_file, index))
2660e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2670e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2685dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef check_emoji_coverage(all_emoji, equivalent_emoji):
2693b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    emoji_font = get_emoji_font()
2703b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji)
271f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
272f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
273f874a1949a7516596a0c2f5829e140dc6f69c326Doug Feltdef get_emoji_font():
2745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    emoji_fonts = [
2755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        record.font for record in _fallback_chain
2765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if 'Zsye' in record.scripts]
27727ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader    assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
278f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt    return emoji_fonts[0]
279f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
280fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
281f874a1949a7516596a0c2f5829e140dc6f69c326Doug Feltdef check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji):
282f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt    coverage = get_emoji_map(emoji_font)
2835dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for sequence in all_emoji:
2845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert sequence in coverage, (
2855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            '%s is not supported in the emoji font.' % printable(sequence))
286fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
2875dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for sequence in coverage:
2885dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if sequence in {0x0000, 0x000D, 0x0020}:
2895dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            # The font needs to support a few extra characters, which is OK
2905dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            continue
2915dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert sequence in all_emoji, (
2925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            'Emoji font should not support %s.' % printable(sequence))
2935dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
2945dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for first, second in sorted(equivalent_emoji.items()):
2955dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert coverage[first] == coverage[second], (
2965dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            '%s and %s should map to the same glyph.' % (
2975dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                printable(first),
2985dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                printable(second)))
2995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for glyph in set(coverage.values()):
3015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
3025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if len(maps_to_glyph) > 1:
3035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            # There are more than one sequences mapping to the same glyph. We
3045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            # need to make sure they were expected to be equivalent.
3055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            equivalent_seqs = set()
3065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            for seq in maps_to_glyph:
3075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                equivalent_seq = seq
3085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                while equivalent_seq in equivalent_emoji:
3095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    equivalent_seq = equivalent_emoji[equivalent_seq]
3105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                equivalent_seqs.add(equivalent_seq)
3115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            assert len(equivalent_seqs) == 1, (
3125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                'The sequences %s should not result in the same glyph %s' % (
3135dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    printable(equivalent_seqs),
3145dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    glyph))
3153b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader
3165dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3175dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef check_emoji_defaults(default_emoji):
3185dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    missing_text_chars = _emoji_properties['Emoji'] - default_emoji
319fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    emoji_font_seen = False
3205dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for record in _fallback_chain:
3215dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if 'Zsye' in record.scripts:
322fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            emoji_font_seen = True
323fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            # No need to check the emoji font
324fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            continue
325fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        # For later fonts, we only check them if they have a script
326fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        # defined, since the defined script may get them to a higher
327f9936b9b7cade30306d5f17534256e587c172254yirui        # score even if they appear after the emoji font. However,
328f9936b9b7cade30306d5f17534256e587c172254yirui        # we should skip checking the text symbols font, since
329f9936b9b7cade30306d5f17534256e587c172254yirui        # symbol fonts should be able to override the emoji display
330f9936b9b7cade30306d5f17534256e587c172254yirui        # style when 'Zsym' is explicitly specified by the user.
331f9936b9b7cade30306d5f17534256e587c172254yirui        if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts):
332fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            continue
333fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
3347b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        # Check default emoji-style characters
3355dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert_font_supports_none_of_chars(record.font, sorted(default_emoji))
3367b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3377b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        # Mark default text-style characters appearing in fonts above the emoji
3387b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        # font as seen
3397b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        if not emoji_font_seen:
3405dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            missing_text_chars -= set(get_best_cmap(record.font))
3417b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3425dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
3435dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # webdings yet.
3447b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    missing_text_chars -= _chars_by_age['7.0']
3457b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    assert missing_text_chars == set(), (
3463b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader        'Text style version of some emoji characters are missing: ' +
3473b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader            repr(missing_text_chars))
3487b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3497b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3507b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# Setting reverse to true returns a dictionary that maps the values to sets of
3517b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# characters, useful for some binary properties. Otherwise, we get a
3527b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# dictionary that maps characters to the property values, assuming there's only
3537b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# one property in the file.
3547b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournaderdef parse_unicode_datafile(file_path, reverse=False):
3557b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    if reverse:
3567b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        output_dict = collections.defaultdict(set)
3577b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    else:
3587b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        output_dict = {}
3597b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    with open(file_path) as datafile:
3607b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        for line in datafile:
361fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            if '#' in line:
362fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader                line = line[:line.index('#')]
363fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            line = line.strip()
364fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            if not line:
365fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader                continue
3665dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3673b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader            chars, prop = line.split(';')[:2]
3685dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            chars = chars.strip()
369fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            prop = prop.strip()
3705dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if ' ' in chars:  # character sequence
3725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                sequence = [int(ch, 16) for ch in chars.split(' ')]
3735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                additions = [tuple(sequence)]
3745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            elif '..' in chars:  # character range
3755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                char_start, char_end = chars.split('..')
3765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                char_start = int(char_start, 16)
3775dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                char_end = int(char_end, 16)
3785dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                additions = xrange(char_start, char_end+1)
3795dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            else:  # singe character
3805dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                additions = [int(chars, 16)]
3817b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader            if reverse:
3825dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                output_dict[prop].update(additions)
3837b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader            else:
3845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                for addition in additions:
3855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    assert addition not in output_dict
3865dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    output_dict[addition] = prop
3877b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    return output_dict
3887b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3897b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
390aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournaderdef parse_emoji_variants(file_path):
3915dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    emoji_set = set()
3925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    text_set = set()
3935dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    with open(file_path) as datafile:
3945dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        for line in datafile:
3955dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if '#' in line:
3965dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                line = line[:line.index('#')]
3975dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            line = line.strip()
3985dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if not line:
3995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                continue
4005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            sequence, description, _ = line.split(';')
4015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            sequence = sequence.strip().split(' ')
4025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            base = int(sequence[0], 16)
4035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            vs = int(sequence[1], 16)
4045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            description = description.strip()
4055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if description == 'text style':
4065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                text_set.add((base, vs))
4075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            elif description == 'emoji style':
4085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                emoji_set.add((base, vs))
4095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return text_set, emoji_set
4105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4127b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournaderdef parse_ucd(ucd_path):
4137b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    global _emoji_properties, _chars_by_age
4145dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    global _text_variation_sequences, _emoji_variation_sequences
4155dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    global _emoji_sequences, _emoji_zwj_sequences
4167b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    _emoji_properties = parse_unicode_datafile(
4177b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        path.join(ucd_path, 'emoji-data.txt'), reverse=True)
418f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader    emoji_properties_additions = parse_unicode_datafile(
419f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader        path.join(ucd_path, 'additions', 'emoji-data.txt'), reverse=True)
420f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader    for prop in emoji_properties_additions.keys():
421f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader        _emoji_properties[prop].update(emoji_properties_additions[prop])
422f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader
4237b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    _chars_by_age = parse_unicode_datafile(
4247b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
425aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    sequences = parse_emoji_variants(
426aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        path.join(ucd_path, 'emoji-variation-sequences.txt'))
4275dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    _text_variation_sequences, _emoji_variation_sequences = sequences
4285dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    _emoji_sequences = parse_unicode_datafile(
4295dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        path.join(ucd_path, 'emoji-sequences.txt'))
4306e06ad055b35b197b3083728c6c5d311fb12e57aSiyamed Sinir    _emoji_sequences.update(parse_unicode_datafile(
4316e06ad055b35b197b3083728c6c5d311fb12e57aSiyamed Sinir        path.join(ucd_path, 'additions', 'emoji-sequences.txt')))
4325dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    _emoji_zwj_sequences = parse_unicode_datafile(
4335dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        path.join(ucd_path, 'emoji-zwj-sequences.txt'))
4341800ba4ffe76de2652151e43efb2a054d105c7adRoozbeh Pournader    _emoji_zwj_sequences.update(parse_unicode_datafile(
4351800ba4ffe76de2652151e43efb2a054d105c7adRoozbeh Pournader        path.join(ucd_path, 'additions', 'emoji-zwj-sequences.txt')))
4365dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4375dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4385dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef flag_sequence(territory_code):
4395dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
4405dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4415dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4425dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderUNSUPPORTED_FLAGS = frozenset({
4435dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
4445dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
4455dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
4465dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
4475dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
448aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    flag_sequence('WF'), flag_sequence('XK'), flag_sequence('YT'),
4495dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader})
4505dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4515dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderEQUIVALENT_FLAGS = {
4525dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('BV'): flag_sequence('NO'),
4535dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('CP'): flag_sequence('FR'),
4545dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('HM'): flag_sequence('AU'),
4555dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('SJ'): flag_sequence('NO'),
4565dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('UM'): flag_sequence('US'),
4575dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader}
4585dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4595dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderCOMBINING_KEYCAP = 0x20E3
4605dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4615dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderLEGACY_ANDROID_EMOJI = {
4625dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E5: flag_sequence('JP'),
4635dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E6: flag_sequence('US'),
4645dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E7: flag_sequence('FR'),
4655dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E8: flag_sequence('DE'),
4665dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E9: flag_sequence('IT'),
4675dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EA: flag_sequence('GB'),
4685dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EB: flag_sequence('ES'),
4695dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EC: flag_sequence('RU'),
4705dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4ED: flag_sequence('CN'),
4715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EE: flag_sequence('KR'),
4725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE82C: (ord('#'), COMBINING_KEYCAP),
4735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE82E: (ord('1'), COMBINING_KEYCAP),
4745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE82F: (ord('2'), COMBINING_KEYCAP),
4755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE830: (ord('3'), COMBINING_KEYCAP),
4765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE831: (ord('4'), COMBINING_KEYCAP),
4775dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE832: (ord('5'), COMBINING_KEYCAP),
4785dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE833: (ord('6'), COMBINING_KEYCAP),
4795dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE834: (ord('7'), COMBINING_KEYCAP),
4805dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE835: (ord('8'), COMBINING_KEYCAP),
4815dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE836: (ord('9'), COMBINING_KEYCAP),
4825dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE837: (ord('0'), COMBINING_KEYCAP),
4835dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader}
4845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderZWJ_IDENTICALS = {
4865dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # KISS
4875dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
4885dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # COUPLE WITH HEART
4895dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
4905dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # FAMILY
4915dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
4925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader}
4935dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
494aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderZWJ = 0x200D
495aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderFEMALE_SIGN = 0x2640
496aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderMALE_SIGN = 0x2642
497aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
498aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderGENDER_DEFAULTS = [
499aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x26F9, MALE_SIGN), # PERSON WITH BALL
500aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3C3, MALE_SIGN), # RUNNER
501aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3C4, MALE_SIGN), # SURFER
502aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3CA, MALE_SIGN), # SWIMMER
503aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3CB, MALE_SIGN), # WEIGHT LIFTER
504aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3CC, MALE_SIGN), # GOLFER
505aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F46E, MALE_SIGN), # POLICE OFFICER
506aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F46F, FEMALE_SIGN), # WOMAN WITH BUNNY EARS
507aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F471, MALE_SIGN), # PERSON WITH BLOND HAIR
508aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F473, MALE_SIGN), # MAN WITH TURBAN
509aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F477, MALE_SIGN), # CONSTRUCTION WORKER
510aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F481, FEMALE_SIGN), # INFORMATION DESK PERSON
511aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F482, MALE_SIGN), # GUARDSMAN
512aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F486, FEMALE_SIGN), # FACE MASSAGE
513aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F487, FEMALE_SIGN), # HAIRCUT
514aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F575, MALE_SIGN), # SLEUTH OR SPY
515aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F645, FEMALE_SIGN), # FACE WITH NO GOOD GESTURE
516aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F646, FEMALE_SIGN), # FACE WITH OK GESTURE
517aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F647, MALE_SIGN), # PERSON BOWING DEEPLY
518aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F64B, FEMALE_SIGN), # HAPPY PERSON RAISING ONE HAND
519aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F64D, FEMALE_SIGN), # PERSON FROWNING
520aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F64E, FEMALE_SIGN), # PERSON WITH POUTING FACE
521aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6A3, MALE_SIGN), # ROWBOAT
522aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6B4, MALE_SIGN), # BICYCLIST
523aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6B5, MALE_SIGN), # MOUNTAIN BICYCLIST
524aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6B6, MALE_SIGN), # PEDESTRIAN
525aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F926, FEMALE_SIGN), # FACE PALM
526aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F937, FEMALE_SIGN), # SHRUG
527aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F938, MALE_SIGN), # PERSON DOING CARTWHEEL
528aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F939, MALE_SIGN), # JUGGLING
529aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F93C, MALE_SIGN), # WRESTLERS
530aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F93D, MALE_SIGN), # WATER POLO
531aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F93E, MALE_SIGN), # HANDBALL
532aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D6, FEMALE_SIGN), # PERSON IN STEAMY ROOM
533aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D7, FEMALE_SIGN), # PERSON CLIMBING
534aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D8, FEMALE_SIGN), # PERSON IN LOTUS POSITION
535aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D9, FEMALE_SIGN), # MAGE
536aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DA, FEMALE_SIGN), # FAIRY
537aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DB, FEMALE_SIGN), # VAMPIRE
538aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DC, FEMALE_SIGN), # MERPERSON
539aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DD, FEMALE_SIGN), # ELF
540aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DE, FEMALE_SIGN), # GENIE
541aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DF, FEMALE_SIGN), # ZOMBIE
542aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader]
543f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
544f874a1949a7516596a0c2f5829e140dc6f69c326Doug Feltdef is_fitzpatrick_modifier(cp):
5453b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    return 0x1F3FB <= cp <= 0x1F3FF
5463b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader
5473b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader
5483b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournaderdef reverse_emoji(seq):
5493b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    rev = list(reversed(seq))
5503b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    # if there are fitzpatrick modifiers in the sequence, keep them after
5513b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    # the emoji they modify
5523b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    for i in xrange(1, len(rev)):
5533b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader        if is_fitzpatrick_modifier(rev[i-1]):
5543b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader            rev[i], rev[i-1] = rev[i-1], rev[i]
5553b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    return tuple(rev)
556f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
557f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
5585dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef compute_expected_emoji():
5595dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji = {}
5605dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    sequence_pieces = set()
5615dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    all_sequences = set()
5625dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    all_sequences.update(_emoji_variation_sequences)
5635dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
5642b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    # add zwj sequences not in the current emoji-zwj-sequences.txt
5652b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
5662b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
5672b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien
568aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    # Add empty flag tag sequence that is supported as fallback
569aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    _emoji_sequences[(0x1F3F4, 0xE007F)] = 'Emoji_Tag_Sequence'
570aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
5715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for sequence in _emoji_sequences.keys():
5725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
5735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences.add(sequence)
5745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence_pieces.update(sequence)
575aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        if _emoji_sequences.get(sequence, None) == 'Emoji_Tag_Sequence':
57663d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader            # Add reverse of all emoji ZWJ sequences, which are added to the
57763d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader            # fonts as a workaround to get the sequences work in RTL text.
578aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # TODO: test if these are actually needed by Minikin/HarfBuzz.
579aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            reversed_seq = reverse_emoji(sequence)
580aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            all_sequences.add(reversed_seq)
581aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            equivalent_emoji[reversed_seq] = sequence
5825dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
5832b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    for sequence in adjusted_emoji_zwj_sequences.keys():
5845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
5855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences.add(sequence)
5865dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence_pieces.update(sequence)
5875dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        # Add reverse of all emoji ZWJ sequences, which are added to the fonts
5885dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        # as a workaround to get the sequences work in RTL text.
5893b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader        reversed_seq = reverse_emoji(sequence)
5905dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences.add(reversed_seq)
5915dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        equivalent_emoji[reversed_seq] = sequence
5925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
593aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    # Remove unsupported flags
594aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    all_sequences.difference_update(UNSUPPORTED_FLAGS)
595aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
596aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    # Add all tag characters used in flags
597aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    sequence_pieces.update(range(0xE0030, 0xE0039 + 1))
598aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    sequence_pieces.update(range(0xE0061, 0xE007A + 1))
5995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
6005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    all_emoji = (
6015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        _emoji_properties['Emoji'] |
6025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences |
6035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence_pieces |
6045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        set(LEGACY_ANDROID_EMOJI.keys()))
6055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    default_emoji = (
6065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        _emoji_properties['Emoji_Presentation'] |
6075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences |
6085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        set(LEGACY_ANDROID_EMOJI.keys()))
6095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
6105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji.update(EQUIVALENT_FLAGS)
6115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
6125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji.update(ZWJ_IDENTICALS)
613aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
614aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    for ch, gender in GENDER_DEFAULTS:
615aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        equivalent_emoji[(ch, ZWJ, gender)] = ch
616aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        for skin_tone in range(0x1F3FB, 0x1F3FF+1):
617aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            skin_toned = (ch, skin_tone, ZWJ, gender)
618aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            if skin_toned in all_emoji:
619aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader                equivalent_emoji[skin_toned] = (ch, skin_tone)
620aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
6215dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for seq in _emoji_variation_sequences:
6225dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        equivalent_emoji[seq] = seq[0]
6235dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
6245dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return all_emoji, default_emoji, equivalent_emoji
625fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
626fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
627bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournaderdef check_vertical_metrics():
628bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader    for record in _fallback_chain:
629bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader        if record.name in ['sans-serif', 'sans-serif-condensed']:
630bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader            font = open_font(record.font)
631ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader            assert font['head'].yMax == 2163 and font['head'].yMin == -555, (
63263d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader                'yMax and yMin of %s do not match expected values.' % (
63363d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader                record.font,))
634ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader
63563d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader        if record.name in ['sans-serif', 'sans-serif-condensed',
63663d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader                           'serif', 'monospace']:
637ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader            font = open_font(record.font)
63863d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader            assert (font['hhea'].ascent == 1900 and
63963d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader                    font['hhea'].descent == -500), (
64063d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader                        'ascent and descent of %s do not match expected '
64163d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader                        'values.' % (record.font,))
64263d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader
64363d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader
64463d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournaderdef check_cjk_punctuation():
64563d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader    cjk_scripts = {'Hans', 'Hant', 'Jpan', 'Kore'}
64663d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader    cjk_punctuation = range(0x3000, 0x301F + 1)
64763d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader    for record in _fallback_chain:
64863d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader        if record.scripts.intersection(cjk_scripts):
64963d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader            # CJK font seen. Stop checking the rest of the fonts.
65063d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader            break
65163d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader        assert_font_supports_none_of_chars(record.font, cjk_punctuation)
652bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader
653bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader
6540e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef main():
6550e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    global _fonts_dir
656f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt    target_out = sys.argv[1]
6570e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    _fonts_dir = path.join(target_out, 'fonts')
6580e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
6590e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
6600e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    parse_fonts_xml(fonts_xml_path)
6610e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
662bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader    check_vertical_metrics()
663bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader
6640e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
6650e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    check_hyphens(hyphens_dir)
6660e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
66763d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader    check_cjk_punctuation()
66863d4d0d3580fc0e4d6ae1543b8ae1b186b8119deRoozbeh Pournader
66927ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader    check_emoji = sys.argv[2]
67027ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader    if check_emoji == 'true':
67127ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader        ucd_path = sys.argv[3]
67227ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader        parse_ucd(ucd_path)
6735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
6745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        check_emoji_coverage(all_emoji, equivalent_emoji)
6755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        check_emoji_defaults(default_emoji)
676fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
6770e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
6780e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderif __name__ == '__main__':
6790e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    main()
680