fontchain_lint.py revision aa3ee8e079e470c72894f0833ac4eb518143e4dd
10e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader#!/usr/bin/env python
20e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
30e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderimport collections
45dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderimport copy
50e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderimport glob
65dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderimport itertools
70e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderfrom os import path
80e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderimport sys
90e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderfrom xml.etree import ElementTree
100e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
110e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderfrom fontTools import ttLib
120e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
135dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderEMOJI_VS = 0xFE0F
145dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
150e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh PournaderLANG_TO_SCRIPT = {
166c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'as': 'Beng',
17033b2226babcaeeb28cc08de6e2c2304a581bd9fRoozbeh Pournader    'bg': 'Cyrl',
186c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'bn': 'Beng',
19033b2226babcaeeb28cc08de6e2c2304a581bd9fRoozbeh Pournader    'cu': 'Cyrl',
206c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'cy': 'Latn',
216c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'da': 'Latn',
220e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'de': 'Latn',
230e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'en': 'Latn',
240e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'es': 'Latn',
256c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'et': 'Latn',
260e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'eu': 'Latn',
276c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'fr': 'Latn',
286c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ga': 'Latn',
296c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'gu': 'Gujr',
306c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'hi': 'Deva',
316c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'hr': 'Latn',
320e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'hu': 'Latn',
330e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'hy': 'Armn',
346c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ja': 'Jpan',
356c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'kn': 'Knda',
366c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ko': 'Kore',
376c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ml': 'Mlym',
386c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'mn': 'Cyrl',
396c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'mr': 'Deva',
400e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'nb': 'Latn',
410e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'nn': 'Latn',
426c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'or': 'Orya',
436c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'pa': 'Guru',
440e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    'pt': 'Latn',
456c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'sl': 'Latn',
466c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'ta': 'Taml',
476c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'te': 'Telu',
486c4f9e0d6f268693663fd5696046172e8c626dd5Jungshik Shin    'tk': 'Latn',
490e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader}
500e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
510e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef lang_to_script(lang_code):
520e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    lang = lang_code.lower()
530e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    while lang not in LANG_TO_SCRIPT:
540e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        hyphen_idx = lang.rfind('-')
550e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assert hyphen_idx != -1, (
560e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            'We do not know what script the "%s" language is written in.'
570e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            % lang_code)
580e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assumed_script = lang[hyphen_idx+1:]
590e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if len(assumed_script) == 4 and assumed_script.isalpha():
600e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            # This is actually the script
610e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            return assumed_script.title()
620e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        lang = lang[:hyphen_idx]
630e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    return LANG_TO_SCRIPT[lang]
640e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
650e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
665dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef printable(inp):
675dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    if type(inp) is set:  # set of character sequences
685dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
695dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    if type(inp) is tuple:  # character sequence
705dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    else:  # single character
725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return 'U+%04X' % inp
735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef open_font(font):
760e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    font_file, index = font
770e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    font_path = path.join(_fonts_dir, font_file)
780e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    if index is not None:
795dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return ttLib.TTFont(font_path, fontNumber=index)
800e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    else:
815dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        return ttLib.TTFont(font_path)
825dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
835dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef get_best_cmap(font):
855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    ttfont = open_font(font)
860e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    all_unicode_cmap = None
870e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    bmp_cmap = None
880e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for cmap in ttfont['cmap'].tables:
890e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
900e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if specifier == (4, 3, 1):
910e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
920e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            bmp_cmap = cmap
930e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        elif specifier == (12, 3, 10):
940e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert all_unicode_cmap is None, (
950e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'More than one UCS-4 cmap in %s' % (font, ))
960e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            all_unicode_cmap = cmap
970e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
980e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
990e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1000e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef get_variation_sequences_cmap(font):
1025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    ttfont = open_font(font)
1035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    vs_cmap = None
1045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for cmap in ttfont['cmap'].tables:
1055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
1065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if specifier == (14, 0, 5):
1075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
1085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            vs_cmap = cmap
1095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return vs_cmap
1105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef get_emoji_map(font):
1135dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Add normal characters
1145dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    emoji_map = copy.copy(get_best_cmap(font))
1155dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}
1165dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1175dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Add variation sequences
1185dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    vs_dict = get_variation_sequences_cmap(font).uvsDict
1195dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for vs in vs_dict:
1205dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        for base, glyph in vs_dict[vs]:
1215dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if glyph is None:
1225dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                emoji_map[(base, vs)] = emoji_map[base]
1235dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            else:
1245dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                emoji_map[(base, vs)] = glyph
1255dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1265dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Add GSUB rules
1275dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    ttfont = open_font(font)
1285dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for lookup in ttfont['GSUB'].table.LookupList.Lookup:
129aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        if lookup.LookupType != 4:
130aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # Other lookups are used in the emoji font for fallback.
131aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # We ignore them for now.
132aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            continue
1335dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        for subtable in lookup.SubTable:
1345dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            ligatures = subtable.ligatures
1355dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            for first_glyph in ligatures:
1365dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                for ligature in ligatures[first_glyph]:
1375dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    sequence = [first_glyph] + ligature.Component
1385dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    sequence = [reverse_cmap[glyph] for glyph in sequence]
1395dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    sequence = tuple(sequence)
1405dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    # Make sure no starting subsequence of 'sequence' has been
1415dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    # seen before.
1425dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    for sub_len in range(2, len(sequence)+1):
1435dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                        subsequence = sequence[:sub_len]
1445dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                        assert subsequence not in emoji_map
1455dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    emoji_map[sequence] = ligature.LigGlyph
1465dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1475dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return emoji_map
1485dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1495dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1500e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef assert_font_supports_any_of_chars(font, chars):
1510e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    best_cmap = get_best_cmap(font)
1520e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for char in chars:
1530e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if char in best_cmap:
1540e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            return
1550e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    sys.exit('None of characters in %s were found in %s' % (chars, font))
1560e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1570e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
158fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournaderdef assert_font_supports_all_of_chars(font, chars):
159fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    best_cmap = get_best_cmap(font)
160fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    for char in chars:
161fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        assert char in best_cmap, (
162fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            'U+%04X was not found in %s' % (char, font))
163fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
164fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
165fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournaderdef assert_font_supports_none_of_chars(font, chars):
166fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    best_cmap = get_best_cmap(font)
167fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    for char in chars:
168fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        assert char not in best_cmap, (
169fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            'U+%04X was found in %s' % (char, font))
170fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
171fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
1725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef assert_font_supports_all_sequences(font, sequences):
1735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    vs_dict = get_variation_sequences_cmap(font).uvsDict
1745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for base, vs in sorted(sequences):
1755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert vs in vs_dict and (base, None) in vs_dict[vs], (
1765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
1775dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1785dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
1790e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef check_hyphens(hyphens_dir):
1800e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    # Find all the scripts that need automatic hyphenation
1810e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    scripts = set()
1820e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
1830e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        hyb_file = path.basename(hyb_file)
1840e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assert hyb_file.startswith('hyph-'), (
1850e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            'Unknown hyphenation file %s' % hyb_file)
1860e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
1870e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        scripts.add(lang_to_script(lang_code))
1880e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1890e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    HYPHENS = {0x002D, 0x2010}
1900e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    for script in scripts:
1910e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        fonts = _script_to_font_map[script]
1920e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        assert fonts, 'No fonts found for the "%s" script' % script
1930e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        for font in fonts:
1940e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert_font_supports_any_of_chars(font, HYPHENS)
1950e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1960e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
1975dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderclass FontRecord(object):
1985dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    def __init__(self, name, scripts, variant, weight, style, font):
1995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.name = name
2005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.scripts = scripts
2015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.variant = variant
2025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.weight = weight
2035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.style = style
2045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        self.font = font
2055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
2065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
2070e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef parse_fonts_xml(fonts_xml_path):
2080e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    global _script_to_font_map, _fallback_chain
2090e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    _script_to_font_map = collections.defaultdict(set)
2100e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    _fallback_chain = []
2110e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    tree = ElementTree.parse(fonts_xml_path)
2129092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    families = tree.findall('family')
2139092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    # Minikin supports up to 254 but users can place their own font at the first
2149092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    # place. Thus, 253 is the maximum allowed number of font families in the
2159092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    # default collection.
2169092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    assert len(families) < 254, (
2179092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka        'System font collection can contains up to 253 font families.')
2189092dc2fa8457858a8e9eab10d91e36225359f5aSeigo Nonaka    for family in families:
2190e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        name = family.get('name')
2200e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        variant = family.get('variant')
2210e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        langs = family.get('lang')
2220e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if name:
2230e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert variant is None, (
2240e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'No variant expected for LGC font %s.' % name)
2250e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert langs is None, (
2260e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'No language expected for LGC fonts %s.' % name)
2270e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        else:
2280e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert variant in {None, 'elegant', 'compact'}, (
2290e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Unexpected value for variant: %s' % variant)
2300e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2310e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        if langs:
2320e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            langs = langs.split()
2330e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            scripts = {lang_to_script(lang) for lang in langs}
2340e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        else:
2350e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            scripts = set()
2360e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2370e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader        for child in family:
2380e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert child.tag == 'font', (
2390e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Unknown tag <%s>' % child.tag)
24088b111401028c33f5d7621eec33c0bae3f5e4c9eJungshik Shin            font_file = child.text.rstrip()
2410e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            weight = int(child.get('weight'))
2420e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert weight % 100 == 0, (
2430e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Font weight "%d" is not a multiple of 100.' % weight)
2440e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2450e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            style = child.get('style')
2460e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            assert style in {'normal', 'italic'}, (
2470e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                'Unknown style "%s"' % style)
2480e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2490e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            index = child.get('index')
2500e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            if index:
2510e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                index = int(index)
2520e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2535dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            _fallback_chain.append(FontRecord(
2540e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                name,
2550e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                frozenset(scripts),
2560e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                variant,
2570e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                weight,
2580e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                style,
2590e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                (font_file, index)))
2600e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2610e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            if name: # non-empty names are used for default LGC fonts
2620e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                map_scripts = {'Latn', 'Grek', 'Cyrl'}
2630e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            else:
2640e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                map_scripts = scripts
2650e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader            for script in map_scripts:
2660e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader                _script_to_font_map[script].add((font_file, index))
2670e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2680e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
2695dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef check_emoji_coverage(all_emoji, equivalent_emoji):
2703b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    emoji_font = get_emoji_font()
2713b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji)
272f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
273f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
274f874a1949a7516596a0c2f5829e140dc6f69c326Doug Feltdef get_emoji_font():
2755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    emoji_fonts = [
2765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        record.font for record in _fallback_chain
2775dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if 'Zsye' in record.scripts]
27827ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader    assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
279f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt    return emoji_fonts[0]
280f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
281fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
282f874a1949a7516596a0c2f5829e140dc6f69c326Doug Feltdef check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji):
283f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt    coverage = get_emoji_map(emoji_font)
2845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for sequence in all_emoji:
2855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert sequence in coverage, (
2865dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            '%s is not supported in the emoji font.' % printable(sequence))
287fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
2885dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for sequence in coverage:
2895dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if sequence in {0x0000, 0x000D, 0x0020}:
2905dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            # The font needs to support a few extra characters, which is OK
2915dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            continue
2925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert sequence in all_emoji, (
2935dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            'Emoji font should not support %s.' % printable(sequence))
2945dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
2955dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for first, second in sorted(equivalent_emoji.items()):
2965dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert coverage[first] == coverage[second], (
2975dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            '%s and %s should map to the same glyph.' % (
2985dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                printable(first),
2995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                printable(second)))
3005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for glyph in set(coverage.values()):
3025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
3035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if len(maps_to_glyph) > 1:
3045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            # There are more than one sequences mapping to the same glyph. We
3055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            # need to make sure they were expected to be equivalent.
3065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            equivalent_seqs = set()
3075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            for seq in maps_to_glyph:
3085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                equivalent_seq = seq
3095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                while equivalent_seq in equivalent_emoji:
3105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    equivalent_seq = equivalent_emoji[equivalent_seq]
3115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                equivalent_seqs.add(equivalent_seq)
3125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            assert len(equivalent_seqs) == 1, (
3135dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                'The sequences %s should not result in the same glyph %s' % (
3145dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    printable(equivalent_seqs),
3155dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    glyph))
3163b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader
3175dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3185dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef check_emoji_defaults(default_emoji):
3195dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    missing_text_chars = _emoji_properties['Emoji'] - default_emoji
320fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader    emoji_font_seen = False
3215dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for record in _fallback_chain:
3225dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        if 'Zsye' in record.scripts:
323fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            emoji_font_seen = True
324fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            # No need to check the emoji font
325fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            continue
326fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        # For later fonts, we only check them if they have a script
327fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader        # defined, since the defined script may get them to a higher
328f9936b9b7cade30306d5f17534256e587c172254yirui        # score even if they appear after the emoji font. However,
329f9936b9b7cade30306d5f17534256e587c172254yirui        # we should skip checking the text symbols font, since
330f9936b9b7cade30306d5f17534256e587c172254yirui        # symbol fonts should be able to override the emoji display
331f9936b9b7cade30306d5f17534256e587c172254yirui        # style when 'Zsym' is explicitly specified by the user.
332f9936b9b7cade30306d5f17534256e587c172254yirui        if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts):
333fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            continue
334fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
3357b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        # Check default emoji-style characters
3365dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        assert_font_supports_none_of_chars(record.font, sorted(default_emoji))
3377b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3387b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        # Mark default text-style characters appearing in fonts above the emoji
3397b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        # font as seen
3407b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        if not emoji_font_seen:
3415dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            missing_text_chars -= set(get_best_cmap(record.font))
3427b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3435dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
3445dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # webdings yet.
3457b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    missing_text_chars -= _chars_by_age['7.0']
3467b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    assert missing_text_chars == set(), (
3473b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader        'Text style version of some emoji characters are missing: ' +
3483b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader            repr(missing_text_chars))
3497b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3507b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3517b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# Setting reverse to true returns a dictionary that maps the values to sets of
3527b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# characters, useful for some binary properties. Otherwise, we get a
3537b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# dictionary that maps characters to the property values, assuming there's only
3547b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader# one property in the file.
3557b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournaderdef parse_unicode_datafile(file_path, reverse=False):
3567b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    if reverse:
3577b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        output_dict = collections.defaultdict(set)
3587b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    else:
3597b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        output_dict = {}
3607b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    with open(file_path) as datafile:
3617b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        for line in datafile:
362fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            if '#' in line:
363fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader                line = line[:line.index('#')]
364fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            line = line.strip()
365fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            if not line:
366fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader                continue
3675dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3683b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader            chars, prop = line.split(';')[:2]
3695dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            chars = chars.strip()
370fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader            prop = prop.strip()
3715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
3725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if ' ' in chars:  # character sequence
3735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                sequence = [int(ch, 16) for ch in chars.split(' ')]
3745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                additions = [tuple(sequence)]
3755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            elif '..' in chars:  # character range
3765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                char_start, char_end = chars.split('..')
3775dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                char_start = int(char_start, 16)
3785dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                char_end = int(char_end, 16)
3795dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                additions = xrange(char_start, char_end+1)
3805dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            else:  # singe character
3815dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                additions = [int(chars, 16)]
3827b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader            if reverse:
3835dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                output_dict[prop].update(additions)
3847b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader            else:
3855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                for addition in additions:
3865dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    assert addition not in output_dict
3875dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                    output_dict[addition] = prop
3887b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    return output_dict
3897b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
3907b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader
391aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournaderdef parse_emoji_variants(file_path):
3925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    emoji_set = set()
3935dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    text_set = set()
3945dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    with open(file_path) as datafile:
3955dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        for line in datafile:
3965dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if '#' in line:
3975dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                line = line[:line.index('#')]
3985dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            line = line.strip()
3995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if not line:
4005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                continue
4015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            sequence, description, _ = line.split(';')
4025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            sequence = sequence.strip().split(' ')
4035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            base = int(sequence[0], 16)
4045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            vs = int(sequence[1], 16)
4055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            description = description.strip()
4065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            if description == 'text style':
4075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                text_set.add((base, vs))
4085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader            elif description == 'emoji style':
4095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader                emoji_set.add((base, vs))
4105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return text_set, emoji_set
4115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4137b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournaderdef parse_ucd(ucd_path):
4147b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    global _emoji_properties, _chars_by_age
4155dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    global _text_variation_sequences, _emoji_variation_sequences
4165dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    global _emoji_sequences, _emoji_zwj_sequences
4177b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    _emoji_properties = parse_unicode_datafile(
4187b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        path.join(ucd_path, 'emoji-data.txt'), reverse=True)
419f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader    emoji_properties_additions = parse_unicode_datafile(
420f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader        path.join(ucd_path, 'additions', 'emoji-data.txt'), reverse=True)
421f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader    for prop in emoji_properties_additions.keys():
422f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader        _emoji_properties[prop].update(emoji_properties_additions[prop])
423f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader
4247b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader    _chars_by_age = parse_unicode_datafile(
4257b822e5fc155a04fa808b1103da7663bd6dd7ba4Roozbeh Pournader        path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
426aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    sequences = parse_emoji_variants(
427aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        path.join(ucd_path, 'emoji-variation-sequences.txt'))
4285dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    _text_variation_sequences, _emoji_variation_sequences = sequences
4295dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    _emoji_sequences = parse_unicode_datafile(
4305dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        path.join(ucd_path, 'emoji-sequences.txt'))
431f7a68c10c70c0537bb1ea3a6a2b54b8800102859Roozbeh Pournader
4325dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    _emoji_zwj_sequences = parse_unicode_datafile(
4335dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        path.join(ucd_path, 'emoji-zwj-sequences.txt'))
4341800ba4ffe76de2652151e43efb2a054d105c7adRoozbeh Pournader    _emoji_zwj_sequences.update(parse_unicode_datafile(
4351800ba4ffe76de2652151e43efb2a054d105c7adRoozbeh Pournader        path.join(ucd_path, 'additions', 'emoji-zwj-sequences.txt')))
4365dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4375dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4385dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef flag_sequence(territory_code):
4395dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
4405dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4415dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4425dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderUNSUPPORTED_FLAGS = frozenset({
4435dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
4445dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
4455dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
4465dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
4475dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
448aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    flag_sequence('WF'), flag_sequence('XK'), flag_sequence('YT'),
4495dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader})
4505dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4515dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderEQUIVALENT_FLAGS = {
4525dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('BV'): flag_sequence('NO'),
4535dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('CP'): flag_sequence('FR'),
4545dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('HM'): flag_sequence('AU'),
4555dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('SJ'): flag_sequence('NO'),
4565dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    flag_sequence('UM'): flag_sequence('US'),
4575dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader}
4585dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4595dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderCOMBINING_KEYCAP = 0x20E3
4605dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4615dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderLEGACY_ANDROID_EMOJI = {
4625dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E5: flag_sequence('JP'),
4635dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E6: flag_sequence('US'),
4645dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E7: flag_sequence('FR'),
4655dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E8: flag_sequence('DE'),
4665dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4E9: flag_sequence('IT'),
4675dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EA: flag_sequence('GB'),
4685dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EB: flag_sequence('ES'),
4695dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EC: flag_sequence('RU'),
4705dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4ED: flag_sequence('CN'),
4715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE4EE: flag_sequence('KR'),
4725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE82C: (ord('#'), COMBINING_KEYCAP),
4735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE82E: (ord('1'), COMBINING_KEYCAP),
4745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE82F: (ord('2'), COMBINING_KEYCAP),
4755dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE830: (ord('3'), COMBINING_KEYCAP),
4765dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE831: (ord('4'), COMBINING_KEYCAP),
4775dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE832: (ord('5'), COMBINING_KEYCAP),
4785dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE833: (ord('6'), COMBINING_KEYCAP),
4795dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE834: (ord('7'), COMBINING_KEYCAP),
4805dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE835: (ord('8'), COMBINING_KEYCAP),
4815dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE836: (ord('9'), COMBINING_KEYCAP),
4825dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    0xFE837: (ord('0'), COMBINING_KEYCAP),
4835dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader}
4845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
4855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh PournaderZWJ_IDENTICALS = {
4865dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # KISS
4875dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
4885dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # COUPLE WITH HEART
4895dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
4905dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    # FAMILY
4915dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
4925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader}
4935dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
494aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderZWJ = 0x200D
495aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderFEMALE_SIGN = 0x2640
496aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderMALE_SIGN = 0x2642
497aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
498aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh PournaderGENDER_DEFAULTS = [
499aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x26F9, MALE_SIGN), # PERSON WITH BALL
500aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3C3, MALE_SIGN), # RUNNER
501aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3C4, MALE_SIGN), # SURFER
502aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3CA, MALE_SIGN), # SWIMMER
503aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3CB, MALE_SIGN), # WEIGHT LIFTER
504aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F3CC, MALE_SIGN), # GOLFER
505aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F46E, MALE_SIGN), # POLICE OFFICER
506aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F46F, FEMALE_SIGN), # WOMAN WITH BUNNY EARS
507aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F471, MALE_SIGN), # PERSON WITH BLOND HAIR
508aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F473, MALE_SIGN), # MAN WITH TURBAN
509aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F477, MALE_SIGN), # CONSTRUCTION WORKER
510aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F481, FEMALE_SIGN), # INFORMATION DESK PERSON
511aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F482, MALE_SIGN), # GUARDSMAN
512aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F486, FEMALE_SIGN), # FACE MASSAGE
513aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F487, FEMALE_SIGN), # HAIRCUT
514aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F575, MALE_SIGN), # SLEUTH OR SPY
515aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F645, FEMALE_SIGN), # FACE WITH NO GOOD GESTURE
516aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F646, FEMALE_SIGN), # FACE WITH OK GESTURE
517aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F647, MALE_SIGN), # PERSON BOWING DEEPLY
518aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F64B, FEMALE_SIGN), # HAPPY PERSON RAISING ONE HAND
519aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F64D, FEMALE_SIGN), # PERSON FROWNING
520aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F64E, FEMALE_SIGN), # PERSON WITH POUTING FACE
521aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6A3, MALE_SIGN), # ROWBOAT
522aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6B4, MALE_SIGN), # BICYCLIST
523aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6B5, MALE_SIGN), # MOUNTAIN BICYCLIST
524aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F6B6, MALE_SIGN), # PEDESTRIAN
525aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F926, FEMALE_SIGN), # FACE PALM
526aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F937, FEMALE_SIGN), # SHRUG
527aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F938, MALE_SIGN), # PERSON DOING CARTWHEEL
528aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F939, MALE_SIGN), # JUGGLING
529aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F93C, MALE_SIGN), # WRESTLERS
530aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F93D, MALE_SIGN), # WATER POLO
531aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F93E, MALE_SIGN), # HANDBALL
532aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D6, FEMALE_SIGN), # PERSON IN STEAMY ROOM
533aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D7, FEMALE_SIGN), # PERSON CLIMBING
534aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D8, FEMALE_SIGN), # PERSON IN LOTUS POSITION
535aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9D9, FEMALE_SIGN), # MAGE
536aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DA, FEMALE_SIGN), # FAIRY
537aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DB, FEMALE_SIGN), # VAMPIRE
538aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DC, FEMALE_SIGN), # MERPERSON
539aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DD, FEMALE_SIGN), # ELF
540aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DE, FEMALE_SIGN), # GENIE
541aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    (0x1F9DF, FEMALE_SIGN), # ZOMBIE
542aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader]
543f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
544f874a1949a7516596a0c2f5829e140dc6f69c326Doug Feltdef is_fitzpatrick_modifier(cp):
5453b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    return 0x1F3FB <= cp <= 0x1F3FF
5463b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader
5473b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader
5483b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournaderdef reverse_emoji(seq):
5493b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    rev = list(reversed(seq))
5503b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    # if there are fitzpatrick modifiers in the sequence, keep them after
5513b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    # the emoji they modify
5523b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    for i in xrange(1, len(rev)):
5533b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader        if is_fitzpatrick_modifier(rev[i-1]):
5543b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader            rev[i], rev[i-1] = rev[i-1], rev[i]
5553b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader    return tuple(rev)
556f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
557f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt
5585dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournaderdef compute_expected_emoji():
5595dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji = {}
5605dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    sequence_pieces = set()
5615dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    all_sequences = set()
5625dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    all_sequences.update(_emoji_variation_sequences)
5635dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
5642b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    # add zwj sequences not in the current emoji-zwj-sequences.txt
5652b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
5662b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
5672b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien
568aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    # Add empty flag tag sequence that is supported as fallback
569aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    _emoji_sequences[(0x1F3F4, 0xE007F)] = 'Emoji_Tag_Sequence'
570aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
5715dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for sequence in _emoji_sequences.keys():
5725dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
5735dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences.add(sequence)
5745dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence_pieces.update(sequence)
575aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        if _emoji_sequences.get(sequence, None) == 'Emoji_Tag_Sequence':
576aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # Add reverse of all emoji ZWJ sequences, which are added to the fonts
577aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # as a workaround to get the sequences work in RTL text.
578aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            # TODO: test if these are actually needed by Minikin/HarfBuzz.
579aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            reversed_seq = reverse_emoji(sequence)
580aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            all_sequences.add(reversed_seq)
581aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            equivalent_emoji[reversed_seq] = sequence
5825dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
5832b8b819fee939c1bca6347a74b42272bc7008fd5Raph Levien    for sequence in adjusted_emoji_zwj_sequences.keys():
5845dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
5855dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences.add(sequence)
5865dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence_pieces.update(sequence)
5875dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        # Add reverse of all emoji ZWJ sequences, which are added to the fonts
5885dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        # as a workaround to get the sequences work in RTL text.
5893b3c78e6ba90c58bc8a4cd4409cfc5bc854ddc3bRoozbeh Pournader        reversed_seq = reverse_emoji(sequence)
5905dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences.add(reversed_seq)
5915dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        equivalent_emoji[reversed_seq] = sequence
5925dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
593aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    # Remove unsupported flags
594aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    all_sequences.difference_update(UNSUPPORTED_FLAGS)
595aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
596aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    # Add all tag characters used in flags
597aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    sequence_pieces.update(range(0xE0030, 0xE0039 + 1))
598aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    sequence_pieces.update(range(0xE0061, 0xE007A + 1))
5995dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
6005dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    all_emoji = (
6015dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        _emoji_properties['Emoji'] |
6025dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences |
6035dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        sequence_pieces |
6045dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        set(LEGACY_ANDROID_EMOJI.keys()))
6055dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    default_emoji = (
6065dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        _emoji_properties['Emoji_Presentation'] |
6075dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_sequences |
6085dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        set(LEGACY_ANDROID_EMOJI.keys()))
6095dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
6105dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji.update(EQUIVALENT_FLAGS)
6115dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
6125dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    equivalent_emoji.update(ZWJ_IDENTICALS)
613aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
614aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader    for ch, gender in GENDER_DEFAULTS:
615aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        equivalent_emoji[(ch, ZWJ, gender)] = ch
616aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader        for skin_tone in range(0x1F3FB, 0x1F3FF+1):
617aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            skin_toned = (ch, skin_tone, ZWJ, gender)
618aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader            if skin_toned in all_emoji:
619aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader                equivalent_emoji[skin_toned] = (ch, skin_tone)
620aa3ee8e079e470c72894f0833ac4eb518143e4ddRoozbeh Pournader
6215dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    for seq in _emoji_variation_sequences:
6225dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        equivalent_emoji[seq] = seq[0]
6235dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader
6245dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader    return all_emoji, default_emoji, equivalent_emoji
625fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
626fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
627bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournaderdef check_vertical_metrics():
628bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader    for record in _fallback_chain:
629bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader        if record.name in ['sans-serif', 'sans-serif-condensed']:
630bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader            font = open_font(record.font)
631ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader            assert font['head'].yMax == 2163 and font['head'].yMin == -555, (
632ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader                'yMax and yMin of %s do not match expected values.' % (record.font,))
633ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader
634ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader        if record.name in ['sans-serif', 'sans-serif-condensed', 'serif', 'monospace']:
635ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader            font = open_font(record.font)
636ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader            assert font['hhea'].ascent == 1900 and font['hhea'].descent == -500, (
637ede3a17f6079b7da62240bbcaf613591ba2fc055Roozbeh Pournader                'ascent and descent of %s do not match expected values.' % (record.font,))
638bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader
639bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader
6400e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderdef main():
6410e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    global _fonts_dir
642f874a1949a7516596a0c2f5829e140dc6f69c326Doug Felt    target_out = sys.argv[1]
6430e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    _fonts_dir = path.join(target_out, 'fonts')
6440e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
6450e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
6460e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    parse_fonts_xml(fonts_xml_path)
6470e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
648bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader    check_vertical_metrics()
649bac1aec6354cc1766cf4ff03578d32d0fa623cb0Roozbeh Pournader
6500e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
6510e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    check_hyphens(hyphens_dir)
6520e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
65327ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader    check_emoji = sys.argv[2]
65427ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader    if check_emoji == 'true':
65527ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader        ucd_path = sys.argv[3]
65627ec3ace18e329eb9654f297dedf667f94baaf81Roozbeh Pournader        parse_ucd(ucd_path)
6575dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
6585dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        check_emoji_coverage(all_emoji, equivalent_emoji)
6595dde087811c255509a72aa9a51c27b40bf0cbf2cRoozbeh Pournader        check_emoji_defaults(default_emoji)
660fa1facc0fd3d04fbc442e23dd8e09f343c8932fcRoozbeh Pournader
6610e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader
6620e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournaderif __name__ == '__main__':
6630e969e2c0ba9ad863c7fcfc3973a16b1b599e50aRoozbeh Pournader    main()
664