fontchain_lint.py revision f9936b9b7cade30306d5f17534256e587c172254
1#!/usr/bin/env python
2
3import collections
4import copy
5import glob
6import itertools
7from os import path
8import sys
9from xml.etree import ElementTree
10
11from fontTools import ttLib
12
13EMOJI_VS = 0xFE0F
14
15LANG_TO_SCRIPT = {
16    'as': 'Beng',
17    'bn': 'Beng',
18    'cy': 'Latn',
19    'da': 'Latn',
20    'de': 'Latn',
21    'en': 'Latn',
22    'es': 'Latn',
23    'et': 'Latn',
24    'eu': 'Latn',
25    'fr': 'Latn',
26    'ga': 'Latn',
27    'gu': 'Gujr',
28    'hi': 'Deva',
29    'hr': 'Latn',
30    'hu': 'Latn',
31    'hy': 'Armn',
32    'ja': 'Jpan',
33    'kn': 'Knda',
34    'ko': 'Kore',
35    'ml': 'Mlym',
36    'mn': 'Cyrl',
37    'mr': 'Deva',
38    'nb': 'Latn',
39    'nn': 'Latn',
40    'or': 'Orya',
41    'pa': 'Guru',
42    'pt': 'Latn',
43    'sl': 'Latn',
44    'ta': 'Taml',
45    'te': 'Telu',
46    'tk': 'Latn',
47}
48
49def lang_to_script(lang_code):
50    lang = lang_code.lower()
51    while lang not in LANG_TO_SCRIPT:
52        hyphen_idx = lang.rfind('-')
53        assert hyphen_idx != -1, (
54            'We do not know what script the "%s" language is written in.'
55            % lang_code)
56        assumed_script = lang[hyphen_idx+1:]
57        if len(assumed_script) == 4 and assumed_script.isalpha():
58            # This is actually the script
59            return assumed_script.title()
60        lang = lang[:hyphen_idx]
61    return LANG_TO_SCRIPT[lang]
62
63
64def printable(inp):
65    if type(inp) is set:  # set of character sequences
66        return '{' + ', '.join([printable(seq) for seq in inp]) + '}'
67    if type(inp) is tuple:  # character sequence
68        return '<' + (', '.join([printable(ch) for ch in inp])) + '>'
69    else:  # single character
70        return 'U+%04X' % inp
71
72
73def open_font(font):
74    font_file, index = font
75    font_path = path.join(_fonts_dir, font_file)
76    if index is not None:
77        return ttLib.TTFont(font_path, fontNumber=index)
78    else:
79        return ttLib.TTFont(font_path)
80
81
82def get_best_cmap(font):
83    ttfont = open_font(font)
84    all_unicode_cmap = None
85    bmp_cmap = None
86    for cmap in ttfont['cmap'].tables:
87        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
88        if specifier == (4, 3, 1):
89            assert bmp_cmap is None, 'More than one BMP cmap in %s' % (font, )
90            bmp_cmap = cmap
91        elif specifier == (12, 3, 10):
92            assert all_unicode_cmap is None, (
93                'More than one UCS-4 cmap in %s' % (font, ))
94            all_unicode_cmap = cmap
95
96    return all_unicode_cmap.cmap if all_unicode_cmap else bmp_cmap.cmap
97
98
99def get_variation_sequences_cmap(font):
100    ttfont = open_font(font)
101    vs_cmap = None
102    for cmap in ttfont['cmap'].tables:
103        specifier = (cmap.format, cmap.platformID, cmap.platEncID)
104        if specifier == (14, 0, 5):
105            assert vs_cmap is None, 'More than one VS cmap in %s' % (font, )
106            vs_cmap = cmap
107    return vs_cmap
108
109
110def get_emoji_map(font):
111    # Add normal characters
112    emoji_map = copy.copy(get_best_cmap(font))
113    reverse_cmap = {glyph: code for code, glyph in emoji_map.items()}
114
115    # Add variation sequences
116    vs_dict = get_variation_sequences_cmap(font).uvsDict
117    for vs in vs_dict:
118        for base, glyph in vs_dict[vs]:
119            if glyph is None:
120                emoji_map[(base, vs)] = emoji_map[base]
121            else:
122                emoji_map[(base, vs)] = glyph
123
124    # Add GSUB rules
125    ttfont = open_font(font)
126    for lookup in ttfont['GSUB'].table.LookupList.Lookup:
127        assert lookup.LookupType == 4, 'We only understand type 4 lookups'
128        for subtable in lookup.SubTable:
129            ligatures = subtable.ligatures
130            for first_glyph in ligatures:
131                for ligature in ligatures[first_glyph]:
132                    sequence = [first_glyph] + ligature.Component
133                    sequence = [reverse_cmap[glyph] for glyph in sequence]
134                    sequence = tuple(sequence)
135                    # Make sure no starting subsequence of 'sequence' has been
136                    # seen before.
137                    for sub_len in range(2, len(sequence)+1):
138                        subsequence = sequence[:sub_len]
139                        assert subsequence not in emoji_map
140                    emoji_map[sequence] = ligature.LigGlyph
141
142    return emoji_map
143
144
145def assert_font_supports_any_of_chars(font, chars):
146    best_cmap = get_best_cmap(font)
147    for char in chars:
148        if char in best_cmap:
149            return
150    sys.exit('None of characters in %s were found in %s' % (chars, font))
151
152
153def assert_font_supports_all_of_chars(font, chars):
154    best_cmap = get_best_cmap(font)
155    for char in chars:
156        assert char in best_cmap, (
157            'U+%04X was not found in %s' % (char, font))
158
159
160def assert_font_supports_none_of_chars(font, chars):
161    best_cmap = get_best_cmap(font)
162    for char in chars:
163        assert char not in best_cmap, (
164            'U+%04X was found in %s' % (char, font))
165
166
167def assert_font_supports_all_sequences(font, sequences):
168    vs_dict = get_variation_sequences_cmap(font).uvsDict
169    for base, vs in sorted(sequences):
170        assert vs in vs_dict and (base, None) in vs_dict[vs], (
171            '<U+%04X, U+%04X> was not found in %s' % (base, vs, font))
172
173
174def check_hyphens(hyphens_dir):
175    # Find all the scripts that need automatic hyphenation
176    scripts = set()
177    for hyb_file in glob.iglob(path.join(hyphens_dir, '*.hyb')):
178        hyb_file = path.basename(hyb_file)
179        assert hyb_file.startswith('hyph-'), (
180            'Unknown hyphenation file %s' % hyb_file)
181        lang_code = hyb_file[hyb_file.index('-')+1:hyb_file.index('.')]
182        scripts.add(lang_to_script(lang_code))
183
184    HYPHENS = {0x002D, 0x2010}
185    for script in scripts:
186        fonts = _script_to_font_map[script]
187        assert fonts, 'No fonts found for the "%s" script' % script
188        for font in fonts:
189            assert_font_supports_any_of_chars(font, HYPHENS)
190
191
192class FontRecord(object):
193    def __init__(self, name, scripts, variant, weight, style, font):
194        self.name = name
195        self.scripts = scripts
196        self.variant = variant
197        self.weight = weight
198        self.style = style
199        self.font = font
200
201
202def parse_fonts_xml(fonts_xml_path):
203    global _script_to_font_map, _fallback_chain
204    _script_to_font_map = collections.defaultdict(set)
205    _fallback_chain = []
206    tree = ElementTree.parse(fonts_xml_path)
207    for family in tree.findall('family'):
208        name = family.get('name')
209        variant = family.get('variant')
210        langs = family.get('lang')
211        if name:
212            assert variant is None, (
213                'No variant expected for LGC font %s.' % name)
214            assert langs is None, (
215                'No language expected for LGC fonts %s.' % name)
216        else:
217            assert variant in {None, 'elegant', 'compact'}, (
218                'Unexpected value for variant: %s' % variant)
219
220        if langs:
221            langs = langs.split()
222            scripts = {lang_to_script(lang) for lang in langs}
223        else:
224            scripts = set()
225
226        for child in family:
227            assert child.tag == 'font', (
228                'Unknown tag <%s>' % child.tag)
229            font_file = child.text
230            weight = int(child.get('weight'))
231            assert weight % 100 == 0, (
232                'Font weight "%d" is not a multiple of 100.' % weight)
233
234            style = child.get('style')
235            assert style in {'normal', 'italic'}, (
236                'Unknown style "%s"' % style)
237
238            index = child.get('index')
239            if index:
240                index = int(index)
241
242            _fallback_chain.append(FontRecord(
243                name,
244                frozenset(scripts),
245                variant,
246                weight,
247                style,
248                (font_file, index)))
249
250            if name: # non-empty names are used for default LGC fonts
251                map_scripts = {'Latn', 'Grek', 'Cyrl'}
252            else:
253                map_scripts = scripts
254            for script in map_scripts:
255                _script_to_font_map[script].add((font_file, index))
256
257
258def check_emoji_coverage(all_emoji, equivalent_emoji):
259    emoji_font = get_emoji_font()
260    check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji)
261
262
263def get_emoji_font():
264    emoji_fonts = [
265        record.font for record in _fallback_chain
266        if 'Zsye' in record.scripts]
267    assert len(emoji_fonts) == 1, 'There are %d emoji fonts.' % len(emoji_fonts)
268    return emoji_fonts[0]
269
270
271def check_emoji_font_coverage(emoji_font, all_emoji, equivalent_emoji):
272    coverage = get_emoji_map(emoji_font)
273    for sequence in all_emoji:
274        assert sequence in coverage, (
275            '%s is not supported in the emoji font.' % printable(sequence))
276
277    for sequence in coverage:
278        if sequence in {0x0000, 0x000D, 0x0020}:
279            # The font needs to support a few extra characters, which is OK
280            continue
281        assert sequence in all_emoji, (
282            'Emoji font should not support %s.' % printable(sequence))
283
284    for first, second in sorted(equivalent_emoji.items()):
285        assert coverage[first] == coverage[second], (
286            '%s and %s should map to the same glyph.' % (
287                printable(first),
288                printable(second)))
289
290    for glyph in set(coverage.values()):
291        maps_to_glyph = [seq for seq in coverage if coverage[seq] == glyph]
292        if len(maps_to_glyph) > 1:
293            # There are more than one sequences mapping to the same glyph. We
294            # need to make sure they were expected to be equivalent.
295            equivalent_seqs = set()
296            for seq in maps_to_glyph:
297                equivalent_seq = seq
298                while equivalent_seq in equivalent_emoji:
299                    equivalent_seq = equivalent_emoji[equivalent_seq]
300                equivalent_seqs.add(equivalent_seq)
301            assert len(equivalent_seqs) == 1, (
302                'The sequences %s should not result in the same glyph %s' % (
303                    printable(equivalent_seqs),
304                    glyph))
305
306
307def check_emoji_defaults(default_emoji):
308    missing_text_chars = _emoji_properties['Emoji'] - default_emoji
309    emoji_font_seen = False
310    for record in _fallback_chain:
311        if 'Zsye' in record.scripts:
312            emoji_font_seen = True
313            # No need to check the emoji font
314            continue
315        # For later fonts, we only check them if they have a script
316        # defined, since the defined script may get them to a higher
317        # score even if they appear after the emoji font. However,
318        # we should skip checking the text symbols font, since
319        # symbol fonts should be able to override the emoji display
320        # style when 'Zsym' is explicitly specified by the user.
321        if emoji_font_seen and (not record.scripts or 'Zsym' in record.scripts):
322            continue
323
324        # Check default emoji-style characters
325        assert_font_supports_none_of_chars(record.font, sorted(default_emoji))
326
327        # Mark default text-style characters appearing in fonts above the emoji
328        # font as seen
329        if not emoji_font_seen:
330            missing_text_chars -= set(get_best_cmap(record.font))
331
332    # Noto does not have monochrome glyphs for Unicode 7.0 wingdings and
333    # webdings yet.
334    missing_text_chars -= _chars_by_age['7.0']
335    assert missing_text_chars == set(), (
336        'Text style version of some emoji characters are missing: ' +
337            repr(missing_text_chars))
338
339
340# Setting reverse to true returns a dictionary that maps the values to sets of
341# characters, useful for some binary properties. Otherwise, we get a
342# dictionary that maps characters to the property values, assuming there's only
343# one property in the file.
344def parse_unicode_datafile(file_path, reverse=False):
345    if reverse:
346        output_dict = collections.defaultdict(set)
347    else:
348        output_dict = {}
349    with open(file_path) as datafile:
350        for line in datafile:
351            if '#' in line:
352                line = line[:line.index('#')]
353            line = line.strip()
354            if not line:
355                continue
356
357            chars, prop = line.split(';')[:2]
358            chars = chars.strip()
359            prop = prop.strip()
360
361            if ' ' in chars:  # character sequence
362                sequence = [int(ch, 16) for ch in chars.split(' ')]
363                additions = [tuple(sequence)]
364            elif '..' in chars:  # character range
365                char_start, char_end = chars.split('..')
366                char_start = int(char_start, 16)
367                char_end = int(char_end, 16)
368                additions = xrange(char_start, char_end+1)
369            else:  # singe character
370                additions = [int(chars, 16)]
371            if reverse:
372                output_dict[prop].update(additions)
373            else:
374                for addition in additions:
375                    assert addition not in output_dict
376                    output_dict[addition] = prop
377    return output_dict
378
379
380def parse_standardized_variants(file_path):
381    emoji_set = set()
382    text_set = set()
383    with open(file_path) as datafile:
384        for line in datafile:
385            if '#' in line:
386                line = line[:line.index('#')]
387            line = line.strip()
388            if not line:
389                continue
390            sequence, description, _ = line.split(';')
391            sequence = sequence.strip().split(' ')
392            base = int(sequence[0], 16)
393            vs = int(sequence[1], 16)
394            description = description.strip()
395            if description == 'text style':
396                text_set.add((base, vs))
397            elif description == 'emoji style':
398                emoji_set.add((base, vs))
399    return text_set, emoji_set
400
401
402def parse_ucd(ucd_path):
403    global _emoji_properties, _chars_by_age
404    global _text_variation_sequences, _emoji_variation_sequences
405    global _emoji_sequences, _emoji_zwj_sequences
406    _emoji_properties = parse_unicode_datafile(
407        path.join(ucd_path, 'emoji-data.txt'), reverse=True)
408    _chars_by_age = parse_unicode_datafile(
409        path.join(ucd_path, 'DerivedAge.txt'), reverse=True)
410    sequences = parse_standardized_variants(
411        path.join(ucd_path, 'StandardizedVariants.txt'))
412    _text_variation_sequences, _emoji_variation_sequences = sequences
413    _emoji_sequences = parse_unicode_datafile(
414        path.join(ucd_path, 'emoji-sequences.txt'))
415    _emoji_zwj_sequences = parse_unicode_datafile(
416        path.join(ucd_path, 'emoji-zwj-sequences.txt'))
417
418
419def flag_sequence(territory_code):
420    return tuple(0x1F1E6 + ord(ch) - ord('A') for ch in territory_code)
421
422
423UNSUPPORTED_FLAGS = frozenset({
424    flag_sequence('BL'), flag_sequence('BQ'), flag_sequence('DG'),
425    flag_sequence('EA'), flag_sequence('EH'), flag_sequence('FK'),
426    flag_sequence('GF'), flag_sequence('GP'), flag_sequence('GS'),
427    flag_sequence('MF'), flag_sequence('MQ'), flag_sequence('NC'),
428    flag_sequence('PM'), flag_sequence('RE'), flag_sequence('TF'),
429    flag_sequence('UN'), flag_sequence('WF'), flag_sequence('XK'),
430    flag_sequence('YT'),
431})
432
433EQUIVALENT_FLAGS = {
434    flag_sequence('BV'): flag_sequence('NO'),
435    flag_sequence('CP'): flag_sequence('FR'),
436    flag_sequence('HM'): flag_sequence('AU'),
437    flag_sequence('SJ'): flag_sequence('NO'),
438    flag_sequence('UM'): flag_sequence('US'),
439}
440
441COMBINING_KEYCAP = 0x20E3
442
443# Characters that Android defaults to emoji style, different from the recommendations in UTR #51
444ANDROID_DEFAULT_EMOJI = frozenset({
445    0x2600, # BLACK SUN WITH RAYS
446    0x2601, # CLOUD
447    0x260E, # BLACK TELEPHONE
448    0x261D, # WHITE UP POINTING INDEX
449    0x263A, # WHITE SMILING FACE
450    0x2660, # BLACK SPADE SUIT
451    0x2663, # BLACK CLUB SUIT
452    0x2665, # BLACK HEART SUIT
453    0x2666, # BLACK DIAMOND SUIT
454    0x270C, # VICTORY HAND
455    0x2744, # SNOWFLAKE
456    0x2764, # HEAVY BLACK HEART
457})
458
459LEGACY_ANDROID_EMOJI = {
460    0xFE4E5: flag_sequence('JP'),
461    0xFE4E6: flag_sequence('US'),
462    0xFE4E7: flag_sequence('FR'),
463    0xFE4E8: flag_sequence('DE'),
464    0xFE4E9: flag_sequence('IT'),
465    0xFE4EA: flag_sequence('GB'),
466    0xFE4EB: flag_sequence('ES'),
467    0xFE4EC: flag_sequence('RU'),
468    0xFE4ED: flag_sequence('CN'),
469    0xFE4EE: flag_sequence('KR'),
470    0xFE82C: (ord('#'), COMBINING_KEYCAP),
471    0xFE82E: (ord('1'), COMBINING_KEYCAP),
472    0xFE82F: (ord('2'), COMBINING_KEYCAP),
473    0xFE830: (ord('3'), COMBINING_KEYCAP),
474    0xFE831: (ord('4'), COMBINING_KEYCAP),
475    0xFE832: (ord('5'), COMBINING_KEYCAP),
476    0xFE833: (ord('6'), COMBINING_KEYCAP),
477    0xFE834: (ord('7'), COMBINING_KEYCAP),
478    0xFE835: (ord('8'), COMBINING_KEYCAP),
479    0xFE836: (ord('9'), COMBINING_KEYCAP),
480    0xFE837: (ord('0'), COMBINING_KEYCAP),
481}
482
483ZWJ_IDENTICALS = {
484    # KISS
485    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F48B, 0x200D, 0x1F468): 0x1F48F,
486    # COUPLE WITH HEART
487    (0x1F469, 0x200D, 0x2764, 0x200D, 0x1F468): 0x1F491,
488    # FAMILY
489    (0x1F468, 0x200D, 0x1F469, 0x200D, 0x1F466): 0x1F46A,
490}
491
492
493def is_fitzpatrick_modifier(cp):
494    return 0x1F3FB <= cp <= 0x1F3FF
495
496
497def reverse_emoji(seq):
498    rev = list(reversed(seq))
499    # if there are fitzpatrick modifiers in the sequence, keep them after
500    # the emoji they modify
501    for i in xrange(1, len(rev)):
502        if is_fitzpatrick_modifier(rev[i-1]):
503            rev[i], rev[i-1] = rev[i-1], rev[i]
504    return tuple(rev)
505
506
507def compute_expected_emoji():
508    equivalent_emoji = {}
509    sequence_pieces = set()
510    all_sequences = set()
511    all_sequences.update(_emoji_variation_sequences)
512
513    # add zwj sequences not in the current emoji-zwj-sequences.txt
514    adjusted_emoji_zwj_sequences = dict(_emoji_zwj_sequences)
515    adjusted_emoji_zwj_sequences.update(_emoji_zwj_sequences)
516    # single parent families
517    additional_emoji_zwj = (
518        (0x1F468, 0x200D, 0x1F466),
519        (0x1F468, 0x200D, 0x1F467),
520        (0x1F468, 0x200D, 0x1F466, 0x200D, 0x1F466),
521        (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F466),
522        (0x1F468, 0x200D, 0x1F467, 0x200D, 0x1F467),
523        (0x1F469, 0x200D, 0x1F466),
524        (0x1F469, 0x200D, 0x1F467),
525        (0x1F469, 0x200D, 0x1F466, 0x200D, 0x1F466),
526        (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F466),
527        (0x1F469, 0x200D, 0x1F467, 0x200D, 0x1F467),
528    )
529    # sequences formed from man and woman and optional fitzpatrick modifier
530    modified_extensions = (
531        0x2696,
532        0x2708,
533        0x1F3A8,
534        0x1F680,
535        0x1F692,
536    )
537    for seq in additional_emoji_zwj:
538        adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
539    for ext in modified_extensions:
540        for base in (0x1F468, 0x1F469):
541            seq = (base, 0x200D, ext)
542            adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
543            for modifier in range(0x1F3FB, 0x1F400):
544                seq = (base, modifier, 0x200D, ext)
545                adjusted_emoji_zwj_sequences[seq] = 'Emoji_ZWJ_Sequence'
546
547    for sequence in _emoji_sequences.keys():
548        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
549        all_sequences.add(sequence)
550        sequence_pieces.update(sequence)
551
552    for sequence in adjusted_emoji_zwj_sequences.keys():
553        sequence = tuple(ch for ch in sequence if ch != EMOJI_VS)
554        all_sequences.add(sequence)
555        sequence_pieces.update(sequence)
556        # Add reverse of all emoji ZWJ sequences, which are added to the fonts
557        # as a workaround to get the sequences work in RTL text.
558        reversed_seq = reverse_emoji(sequence)
559        all_sequences.add(reversed_seq)
560        equivalent_emoji[reversed_seq] = sequence
561
562    # Add all two-letter flag sequences, as even the unsupported ones should
563    # resolve to a flag tofu.
564    all_letters = [chr(code) for code in range(ord('A'), ord('Z')+1)]
565    all_two_letter_codes = itertools.product(all_letters, repeat=2)
566    all_flags = {flag_sequence(code) for code in all_two_letter_codes}
567    all_sequences.update(all_flags)
568    tofu_flags = UNSUPPORTED_FLAGS | (all_flags - set(_emoji_sequences.keys()))
569
570    all_emoji = (
571        _emoji_properties['Emoji'] |
572        all_sequences |
573        sequence_pieces |
574        set(LEGACY_ANDROID_EMOJI.keys()))
575    default_emoji = (
576        _emoji_properties['Emoji_Presentation'] |
577        ANDROID_DEFAULT_EMOJI |
578        all_sequences |
579        set(LEGACY_ANDROID_EMOJI.keys()))
580
581    first_tofu_flag = sorted(tofu_flags)[0]
582    for flag in tofu_flags:
583        if flag != first_tofu_flag:
584            equivalent_emoji[flag] = first_tofu_flag
585    equivalent_emoji.update(EQUIVALENT_FLAGS)
586    equivalent_emoji.update(LEGACY_ANDROID_EMOJI)
587    equivalent_emoji.update(ZWJ_IDENTICALS)
588    for seq in _emoji_variation_sequences:
589        equivalent_emoji[seq] = seq[0]
590
591    return all_emoji, default_emoji, equivalent_emoji
592
593
594def check_vertical_metrics():
595    for record in _fallback_chain:
596        if record.name in ['sans-serif', 'sans-serif-condensed']:
597            font = open_font(record.font)
598            assert font['head'].yMax == 2163 and font['head'].yMin == -555, (
599                'yMax and yMin of %s do not match expected values.' % (record.font,))
600
601        if record.name in ['sans-serif', 'sans-serif-condensed', 'serif', 'monospace']:
602            font = open_font(record.font)
603            assert font['hhea'].ascent == 1900 and font['hhea'].descent == -500, (
604                'ascent and descent of %s do not match expected values.' % (record.font,))
605
606
607def main():
608    global _fonts_dir
609    target_out = sys.argv[1]
610    _fonts_dir = path.join(target_out, 'fonts')
611
612    fonts_xml_path = path.join(target_out, 'etc', 'fonts.xml')
613    parse_fonts_xml(fonts_xml_path)
614
615    check_vertical_metrics()
616
617    hyphens_dir = path.join(target_out, 'usr', 'hyphen-data')
618    check_hyphens(hyphens_dir)
619
620    check_emoji = sys.argv[2]
621    if check_emoji == 'true':
622        ucd_path = sys.argv[3]
623        parse_ucd(ucd_path)
624        all_emoji, default_emoji, equivalent_emoji = compute_expected_emoji()
625        check_emoji_coverage(all_emoji, equivalent_emoji)
626        check_emoji_defaults(default_emoji)
627
628
629if __name__ == '__main__':
630    main()
631