15569331642446be05292e3e1f8a51218827168cdclairehodef lines_get(f): 25569331642446be05292e3e1f8a51218827168cdclaireho '''Parse a file like object, removing comments and returning a list of 35569331642446be05292e3e1f8a51218827168cdclaireho lines.''' 45569331642446be05292e3e1f8a51218827168cdclaireho def cut_comment(line): 55569331642446be05292e3e1f8a51218827168cdclaireho first_hash = line.find('#') 65569331642446be05292e3e1f8a51218827168cdclaireho if first_hash == -1: 75569331642446be05292e3e1f8a51218827168cdclaireho return line 85569331642446be05292e3e1f8a51218827168cdclaireho return line[:first_hash] 95569331642446be05292e3e1f8a51218827168cdclaireho 105569331642446be05292e3e1f8a51218827168cdclaireho return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)] 115569331642446be05292e3e1f8a51218827168cdclaireho 125569331642446be05292e3e1f8a51218827168cdclairehodef line_split(line): 135569331642446be05292e3e1f8a51218827168cdclaireho '''Split a line based on a semicolon separator.''' 145569331642446be05292e3e1f8a51218827168cdclaireho def normalise(word): 155569331642446be05292e3e1f8a51218827168cdclaireho return word.lstrip().rstrip() 165569331642446be05292e3e1f8a51218827168cdclaireho return [normalise(x) for x in line.split(';')] 175569331642446be05292e3e1f8a51218827168cdclaireho 185569331642446be05292e3e1f8a51218827168cdclairehodef codepoints_parse(token): 195569331642446be05292e3e1f8a51218827168cdclaireho '''Parse a Unicode style code-point range. Return either a single value or a 205569331642446be05292e3e1f8a51218827168cdclaireho tuple of (start, end) for a range of code-points.''' 215569331642446be05292e3e1f8a51218827168cdclaireho def fromHex(token): 225569331642446be05292e3e1f8a51218827168cdclaireho return int(token, 16) 235569331642446be05292e3e1f8a51218827168cdclaireho parts = token.split('..') 245569331642446be05292e3e1f8a51218827168cdclaireho if len(parts) == 2: 255569331642446be05292e3e1f8a51218827168cdclaireho return (fromHex(parts[0]), fromHex(parts[1])) 265569331642446be05292e3e1f8a51218827168cdclaireho elif len(parts) == 1: 275569331642446be05292e3e1f8a51218827168cdclaireho return fromHex(parts[0]) 285569331642446be05292e3e1f8a51218827168cdclaireho else: 295569331642446be05292e3e1f8a51218827168cdclaireho raise ValueError(token) 305569331642446be05292e3e1f8a51218827168cdclaireho 315569331642446be05292e3e1f8a51218827168cdclairehodef unicode_file_parse(input, map, default_value = None): 325569331642446be05292e3e1f8a51218827168cdclaireho '''Parse a file like object, @input where the first column is a code-point 335569331642446be05292e3e1f8a51218827168cdclaireho range and the second column is mapped via the given dict, @map.''' 345569331642446be05292e3e1f8a51218827168cdclaireho ranges = [] 355569331642446be05292e3e1f8a51218827168cdclaireho tokens = [line_split(x) for x in lines_get(input)] 365569331642446be05292e3e1f8a51218827168cdclaireho for line in tokens: 375569331642446be05292e3e1f8a51218827168cdclaireho if len(line) == 2: 385569331642446be05292e3e1f8a51218827168cdclaireho codepoints = codepoints_parse(line[0]) 395569331642446be05292e3e1f8a51218827168cdclaireho value = map[line[1]] 405569331642446be05292e3e1f8a51218827168cdclaireho if value == default_value: 415569331642446be05292e3e1f8a51218827168cdclaireho continue 425569331642446be05292e3e1f8a51218827168cdclaireho 435569331642446be05292e3e1f8a51218827168cdclaireho if type(codepoints) == int: 445569331642446be05292e3e1f8a51218827168cdclaireho codepoints = (codepoints, codepoints) 455569331642446be05292e3e1f8a51218827168cdclaireho 465569331642446be05292e3e1f8a51218827168cdclaireho ranges.append((codepoints[0], codepoints[1], value)) 475569331642446be05292e3e1f8a51218827168cdclaireho else: 485569331642446be05292e3e1f8a51218827168cdclaireho raise ValueError(line) 495569331642446be05292e3e1f8a51218827168cdclaireho 505569331642446be05292e3e1f8a51218827168cdclaireho return ranges 515569331642446be05292e3e1f8a51218827168cdclaireho 525569331642446be05292e3e1f8a51218827168cdclairehodef sort_and_merge(ranges): 535569331642446be05292e3e1f8a51218827168cdclaireho '''Given a list of (start, end, value), merge elements where the ranges are 545569331642446be05292e3e1f8a51218827168cdclaireho continuous and the values are the same.''' 555569331642446be05292e3e1f8a51218827168cdclaireho output = [] 565569331642446be05292e3e1f8a51218827168cdclaireho ranges.sort() 575569331642446be05292e3e1f8a51218827168cdclaireho current = None 585569331642446be05292e3e1f8a51218827168cdclaireho for v in ranges: 595569331642446be05292e3e1f8a51218827168cdclaireho if current is None: 605569331642446be05292e3e1f8a51218827168cdclaireho current = v 615569331642446be05292e3e1f8a51218827168cdclaireho continue 625569331642446be05292e3e1f8a51218827168cdclaireho if current[1] + 1 == v[0] and current[2] == v[2]: 635569331642446be05292e3e1f8a51218827168cdclaireho current = (current[0], v[1], v[2]) 645569331642446be05292e3e1f8a51218827168cdclaireho else: 655569331642446be05292e3e1f8a51218827168cdclaireho output.append(current) 665569331642446be05292e3e1f8a51218827168cdclaireho current = v 675569331642446be05292e3e1f8a51218827168cdclaireho if current is not None: 685569331642446be05292e3e1f8a51218827168cdclaireho output.append(current) 695569331642446be05292e3e1f8a51218827168cdclaireho 705569331642446be05292e3e1f8a51218827168cdclaireho return output 71