_c_m_a_p.py revision d299b55d14fa77411140c0cc1c2524583b4ffa58
1import DefaultTable
2import struct
3import array
4import Numeric
5import operator
6from fontTools import ttLib
7from fontTools.misc.textTools import safeEval, readHex
8from types import TupleType
9
10
11class table__c_m_a_p(DefaultTable.DefaultTable):
12
13	def getcmap(self, platformID, platEncID):
14		for subtable in self.tables:
15			if (subtable.platformID == platformID and
16					subtable.platEncID == platEncID):
17				return subtable
18		return None # not found
19
20	def decompile(self, data, ttFont):
21		tableVersion, numSubTables = struct.unpack(">HH", data[:4])
22		self.tableVersion = int(tableVersion)
23		self.tables = tables = []
24		seenOffsets = {}
25		for i in range(numSubTables):
26			platformID, platEncID, offset = struct.unpack(
27					">HHl", data[4+i*8:4+(i+1)*8])
28			platformID, platEncID = int(platformID), int(platEncID)
29			format, length = struct.unpack(">HH", data[offset:offset+4])
30			if (format < 8) and not length:
31				continue  # bogus cmap subtable?
32			if format in [8,10,12]:
33				format, reserved, length = struct.unpack(">HHL", data[offset:offset+8])
34			if not cmap_classes.has_key(format):
35				table = cmap_format_unknown(format)
36			else:
37				table = cmap_classes[format](format)
38			table.platformID = platformID
39			table.platEncID = platEncID
40			# Note that by default we decompile only the subtable header info;
41			# any other data gets decompiled only when an attribute of the
42			# subtable is referenced.
43			table.decompileHeader(data[offset:offset+int(length)], ttFont)
44			if seenOffsets.has_key(offset):
45				table.cmap = tables[seenOffsets[offset]].cmap
46			else:
47				seenOffsets[offset] = i
48			tables.append(table)
49
50	def compile(self, ttFont):
51		self.tables.sort()    # sort according to the spec; see CmapSubtable.__cmp__()
52		numSubTables = len(self.tables)
53		totalOffset = 4 + 8 * numSubTables
54		data = struct.pack(">HH", self.tableVersion, numSubTables)
55		tableData = ""
56		seen = {}  # Some tables are the same object reference. Don't compile them twice.
57		done = {}  # Some tables are different objects, but compile to the same data chunk
58		for table in self.tables:
59			try:
60				offset = seen[id(table.cmap)]
61			except KeyError:
62				chunk = table.compile(ttFont)
63				if done.has_key(chunk):
64					offset = done[chunk]
65				else:
66					offset = seen[id(table.cmap)] = done[chunk] = totalOffset + len(tableData)
67					tableData = tableData + chunk
68			data = data + struct.pack(">HHl", table.platformID, table.platEncID, offset)
69		return data + tableData
70
71	def toXML(self, writer, ttFont):
72		writer.simpletag("tableVersion", version=self.tableVersion)
73		writer.newline()
74		for table in self.tables:
75			table.toXML(writer, ttFont)
76
77	def fromXML(self, (name, attrs, content), ttFont):
78		if name == "tableVersion":
79			self.tableVersion = safeEval(attrs["version"])
80			return
81		if name[:12] <> "cmap_format_":
82			return
83		if not hasattr(self, "tables"):
84			self.tables = []
85		format = safeEval(name[12:])
86		if not cmap_classes.has_key(format):
87			table = cmap_format_unknown(format)
88		else:
89			table = cmap_classes[format](format)
90		table.platformID = safeEval(attrs["platformID"])
91		table.platEncID = safeEval(attrs["platEncID"])
92		table.fromXML((name, attrs, content), ttFont)
93		self.tables.append(table)
94
95
96class CmapSubtable:
97
98	def __init__(self, format):
99		self.format = format
100		self.data = None
101		self.ttFont = None
102
103	def __getattr__(self, attr):
104		# allow lazy decompilation of subtables.
105		if attr[:2] == '__': # don't handle requests for member functions like '__lt__'
106			raise AttributeError, attr
107		if self.data == None:
108			raise AttributeError, attr
109		self.decompile(None, None) # use saved data.
110		self.data = None # Once this table has been decompiled, make sure we don't
111						# just return the original data. Also avoids recursion when
112						# called with an attribute that the cmap subtable doesn't have.
113		return getattr(self, attr)
114
115	def decompileHeader(self, data, ttFont):
116		format, length, language = struct.unpack(">HHH", data[:6])
117		assert len(data) == length, "corrupt cmap table format %d (data length: %d, header length: %d)" % (format, len(data), length)
118		self.format = int(format)
119		self.length = int(length)
120		self.language = int(language)
121		self.data = data[6:]
122		self.ttFont = ttFont
123
124	def toXML(self, writer, ttFont):
125		writer.begintag(self.__class__.__name__, [
126				("platformID", self.platformID),
127				("platEncID", self.platEncID),
128				("language", self.language),
129				])
130		writer.newline()
131		codes = self.cmap.items()
132		codes.sort()
133		self._writeCodes(codes, writer)
134		writer.endtag(self.__class__.__name__)
135		writer.newline()
136
137	def _writeCodes(self, codes, writer):
138		if (self.platformID, self.platEncID) == (3, 1) or (self.platformID, self.platEncID) == (3, 10) or self.platformID == 0:
139			from fontTools.unicode import Unicode
140			isUnicode = 1
141		else:
142			isUnicode = 0
143		for code, name in codes:
144			writer.simpletag("map", code=hex(code), name=name)
145			if isUnicode:
146				writer.comment(Unicode[code])
147			writer.newline()
148
149	def __cmp__(self, other):
150		# implemented so that list.sort() sorts according to the cmap spec.
151		selfTuple = (
152					self.platformID,
153					self.platEncID,
154					self.language,
155					self.__dict__)
156		otherTuple = (
157					other.platformID,
158					other.platEncID,
159					other.language,
160					other.__dict__)
161		return cmp(selfTuple, otherTuple)
162
163
164class cmap_format_0(CmapSubtable):
165
166	def decompile(self, data, ttFont):
167		# we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
168		# If not, someone is calling  the subtable decompile() directly, and must provide both args.
169		if data != None and ttFont != None:
170			self.decompileHeader(data[offset:offset+int(length)], ttFont)
171		else:
172			assert(	(data == None and (ttFont == None), "Need both data and ttFont arguments"))
173		data = self.data # decompileHeader assigns the data after the header to self.data
174		assert 262 == self.length, "Format 0 cmap subtable not 262 bytes"
175		glyphIdArray = array.array("B")
176		glyphIdArray.fromstring(self.data)
177		self.cmap = cmap = {}
178		lenArray = len(glyphIdArray)
179		charCodes = range(lenArray)
180		names = map(self.ttFont.getGlyphName, glyphIdArray)
181		map(operator.setitem, [cmap]*lenArray, charCodes, names)
182
183
184	def compile(self, ttFont):
185		if self.data:
186			return struct.pack(">HHH", 0, 262, self.language) + self.data
187
188		charCodeList = self.cmap.items()
189		charCodeList.sort()
190		charCodes = [entry[0] for entry in charCodeList]
191		valueList = [entry[1] for entry in charCodeList]
192		assert charCodes == range(256)
193		valueList = map(ttFont.getGlyphID, valueList)
194
195		glyphIdArray = Numeric.array(valueList, Numeric.Int8)
196		data = struct.pack(">HHH", 0, 262, self.language) + glyphIdArray.tostring()
197		assert len(data) == 262
198		return data
199
200	def fromXML(self, (name, attrs, content), ttFont):
201		self.language = safeEval(attrs["language"])
202		if not hasattr(self, "cmap"):
203			self.cmap = {}
204		cmap = self.cmap
205		for element in content:
206			if type(element) <> TupleType:
207				continue
208			name, attrs, content = element
209			if name <> "map":
210				continue
211			cmap[safeEval(attrs["code"])] = attrs["name"]
212
213
214subHeaderFormat = ">HHhH"
215class SubHeader:
216	def __init__(self):
217		self.firstCode = None
218		self.entryCount = None
219		self.idDelta = None
220		self.idRangeOffset = None
221		self.glyphIndexArray = []
222
223class cmap_format_2(CmapSubtable):
224
225	def setIDDelta(self, subHeader):
226		subHeader.idDelta = 0
227		# find the minGI which is not zero.
228		minGI = subHeader.glyphIndexArray[0]
229		for gid in subHeader.glyphIndexArray:
230			if (gid != 0) and (gid < minGI):
231				minGI = gid
232		# The lowest gid in glyphIndexArray, after subtracting idDelta, must be 1.
233		# idDelta is a short, and must be between -32K and 32K. minGI can be between 1 and 64K.
234		# We would like to pick an idDelta such that the first glyphArray GID is 1,
235		# so that we are more likely to be able to combine glypharray GID subranges.
236		# This means that we have a problem when minGI is > 32K
237		# Since the final gi is reconstructed from the glyphArray GID by:
238		#    (short)finalGID = (gid +  idDelta) % 0x10000),
239		# we can get from a glypharray GID of 1 to a final GID of 65K by subtracting 2, and casting the
240		# negative number to an unsigned short.
241
242		if  (minGI > 1):
243			if  minGI > 0x7FFF:
244				subHeader.idDelta = -(0x10000 - minGI) -1
245			else:
246				subHeader.idDelta =  minGI -1
247			idDelta = subHeader.idDelta
248			for i in range(subHeader.entryCount):
249				gid = subHeader.glyphIndexArray[i]
250				if gid > 0:
251					subHeader.glyphIndexArray[i] = gid - idDelta
252
253
254	def decompile(self, data, ttFont):
255		# we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
256		# If not, someone is calling  the subtable decompile() directly, and must provide both args.
257		if data != None and ttFont != None:
258			self.decompileHeader(data[offset:offset+int(length)], ttFont)
259		else:
260			assert(	(data == None and (ttFont == None), "Need both data and ttFont arguments"))
261
262		data = self.data # decompileHeader assigns the data after the header to self.data
263		subHeaderKeys = []
264		maxSubHeaderindex = 0
265		# get the key array, and determine the number of subHeaders.
266		allKeys = array.array("H")
267		allKeys.fromstring(data[:512])
268		data = data[512:]
269		if ttLib.endian <> "big":
270			allKeys.byteswap()
271		subHeaderKeys = [ key/8 for key in allKeys]
272		maxSubHeaderindex = max(subHeaderKeys)
273
274		#Load subHeaders
275		subHeaderList = []
276		pos = 0
277		for i in range(maxSubHeaderindex + 1):
278			subHeader = SubHeader()
279			(subHeader.firstCode, subHeader.entryCount, subHeader.idDelta, \
280				subHeader.idRangeOffset) = struct.unpack(subHeaderFormat, data[pos:pos + 8])
281			pos += 8
282			giDataPos = pos + subHeader.idRangeOffset-2
283			giList = array.array("H")
284			giList.fromstring(data[giDataPos:giDataPos + subHeader.entryCount*2])
285			if ttLib.endian <> "big":
286				giList.byteswap()
287			subHeader.glyphIndexArray = giList
288			subHeaderList.append(subHeader)
289		# How this gets processed.
290		# Charcodes may be one or two bytes.
291		# The first byte of a charcode is mapped through the  subHeaderKeys, to select
292		# a subHeader. For any subheader but 0, the next byte is then mapped through the
293		# selected subheader. If subheader Index 0 is selected, then the byte itself is
294		# mapped through the subheader, and there is no second byte.
295		# Then assume that the subsequent byte is the first byte of the next charcode,and repeat.
296		#
297		# Each subheader references a range in the glyphIndexArray whose length is entryCount.
298		# The range in glyphIndexArray referenced by a sunheader may overlap with the range in glyphIndexArray
299		# referenced by another subheader.
300		# The only subheader that will be referenced by more than one first-byte value is the subheader
301		# that maps the entire range of glyphID values to glyphIndex 0, e.g notdef:
302		#	 {firstChar 0, EntryCount 0,idDelta 0,idRangeOffset xx}
303		# A byte being mapped though a subheader is treated as in index into a mapping of array index to font glyphIndex.
304		# A subheader specifies a subrange within (0...256) by the
305		# firstChar and EntryCount values. If the byte value is outside the subrange, then the glyphIndex is zero
306		# (e.g. glyph not in font).
307		# If the byte index is in the subrange, then an offset index is calculated as (byteIndex - firstChar).
308		# The index to glyphIndex mapping is a subrange of the glyphIndexArray. You find the start of the subrange by
309		# counting idRangeOffset bytes from the idRangeOffset word. The first value in this subrange is the
310		# glyphIndex for the index firstChar. The offset index should then be used in this array to get the glyphIndex.
311		# Example for Logocut-Medium
312		# first byte of charcode = 129; selects subheader 1.
313		# subheader 1 = {firstChar 64, EntryCount 108,idDelta 42,idRangeOffset 0252}
314		# second byte of charCode = 66
315		# the index offset = 66-64 = 2.
316		# The subrange of the glyphIndexArray starting at 0x0252 bytes from the idRangeOffset word is:
317		# [glyphIndexArray index], [subrange array index] = glyphIndex
318		# [256], [0]=1 	from charcode [129, 64]
319		# [257], [1]=2  	from charcode [129, 65]
320		# [258], [2]=3  	from charcode [129, 66]
321		# [259], [3]=4  	from charcode [129, 67]
322		# So, the glyphIndex = 3 from the array. Then if idDelta is not zero and the glyph ID is not zero,
323		# add it to the glyphID to get the final glyphIndex
324		# value. In this case the final glyph index = 3+ 42 -> 45 for the final glyphIndex. Whew!
325
326		self.data = ""
327		self.cmap = cmap = {}
328		notdefGI = 0
329		for firstByte in range(256):
330			subHeadindex = subHeaderKeys[firstByte]
331			subHeader = subHeaderList[subHeadindex]
332			if subHeadindex == 0:
333				if (firstByte < subHeader.firstCode) or (firstByte >= subHeader.firstCode + subHeader.entryCount):
334					continue # gi is notdef.
335				else:
336					charCode = firstByte
337					offsetIndex = firstByte - subHeader.firstCode
338					gi = subHeader.glyphIndexArray[offsetIndex]
339					if gi != 0:
340						gi = (gi + subHeader.idDelta) % 0x10000
341					else:
342						continue # gi is notdef.
343				cmap[charCode] = gi
344			else:
345				if subHeader.entryCount:
346					charCodeOffset = firstByte * 256 + subHeader.firstCode
347					for offsetIndex in range(subHeader.entryCount):
348						charCode = charCodeOffset + offsetIndex
349						gi = subHeader.glyphIndexArray[offsetIndex]
350						if gi != 0:
351							gi = (gi + subHeader.idDelta) % 0x10000
352						else:
353							continue
354						cmap[charCode] = gi
355				# If not subHeader.entryCount, then all char codes with this first byte are
356				# mapped to .notdef. We can skip this subtable, and leave the glyphs un-encoded, which is the
357				# same as mapping it to .notdef.
358		# cmap values are GID's.
359		glyphOrder = self.ttFont.getGlyphOrder()
360		gids = cmap.values()
361		charCodes = cmap.keys()
362		lenCmap = len(gids)
363		try:
364			names = map(operator.getitem, [glyphOrder]*lenCmap, gids )
365		except IndexError:
366			getGlyphName = self.ttFont.getGlyphName
367			names = map(getGlyphName, gids )
368		map(operator.setitem, [cmap]*lenCmap, charCodes, names)
369
370
371	def compile(self, ttFont):
372		if self.data:
373			return struct.pack(">HHH", self.format, self.length, self.language) + self.data
374		kEmptyTwoCharCodeRange = -1
375		notdefGI = 0
376
377		items = self.cmap.items()
378		items.sort()
379		charCodes = [item[0] for item in items]
380		names = [item[1] for item in items]
381		nameMap = ttFont.getReverseGlyphMap()
382		lenCharCodes = len(charCodes)
383		try:
384			gids = map(operator.getitem, [nameMap]*lenCharCodes, names)
385		except KeyError:
386			nameMap = ttFont.getReverseGlyphMap(rebuild=1)
387			try:
388				gids = map(operator.getitem, [nameMap]*lenCharCodes, names)
389			except KeyError:
390				# allow virtual GIDs in format 2 tables
391				gids = []
392				for name in names:
393					try:
394						gid = nameMap[name]
395					except KeyError:
396						try:
397							if (name[:3] == 'gid'):
398								gid = eval(name[3:])
399							else:
400								gid = ttFont.getGlyphID(name)
401						except:
402							raise KeyError(name)
403
404					gids.append(gid)
405
406		# Process the (char code to gid) item list  in char code order.
407		# By definition, all one byte char codes map to subheader 0.
408		# For all the two byte char codes, we assume that the first byte maps maps to the empty subhead (with an entry count of 0,
409		# which defines all char codes in its range to map to notdef) unless proven otherwise.
410		# Note that since the char code items are processed in char code order, all the char codes with the
411		# same first byte are in sequential order.
412
413		subHeaderKeys = [ kEmptyTwoCharCodeRange for x in  range(256)] # list of indices into subHeaderList.
414		subHeaderList = []
415
416		# We force this subheader entry 0  to exist in the subHeaderList in the case where some one comes up
417		# with a cmap where all the one byte char codes map to notdef,
418		# with the result that the subhead 0 would not get created just by processing the item list.
419		charCode = charCodes[0]
420		if charCode > 255:
421			subHeader = SubHeader()
422			subHeader.firstCode = 0
423			subHeader.entryCount = 0
424			subHeader.idDelta = 0
425			subHeader.idRangeOffset = 0
426			subHeaderList.append(subHeader)
427
428
429		lastFirstByte = -1
430		items = zip(charCodes, gids)
431		for charCode, gid in items:
432			if gid == 0:
433				continue
434			firstbyte = charCode >> 8
435			secondByte = charCode & 0x00FF
436
437			if firstbyte != lastFirstByte: # Need to update the current subhead, and start a new one.
438				if lastFirstByte > -1:
439					# fix GI's and iDelta of current subheader.
440					self.setIDDelta(subHeader)
441
442					# If it was sunheader 0 for one-byte charCodes, then we need to set the subHeaderKeys value to zero
443					# for the indices matching the char codes.
444					if lastFirstByte == 0:
445						for index in range(subHeader.entryCount):
446							charCode = subHeader.firstCode + index
447							subHeaderKeys[charCode] = 0
448
449					assert (subHeader.entryCount == len(subHeader.glyphIndexArray)), "Error - subhead entry count does not match len of glyphID subrange."
450				# init new subheader
451				subHeader = SubHeader()
452				subHeader.firstCode = secondByte
453				subHeader.entryCount = 1
454				subHeader.glyphIndexArray.append(gid)
455				subHeaderList.append(subHeader)
456				subHeaderKeys[firstbyte] = len(subHeaderList) -1
457				lastFirstByte = firstbyte
458			else:
459				# need to fill in with notdefs all the code points between the last charCode and the current charCode.
460				codeDiff = secondByte - (subHeader.firstCode + subHeader.entryCount)
461				for i in range(codeDiff):
462					subHeader.glyphIndexArray.append(notdefGI)
463				subHeader.glyphIndexArray.append(gid)
464				subHeader.entryCount = subHeader.entryCount + codeDiff + 1
465
466		# fix GI's and iDelta of last subheader that we we added to the subheader array.
467		self.setIDDelta(subHeader)
468
469		# Now we add a final subheader for the subHeaderKeys which maps to empty two byte charcode ranges.
470		subHeader = SubHeader()
471		subHeader.firstCode = 0
472		subHeader.entryCount = 0
473		subHeader.idDelta = 0
474		subHeader.idRangeOffset = 2
475		subHeaderList.append(subHeader)
476		emptySubheadIndex = len(subHeaderList) - 1
477		for index in range(256):
478			if subHeaderKeys[index] == kEmptyTwoCharCodeRange:
479				subHeaderKeys[index] = emptySubheadIndex
480		# Since this is the last subheader, the GlyphIndex Array starts two bytes after the start of the
481		# idRangeOffset word of this subHeader. We can safely point to the first entry in the GlyphIndexArray,
482		# since the first subrange of the GlyphIndexArray is for subHeader 0, which always starts with
483		# charcode 0 and GID 0.
484
485		idRangeOffset = (len(subHeaderList)-1)*8  + 2 # offset to beginning of glyphIDArray from first subheader idRangeOffset.
486		subheadRangeLen = len(subHeaderList) -1 # skip last special empty-set subheader; we've already hardocodes its idRangeOffset to 2.
487		for index in range(subheadRangeLen):
488			subHeader = subHeaderList[index]
489			subHeader.idRangeOffset = 0
490			for j  in range(index):
491				prevSubhead = subHeaderList[j]
492				if prevSubhead.glyphIndexArray == subHeader.glyphIndexArray: # use the glyphIndexArray subarray
493					subHeader.idRangeOffset = prevSubhead.idRangeOffset - (index-j)*8
494					subHeader.glyphIndexArray = []
495					break
496			if subHeader.idRangeOffset == 0: # didn't find one.
497				subHeader.idRangeOffset = idRangeOffset
498				idRangeOffset = (idRangeOffset - 8) + subHeader.entryCount*2 # one less subheader, one more subArray.
499			else:
500				idRangeOffset = idRangeOffset - 8  # one less subheader
501
502		# Now we can write out the data!
503		length = 6 + 512 + 8*len(subHeaderList) # header, 256 subHeaderKeys, and subheader array.
504		for subhead in 	subHeaderList[:-1]:
505			length = length + len(subhead.glyphIndexArray)*2  # We can't use subhead.entryCount, as some of the subhead may share subArrays.
506		dataList = [struct.pack(">HHH", 2, length, self.language)]
507		for index in subHeaderKeys:
508			dataList.append(struct.pack(">H", index*8))
509		for subhead in 	subHeaderList:
510			dataList.append(struct.pack(subHeaderFormat, subhead.firstCode, subhead.entryCount, subhead.idDelta, subhead.idRangeOffset))
511		for subhead in 	subHeaderList[:-1]:
512			for gi in subhead.glyphIndexArray:
513				dataList.append(struct.pack(">H", gi))
514		data = "".join(dataList)
515		assert (len(data) == length), "Error: cmap format 2 is not same length as calculated! actual: " + str(len(data))+ " calc : " + str(length)
516		return data
517
518
519	def fromXML(self, (name, attrs, content), ttFont):
520		self.language = safeEval(attrs["language"])
521		if not hasattr(self, "cmap"):
522			self.cmap = {}
523		cmap = self.cmap
524
525		for element in content:
526			if type(element) <> TupleType:
527				continue
528			name, attrs, content = element
529			if name <> "map":
530				continue
531			cmap[safeEval(attrs["code"])] = attrs["name"]
532
533
534cmap_format_4_format = ">7H"
535
536#uint16  endCode[segCount]          # Ending character code for each segment, last = 0xFFFF.
537#uint16  reservedPad                # This value should be zero
538#uint16  startCode[segCount]        # Starting character code for each segment
539#uint16  idDelta[segCount]          # Delta for all character codes in segment
540#uint16  idRangeOffset[segCount]    # Offset in bytes to glyph indexArray, or 0
541#uint16  glyphIndexArray[variable]  # Glyph index array
542
543def splitRange(startCode, endCode, cmap):
544	# Try to split a range of character codes into subranges with consecutive
545	# glyph IDs in such a way that the cmap4 subtable can be stored "most"
546	# efficiently. I can't prove I've got the optimal solution, but it seems
547	# to do well with the fonts I tested: none became bigger, many became smaller.
548	if startCode == endCode:
549		return [], [endCode]
550
551	lastID = cmap[startCode]
552	lastCode = startCode
553	inOrder = None
554	orderedBegin = None
555	subRanges = []
556
557	# Gather subranges in which the glyph IDs are consecutive.
558	for code in range(startCode + 1, endCode + 1):
559		glyphID = cmap[code]
560
561		if glyphID - 1 == lastID:
562			if inOrder is None or not inOrder:
563				inOrder = 1
564				orderedBegin = lastCode
565		else:
566			if inOrder:
567				inOrder = 0
568				subRanges.append((orderedBegin, lastCode))
569				orderedBegin = None
570
571		lastID = glyphID
572		lastCode = code
573
574	if inOrder:
575		subRanges.append((orderedBegin, lastCode))
576	assert lastCode == endCode
577
578	# Now filter out those new subranges that would only make the data bigger.
579	# A new segment cost 8 bytes, not using a new segment costs 2 bytes per
580	# character.
581	newRanges = []
582	for b, e in subRanges:
583		if b == startCode and e == endCode:
584			break  # the whole range, we're fine
585		if b == startCode or e == endCode:
586			threshold = 4  # split costs one more segment
587		else:
588			threshold = 8  # split costs two more segments
589		if (e - b + 1) > threshold:
590			newRanges.append((b, e))
591	subRanges = newRanges
592
593	if not subRanges:
594		return [], [endCode]
595
596	if subRanges[0][0] != startCode:
597		subRanges.insert(0, (startCode, subRanges[0][0] - 1))
598	if subRanges[-1][1] != endCode:
599		subRanges.append((subRanges[-1][1] + 1, endCode))
600
601	# Fill the "holes" in the segments list -- those are the segments in which
602	# the glyph IDs are _not_ consecutive.
603	i = 1
604	while i < len(subRanges):
605		if subRanges[i-1][1] + 1 != subRanges[i][0]:
606			subRanges.insert(i, (subRanges[i-1][1] + 1, subRanges[i][0] - 1))
607			i = i + 1
608		i = i + 1
609
610	# Transform the ranges into startCode/endCode lists.
611	start = []
612	end = []
613	for b, e in subRanges:
614		start.append(b)
615		end.append(e)
616	start.pop(0)
617
618	assert len(start) + 1 == len(end)
619	return start, end
620
621
622class cmap_format_4(CmapSubtable):
623
624	def decompile(self, data, ttFont):
625		# we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
626		# If not, someone is calling  the subtable decompile() directly, and must provide both args.
627		if data != None and ttFont != None:
628			self.decompileHeader(self.data[offset:offset+int(length)], ttFont)
629		else:
630			assert(	(data == None and (ttFont == None), "Need both data and ttFont arguments"))
631
632		data = self.data # decompileHeader assigns the data after the header to self.data
633		(segCountX2, searchRange, entrySelector, rangeShift) = \
634					struct.unpack(">4H", data[:8])
635		data = data[8:]
636		segCount = segCountX2 / 2
637
638		allCodes = array.array("H")
639		allCodes.fromstring(data)
640		self.data = data = None
641
642		if ttLib.endian <> "big":
643			allCodes.byteswap()
644
645		# divide the data
646		endCode = allCodes[:segCount]
647		allCodes = allCodes[segCount+1:]  # the +1 is skipping the reservedPad field
648		startCode = allCodes[:segCount]
649		allCodes = allCodes[segCount:]
650		idDelta = allCodes[:segCount]
651		allCodes = allCodes[segCount:]
652		idRangeOffset = allCodes[:segCount]
653		glyphIndexArray = allCodes[segCount:]
654		lenGIArray = len(glyphIndexArray)
655
656		# build 2-byte character mapping
657		charCodes = []
658		gids = []
659		for i in range(len(startCode) - 1):	# don't do 0xffff!
660			rangeCharCodes = range(startCode[i], endCode[i] + 1)
661			charCodes = charCodes + rangeCharCodes
662			for charCode in rangeCharCodes:
663				rangeOffset = idRangeOffset[i]
664				if rangeOffset == 0:
665					glyphID = charCode + idDelta[i]
666				else:
667					# *someone* needs to get killed.
668					index = idRangeOffset[i] / 2 + (charCode - startCode[i]) + i - len(idRangeOffset)
669					assert (index < lenGIArray), "In format 4 cmap, range (%d), the calculated index (%d) into the glyph index array  is not less than the length of the array (%d) !" % (i, index, lenGIArray)
670					if glyphIndexArray[index] <> 0:  # if not missing glyph
671						glyphID = glyphIndexArray[index] + idDelta[i]
672					else:
673						glyphID = 0  # missing glyph
674				gids.append(glyphID % 0x10000)
675
676		self.cmap = cmap = {}
677		lenCmap = len(gids)
678		glyphOrder = self.ttFont.getGlyphOrder()
679		try:
680			names = map(operator.getitem, [glyphOrder]*lenCmap, gids )
681		except IndexError:
682			getGlyphName = self.ttFont.getGlyphName
683			names = map(getGlyphName, gids )
684		map(operator.setitem, [cmap]*lenCmap, charCodes, names)
685
686
687
688	def setIDDelta(self, idDelta):
689		# The lowest gid in glyphIndexArray, after subtracting idDelta, must be 1.
690		# idDelta is a short, and must be between -32K and 32K
691		# startCode can be between 0 and 64K-1, and the first glyph index can be between 1 and 64K-1
692		# This means that we have a problem because we can need to assign to idDelta values
693		# between -(64K-2) and 64K -1.
694		# Since the final gi is reconstructed from the glyphArray GID by:
695		#    (short)finalGID = (gid +  idDelta) % 0x10000),
696		# we can get from a startCode of 0 to a final GID of 64 -1K by subtracting 1, and casting the
697		# negative number to an unsigned short.
698		# Similarly , we can get from a startCode of 64K-1 to a final GID of 1 by adding 2, because of
699		# the modulo arithmetic.
700
701		if idDelta > 0x7FFF:
702			idDelta = idDelta - 0x10000
703		elif idDelta <  -0x7FFF:
704			idDelta = idDelta + 0x10000
705
706		return idDelta
707
708
709	def compile(self, ttFont):
710		if self.data:
711			return struct.pack(">HHH", self.format, self.length, self.language) + self.data
712
713		from fontTools.ttLib.sfnt import maxPowerOfTwo
714
715		charCodes = self.cmap.keys()
716
717		charCodes.sort()
718		lenCharCodes = len(charCodes)
719		if lenCharCodes == 0:
720			startCode = [0xffff]
721			endCode = [0xffff]
722		else:
723			names = self.cmap.values()
724			nameMap = ttFont.getReverseGlyphMap()
725			try:
726				gids = map(operator.getitem, [nameMap]*lenCharCodes, names)
727			except KeyError:
728				nameMap = ttFont.getReverseGlyphMap(rebuild=1)
729				try:
730					gids = map(operator.getitem, [nameMap]*lenCharCodes, names)
731				except KeyError:
732					# allow virtual GIDs in format 4 tables
733					gids = []
734					for name in names:
735						try:
736							gid = nameMap[name]
737						except KeyError:
738							try:
739								if (name[:3] == 'gid'):
740									gid = eval(name[3:])
741								else:
742									gid = ttFont.getGlyphID(name)
743							except:
744								raise KeyError(name)
745
746						gids.append(gid)
747			cmap = {}  # code:glyphID mapping
748			map(operator.setitem, [cmap]*len(charCodes), charCodes, gids)
749
750			# Build startCode and endCode lists.
751			# Split the char codes in ranges of consecutive char codes, then split
752			# each range in more ranges of consecutive/not consecutive glyph IDs.
753			# See splitRange().
754			lastCode = charCodes[0]
755			endCode = []
756			startCode = [lastCode]
757			for charCode in charCodes[1:]:  # skip the first code, it's the first start code
758				if charCode == lastCode + 1:
759					lastCode = charCode
760					continue
761				start, end = splitRange(startCode[-1], lastCode, cmap)
762				startCode.extend(start)
763				endCode.extend(end)
764				startCode.append(charCode)
765				lastCode = charCode
766			endCode.append(lastCode)
767			startCode.append(0xffff)
768			endCode.append(0xffff)
769
770		# build up rest of cruft
771		idDelta = []
772		idRangeOffset = []
773		glyphIndexArray = []
774		for i in range(len(endCode)-1):  # skip the closing codes (0xffff)
775			indices = []
776			for charCode in range(startCode[i], endCode[i] + 1):
777				indices.append(cmap[charCode])
778			if  (indices == range(indices[0], indices[0] + len(indices))):
779				idDeltaTemp = self.setIDDelta(indices[0] - startCode[i])
780				idDelta.append( idDeltaTemp)
781				idRangeOffset.append(0)
782			else:
783				# someone *definitely* needs to get killed.
784				idDelta.append(0)
785				idRangeOffset.append(2 * (len(endCode) + len(glyphIndexArray) - i))
786				glyphIndexArray.extend(indices)
787		idDelta.append(1)  # 0xffff + 1 == (tadaa!) 0. So this end code maps to .notdef
788		idRangeOffset.append(0)
789
790		# Insane.
791		segCount = len(endCode)
792		segCountX2 = segCount * 2
793		maxExponent = maxPowerOfTwo(segCount)
794		searchRange = 2 * (2 ** maxExponent)
795		entrySelector = maxExponent
796		rangeShift = 2 * segCount - searchRange
797
798		charCodeArray = Numeric.array( endCode + [0] + startCode, Numeric.UInt16)
799		idDeltaeArray = Numeric.array(idDelta, Numeric.Int16)
800		restArray = Numeric.array(idRangeOffset + glyphIndexArray, Numeric.UInt16)
801		if ttLib.endian <> "big":
802			charCodeArray = charCodeArray.byteswapped()
803			idDeltaeArray = idDeltaeArray.byteswapped()
804			restArray = restArray.byteswapped()
805		data = charCodeArray.tostring() + idDeltaeArray.tostring() + restArray.tostring()
806
807		length = struct.calcsize(cmap_format_4_format) + len(data)
808		header = struct.pack(cmap_format_4_format, self.format, length, self.language,
809				segCountX2, searchRange, entrySelector, rangeShift)
810		return header + data
811
812	def fromXML(self, (name, attrs, content), ttFont):
813		self.language = safeEval(attrs["language"])
814		if not hasattr(self, "cmap"):
815			self.cmap = {}
816		cmap = self.cmap
817
818		for element in content:
819			if type(element) <> TupleType:
820				continue
821			nameMap, attrsMap, dummyContent = element
822			if nameMap <> "map":
823				assert 0, "Unrecognized keyword in cmap subtable"
824			cmap[safeEval(attrsMap["code"])] = attrsMap["name"]
825
826
827class cmap_format_6(CmapSubtable):
828
829	def decompile(self, data, ttFont):
830		# we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
831		# If not, someone is calling  the subtable decompile() directly, and must provide both args.
832		if data != None and ttFont != None:
833			self.decompileHeader(data[offset:offset+int(length)], ttFont)
834		else:
835			assert(	(data == None and (ttFont == None), "Need both data and ttFont arguments"))
836
837		data = self.data # decompileHeader assigns the data after the header to self.data
838		firstCode, entryCount = struct.unpack(">HH", data[:4])
839		firstCode = int(firstCode)
840		data = data[4:]
841		#assert len(data) == 2 * entryCount  # XXX not true in Apple's Helvetica!!!
842		glyphIndexArray = array.array("H")
843		glyphIndexArray.fromstring(data[:2 * int(entryCount)])
844		if ttLib.endian <> "big":
845			glyphIndexArray.byteswap()
846		self.data = data = None
847
848		self.cmap = cmap = {}
849
850		lenArray = len(glyphIndexArray)
851		charCodes = range(firstCode, firstCode + lenArray )
852		glyphOrder = self.ttFont.getGlyphOrder()
853		try:
854			names = map(operator.getitem, [glyphOrder]*lenArray, glyphIndexArray )
855		except IndexError:
856			getGlyphName = self.ttFont.getGlyphName
857			names = map(getGlyphName, glyphIndexArray )
858		map(operator.setitem, [cmap]*lenArray, charCodes, names)
859
860	def compile(self, ttFont):
861		if self.data:
862			return struct.pack(">HHH", self.format, self.length, self.language) + self.data
863		cmap = self.cmap
864		codes = cmap.keys()
865		if codes: # yes, there are empty cmap tables.
866			codes.sort()
867			lenCodes = len(codes)
868			assert codes == range(codes[0], codes[0] + lenCodes)
869			firstCode = codes[0]
870			valueList = map(operator.getitem, [cmap]*lenCodes, codes)
871			valueList = map(ttFont.getGlyphID, valueList)
872			glyphIndexArray = Numeric.array(valueList, Numeric.UInt16)
873			if ttLib.endian <> "big":
874				glyphIndexArray = glyphIndexArray.byteswapped()
875			data = glyphIndexArray.tostring()
876		else:
877			data = ""
878			firstCode = 0
879		header = struct.pack(">HHHHH",
880				6, len(data) + 10, self.language, firstCode, len(codes))
881		return header + data
882
883	def fromXML(self, (name, attrs, content), ttFont):
884		self.language = safeEval(attrs["language"])
885		if not hasattr(self, "cmap"):
886			self.cmap = {}
887		cmap = self.cmap
888
889		for element in content:
890			if type(element) <> TupleType:
891				continue
892			name, attrs, content = element
893			if name <> "map":
894				continue
895			cmap[safeEval(attrs["code"])] = attrs["name"]
896
897
898class cmap_format_12(CmapSubtable):
899
900	def __init__(self, format):
901		self.format = format
902		self.reserved = 0
903		self.data = None
904		self.ttFont = None
905
906	def decompileHeader(self, data, ttFont):
907		format, reserved, length, language, nGroups = struct.unpack(">HHLLL", data[:16])
908		assert len(data) == (16 + nGroups*12) == (length), "corrupt cmap table format 12 (data length: %d, header length: %d)" % (len(data), length)
909		self.format = format
910		self.reserved = reserved
911		self.length = length
912		self.language = language
913		self.nGroups = nGroups
914		self.data = data[16:]
915		self.ttFont = ttFont
916
917	def decompile(self, data, ttFont):
918		# we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
919		# If not, someone is calling  the subtable decompile() directly, and must provide both args.
920		if data != None and ttFont != None:
921			self.decompileHeader(data[offset:offset+int(length)], ttFont)
922		else:
923			assert(	(data == None and (ttFont == None), "Need both data and ttFont arguments"))
924
925		data = self.data # decompileHeader assigns the data after the header to self.data
926		charCodes = []
927		gids = []
928		pos = 0
929		for i in range(self.nGroups):
930			startCharCode, endCharCode, glyphID = struct.unpack(">LLL",data[pos:pos+12] )
931			pos += 12
932			lenGroup = 1 + endCharCode - startCharCode
933			charCodes += range(startCharCode, endCharCode +1)
934			gids += range(glyphID, glyphID + lenGroup)
935		self.data = data = None
936		self.cmap = cmap = {}
937		lenCmap = len(gids)
938		glyphOrder = self.ttFont.getGlyphOrder()
939		try:
940			names = map(operator.getitem, [glyphOrder]*lenCmap, gids )
941		except IndexError:
942			getGlyphName = self.ttFont.getGlyphName
943			names = map(getGlyphName, gids )
944		map(operator.setitem, [cmap]*lenCmap, charCodes, names)
945
946	def compile(self, ttFont):
947		if self.data:
948			return struct.pack(">HHLLL", self.format, self.reserved , self.length, self.language, self.nGroups) + self.data
949		charCodes = self.cmap.keys()
950		lenCharCodes = len(charCodes)
951		names = self.cmap.values()
952		nameMap = ttFont.getReverseGlyphMap()
953		try:
954			gids = map(operator.getitem, [nameMap]*lenCharCodes, names)
955		except KeyError:
956			nameMap = ttFont.getReverseGlyphMap(rebuild=1)
957			try:
958				gids = map(operator.getitem, [nameMap]*lenCharCodes, names)
959			except KeyError:
960				# allow virtual GIDs in format 12 tables
961				gids = []
962				for name in names:
963					try:
964						gid = nameMap[name]
965					except KeyError:
966						try:
967							if (name[:3] == 'gid'):
968								gid = eval(name[3:])
969							else:
970								gid = ttFont.getGlyphID(name)
971						except:
972							raise KeyError(name)
973
974					gids.append(gid)
975
976		cmap = {}  # code:glyphID mapping
977		map(operator.setitem, [cmap]*len(charCodes), charCodes, gids)
978
979		charCodes.sort()
980		index = 0
981		startCharCode = charCodes[0]
982		startGlyphID = cmap[startCharCode]
983		lastGlyphID =  startGlyphID - 1
984		lastCharCode = startCharCode - 1
985		nGroups = 0
986		dataList =  []
987		maxIndex = len(charCodes)
988		for index in range(maxIndex):
989			charCode = charCodes[index]
990			glyphID = cmap[charCode]
991			if (glyphID != 1 + lastGlyphID) or (charCode != 1 + lastCharCode):
992				dataList.append(struct.pack(">LLL", startCharCode, lastCharCode, startGlyphID))
993				startCharCode = charCode
994				startGlyphID = glyphID
995				nGroups = nGroups + 1
996			lastGlyphID = glyphID
997			lastCharCode = charCode
998		dataList.append(struct.pack(">LLL", startCharCode, lastCharCode, startGlyphID))
999		nGroups = nGroups + 1
1000		data = "".join(dataList)
1001		lengthSubtable = len(data) +16
1002		assert len(data) == (nGroups*12) == (lengthSubtable-16)
1003		return struct.pack(">HHLLL", self.format, self.reserved , lengthSubtable, self.language, nGroups) + data
1004
1005	def toXML(self, writer, ttFont):
1006		writer.begintag(self.__class__.__name__, [
1007				("platformID", self.platformID),
1008				("platEncID", self.platEncID),
1009				("format", self.format),
1010				("reserved", self.reserved),
1011				("length", self.length),
1012				("language", self.language),
1013				("nGroups", self.nGroups),
1014				])
1015		writer.newline()
1016		codes = self.cmap.items()
1017		codes.sort()
1018		self._writeCodes(codes, writer)
1019		writer.endtag(self.__class__.__name__)
1020		writer.newline()
1021
1022	def fromXML(self, (name, attrs, content), ttFont):
1023		self.format = safeEval(attrs["format"])
1024		self.reserved = safeEval(attrs["reserved"])
1025		self.length = safeEval(attrs["length"])
1026		self.language = safeEval(attrs["language"])
1027		self.nGroups = safeEval(attrs["nGroups"])
1028		if not hasattr(self, "cmap"):
1029			self.cmap = {}
1030		cmap = self.cmap
1031
1032		for element in content:
1033			if type(element) <> TupleType:
1034				continue
1035			name, attrs, content = element
1036			if name <> "map":
1037				continue
1038			cmap[safeEval(attrs["code"])] = attrs["name"]
1039
1040
1041class cmap_format_unknown(CmapSubtable):
1042
1043	def toXML(self, writer, ttFont):
1044		cmapName = self.__class__.__name__[:12] + str(self.format)
1045		writer.begintag(cmapName, [
1046				("platformID", self.platformID),
1047				("platEncID", self.platEncID),
1048				])
1049		writer.newline()
1050		writer.dumphex(self.data)
1051		writer.endtag(cmapName)
1052		writer.newline()
1053
1054	def fromXML(self, (name, attrs, content), ttFont):
1055		self.data = readHex(content)
1056		self.cmap = {}
1057
1058	def decompileHeader(self, data, ttFont):
1059		self.language = 0  # dummy value
1060		self.data = data
1061
1062	def decompile(self, data, ttFont):
1063		# we usually get here indirectly from the subtable __getattr__ function, in which case both args must be None.
1064		# If not, someone is calling  the subtable decompile() directly, and must provide both args.
1065		if data != None and ttFont != None:
1066			self.decompileHeader(data[offset:offset+int(length)], ttFont)
1067		else:
1068			assert(	(data == None and (ttFont == None), "Need both data and ttFont arguments"))
1069
1070	def compile(self, ttFont):
1071		if self.data:
1072			return self.data
1073		else:
1074			return None
1075
1076cmap_classes = {
1077		0: cmap_format_0,
1078		2: cmap_format_2,
1079		4: cmap_format_4,
1080		6: cmap_format_6,
1081		12: cmap_format_12,
1082		}
1083