_c_m_a_p.py revision 542b9510e6a8909e35e99a5279b7c2ec57c78e3c
1cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)import DefaultTable
2cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)import struct
3cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)import string
4cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)import array
5cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)from fontTools import ttLib
6cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)from fontTools.misc.textTools import safeEval, readHex
7cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)from types import TupleType
8cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
9cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
10cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)class table__c_m_a_p(DefaultTable.DefaultTable):
11cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
12cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	def getcmap(self, platformID, platEncID):
13cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		for subtable in self.tables:
14cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			if (subtable.platformID == platformID and
15cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)					subtable.platEncID == platEncID):
16cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)				return subtable
17cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		return None # not found
18cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
19cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	def decompile(self, data, ttFont):
20cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		tableVersion, numSubTables = struct.unpack(">HH", data[:4])
21cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		self.tableVersion = int(tableVersion)
22cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		self.tables = tables = []
23cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		for i in range(numSubTables):
24cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			platformID, platEncID, offset = struct.unpack(
25cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)					">HHl", data[4+i*8:4+(i+1)*8])
26cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			platformID, platEncID = int(platformID), int(platEncID)
27cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			format, length = struct.unpack(">HH", data[offset:offset+4])
28cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			if not length:
29cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)				continue  # bogus cmap subtable?
30cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			if not cmap_classes.has_key(format):
31cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)				table = cmap_format_unknown(format)
32cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			else:
33cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)				table = cmap_classes[format](format)
34cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			table.platformID = platformID
35cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			table.platEncID = platEncID
36cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			table.decompile(data[offset:offset+int(length)], ttFont)
37cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			tables.append(table)
38cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
39cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	def compile(self, ttFont):
40cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		self.tables.sort()    # sort according to the spec; see CmapSubtable.__cmp__()
41cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		numSubTables = len(self.tables)
42cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		totalOffset = 4 + 8 * numSubTables
43cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		data = struct.pack(">HH", self.tableVersion, numSubTables)
44cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		tableData = ""
45cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		done = {}  # remember the data so we can reuse the "pointers"
46cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		for table in self.tables:
47cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			chunk = table.compile(ttFont)
48cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			if done.has_key(chunk):
49cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)				offset = done[chunk]
50cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			else:
51cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)				offset = done[chunk] = totalOffset + len(tableData)
52cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)				tableData = tableData + chunk
53cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			data = data + struct.pack(">HHl", table.platformID, table.platEncID, offset)
54cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		return data + tableData
55cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
56cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	def toXML(self, writer, ttFont):
57cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		writer.simpletag("tableVersion", version=self.tableVersion)
58cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		writer.newline()
59cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		for table in self.tables:
60cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			table.toXML(writer, ttFont)
61cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
62cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	def fromXML(self, (name, attrs, content), ttFont):
63cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		if name == "tableVersion":
64cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			self.tableVersion = safeEval(attrs["version"])
65cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			return
66cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		if name[:12] <> "cmap_format_":
67cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			return
68cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		if not hasattr(self, "tables"):
69cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			self.tables = []
70cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		format = safeEval(name[12])
71cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		if not cmap_classes.has_key(format):
72cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			table = cmap_format_unknown(format)
73cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		else:
74cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)			table = cmap_classes[format](format)
75cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		table.platformID = safeEval(attrs["platformID"])
76cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		table.platEncID = safeEval(attrs["platEncID"])
77cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		table.fromXML((name, attrs, content), ttFont)
78cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		self.tables.append(table)
79cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
80cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
81cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)class CmapSubtable:
82cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
83cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)	def __init__(self, format):
84cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)		self.format = format
85cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)
86	def toXML(self, writer, ttFont):
87		writer.begintag(self.__class__.__name__, [
88				("platformID", self.platformID),
89				("platEncID", self.platEncID),
90				])
91		writer.newline()
92		writer.dumphex(self.compile(ttFont))
93		writer.endtag(self.__class__.__name__)
94		writer.newline()
95
96	def fromXML(self, (name, attrs, content), ttFont):
97		self.decompile(readHex(content), ttFont)
98
99	def __cmp__(self, other):
100		# implemented so that list.sort() sorts according to the cmap spec.
101		selfTuple = (
102					self.platformID,
103					self.platEncID,
104					self.version,
105					self.__dict__)
106		otherTuple = (
107					other.platformID,
108					other.platEncID,
109					other.version,
110					other.__dict__)
111		return cmp(selfTuple, otherTuple)
112
113
114class cmap_format_0(CmapSubtable):
115
116	def decompile(self, data, ttFont):
117		format, length, version = struct.unpack(">HHH", data[:6])
118		self.version = int(version)
119		assert len(data) == 262 == length
120		glyphIdArray = array.array("B")
121		glyphIdArray.fromstring(data[6:])
122		self.cmap = cmap = {}
123		for charCode in range(len(glyphIdArray)):
124			cmap[charCode] = ttFont.getGlyphName(glyphIdArray[charCode])
125
126	def compile(self, ttFont):
127		charCodes = self.cmap.keys()
128		charCodes.sort()
129		assert charCodes == range(256)  # charCodes[charCode] == charCode
130		for charCode in charCodes:
131			# reusing the charCodes list!
132			charCodes[charCode] = ttFont.getGlyphID(self.cmap[charCode])
133		glyphIdArray = array.array("B", charCodes)
134		data = struct.pack(">HHH", 0, 262, self.version) + glyphIdArray.tostring()
135		assert len(data) == 262
136		return data
137
138	def toXML(self, writer, ttFont):
139		writer.begintag(self.__class__.__name__, [
140				("platformID", self.platformID),
141				("platEncID", self.platEncID),
142				("version", self.version),
143				])
144		writer.newline()
145		items = self.cmap.items()
146		items.sort()
147		for code, name in items:
148			writer.simpletag("map", code=hex(code), name=name)
149			writer.newline()
150		writer.endtag(self.__class__.__name__)
151		writer.newline()
152
153	def fromXML(self, (name, attrs, content), ttFont):
154		self.version = safeEval(attrs["version"])
155		self.cmap = {}
156		for element in content:
157			if type(element) <> TupleType:
158				continue
159			name, attrs, content = element
160			if name <> "map":
161				continue
162			self.cmap[safeEval(attrs["code"])] = attrs["name"]
163
164
165class cmap_format_2(CmapSubtable):
166
167	def decompile(self, data, ttFont):
168		format, length, version = struct.unpack(">HHH", data[:6])
169		self.version = int(version)
170		self.data = data
171
172	def compile(self, ttFont):
173		return self.data
174
175
176cmap_format_4_format = ">7H"
177
178#uint16	endCode[segCount]			# Ending character code for each segment, last = 0xFFFF.
179#uint16	reservedPad					# This value should be zero
180#uint16	startCode[segCount]			# Starting character code for each segment
181#uint16	idDelta[segCount]			# Delta for all character codes in segment
182#uint16	idRangeOffset[segCount]		# Offset in bytes to glyph indexArray, or 0
183#uint16	glyphIndexArray[variable]	# Glyph index array
184
185def splitRange(startCode, endCode, cmap):
186	if startCode == endCode:
187		return [], [endCode]
188
189	allGlyphs = [(startCode, cmap[startCode])]  # XXX
190	lastID = cmap[startCode]
191	lastCode = startCode
192	inOrder = None
193	orderedBegin = None
194	parts = []
195
196	for code in range(startCode + 1, endCode + 1):
197		glyphID = cmap[code]
198		allGlyphs.append((code, glyphID))  # XXX
199
200		if glyphID - 1 == lastID:
201			if inOrder is None or not inOrder:
202				inOrder = 1
203				orderedBegin = lastCode
204		else:
205			if inOrder:
206				inOrder = 0
207				parts.append((orderedBegin, lastCode))
208				orderedBegin = None
209
210		lastID = glyphID
211		lastCode = code
212
213	if inOrder:
214		parts.append((orderedBegin, lastCode))
215	assert lastCode == endCode
216
217	newParts = []
218	for b, e in parts:
219		if b == startCode and e == endCode:
220			break  # the whole range, we're fine
221		if b == startCode or e == endCode:
222			threshold = 4  # split costs one more segment
223		else:
224			threshold = 8  # split costs two more segments
225		if (e - b + 1) > threshold:
226			newParts.append((b, e))
227	parts = newParts
228
229	if not parts:
230		return [], [endCode]
231
232	if parts[0][0] != startCode:
233		parts.insert(0, (startCode, parts[0][0] - 1))
234	if parts[-1][1] != endCode:
235		parts.append((parts[-1][1] + 1, endCode))
236	i = 1
237	while i < len(parts):
238		if parts[i-1][1] + 1 != parts[i][0]:
239			parts.insert(i, (parts[i-1][1] + 1, parts[i][0] - 1))
240			i = i + 1
241		i = i + 1
242
243	start = []
244	end = []
245	for b, e in parts:
246		start.append(b)
247		end.append(e)
248	start.pop(0)
249
250	assert len(start) + 1 == len(end)
251	return start, end
252
253
254class cmap_format_4(CmapSubtable):
255
256	def decompile(self, data, ttFont):
257		(format, length, self.version, segCountX2,
258				searchRange, entrySelector, rangeShift) = \
259					struct.unpack(cmap_format_4_format, data[:14])
260		assert len(data) == length, "corrupt cmap table (%d, %d)" % (len(data), length)
261		segCount = segCountX2 / 2
262
263		allCodes = array.array("H")
264		allCodes.fromstring(data[14:])
265		if ttLib.endian <> "big":
266			allCodes.byteswap()
267
268		# divide the data
269		endCode = allCodes[:segCount]
270		allCodes = allCodes[segCount+1:]  # the +1 is skipping the reservedPad field
271		startCode = allCodes[:segCount]
272		allCodes = allCodes[segCount:]
273		idDelta = allCodes[:segCount]
274		allCodes = allCodes[segCount:]
275		idRangeOffset = allCodes[:segCount]
276		glyphIndexArray = allCodes[segCount:]
277
278		#print ">>>> segCount", segCount, "len(glyphIndexArray):", len(glyphIndexArray), "len(data)", len(data)
279
280		# build 2-byte character mapping
281		cmap = {}
282		for i in range(len(startCode) - 1):	# don't do 0xffff!
283			for charCode in range(startCode[i], endCode[i] + 1):
284				rangeOffset = idRangeOffset[i]
285				if rangeOffset == 0:
286					glyphID = charCode + idDelta[i]
287				else:
288					# *someone* needs to get killed.
289					index = idRangeOffset[i] / 2 + (charCode - startCode[i]) + i - len(idRangeOffset)
290					if glyphIndexArray[index] <> 0:  # if not missing glyph
291						glyphID = glyphIndexArray[index] + idDelta[i]
292					else:
293						glyphID = 0  # missing glyph
294				cmap[charCode] = ttFont.getGlyphName(glyphID % 0x10000)
295		self.cmap = cmap
296
297	def compile(self, ttFont):
298		from fontTools.ttLib.sfnt import maxPowerOfTwo
299
300		cmap = {}  # code:glyphID mapping
301		for code, glyphName in self.cmap.items():
302			cmap[code] = ttFont.getGlyphID(glyphName)
303		codes = cmap.keys()
304		codes.sort()
305
306		# build startCode and endCode lists
307		lastCode = codes[0]
308		endCode = []
309		startCode = [lastCode]
310		for charCode in codes[1:]:  # skip the first code, it's the first start code
311			if charCode == lastCode + 1:
312				lastCode = charCode
313				continue
314			start, end = splitRange(startCode[-1], lastCode, cmap)
315			startCode.extend(start)
316			endCode.extend(end)
317			startCode.append(charCode)
318			lastCode = charCode
319		endCode.append(lastCode)
320		startCode.append(0xffff)
321		endCode.append(0xffff)
322
323		# build up rest of cruft
324		idDelta = []
325		idRangeOffset = []
326		glyphIndexArray = []
327
328		for i in range(len(endCode)-1):  # skip the closing codes (0xffff)
329			indices = []
330			for charCode in range(startCode[i], endCode[i] + 1):
331				indices.append(cmap[charCode])
332			if indices == range(indices[0], indices[0] + len(indices)):
333				idDelta.append((indices[0] - startCode[i]) % 0x10000)
334				idRangeOffset.append(0)
335			else:
336				# someone *definitely* needs to get killed.
337				idDelta.append(0)
338				idRangeOffset.append(2 * (len(endCode) + len(glyphIndexArray) - i))
339				glyphIndexArray.extend(indices)
340		idDelta.append(1)  # 0xffff + 1 == (tadaa!) 0. So this end code maps to .notdef
341		idRangeOffset.append(0)
342
343		# Insane.
344		segCount = len(endCode)
345		segCountX2 = segCount * 2
346		maxExponent = maxPowerOfTwo(segCount)
347		searchRange = 2 * (2 ** maxExponent)
348		entrySelector = maxExponent
349		rangeShift = 2 * segCount - searchRange
350
351		allCodes = array.array("H",
352				endCode + [0] + startCode + idDelta + idRangeOffset + glyphIndexArray)
353		if ttLib.endian <> "big":
354			allCodes.byteswap()
355		data = allCodes.tostring()
356		length = struct.calcsize(cmap_format_4_format) + len(data)
357		header = struct.pack(cmap_format_4_format, self.format, length, self.version,
358				segCountX2, searchRange, entrySelector, rangeShift)
359		data = header + data
360
361		#print "<<<< segCount", segCount, "len(glyphIndexArray):", len(glyphIndexArray), "len(data)", len(data)
362
363		return data
364
365	def toXML(self, writer, ttFont):
366		from fontTools.unicode import Unicode
367		codes = self.cmap.items()
368		codes.sort()
369		writer.begintag(self.__class__.__name__, [
370				("platformID", self.platformID),
371				("platEncID", self.platEncID),
372				("version", self.version),
373				])
374		writer.newline()
375
376		for code, name in codes:
377			writer.simpletag("map", code=hex(code), name=name)
378			writer.comment(Unicode[code])
379			writer.newline()
380
381		writer.endtag(self.__class__.__name__)
382		writer.newline()
383
384	def fromXML(self, (name, attrs, content), ttFont):
385		self.version = safeEval(attrs["version"])
386		self.cmap = {}
387		for element in content:
388			if type(element) <> TupleType:
389				continue
390			name, attrs, content = element
391			if name <> "map":
392				continue
393			self.cmap[safeEval(attrs["code"])] = attrs["name"]
394
395
396class cmap_format_6(CmapSubtable):
397
398	def decompile(self, data, ttFont):
399		format, length, version, firstCode, entryCount = struct.unpack(
400				">HHHHH", data[:10])
401		self.version = int(version)
402		firstCode = int(firstCode)
403		self.version = int(version)
404		data = data[10:]
405		#assert len(data) == 2 * entryCount  # XXX not true in Apple's Helvetica!!!
406		glyphIndexArray = array.array("H")
407		glyphIndexArray.fromstring(data[:2 * int(entryCount)])
408		if ttLib.endian <> "big":
409			glyphIndexArray.byteswap()
410		self.cmap = cmap = {}
411		for i in range(len(glyphIndexArray)):
412			glyphID = glyphIndexArray[i]
413			glyphName = ttFont.getGlyphName(glyphID)
414			cmap[i+firstCode] = glyphName
415
416	def compile(self, ttFont):
417		codes = self.cmap.keys()
418		codes.sort()
419		assert codes == range(codes[0], codes[0] + len(codes))
420		glyphIndexArray = array.array("H", [0] * len(codes))
421		firstCode = codes[0]
422		for i in range(len(codes)):
423			code = codes[i]
424			glyphIndexArray[code-firstCode] = ttFont.getGlyphID(self.cmap[code])
425		if ttLib.endian <> "big":
426			glyphIndexArray.byteswap()
427		data = glyphIndexArray.tostring()
428		header = struct.pack(">HHHHH",
429				6, len(data) + 10, self.version, firstCode, len(self.cmap))
430		return header + data
431
432	def toXML(self, writer, ttFont):
433		codes = self.cmap.items()
434		codes.sort()
435		writer.begintag(self.__class__.__name__, [
436				("platformID", self.platformID),
437				("platEncID", self.platEncID),
438				("version", self.version),
439				])
440		writer.newline()
441
442		for code, name in codes:
443			writer.simpletag("map", code=hex(code), name=name)
444			writer.newline()
445
446		writer.endtag(self.__class__.__name__)
447		writer.newline()
448
449	def fromXML(self, (name, attrs, content), ttFont):
450		self.version = safeEval(attrs["version"])
451		self.cmap = {}
452		for element in content:
453			if type(element) <> TupleType:
454				continue
455			name, attrs, content = element
456			if name <> "map":
457				continue
458			self.cmap[safeEval(attrs["code"])] = attrs["name"]
459
460
461class cmap_format_unknown(CmapSubtable):
462
463	def decompile(self, data, ttFont):
464		self.data = data
465
466	def compile(self, ttFont):
467		return self.data
468
469
470cmap_classes = {
471		0: cmap_format_0,
472		2: cmap_format_2,
473		4: cmap_format_4,
474		6: cmap_format_6,
475		}
476
477
478