1// =================================================================================================
2// ADOBE SYSTEMS INCORPORATED
3// Copyright 2006 Adobe Systems Incorporated
4// All Rights Reserved
5//
6// NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
7// of the Adobe license agreement accompanying it.
8// =================================================================================================
9
10package com.adobe.xmp.impl;
11
12import java.io.IOException;
13import java.io.InputStream;
14import java.io.InputStreamReader;
15import java.io.Reader;
16import java.io.StringReader;
17import java.io.UnsupportedEncodingException;
18
19import javax.xml.XMLConstants;
20import javax.xml.parsers.DocumentBuilder;
21import javax.xml.parsers.DocumentBuilderFactory;
22import javax.xml.parsers.ParserConfigurationException;
23
24import org.w3c.dom.Document;
25import org.w3c.dom.Node;
26import org.w3c.dom.NodeList;
27import org.w3c.dom.ProcessingInstruction;
28import org.xml.sax.InputSource;
29import org.xml.sax.SAXException;
30
31import com.adobe.xmp.XMPConst;
32import com.adobe.xmp.XMPError;
33import com.adobe.xmp.XMPException;
34import com.adobe.xmp.XMPMeta;
35import com.adobe.xmp.options.ParseOptions;
36
37
38/**
39 * This class replaces the <code>ExpatAdapter.cpp</code> and does the
40 * XML-parsing and fixes the prefix. After the parsing several normalisations
41 * are applied to the XMPTree.
42 *
43 * @since 01.02.2006
44 */
45public class XMPMetaParser
46{
47	/**  */
48	private static final Object XMP_RDF = new Object();
49	/** the DOM Parser Factory, options are set */
50	private static DocumentBuilderFactory factory = createDocumentBuilderFactory();
51
52	/**
53	 * Hidden constructor, initialises the SAX parser handler.
54	 */
55	private XMPMetaParser()
56	{
57		// EMPTY
58	}
59
60
61
62	/**
63	 * Parses the input source into an XMP metadata object, including
64	 * de-aliasing and normalisation.
65	 *
66	 * @param input the input can be an <code>InputStream</code>, a <code>String</code> or
67	 * 			a byte buffer containing the XMP packet.
68	 * @param options the parse options
69	 * @return Returns the resulting XMP metadata object
70	 * @throws XMPException Thrown if parsing or normalisation fails.
71	 */
72	public static XMPMeta parse(Object input, ParseOptions options) throws XMPException
73	{
74		ParameterAsserts.assertNotNull(input);
75		options = options != null ? options : new ParseOptions();
76
77		Document document = parseXml(input, options);
78
79		boolean xmpmetaRequired = options.getRequireXMPMeta();
80		Object[] result = new Object[3];
81		result = findRootNode(document, xmpmetaRequired, result);
82
83		if (result != null  &&  result[1] == XMP_RDF)
84		{
85			XMPMetaImpl xmp = ParseRDF.parse((Node) result[0]);
86			xmp.setPacketHeader((String) result[2]);
87
88			// Check if the XMP object shall be normalized
89			if (!options.getOmitNormalization())
90			{
91				return XMPNormalizer.process(xmp, options);
92			}
93			else
94			{
95				return xmp;
96			}
97		}
98		else
99		{
100			// no appropriate root node found, return empty metadata object
101			return new XMPMetaImpl();
102		}
103	}
104
105
106	/**
107	 * Parses the raw XML metadata packet considering the parsing options.
108	 * Latin-1/ISO-8859-1 can be accepted when the input is a byte stream
109	 * (some old toolkits versions such packets). The stream is
110	 * then wrapped in another stream that converts Latin-1 to UTF-8.
111	 * <p>
112	 * If control characters shall be fixed, a reader is used that fixes the chars to spaces
113	 * (if the input is a byte stream is has to be read as character stream).
114	 * <p>
115	 * Both options reduce the performance of the parser.
116	 *
117	 * @param input the input can be an <code>InputStream</code>, a <code>String</code> or
118	 * 			a byte buffer containing the XMP packet.
119	 * @param options the parsing options
120	 * @return Returns the parsed XML document or an exception.
121	 * @throws XMPException Thrown if the parsing fails for different reasons
122	 */
123	private static Document parseXml(Object input, ParseOptions options)
124			throws XMPException
125	{
126		if (input instanceof InputStream)
127		{
128			return parseXmlFromInputStream((InputStream) input, options);
129		}
130		else if (input instanceof byte[])
131		{
132			return parseXmlFromBytebuffer(new ByteBuffer((byte[]) input), options);
133		}
134		else
135		{
136			return parseXmlFromString((String) input, options);
137		}
138	}
139
140
141	/**
142	 * Parses XML from an {@link InputStream},
143	 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
144	 *
145	 * @param stream an <code>InputStream</code>
146	 * @param options the parsing options
147	 * @return Returns an XML DOM-Document.
148	 * @throws XMPException Thrown when the parsing fails.
149	 */
150	private static Document parseXmlFromInputStream(InputStream stream, ParseOptions options)
151			throws XMPException
152	{
153		if (!options.getAcceptLatin1()  &&  !options.getFixControlChars())
154		{
155			return parseInputSource(new InputSource(stream));
156		}
157		else
158		{
159			// load stream into bytebuffer
160			try
161			{
162				ByteBuffer buffer = new ByteBuffer(stream);
163				return parseXmlFromBytebuffer(buffer, options);
164			}
165			catch (IOException e)
166			{
167				throw new XMPException("Error reading the XML-file",
168						XMPError.BADSTREAM, e);
169			}
170		}
171	}
172
173
174	/**
175	 * Parses XML from a byte buffer,
176	 * fixing the encoding (Latin-1 to UTF-8) and illegal control character optionally.
177	 *
178	 * @param buffer a byte buffer containing the XMP packet
179	 * @param options the parsing options
180	 * @return Returns an XML DOM-Document.
181	 * @throws XMPException Thrown when the parsing fails.
182	 */
183	private static Document parseXmlFromBytebuffer(ByteBuffer buffer, ParseOptions options)
184		throws XMPException
185	{
186		InputSource source = new InputSource(buffer.getByteStream());
187		try
188		{
189			return parseInputSource(source);
190		}
191		catch (XMPException e)
192		{
193			if (e.getErrorCode() == XMPError.BADXML  ||
194				e.getErrorCode() == XMPError.BADSTREAM)
195			{
196				if (options.getAcceptLatin1())
197				{
198					buffer = Latin1Converter.convert(buffer);
199				}
200
201				if (options.getFixControlChars())
202				{
203					try
204					{
205						String encoding = buffer.getEncoding();
206						Reader fixReader = new FixASCIIControlsReader(
207							new InputStreamReader(
208								buffer.getByteStream(), encoding));
209						return parseInputSource(new InputSource(fixReader));
210					}
211					catch (UnsupportedEncodingException e1)
212					{
213						// can normally not happen as the encoding is provided by a util function
214						throw new XMPException("Unsupported Encoding",
215								XMPError.INTERNALFAILURE, e);
216					}
217				}
218				source = new InputSource(buffer.getByteStream());
219				return parseInputSource(source);
220			}
221			else
222			{
223				throw e;
224			}
225		}
226	}
227
228
229	/**
230	 * Parses XML from a {@link String},
231	 * fixing the illegal control character optionally.
232	 *
233	 * @param input a <code>String</code> containing the XMP packet
234	 * @param options the parsing options
235	 * @return Returns an XML DOM-Document.
236	 * @throws XMPException Thrown when the parsing fails.
237	 */
238	private static Document parseXmlFromString(String input, ParseOptions options)
239			throws XMPException
240	{
241		InputSource source = new InputSource(new StringReader(input));
242		try
243		{
244			return parseInputSource(source);
245		}
246		catch (XMPException e)
247		{
248			if (e.getErrorCode() == XMPError.BADXML  &&  options.getFixControlChars())
249			{
250				source = new InputSource(new FixASCIIControlsReader(new StringReader(input)));
251				return parseInputSource(source);
252			}
253			else
254			{
255				throw e;
256			}
257		}
258	}
259
260
261	/**
262	 * Runs the XML-Parser.
263	 * @param source an <code>InputSource</code>
264	 * @return Returns an XML DOM-Document.
265	 * @throws XMPException Wraps parsing and I/O-exceptions into an XMPException.
266	 */
267	private static Document parseInputSource(InputSource source) throws XMPException
268	{
269		try
270		{
271			DocumentBuilder builder = factory.newDocumentBuilder();
272			builder.setErrorHandler(null);
273			return builder.parse(source);
274		}
275		catch (SAXException e)
276		{
277			throw new XMPException("XML parsing failure", XMPError.BADXML, e);
278		}
279		catch (ParserConfigurationException e)
280		{
281			throw new XMPException("XML Parser not correctly configured",
282					XMPError.UNKNOWN, e);
283		}
284		catch (IOException e)
285		{
286			throw new XMPException("Error reading the XML-file", XMPError.BADSTREAM, e);
287		}
288	}
289
290
291	/**
292	 * Find the XML node that is the root of the XMP data tree. Generally this
293	 * will be an outer node, but it could be anywhere if a general XML document
294	 * is parsed (e.g. SVG). The XML parser counted all rdf:RDF and
295	 * pxmp:XMP_Packet nodes, and kept a pointer to the last one. If there is
296	 * more than one possible root use PickBestRoot to choose among them.
297	 * <p>
298	 * If there is a root node, try to extract the version of the previous XMP
299	 * toolkit.
300	 * <p>
301	 * Pick the first x:xmpmeta among multiple root candidates. If there aren't
302	 * any, pick the first bare rdf:RDF if that is allowed. The returned root is
303	 * the rdf:RDF child if an x:xmpmeta element was chosen. The search is
304	 * breadth first, so a higher level candiate is chosen over a lower level
305	 * one that was textually earlier in the serialized XML.
306	 *
307	 * @param root the root of the xml document
308	 * @param xmpmetaRequired flag if the xmpmeta-tag is still required, might be set
309	 * 		initially to <code>true</code>, if the parse option "REQUIRE_XMP_META" is set
310	 * @param result The result array that is filled during the recursive process.
311	 * @return Returns an array that contains the result or <code>null</code>.
312	 * 		   The array contains:
313	 * <ol>
314	 * 		<li>the rdf:RDF-node
315	 * 		<li>an object that is either XMP_RDF or XMP_PLAIN
316	 * 		<li>a flag that is true if a <?xpacket..> processing instruction has been found
317	 * 		<li>the body text of the xpacket-instruction.
318	 * </ol>
319	 *
320	 */
321	private static Object[] findRootNode(Node root, boolean xmpmetaRequired, Object[] result)
322	{
323		// Look among this parent's content for x:xapmeta or x:xmpmeta.
324		// The recursion for x:xmpmeta is broader than the strictly defined choice,
325		// but gives us smaller code.
326		NodeList children = root.getChildNodes();
327		for (int i = 0; i < children.getLength(); i++)
328		{
329			root = children.item(i);
330			if (Node.PROCESSING_INSTRUCTION_NODE == root.getNodeType()  &&
331				((ProcessingInstruction) root).getTarget() == XMPConst.XMP_PI)
332			{
333				// Store the processing instructions content
334				if (result != null)
335				{
336					result[2] = ((ProcessingInstruction) root).getData();
337				}
338			}
339			else if (Node.TEXT_NODE != root.getNodeType()  &&
340				Node.PROCESSING_INSTRUCTION_NODE != root.getNodeType())
341			{
342				String rootNS = root.getNamespaceURI();
343				String rootLocal = root.getLocalName();
344				if (
345						(
346							XMPConst.TAG_XMPMETA.equals(rootLocal)  ||
347							XMPConst.TAG_XAPMETA.equals(rootLocal)
348						)  &&
349						XMPConst.NS_X.equals(rootNS)
350				   )
351				{
352					// by not passing the RequireXMPMeta-option, the rdf-Node will be valid
353					return findRootNode(root, false, result);
354				}
355				else if (!xmpmetaRequired  &&
356						"RDF".equals(rootLocal)  &&
357						 XMPConst.NS_RDF.equals(rootNS))
358				{
359					if (result != null)
360					{
361						result[0] = root;
362						result[1] = XMP_RDF;
363					}
364					return result;
365				}
366				else
367				{
368					// continue searching
369					Object[] newResult = findRootNode(root, xmpmetaRequired, result);
370					if (newResult != null)
371					{
372						return newResult;
373					}
374					else
375					{
376						continue;
377					}
378				}
379			}
380		}
381
382		// no appropriate node has been found
383		return null;
384		//     is extracted here in the C++ Toolkit
385	}
386
387
388	/**
389	 * @return Creates, configures and returnes the document builder factory for
390	 *         the Metadata Parser.
391	 */
392	private static DocumentBuilderFactory createDocumentBuilderFactory()
393	{
394		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
395		factory.setNamespaceAware(true);
396		factory.setIgnoringComments(true);
397
398		try
399		{
400			// honor System parsing limits, e.g.
401			// System.setProperty("entityExpansionLimit", "10");
402			factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
403		}
404		catch (Exception e)
405		{
406			// Ignore IllegalArgumentException and ParserConfigurationException
407			// in case the configured XML-Parser does not implement the feature.
408		}
409		return factory;
410	}
411}