1// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
2//
3// TagSoup is licensed under the Apache License,
4// Version 2.0.  You may obtain a copy of this license at
5// http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
6// additional legal rights not granted by this license.
7//
8// TagSoup is distributed in the hope that it will be useful, but
9// unless required by applicable law or agreed to in writing, TagSoup
10// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
11// OF ANY KIND, either express or implied; not even the implied warranty
12// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13//
14//
15// The TagSoup command line UI
16
17package org.ccil.cowan.tagsoup;
18import java.util.Hashtable;
19import java.util.Enumeration;
20import java.io.*;
21import java.net.URL;
22import java.net.URLConnection;
23import org.xml.sax.*;
24import org.xml.sax.helpers.DefaultHandler;
25import org.xml.sax.ext.LexicalHandler;
26
27
28/**
29The stand-alone TagSoup program.
30**/
31public class CommandLine {
32
33	static Hashtable options = new Hashtable(); static {
34		options.put("--nocdata", Boolean.FALSE); // CDATA elements are normal
35		options.put("--files", Boolean.FALSE);	// process arguments as separate files
36		options.put("--reuse", Boolean.FALSE);	// reuse a single Parser
37		options.put("--nons", Boolean.FALSE);	// no namespaces
38		options.put("--nobogons", Boolean.FALSE);  // suppress unknown elements
39		options.put("--any", Boolean.FALSE);	// unknowns have ANY content model
40		options.put("--emptybogons", Boolean.FALSE);	// unknowns have EMPTY content model
41		options.put("--norootbogons", Boolean.FALSE);	// unknowns can't be the root
42		options.put("--pyxin", Boolean.FALSE);	// input is PYX
43		options.put("--lexical", Boolean.FALSE); // output comments
44		options.put("--pyx", Boolean.FALSE);	// output is PYX
45		options.put("--html", Boolean.FALSE);	// output is HTML
46		options.put("--method=", Boolean.FALSE); // output method
47		options.put("--doctype-public=", Boolean.FALSE); // override public id
48		options.put("--doctype-system=", Boolean.FALSE); // override system id
49		options.put("--output-encoding=", Boolean.FALSE); // output encoding
50		options.put("--omit-xml-declaration", Boolean.FALSE); // omit XML decl
51		options.put("--encoding=", Boolean.FALSE); // specify encoding
52		options.put("--help", Boolean.FALSE); 	// display help
53		options.put("--version", Boolean.FALSE);	// display version
54		options.put("--nodefaults", Boolean.FALSE); // no default attrs
55		options.put("--nocolons", Boolean.FALSE); // colon to underscore
56		options.put("--norestart", Boolean.FALSE); // no restartable elements
57		options.put("--ignorable", Boolean.FALSE);  // return ignorable whitespace
58		}
59
60	/**
61	Main method.  Processes specified files or standard input.
62	**/
63
64	public static void main(String[] argv) throws IOException, SAXException {
65		int optind = getopts(options, argv);
66		if (hasOption(options, "--help")) {
67			doHelp();
68			return;
69			}
70		if (hasOption(options, "--version")) {
71			System.err.println("TagSoup version 1.2");
72			return;
73			}
74		if (argv.length == optind) {
75			process("", System.out);
76			}
77		else if (hasOption(options, "--files")) {
78			for (int i = optind; i < argv.length; i++) {
79				String src = argv[i];
80				String dst;
81				int j = src.lastIndexOf('.');
82				if (j == -1)
83					dst = src + ".xhtml";
84				else if (src.endsWith(".xhtml"))
85					dst = src + "_";
86				else
87					dst = src.substring(0, j) + ".xhtml";
88				System.err.println("src: " + src + " dst: " + dst);
89				OutputStream os = new FileOutputStream(dst);
90				process(src, os);
91				}
92			}
93		else {
94			for (int i = optind; i < argv.length; i++) {
95				System.err.println("src: " + argv[i]);
96				process(argv[i], System.out);
97				}
98			}
99		}
100
101	// Print the help message
102
103	private static void doHelp() {
104		System.err.print("usage: java -jar tagsoup-*.jar ");
105		System.err.print(" [ ");
106		boolean first = true;
107		for (Enumeration e = options.keys(); e.hasMoreElements(); ) {
108			if (!first) {
109				System.err.print("| ");
110				}
111			first = false;
112			String key = (String)(e.nextElement());
113			System.err.print(key);
114			if (key.endsWith("="))
115				System.err.print("?");
116				System.err.print(" ");
117			}
118		System.err.println("]*");
119	}
120
121	private static Parser theParser = null;
122	private static HTMLSchema theSchema = null;
123	private static String theOutputEncoding = null;
124
125	// Process one source onto an output stream.
126
127	private static void process(String src, OutputStream os)
128			throws IOException, SAXException {
129		XMLReader r;
130		if (hasOption(options, "--reuse")) {
131			if (theParser == null) theParser = new Parser();
132			r = theParser;
133			}
134		else {
135			r = new Parser();
136			}
137		theSchema = new HTMLSchema();
138		r.setProperty(Parser.schemaProperty, theSchema);
139
140		if (hasOption(options, "--nocdata")) {
141			r.setFeature(Parser.CDATAElementsFeature, false);
142			}
143
144		if (hasOption(options, "--nons") || hasOption(options, "--html")) {
145			r.setFeature(Parser.namespacesFeature, false);
146			}
147
148		if (hasOption(options, "--nobogons")) {
149			r.setFeature(Parser.ignoreBogonsFeature, true);
150			}
151
152		if (hasOption(options, "--any")) {
153			r.setFeature(Parser.bogonsEmptyFeature, false);
154			}
155		else if (hasOption(options, "--emptybogons")) {
156			r.setFeature(Parser.bogonsEmptyFeature, true);
157			}
158
159		if (hasOption(options, "--norootbogons")) {
160			r.setFeature(Parser.rootBogonsFeature, false);
161			}
162
163		if (hasOption(options, "--nodefaults")) {
164			r.setFeature(Parser.defaultAttributesFeature, false);
165			}
166		if (hasOption(options, "--nocolons")) {
167			r.setFeature(Parser.translateColonsFeature, true);
168			}
169
170		if (hasOption(options, "--norestart")) {
171			r.setFeature(Parser.restartElementsFeature, false);
172			}
173
174		if (hasOption(options, "--ignorable")) {
175			r.setFeature(Parser.ignorableWhitespaceFeature, true);
176			}
177
178		if (hasOption(options, "--pyxin")) {
179			r.setProperty(Parser.scannerProperty, new PYXScanner());
180			}
181
182		Writer w;
183		if (theOutputEncoding == null) {
184			w = new OutputStreamWriter(os);
185			}
186		else {
187			w = new OutputStreamWriter(os, theOutputEncoding);
188			}
189		ContentHandler h = chooseContentHandler(w);
190		r.setContentHandler(h);
191		if (hasOption(options, "--lexical") && h instanceof LexicalHandler) {
192			r.setProperty(Parser.lexicalHandlerProperty, h);
193			}
194		InputSource s = new InputSource();
195		if (src != "") {
196			s.setSystemId(src);
197			}
198		else {
199			s.setByteStream(System.in);
200			}
201		if (hasOption(options, "--encoding=")) {
202//			System.out.println("%% Found --encoding");
203			String encoding = (String)options.get("--encoding=");
204			if (encoding != null) s.setEncoding(encoding);
205			}
206		r.parse(s);
207		}
208
209	// Pick a content handler to generate the desired format.
210
211	private static ContentHandler chooseContentHandler(Writer w) {
212		XMLWriter x;
213		if (hasOption(options, "--pyx")) {
214			return new PYXWriter(w);
215			}
216
217		x = new XMLWriter(w);
218		if (hasOption(options, "--html")) {
219			x.setOutputProperty(XMLWriter.METHOD, "html");
220			x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
221			}
222		if (hasOption(options, "--method=")) {
223			String method = (String)options.get("--method=");
224			if (method != null) {
225				x.setOutputProperty(XMLWriter.METHOD, method);
226				}
227			}
228		if (hasOption(options, "--doctype-public=")) {
229			String doctype_public = (String)options.get("--doctype-public=");
230			if (doctype_public != null) {
231				x.setOutputProperty(XMLWriter.DOCTYPE_PUBLIC, doctype_public);
232				}
233			}
234		if (hasOption(options, "--doctype-system=")) {
235			String doctype_system = (String)options.get("--doctype-system=");
236			if (doctype_system != null) {
237				x.setOutputProperty(XMLWriter.DOCTYPE_SYSTEM, doctype_system);
238				}
239			}
240		if (hasOption(options, "--output-encoding=")) {
241			theOutputEncoding = (String)options.get("--output-encoding=");
242//			System.err.println("%%%% Output encoding is " + theOutputEncoding);
243			if (theOutputEncoding != null) {
244				x.setOutputProperty(XMLWriter.ENCODING, theOutputEncoding);
245				}
246			}
247		if (hasOption(options, "--omit-xml-declaration")) {
248			x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
249			}
250		x.setPrefix(theSchema.getURI(), "");
251		return x;
252		}
253
254	// Options processing
255
256	private static int getopts(Hashtable options, String[] argv) {
257		int optind;
258		for (optind = 0; optind < argv.length; optind++) {
259			String arg = argv[optind];
260			String value = null;
261			if (arg.charAt(0) != '-') break;
262			int eqsign = arg.indexOf('=');
263			if (eqsign != -1) {
264				value = arg.substring(eqsign + 1, arg.length());
265				arg = arg.substring(0, eqsign + 1);
266				}
267			if (options.containsKey(arg)) {
268				if (value == null) options.put(arg, Boolean.TRUE);
269				else options.put(arg, value);
270//				System.out.println("%% Parsed [" + arg + "]=[" + value + "]");
271				}
272			else {
273				System.err.print("Unknown option ");
274				System.err.println(arg);
275				System.exit(1);
276				}
277			}
278		return optind;
279		}
280
281	// Return true if an option exists.
282
283	private static boolean hasOption(Hashtable options, String option) {
284		if (Boolean.getBoolean(option)) return true;
285		else if (options.get(option) != Boolean.FALSE) return true;
286		return false;
287		}
288
289	}
290