1 // This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
2 //
3 // TagSoup is licensed under the Apache License,
4 // Version 2.0.  You may obtain a copy of this license at
5 // http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
6 // additional legal rights not granted by this license.
7 //
8 // TagSoup is distributed in the hope that it will be useful, but
9 // unless required by applicable law or agreed to in writing, TagSoup
10 // is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
11 // OF ANY KIND, either express or implied; not even the implied warranty
12 // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13 //
14 //
15 // The TagSoup command line UI
16 
17 package org.ccil.cowan.tagsoup;
18 import java.util.Hashtable;
19 import java.util.Enumeration;
20 import java.io.*;
21 import java.net.URL;
22 import java.net.URLConnection;
23 import org.xml.sax.*;
24 import org.xml.sax.helpers.DefaultHandler;
25 import org.xml.sax.ext.LexicalHandler;
26 
27 
28 /**
29 The stand-alone TagSoup program.
30 **/
31 public class CommandLine {
32 
33 	static Hashtable options = new Hashtable(); static {
34 		options.put("--nocdata", Boolean.FALSE); // CDATA elements are normal
35 		options.put("--files", Boolean.FALSE);	// process arguments as separate files
36 		options.put("--reuse", Boolean.FALSE);	// reuse a single Parser
37 		options.put("--nons", Boolean.FALSE);	// no namespaces
38 		options.put("--nobogons", Boolean.FALSE);  // suppress unknown elements
39 		options.put("--any", Boolean.FALSE);	// unknowns have ANY content model
40 		options.put("--emptybogons", Boolean.FALSE);	// unknowns have EMPTY content model
41 		options.put("--norootbogons", Boolean.FALSE);	// unknowns can't be the root
42 		options.put("--pyxin", Boolean.FALSE);	// input is PYX
43 		options.put("--lexical", Boolean.FALSE); // output comments
44 		options.put("--pyx", Boolean.FALSE);	// output is PYX
45 		options.put("--html", Boolean.FALSE);	// output is HTML
46 		options.put("--method=", Boolean.FALSE); // output method
47 		options.put("--doctype-public=", Boolean.FALSE); // override public id
48 		options.put("--doctype-system=", Boolean.FALSE); // override system id
49 		options.put("--output-encoding=", Boolean.FALSE); // output encoding
50 		options.put("--omit-xml-declaration", Boolean.FALSE); // omit XML decl
51 		options.put("--encoding=", Boolean.FALSE); // specify encoding
52 		options.put("--help", Boolean.FALSE); 	// display help
53 		options.put("--version", Boolean.FALSE);	// display version
54 		options.put("--nodefaults", Boolean.FALSE); // no default attrs
55 		options.put("--nocolons", Boolean.FALSE); // colon to underscore
56 		options.put("--norestart", Boolean.FALSE); // no restartable elements
57 		options.put("--ignorable", Boolean.FALSE);  // return ignorable whitespace
58 		}
59 
60 	/**
61 	Main method.  Processes specified files or standard input.
62 	**/
63 
main(String[] argv)64 	public static void main(String[] argv) throws IOException, SAXException {
65 		int optind = getopts(options, argv);
66 		if (hasOption(options, "--help")) {
67 			doHelp();
68 			return;
69 			}
70 		if (hasOption(options, "--version")) {
71 			System.err.println("TagSoup version 1.2");
72 			return;
73 			}
74 		if (argv.length == optind) {
75 			process("", System.out);
76 			}
77 		else if (hasOption(options, "--files")) {
78 			for (int i = optind; i < argv.length; i++) {
79 				String src = argv[i];
80 				String dst;
81 				int j = src.lastIndexOf('.');
82 				if (j == -1)
83 					dst = src + ".xhtml";
84 				else if (src.endsWith(".xhtml"))
85 					dst = src + "_";
86 				else
87 					dst = src.substring(0, j) + ".xhtml";
88 				System.err.println("src: " + src + " dst: " + dst);
89 				OutputStream os = new FileOutputStream(dst);
90 				process(src, os);
91 				}
92 			}
93 		else {
94 			for (int i = optind; i < argv.length; i++) {
95 				System.err.println("src: " + argv[i]);
96 				process(argv[i], System.out);
97 				}
98 			}
99 		}
100 
101 	// Print the help message
102 
doHelp()103 	private static void doHelp() {
104 		System.err.print("usage: java -jar tagsoup-*.jar ");
105 		System.err.print(" [ ");
106 		boolean first = true;
107 		for (Enumeration e = options.keys(); e.hasMoreElements(); ) {
108 			if (!first) {
109 				System.err.print("| ");
110 				}
111 			first = false;
112 			String key = (String)(e.nextElement());
113 			System.err.print(key);
114 			if (key.endsWith("="))
115 				System.err.print("?");
116 				System.err.print(" ");
117 			}
118 		System.err.println("]*");
119 	}
120 
121 	private static Parser theParser = null;
122 	private static HTMLSchema theSchema = null;
123 	private static String theOutputEncoding = null;
124 
125 	// Process one source onto an output stream.
126 
process(String src, OutputStream os)127 	private static void process(String src, OutputStream os)
128 			throws IOException, SAXException {
129 		XMLReader r;
130 		if (hasOption(options, "--reuse")) {
131 			if (theParser == null) theParser = new Parser();
132 			r = theParser;
133 			}
134 		else {
135 			r = new Parser();
136 			}
137 		theSchema = new HTMLSchema();
138 		r.setProperty(Parser.schemaProperty, theSchema);
139 
140 		if (hasOption(options, "--nocdata")) {
141 			r.setFeature(Parser.CDATAElementsFeature, false);
142 			}
143 
144 		if (hasOption(options, "--nons") || hasOption(options, "--html")) {
145 			r.setFeature(Parser.namespacesFeature, false);
146 			}
147 
148 		if (hasOption(options, "--nobogons")) {
149 			r.setFeature(Parser.ignoreBogonsFeature, true);
150 			}
151 
152 		if (hasOption(options, "--any")) {
153 			r.setFeature(Parser.bogonsEmptyFeature, false);
154 			}
155 		else if (hasOption(options, "--emptybogons")) {
156 			r.setFeature(Parser.bogonsEmptyFeature, true);
157 			}
158 
159 		if (hasOption(options, "--norootbogons")) {
160 			r.setFeature(Parser.rootBogonsFeature, false);
161 			}
162 
163 		if (hasOption(options, "--nodefaults")) {
164 			r.setFeature(Parser.defaultAttributesFeature, false);
165 			}
166 		if (hasOption(options, "--nocolons")) {
167 			r.setFeature(Parser.translateColonsFeature, true);
168 			}
169 
170 		if (hasOption(options, "--norestart")) {
171 			r.setFeature(Parser.restartElementsFeature, false);
172 			}
173 
174 		if (hasOption(options, "--ignorable")) {
175 			r.setFeature(Parser.ignorableWhitespaceFeature, true);
176 			}
177 
178 		if (hasOption(options, "--pyxin")) {
179 			r.setProperty(Parser.scannerProperty, new PYXScanner());
180 			}
181 
182 		Writer w;
183 		if (theOutputEncoding == null) {
184 			w = new OutputStreamWriter(os);
185 			}
186 		else {
187 			w = new OutputStreamWriter(os, theOutputEncoding);
188 			}
189 		ContentHandler h = chooseContentHandler(w);
190 		r.setContentHandler(h);
191 		if (hasOption(options, "--lexical") && h instanceof LexicalHandler) {
192 			r.setProperty(Parser.lexicalHandlerProperty, h);
193 			}
194 		InputSource s = new InputSource();
195 		if (src != "") {
196 			s.setSystemId(src);
197 			}
198 		else {
199 			s.setByteStream(System.in);
200 			}
201 		if (hasOption(options, "--encoding=")) {
202 //			System.out.println("%% Found --encoding");
203 			String encoding = (String)options.get("--encoding=");
204 			if (encoding != null) s.setEncoding(encoding);
205 			}
206 		r.parse(s);
207 		}
208 
209 	// Pick a content handler to generate the desired format.
210 
chooseContentHandler(Writer w)211 	private static ContentHandler chooseContentHandler(Writer w) {
212 		XMLWriter x;
213 		if (hasOption(options, "--pyx")) {
214 			return new PYXWriter(w);
215 			}
216 
217 		x = new XMLWriter(w);
218 		if (hasOption(options, "--html")) {
219 			x.setOutputProperty(XMLWriter.METHOD, "html");
220 			x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
221 			}
222 		if (hasOption(options, "--method=")) {
223 			String method = (String)options.get("--method=");
224 			if (method != null) {
225 				x.setOutputProperty(XMLWriter.METHOD, method);
226 				}
227 			}
228 		if (hasOption(options, "--doctype-public=")) {
229 			String doctype_public = (String)options.get("--doctype-public=");
230 			if (doctype_public != null) {
231 				x.setOutputProperty(XMLWriter.DOCTYPE_PUBLIC, doctype_public);
232 				}
233 			}
234 		if (hasOption(options, "--doctype-system=")) {
235 			String doctype_system = (String)options.get("--doctype-system=");
236 			if (doctype_system != null) {
237 				x.setOutputProperty(XMLWriter.DOCTYPE_SYSTEM, doctype_system);
238 				}
239 			}
240 		if (hasOption(options, "--output-encoding=")) {
241 			theOutputEncoding = (String)options.get("--output-encoding=");
242 //			System.err.println("%%%% Output encoding is " + theOutputEncoding);
243 			if (theOutputEncoding != null) {
244 				x.setOutputProperty(XMLWriter.ENCODING, theOutputEncoding);
245 				}
246 			}
247 		if (hasOption(options, "--omit-xml-declaration")) {
248 			x.setOutputProperty(XMLWriter.OMIT_XML_DECLARATION, "yes");
249 			}
250 		x.setPrefix(theSchema.getURI(), "");
251 		return x;
252 		}
253 
254 	// Options processing
255 
getopts(Hashtable options, String[] argv)256 	private static int getopts(Hashtable options, String[] argv) {
257 		int optind;
258 		for (optind = 0; optind < argv.length; optind++) {
259 			String arg = argv[optind];
260 			String value = null;
261 			if (arg.charAt(0) != '-') break;
262 			int eqsign = arg.indexOf('=');
263 			if (eqsign != -1) {
264 				value = arg.substring(eqsign + 1, arg.length());
265 				arg = arg.substring(0, eqsign + 1);
266 				}
267 			if (options.containsKey(arg)) {
268 				if (value == null) options.put(arg, Boolean.TRUE);
269 				else options.put(arg, value);
270 //				System.out.println("%% Parsed [" + arg + "]=[" + value + "]");
271 				}
272 			else {
273 				System.err.print("Unknown option ");
274 				System.err.println(arg);
275 				System.exit(1);
276 				}
277 			}
278 		return optind;
279 		}
280 
281 	// Return true if an option exists.
282 
hasOption(Hashtable options, String option)283 	private static boolean hasOption(Hashtable options, String option) {
284 		if (Boolean.getBoolean(option)) return true;
285 		else if (options.get(option) != Boolean.FALSE) return true;
286 		return false;
287 		}
288 
289 	}
290