1 /*
2  ******************************************************************************
3  * Copyright (C) 2003-2014, International Business Machines Corporation and   *
4  * others. All Rights Reserved.                                               *
5  ******************************************************************************
6  */
7 /**
8  * @author Ram Viswanadha
9  *
10  * This tool validates xml against DTD or valid XML ... IE 6 does not do a good job
11  */
12 package org.unicode.cldr.util;
13 
14 import java.io.BufferedReader;
15 import java.io.File;
16 import java.io.FileInputStream;
17 import java.io.FileReader;
18 import java.io.FilenameFilter;
19 import java.io.IOException;
20 
21 import javax.xml.parsers.DocumentBuilder;
22 import javax.xml.parsers.DocumentBuilderFactory;
23 
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Element;
26 import org.w3c.dom.Text;
27 import org.xml.sax.ErrorHandler;
28 import org.xml.sax.InputSource;
29 import org.xml.sax.SAXException;
30 import org.xml.sax.SAXParseException;
31 
32 @CLDRTool(alias = "validate", description = "Check XML files for validity")
33 public class XMLValidator {
34     public static boolean quiet = false;
35     public static boolean parseonly = false;
36 
main(String[] args)37     public static void main(String[] args) throws IOException {
38         if (args.length == 0) {
39             System.out.println("No files specified. Validation failed. Use --help for help.");
40             return;
41         }
42         for (int i = 0; i < args.length; i++) {
43             if (args[i].equals("-q") || args[i].equals("--quiet")) {
44                 quiet = true;
45             } else if (args[i].equals("--help")) {
46                 usage();
47                 return;
48             } else if (args[i].equals("--parseonly")) {
49                 System.err.println("# DTD Validation is disabled. Will only check for well formed XML.");
50                 parseonly = true;
51             } else {
52                 File f = new File(args[i]);
53                 if (f.isDirectory()) {
54                     parseDirectory(f);
55                 } else {
56                     if (!quiet) System.out.println("Processing file " + args[i]);
57                     new fileParserThread(args[i]).run();
58                 }
59             }
60         }
61         if (parseonly) {
62             System.err.println("# DTD Validation is disabled. Only checked for well formed XML.");
63         }
64     }
65 
parseDirectory(File f)66     private static void parseDirectory(File f) throws IOException {
67         // System.err.println("Parsing directory " + f.getAbsolutePath());
68         for (File s : f.listFiles(new FilenameFilter() {
69             @Override
70             public boolean accept(File arg0, String arg1) {
71                 if (arg1.startsWith(".")) {
72                     return false; // skip .git, .svn, ...
73                 }
74                 File n = new File(arg0, arg1);
75                 // System.err.println("Considering " + n.getAbsolutePath() );
76                 if (n.isDirectory()) {
77                     try {
78                         parseDirectory(n);
79                     } catch (IOException e) {
80                         // TODO Auto-generated catch block
81                         e.printStackTrace();
82                         System.err.println("Error " + e.toString() + " parsing " + arg0.getPath());
83                     }
84                     return false;
85                 } else if (arg1.endsWith(".xml")) {
86                     return true;
87                 } else {
88                     return false;
89                 }
90             }
91         })) {
92             if (!quiet) System.out.println("Processing file " + s.getPath());
93             new fileParserThread(s.getCanonicalPath()).run();
94         }
95     }
96 
usage()97     private static void usage() {
98         System.err.println("usage:  " + XMLValidator.class.getName() + " [ -q ] [ --help ] [ --parseonly ] file ...");
99         System.err.println("usage:  " + XMLValidator.class.getName()
100             + " [ -q ] [ --help ] [ --parseonly ] directory ...");
101     }
102 
103     /**
104      * Utility method to translate a String filename to URL.
105      *
106      * Note: This method is not necessarily proven to get the correct URL for
107      * every possible kind of filename; it should be improved. It handles the
108      * most common cases that we've encountered when running Conformance tests
109      * on Xalan. Also note, this method does not handle other non-file: flavors
110      * of URLs at all.
111      *
112      * If the name is null, return null. If the name starts with a common URI
113      * scheme (namely the ones found in the examples of RFC2396), then simply
114      * return the name as-is (the assumption is that it's already a URL)
115      * Otherwise we attempt (cheaply) to convert to a file:/// URL.
116      *
117      * @param filename
118      *            a local path/filename of a file
119      * @return a file:/// URL, the same string if it appears to already be a
120      *         URL, or null if error
121      */
filenameToURL(String filename)122     public static String filenameToURL(String filename) {
123         // null begets null - something like the commutative property
124         if (null == filename)
125             return null;
126 
127         // Don't translate a string that already looks like a URL
128         if (filename.startsWith("file:") || filename.startsWith("http:")
129             || filename.startsWith("ftp:")
130             || filename.startsWith("gopher:")
131             || filename.startsWith("mailto:")
132             || filename.startsWith("news:")
133             || filename.startsWith("telnet:"))
134             return filename;
135 
136         File f = new File(filename);
137         String tmp = null;
138         try {
139             // This normally gives a better path
140             tmp = f.getCanonicalPath();
141         } catch (IOException ioe) {
142             // But this can be used as a backup, for cases
143             // where the file does not exist, etc.
144             tmp = f.getAbsolutePath();
145         }
146 
147         // URLs must explicitly use only forward slashes
148         if (File.separatorChar == '\\') {
149             tmp = tmp.replace('\\', '/');
150         }
151         // Note the presumption that it's a file reference
152         // Ensure we have the correct number of slashes at the
153         // start: we always want 3 /// if it's absolute
154         // (which we should have forced above)
155         if (tmp.startsWith("/"))
156             return "file://" + tmp;
157         else
158             return "file:///" + tmp;
159 
160     }
161 
162     public static class fileParserThread extends Thread {
163         String filename;
164 
fileParserThread(String _filename)165         fileParserThread(String _filename) {
166             filename = _filename;
167         }
168 
run()169         public void run() {
170             // Force filerefs to be URI's if needed: note this is independent of any
171             // other files
172             String docURI = filenameToURL(filename);
173             parse(new InputSource(docURI), filename);
174         }
175     }
176 
parse(InputSource docSrc, String filename)177     static Document parse(InputSource docSrc, String filename) {
178 
179         // Check for BOM.
180         try {
181             FileInputStream fis = null;
182             try {
183                 fis = new FileInputStream(filename);
184                 byte bytes[] = new byte[3];
185                 if (fis.read(bytes) == 3 &&
186                     bytes[0] == (byte) 0xef &&
187                     bytes[1] == (byte) 0xbb &&
188                     bytes[2] == (byte) 0xbf) {
189                     System.err.println(filename + ": ERROR: contains UTF-8 BOM (shouldn't happen in CLDR XML files)");
190                 }
191             } finally {
192                 if (fis != null) {
193                     fis.close();
194                 }
195             }
196         } catch (IOException ioe) { /* ignored- other branches will report an error. */
197         }
198 
199         DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance();
200         // Always set namespaces on
201         if (!parseonly) {
202             dfactory.setNamespaceAware(true);
203             dfactory.setValidating(true);
204         }
205         // Set other attributes here as needed
206         // applyAttributes(dfactory, attributes);
207 
208         // Local class: cheap non-printing ErrorHandler
209         // This is used to suppress validation warnings
210         final String filename2 = filename;
211         ErrorHandler nullHandler = new ErrorHandler() {
212             public void warning(SAXParseException e) throws SAXException {
213                 System.err.println(filename2 + ": Warning: " + e.getMessage());
214 
215             }
216 
217             public void error(SAXParseException e) throws SAXException {
218                 int col = e.getColumnNumber();
219                 System.err.println(filename2 + ":" + e.getLineNumber() + (col >= 0 ? ":" + col : "")
220                     + ": ERROR: Element " + e.getPublicId()
221                     + " is not valid because " + e.getMessage());
222             }
223 
224             public void fatalError(SAXParseException e) throws SAXException {
225                 System.err.println(filename2 + ": ERROR ");
226                 throw e;
227             }
228         };
229 
230         Document doc = null;
231         try {
232             // First, attempt to parse as XML (preferred)...
233             DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
234             docBuilder.setErrorHandler(nullHandler);
235             docBuilder.setEntityResolver(new CachingEntityResolver());
236             // if(docBuilder.isValidating()){
237             // System.out.println("The parser is a validating parser");
238             // }
239             doc = docBuilder.parse(docSrc);
240         } catch (Throwable se) {
241             // ... if we couldn't parse as XML, attempt parse as HTML...
242             if (se instanceof SAXParseException) {
243                 SAXParseException pe = (SAXParseException) se;
244                 int col = pe.getColumnNumber();
245                 System.err.println(filename + ":" + pe.getLineNumber() + (col >= 0 ? ":" + col : "") + ": ERROR:"
246                     + se.toString());
247             } else {
248                 System.err.println(filename + ": ERROR:" + se.toString());
249             }
250             try {
251                 // @todo need to find an HTML to DOM parser we can use!!!
252                 // doc = someHTMLParser.parse(new InputSource(filename));
253                 throw new RuntimeException(filename + ": XMLComparator not HTML parser!");
254             } catch (Exception e) {
255                 if (filename != null) {
256                     // ... if we can't parse as HTML, then just parse the text
257                     try {
258 
259                         // Parse as text, line by line
260                         // Since we already know it should be text, this should
261                         // work better than parsing by bytes.
262                         FileReader fr = new FileReader(filename);
263                         BufferedReader br = new BufferedReader(fr);
264                         StringBuffer buffer = new StringBuffer();
265                         for (;;) {
266                             String tmp = br.readLine();
267 
268                             if (tmp == null) {
269                                 break;
270                             }
271 
272                             buffer.append(tmp);
273                             buffer.append("\n"); // Put in the newlines as well
274                         }
275                         br.close();
276                         DocumentBuilder docBuilder = dfactory
277                             .newDocumentBuilder();
278                         doc = docBuilder.newDocument();
279                         Element outElem = doc.createElement("out");
280                         Text textNode = doc.createTextNode(buffer.toString());
281 
282                         // Note: will this always be a valid node? If we're
283                         // parsing
284                         // in as text, will there ever be cases where the diff that's
285                         // done later on will fail becuase some really garbage-like
286                         // text has been put into a node?
287                         outElem.appendChild(textNode);
288                         doc.appendChild(outElem);
289                     } catch (Throwable throwable) {
290 
291                         // throwable.printStackTrace();
292                     }
293                 }
294             }
295         }
296         return doc;
297     }
298 }
299