1 /*
2  ******************************************************************************
3  * Copyright (C) 2003-2014, International Business Machines Corporation and   *
4  * others. All Rights Reserved.                                               *
5  ******************************************************************************
6  */
7 /**
8  * @author Ram Viswanadha
9  *
10  * This tool validates xml against DTD or valid XML ... IE 6 does not do a good job
11  */
12 package org.unicode.cldr.util;
13 
14 import java.io.BufferedReader;
15 import java.io.File;
16 import java.io.FileInputStream;
17 import java.io.FileReader;
18 import java.io.FilenameFilter;
19 import java.io.IOException;
20 
21 import javax.xml.parsers.DocumentBuilder;
22 import javax.xml.parsers.DocumentBuilderFactory;
23 
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Element;
26 import org.w3c.dom.Text;
27 import org.xml.sax.ErrorHandler;
28 import org.xml.sax.InputSource;
29 import org.xml.sax.SAXException;
30 import org.xml.sax.SAXParseException;
31 
32 @CLDRTool(alias = "validate", description = "Check XML files for validity")
33 public class XMLValidator {
34     public static boolean quiet = false;
35     public static boolean parseonly = false;
36 
main(String[] args)37     public static void main(String[] args) throws IOException {
38         if (args.length == 0) {
39             System.out.println("No files specified. Validation failed. Use --help for help.");
40             return;
41         }
42         for (int i = 0; i < args.length; i++) {
43             if (args[i].equals("-q") || args[i].equals("--quiet")) {
44                 quiet = true;
45             } else if (args[i].equals("--help")) {
46                 usage();
47                 return;
48             } else if (args[i].equals("--parseonly")) {
49                 System.err.println("# DTD Validation is disabled. Will only check for well formed XML.");
50                 parseonly = true;
51             } else {
52                 File f = new File(args[i]);
53                 if (f.isDirectory()) {
54                     parseDirectory(f);
55                 } else {
56                     if (!quiet) System.out.println("Processing file " + args[i]);
57                     new fileParserThread(args[i]).run();
58                 }
59             }
60         }
61         if (parseonly) {
62             System.err.println("# DTD Validation is disabled. Only checked for well formed XML.");
63         }
64     }
65 
parseDirectory(File f)66     private static void parseDirectory(File f) throws IOException {
67         // System.err.println("Parsing directory " + f.getAbsolutePath());
68         for (File s : f.listFiles(new FilenameFilter() {
69             @Override
70             public boolean accept(File arg0, String arg1) {
71                 if (arg1.startsWith(".")) {
72                     return false; // skip .git, .svn, ...
73                 }
74                 File n = new File(arg0, arg1);
75                 // System.err.println("Considering " + n.getAbsolutePath() );
76                 if (n.isDirectory()) {
77                     try {
78                         parseDirectory(n);
79                     } catch (IOException e) {
80                         // TODO Auto-generated catch block
81                         e.printStackTrace();
82                         System.err.println("Error " + e.toString() + " parsing " + arg0.getPath());
83                     }
84                     return false;
85                 } else if (arg1.endsWith(".xml")) {
86                     return true;
87                 } else {
88                     return false;
89                 }
90             }
91         })) {
92             if (!quiet) System.out.println("Processing file " + s.getPath());
93             new fileParserThread(PathUtilities.getNormalizedPathString(s)).run();
94         }
95     }
96 
usage()97     private static void usage() {
98         System.err.println("usage:  " + XMLValidator.class.getName() + " [ -q ] [ --help ] [ --parseonly ] file ...");
99         System.err.println("usage:  " + XMLValidator.class.getName()
100             + " [ -q ] [ --help ] [ --parseonly ] directory ...");
101     }
102 
103     /**
104      * Utility method to translate a String filename to URL.
105      *
106      * Note: This method is not necessarily proven to get the correct URL for
107      * every possible kind of filename; it should be improved. It handles the
108      * most common cases that we've encountered when running Conformance tests
109      * on Xalan. Also note, this method does not handle other non-file: flavors
110      * of URLs at all.
111      *
112      * If the name is null, return null. If the name starts with a common URI
113      * scheme (namely the ones found in the examples of RFC2396), then simply
114      * return the name as-is (the assumption is that it's already a URL)
115      * Otherwise we attempt (cheaply) to convert to a file:/// URL.
116      *
117      * @param filename
118      *            a local path/filename of a file
119      * @return a file:/// URL, the same string if it appears to already be a
120      *         URL, or null if error
121      */
filenameToURL(String filename)122     public static String filenameToURL(String filename) {
123         // null begets null - something like the commutative property
124         if (null == filename)
125             return null;
126 
127         // Don't translate a string that already looks like a URL
128         if (filename.startsWith("file:") || filename.startsWith("http:")
129             || filename.startsWith("ftp:")
130             || filename.startsWith("gopher:")
131             || filename.startsWith("mailto:")
132             || filename.startsWith("news:")
133             || filename.startsWith("telnet:"))
134             return filename;
135 
136         File f = new File(filename);
137         String tmp = PathUtilities.getNormalizedPathString(f);
138 
139         // URLs must explicitly use only forward slashes
140         if (File.separatorChar == '\\') {
141             tmp = tmp.replace('\\', '/');
142         }
143         // Note the presumption that it's a file reference
144         // Ensure we have the correct number of slashes at the
145         // start: we always want 3 /// if it's absolute
146         // (which we should have forced above)
147         if (tmp.startsWith("/"))
148             return "file://" + tmp;
149         else
150             return "file:///" + tmp;
151 
152     }
153 
154     public static class fileParserThread extends Thread {
155         String filename;
156 
fileParserThread(String _filename)157         fileParserThread(String _filename) {
158             filename = _filename;
159         }
160 
161         @Override
run()162         public void run() {
163             // Force filerefs to be URI's if needed: note this is independent of any
164             // other files
165             String docURI = filenameToURL(filename);
166             parse(new InputSource(docURI), filename);
167         }
168     }
169 
parse(InputSource docSrc, String filename)170     static Document parse(InputSource docSrc, String filename) {
171 
172         // Check for BOM.
173         try {
174             FileInputStream fis = null;
175             try {
176                 fis = new FileInputStream(filename);
177                 byte bytes[] = new byte[3];
178                 if (fis.read(bytes) == 3 &&
179                     bytes[0] == (byte) 0xef &&
180                     bytes[1] == (byte) 0xbb &&
181                     bytes[2] == (byte) 0xbf) {
182                     System.err.println(filename + ": ERROR: contains UTF-8 BOM (shouldn't happen in CLDR XML files)");
183                 }
184             } finally {
185                 if (fis != null) {
186                     fis.close();
187                 }
188             }
189         } catch (IOException ioe) { /* ignored- other branches will report an error. */
190         }
191 
192         DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance();
193         // Always set namespaces on
194         if (!parseonly) {
195             dfactory.setNamespaceAware(true);
196             dfactory.setValidating(true);
197         }
198         // Set other attributes here as needed
199         // applyAttributes(dfactory, attributes);
200 
201         // Local class: cheap non-printing ErrorHandler
202         // This is used to suppress validation warnings
203         final String filename2 = filename;
204         ErrorHandler nullHandler = new ErrorHandler() {
205             @Override
206             public void warning(SAXParseException e) throws SAXException {
207                 System.err.println(filename2 + ": Warning: " + e.getMessage());
208 
209             }
210 
211             @Override
212             public void error(SAXParseException e) throws SAXException {
213                 int col = e.getColumnNumber();
214                 System.err.println(filename2 + ":" + e.getLineNumber() + (col >= 0 ? ":" + col : "")
215                     + ": ERROR: Element " + e.getPublicId()
216                     + " is not valid because " + e.getMessage());
217             }
218 
219             @Override
220             public void fatalError(SAXParseException e) throws SAXException {
221                 System.err.println(filename2 + ": ERROR ");
222                 throw e;
223             }
224         };
225 
226         Document doc = null;
227         try {
228             // First, attempt to parse as XML (preferred)...
229             DocumentBuilder docBuilder = dfactory.newDocumentBuilder();
230             docBuilder.setErrorHandler(nullHandler);
231             // if(docBuilder.isValidating()){
232             // System.out.println("The parser is a validating parser");
233             // }
234             doc = docBuilder.parse(docSrc);
235         } catch (Throwable se) {
236             // ... if we couldn't parse as XML, attempt parse as HTML...
237             if (se instanceof SAXParseException) {
238                 SAXParseException pe = (SAXParseException) se;
239                 int col = pe.getColumnNumber();
240                 System.err.println(filename + ":" + pe.getLineNumber() + (col >= 0 ? ":" + col : "") + ": ERROR:"
241                     + se.toString());
242             } else {
243                 System.err.println(filename + ": ERROR:" + se.toString());
244             }
245             try {
246                 // @todo need to find an HTML to DOM parser we can use!!!
247                 // doc = someHTMLParser.parse(new InputSource(filename));
248                 throw new RuntimeException(filename + ": XMLComparator not HTML parser!");
249             } catch (Exception e) {
250                 if (filename != null) {
251                     // ... if we can't parse as HTML, then just parse the text
252                     try {
253 
254                         // Parse as text, line by line
255                         // Since we already know it should be text, this should
256                         // work better than parsing by bytes.
257                         FileReader fr = new FileReader(filename);
258                         BufferedReader br = new BufferedReader(fr);
259                         StringBuffer buffer = new StringBuffer();
260                         for (;;) {
261                             String tmp = br.readLine();
262 
263                             if (tmp == null) {
264                                 break;
265                             }
266 
267                             buffer.append(tmp);
268                             buffer.append("\n"); // Put in the newlines as well
269                         }
270                         br.close();
271                         DocumentBuilder docBuilder = dfactory
272                             .newDocumentBuilder();
273                         doc = docBuilder.newDocument();
274                         Element outElem = doc.createElement("out");
275                         Text textNode = doc.createTextNode(buffer.toString());
276 
277                         // Note: will this always be a valid node? If we're
278                         // parsing
279                         // in as text, will there ever be cases where the diff that's
280                         // done later on will fail becuase some really garbage-like
281                         // text has been put into a node?
282                         outElem.appendChild(textNode);
283                         doc.appendChild(outElem);
284                     } catch (Throwable throwable) {
285 
286                         // throwable.printStackTrace();
287                     }
288                 }
289             }
290         }
291         return doc;
292     }
293 }
294