1 /* 2 ****************************************************************************** 3 * Copyright (C) 2003-2014, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 /** 8 * @author Ram Viswanadha 9 * 10 * This tool validates xml against DTD or valid XML ... IE 6 does not do a good job 11 */ 12 package org.unicode.cldr.util; 13 14 import java.io.BufferedReader; 15 import java.io.File; 16 import java.io.FileInputStream; 17 import java.io.FileReader; 18 import java.io.FilenameFilter; 19 import java.io.IOException; 20 21 import javax.xml.parsers.DocumentBuilder; 22 import javax.xml.parsers.DocumentBuilderFactory; 23 24 import org.w3c.dom.Document; 25 import org.w3c.dom.Element; 26 import org.w3c.dom.Text; 27 import org.xml.sax.ErrorHandler; 28 import org.xml.sax.InputSource; 29 import org.xml.sax.SAXException; 30 import org.xml.sax.SAXParseException; 31 32 @CLDRTool(alias = "validate", description = "Check XML files for validity") 33 public class XMLValidator { 34 public static boolean quiet = false; 35 public static boolean parseonly = false; 36 main(String[] args)37 public static void main(String[] args) throws IOException { 38 if (args.length == 0) { 39 System.out.println("No files specified. Validation failed. Use --help for help."); 40 return; 41 } 42 for (int i = 0; i < args.length; i++) { 43 if (args[i].equals("-q") || args[i].equals("--quiet")) { 44 quiet = true; 45 } else if (args[i].equals("--help")) { 46 usage(); 47 return; 48 } else if (args[i].equals("--parseonly")) { 49 System.err.println("# DTD Validation is disabled. Will only check for well formed XML."); 50 parseonly = true; 51 } else { 52 File f = new File(args[i]); 53 if (f.isDirectory()) { 54 parseDirectory(f); 55 } else { 56 if (!quiet) System.out.println("Processing file " + args[i]); 57 new fileParserThread(args[i]).run(); 58 } 59 } 60 } 61 if (parseonly) { 62 System.err.println("# DTD Validation is disabled. Only checked for well formed XML."); 63 } 64 } 65 parseDirectory(File f)66 private static void parseDirectory(File f) throws IOException { 67 // System.err.println("Parsing directory " + f.getAbsolutePath()); 68 for (File s : f.listFiles(new FilenameFilter() { 69 @Override 70 public boolean accept(File arg0, String arg1) { 71 if (arg1.startsWith(".")) { 72 return false; // skip .git, .svn, ... 73 } 74 File n = new File(arg0, arg1); 75 // System.err.println("Considering " + n.getAbsolutePath() ); 76 if (n.isDirectory()) { 77 try { 78 parseDirectory(n); 79 } catch (IOException e) { 80 // TODO Auto-generated catch block 81 e.printStackTrace(); 82 System.err.println("Error " + e.toString() + " parsing " + arg0.getPath()); 83 } 84 return false; 85 } else if (arg1.endsWith(".xml")) { 86 return true; 87 } else { 88 return false; 89 } 90 } 91 })) { 92 if (!quiet) System.out.println("Processing file " + s.getPath()); 93 new fileParserThread(PathUtilities.getNormalizedPathString(s)).run(); 94 } 95 } 96 usage()97 private static void usage() { 98 System.err.println("usage: " + XMLValidator.class.getName() + " [ -q ] [ --help ] [ --parseonly ] file ..."); 99 System.err.println("usage: " + XMLValidator.class.getName() 100 + " [ -q ] [ --help ] [ --parseonly ] directory ..."); 101 } 102 103 /** 104 * Utility method to translate a String filename to URL. 105 * 106 * Note: This method is not necessarily proven to get the correct URL for 107 * every possible kind of filename; it should be improved. It handles the 108 * most common cases that we've encountered when running Conformance tests 109 * on Xalan. Also note, this method does not handle other non-file: flavors 110 * of URLs at all. 111 * 112 * If the name is null, return null. If the name starts with a common URI 113 * scheme (namely the ones found in the examples of RFC2396), then simply 114 * return the name as-is (the assumption is that it's already a URL) 115 * Otherwise we attempt (cheaply) to convert to a file:/// URL. 116 * 117 * @param filename 118 * a local path/filename of a file 119 * @return a file:/// URL, the same string if it appears to already be a 120 * URL, or null if error 121 */ filenameToURL(String filename)122 public static String filenameToURL(String filename) { 123 // null begets null - something like the commutative property 124 if (null == filename) 125 return null; 126 127 // Don't translate a string that already looks like a URL 128 if (filename.startsWith("file:") || filename.startsWith("http:") 129 || filename.startsWith("ftp:") 130 || filename.startsWith("gopher:") 131 || filename.startsWith("mailto:") 132 || filename.startsWith("news:") 133 || filename.startsWith("telnet:")) 134 return filename; 135 136 File f = new File(filename); 137 String tmp = PathUtilities.getNormalizedPathString(f); 138 139 // URLs must explicitly use only forward slashes 140 if (File.separatorChar == '\\') { 141 tmp = tmp.replace('\\', '/'); 142 } 143 // Note the presumption that it's a file reference 144 // Ensure we have the correct number of slashes at the 145 // start: we always want 3 /// if it's absolute 146 // (which we should have forced above) 147 if (tmp.startsWith("/")) 148 return "file://" + tmp; 149 else 150 return "file:///" + tmp; 151 152 } 153 154 public static class fileParserThread extends Thread { 155 String filename; 156 fileParserThread(String _filename)157 fileParserThread(String _filename) { 158 filename = _filename; 159 } 160 161 @Override run()162 public void run() { 163 // Force filerefs to be URI's if needed: note this is independent of any 164 // other files 165 String docURI = filenameToURL(filename); 166 parse(new InputSource(docURI), filename); 167 } 168 } 169 parse(InputSource docSrc, String filename)170 static Document parse(InputSource docSrc, String filename) { 171 172 // Check for BOM. 173 try { 174 FileInputStream fis = null; 175 try { 176 fis = new FileInputStream(filename); 177 byte bytes[] = new byte[3]; 178 if (fis.read(bytes) == 3 && 179 bytes[0] == (byte) 0xef && 180 bytes[1] == (byte) 0xbb && 181 bytes[2] == (byte) 0xbf) { 182 System.err.println(filename + ": ERROR: contains UTF-8 BOM (shouldn't happen in CLDR XML files)"); 183 } 184 } finally { 185 if (fis != null) { 186 fis.close(); 187 } 188 } 189 } catch (IOException ioe) { /* ignored- other branches will report an error. */ 190 } 191 192 DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance(); 193 // Always set namespaces on 194 if (!parseonly) { 195 dfactory.setNamespaceAware(true); 196 dfactory.setValidating(true); 197 } 198 // Set other attributes here as needed 199 // applyAttributes(dfactory, attributes); 200 201 // Local class: cheap non-printing ErrorHandler 202 // This is used to suppress validation warnings 203 final String filename2 = filename; 204 ErrorHandler nullHandler = new ErrorHandler() { 205 @Override 206 public void warning(SAXParseException e) throws SAXException { 207 System.err.println(filename2 + ": Warning: " + e.getMessage()); 208 209 } 210 211 @Override 212 public void error(SAXParseException e) throws SAXException { 213 int col = e.getColumnNumber(); 214 System.err.println(filename2 + ":" + e.getLineNumber() + (col >= 0 ? ":" + col : "") 215 + ": ERROR: Element " + e.getPublicId() 216 + " is not valid because " + e.getMessage()); 217 } 218 219 @Override 220 public void fatalError(SAXParseException e) throws SAXException { 221 System.err.println(filename2 + ": ERROR "); 222 throw e; 223 } 224 }; 225 226 Document doc = null; 227 try { 228 // First, attempt to parse as XML (preferred)... 229 DocumentBuilder docBuilder = dfactory.newDocumentBuilder(); 230 docBuilder.setErrorHandler(nullHandler); 231 // if(docBuilder.isValidating()){ 232 // System.out.println("The parser is a validating parser"); 233 // } 234 doc = docBuilder.parse(docSrc); 235 } catch (Throwable se) { 236 // ... if we couldn't parse as XML, attempt parse as HTML... 237 if (se instanceof SAXParseException) { 238 SAXParseException pe = (SAXParseException) se; 239 int col = pe.getColumnNumber(); 240 System.err.println(filename + ":" + pe.getLineNumber() + (col >= 0 ? ":" + col : "") + ": ERROR:" 241 + se.toString()); 242 } else { 243 System.err.println(filename + ": ERROR:" + se.toString()); 244 } 245 try { 246 // @todo need to find an HTML to DOM parser we can use!!! 247 // doc = someHTMLParser.parse(new InputSource(filename)); 248 throw new RuntimeException(filename + ": XMLComparator not HTML parser!"); 249 } catch (Exception e) { 250 if (filename != null) { 251 // ... if we can't parse as HTML, then just parse the text 252 try { 253 254 // Parse as text, line by line 255 // Since we already know it should be text, this should 256 // work better than parsing by bytes. 257 FileReader fr = new FileReader(filename); 258 BufferedReader br = new BufferedReader(fr); 259 StringBuffer buffer = new StringBuffer(); 260 for (;;) { 261 String tmp = br.readLine(); 262 263 if (tmp == null) { 264 break; 265 } 266 267 buffer.append(tmp); 268 buffer.append("\n"); // Put in the newlines as well 269 } 270 br.close(); 271 DocumentBuilder docBuilder = dfactory 272 .newDocumentBuilder(); 273 doc = docBuilder.newDocument(); 274 Element outElem = doc.createElement("out"); 275 Text textNode = doc.createTextNode(buffer.toString()); 276 277 // Note: will this always be a valid node? If we're 278 // parsing 279 // in as text, will there ever be cases where the diff that's 280 // done later on will fail becuase some really garbage-like 281 // text has been put into a node? 282 outElem.appendChild(textNode); 283 doc.appendChild(outElem); 284 } catch (Throwable throwable) { 285 286 // throwable.printStackTrace(); 287 } 288 } 289 } 290 } 291 return doc; 292 } 293 } 294