/* ******************************************************************************* * Copyright (C) 2002-2016, International Business Machines Corporation and * others. All Rights Reserved. ******************************************************************************* */ package org.unicode.cldr.util; import java.io.BufferedReader; import java.io.IOException; import org.unicode.cldr.draft.FileUtilities; import com.ibm.icu.text.Transliterator; import com.ibm.icu.util.ICUUncheckedIOException; public class TransliteratorUtilities { public static boolean DEBUG = false; public static void registerTransliteratorFromFile(String dir, String id) { try { String filename = id.replace('-', '_') + ".txt"; String rules = getFileContents(dir, filename); Transliterator t; int pos = id.indexOf('-'); String rid; if (pos < 0) { rid = id + "-Any"; id = "Any-" + id; } else { rid = id.substring(pos + 1) + "-" + id.substring(0, pos); } t = Transliterator.createFromRules(id, rules, Transliterator.FORWARD); Transliterator.unregister(id); Transliterator.registerInstance(t); /*String test = "\u049A\u0430\u0437\u0430\u049B"; System.out.println(t.transliterate(test)); t = Transliterator.getInstance(id); System.out.println(t.transliterate(test)); */ t = Transliterator.createFromRules(rid, rules, Transliterator.REVERSE); Transliterator.unregister(rid); Transliterator.registerInstance(t); if (DEBUG) System.out.println("Registered new Transliterator: " + id + ", " + rid); } catch (IOException e) { //#if defined(FOUNDATION10) || defined(J2SE13) //## throw (IllegalArgumentException) new IllegalArgumentException("Can't open " + dir + ", " + id+" "+ e.getMessage()); //#else throw new ICUUncheckedIOException("Can't open " + dir + ", " + id, e); //#endif } } /** * */ public static String getFileContents(String dir, String filename) throws IOException { //#if defined(FOUNDATION10) || defined(J2SE13) //## BufferedReader br = TestUtil.openUTF8Reader(dir, filename); //#else BufferedReader br = FileUtilities.openUTF8Reader(dir, filename); //#endif StringBuffer buffer = new StringBuffer(); while (true) { String line = br.readLine(); if (line == null) break; if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1); buffer.append(line).append("\r\n"); } br.close(); return buffer.toString(); } private static final String BASE_RULES = ":: (hex-any/xml);" + ":: (hex-any/xml10);" + "'<' > '<' ;" + "'<' < '&'[lL][Tt]';' ;" + "'&' > '&' ;" + "'&' < '&'[aA][mM][pP]';' ;" + "'>' < '&'[gG][tT]';' ;" + "'\"' < '&'[qQ][uU][oO][tT]';' ; " + "'' < '&'[aA][pP][oO][sS]';' ; "; private static final String CONTENT_RULES = "'>' > '>' ;"; private static final String HTML_RULES = BASE_RULES + CONTENT_RULES + "'\"' > '"' ; "; private static final String HTML_RULES_CONTROLS = HTML_RULES + ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; "; private static final String HTML_RULES_ASCII = HTML_RULES + ":: [[:C:][:^ASCII:]] any-hex/xml ; "; private static final String XML_RULES = HTML_RULES + "'' > ''' ; "; /* The ampersand character (&) and the left angle bracket (<) MUST NOT appear in their literal form, except when used as markup delimiters, or within a comment, a processing instruction, or a CDATA section. If they are needed elsewhere, they MUST be escaped using either numeric character references or the strings "&" and "<" respectively. The right angle bracket (>) MAY be represented using the string ">", and MUST, for compatibility, be escaped using either ">" or a character reference when it appears in the string "]]>" in content, when that string is not marking the end of a CDATA section. In the content of elements, character data is any string of characters which does not contain the start-delimiter of any markup and does not include the CDATA-section-close delimiter, "]]>". In a CDATA section, character data is any string of characters not including the CDATA-section-close delimiter, "]]>". To allow attribute values to contain both single and double quotes, the apostrophe or single-quote character (') MAY be represented as "'", and the double-quote character (") as """. */ public static final Transliterator toXML = Transliterator.createFromRules( "any-xml", XML_RULES, Transliterator.FORWARD); public static final Transliterator fromXML = Transliterator.createFromRules( "xml-any", XML_RULES, Transliterator.REVERSE); public static final Transliterator toHTML = Transliterator.createFromRules( "any-html", HTML_RULES, Transliterator.FORWARD); public static final Transliterator toHTMLControl = Transliterator.createFromRules( "any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD); public static final Transliterator toHTMLAscii = Transliterator.createFromRules( "any-html", HTML_RULES_ASCII, Transliterator.FORWARD); public static final Transliterator fromHTML = Transliterator.createFromRules( "html-any", HTML_RULES, Transliterator.REVERSE); }