1 /*
2  *******************************************************************************
3  * Copyright (C) 2002-2016, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package org.unicode.cldr.util;
8 
9 import java.io.BufferedReader;
10 import java.io.IOException;
11 
12 import org.unicode.cldr.draft.FileUtilities;
13 
14 import com.ibm.icu.text.Transliterator;
15 import com.ibm.icu.util.ICUUncheckedIOException;
16 
17 public class TransliteratorUtilities {
18     public static boolean DEBUG = false;
19 
registerTransliteratorFromFile(String dir, String id)20     public static void registerTransliteratorFromFile(String dir, String id) {
21         try {
22             String filename = id.replace('-', '_') + ".txt";
23             String rules = getFileContents(dir, filename);
24             Transliterator t;
25             int pos = id.indexOf('-');
26             String rid;
27             if (pos < 0) {
28                 rid = id + "-Any";
29                 id = "Any-" + id;
30             } else {
31                 rid = id.substring(pos + 1) + "-" + id.substring(0, pos);
32             }
33             t = Transliterator.createFromRules(id, rules, Transliterator.FORWARD);
34             Transliterator.unregister(id);
35             Transliterator.registerInstance(t);
36 
37             /*String test = "\u049A\u0430\u0437\u0430\u049B";
38             System.out.println(t.transliterate(test));
39             t = Transliterator.getInstance(id);
40             System.out.println(t.transliterate(test));
41             */
42 
43             t = Transliterator.createFromRules(rid, rules, Transliterator.REVERSE);
44             Transliterator.unregister(rid);
45             Transliterator.registerInstance(t);
46             if (DEBUG) System.out.println("Registered new Transliterator: " + id + ", " + rid);
47         } catch (IOException e) {
48 //#if defined(FOUNDATION10) || defined(J2SE13)
49 //##        throw (IllegalArgumentException) new IllegalArgumentException("Can't open " + dir + ", " + id+" "+ e.getMessage());
50 //#else
51             throw new ICUUncheckedIOException("Can't open " + dir + ", " + id, e);
52 //#endif
53         }
54     }
55 
56     /**
57      *
58      */
getFileContents(String dir, String filename)59     public static String getFileContents(String dir, String filename) throws IOException {
60 //#if defined(FOUNDATION10) || defined(J2SE13)
61 //##        BufferedReader br = TestUtil.openUTF8Reader(dir, filename);
62 //#else
63         BufferedReader br = FileUtilities.openUTF8Reader(dir, filename);
64 //#endif
65         StringBuffer buffer = new StringBuffer();
66         while (true) {
67             String line = br.readLine();
68             if (line == null) break;
69             if (line.length() > 0 && line.charAt(0) == '\uFEFF') line = line.substring(1);
70             buffer.append(line).append("\r\n");
71         }
72         br.close();
73         return buffer.toString();
74 
75     }
76 
77     private static final String BASE_RULES = ":: (hex-any/xml);" +
78         ":: (hex-any/xml10);" +
79         "'<' > '&lt;' ;" +
80         "'<' < '&'[lL][Tt]';' ;" +
81         "'&' > '&amp;' ;" +
82         "'&' < '&'[aA][mM][pP]';' ;" +
83         "'>' < '&'[gG][tT]';' ;" +
84         "'\"' < '&'[qQ][uU][oO][tT]';' ; " +
85         "'' < '&'[aA][pP][oO][sS]';' ; ";
86 
87     private static final String CONTENT_RULES = "'>' > '&gt;' ;";
88 
89     private static final String HTML_RULES = BASE_RULES + CONTENT_RULES +
90         "'\"' > '&quot;' ; ";
91 
92     private static final String HTML_RULES_CONTROLS = HTML_RULES +
93         ":: [[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]] hex/unicode ; ";
94 
95     private static final String HTML_RULES_ASCII = HTML_RULES +
96         ":: [[:C:][:^ASCII:]] any-hex/xml ; ";
97 
98     private static final String XML_RULES = HTML_RULES +
99         "'' > '&apos;' ; ";
100 
101     /*
102     The ampersand character (&) and the left angle bracket (<) MUST NOT appear
103 
104     in their literal form, except when used as markup delimiters, or within a
105 
106     comment, a processing instruction, or a CDATA section. If they are needed
107 
108     elsewhere, they MUST be escaped using either numeric character references or
109 
110     the strings "&amp;" and "&lt;" respectively. The right angle bracket (>) MAY
111 
112     be represented using the string "&gt;", and MUST, for compatibility, be
113 
114     escaped using either "&gt;" or a character reference when it appears in the string
115 
116     "]]>" in content, when that string is not marking the end of a CDATA section.
117 
118     In the content of elements, character data is any string of characters which does
119 
120     not contain the start-delimiter of any markup and does not include the
121 
122     CDATA-section-close delimiter, "]]>". In a CDATA section, character data is
123 
124     any string of characters not including the CDATA-section-close delimiter,
125 
126     "]]>".
127 
128     To allow attribute values to contain both single and double quotes, the
129 
130     apostrophe or single-quote character (') MAY be represented as "&apos;", and
131 
132     the double-quote character (") as "&quot;".
133 
134 
135      */
136 
137     public static final Transliterator toXML = Transliterator.createFromRules(
138         "any-xml", XML_RULES, Transliterator.FORWARD);
139     public static final Transliterator fromXML = Transliterator.createFromRules(
140         "xml-any", XML_RULES, Transliterator.REVERSE);
141     public static final Transliterator toHTML = Transliterator.createFromRules(
142         "any-html", HTML_RULES, Transliterator.FORWARD);
143     public static final Transliterator toHTMLControl = Transliterator.createFromRules(
144         "any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD);
145     public static final Transliterator toHTMLAscii = Transliterator.createFromRules(
146         "any-html", HTML_RULES_ASCII, Transliterator.FORWARD);
147     public static final Transliterator fromHTML = Transliterator.createFromRules(
148         "html-any", HTML_RULES, Transliterator.REVERSE);
149 }
150