1 /*
2  **********************************************************************
3  * Copyright (c) 2005-2011, International Business Machines
4  * Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  * Author: John Emmons
7  **********************************************************************
8  */
9 package org.unicode.cldr.posix;
10 
11 import java.io.File;
12 import java.io.IOException;
13 import java.io.PrintWriter;
14 import java.nio.ByteBuffer;
15 import java.nio.charset.Charset;
16 import java.util.ArrayList;
17 import java.util.Collections;
18 import java.util.List;
19 import java.util.ListIterator;
20 
21 import org.unicode.cldr.draft.FileUtilities;
22 import org.unicode.cldr.icu.SimpleConverter;
23 
24 import com.ibm.icu.dev.tool.UOption;
25 import com.ibm.icu.impl.Utility;
26 import com.ibm.icu.text.UnicodeSet;
27 import com.ibm.icu.text.UnicodeSetIterator;
28 
29 /**
30  * Class to generate POSIX format charmap
31  *
32  * @author John C. Emmons
33  */
34 
35 public class GenerateCharmap {
36 
37     private static final int DESTDIR = 2,
38         UNICODESET = 3,
39         CHARSET = 4;
40 
41     private static final UOption[] options = {
42         UOption.HELP_H(),
43         UOption.HELP_QUESTION_MARK(),
44         UOption.create("destdir", 'd', UOption.REQUIRES_ARG).setDefault("."),
45         UOption.create("unicodeset", 'u', UOption.REQUIRES_ARG).setDefault("[\\u0000-\\U0010FFFF]"),
46         UOption.create("charset", 'c', UOption.REQUIRES_ARG).setDefault("UTF-8"),
47     };
48 
main(String[] args)49     public static void main(String[] args) throws IOException {
50         UOption.parseArgs(args, options);
51         String codeset = options[CHARSET].value;
52         GenerateCharmap gp = new GenerateCharmap(new UnicodeSet(options[UNICODESET].value),
53             Charset.forName(codeset), codeset);
54         PrintWriter out = FileUtilities.openUTF8Writer(options[DESTDIR].value + File.separator, codeset + ".cm");
55         gp.write(out);
56         out.close();
57     }
58 
59     public class CharmapLine implements Comparable<Object> {
60         public String CharacterValue;
61         public String CharacterName;
62         public String CharacterAltName;
63 
CharmapLine(String Name, String AltName, String Value)64         public CharmapLine(String Name, String AltName, String Value) {
65             CharacterName = Name;
66             CharacterAltName = AltName;
67             CharacterValue = Value;
68             if (Name.equals(AltName))
69                 CharacterAltName = "";
70         }
71 
compareTo(Object o)72         public int compareTo(Object o) {
73             CharmapLine c = (CharmapLine) o;
74             return (CharacterValue.compareTo(c.CharacterValue));
75         }
76     }
77 
78     UnicodeSet chars;
79     Charset cs;
80     String codeset;
81 
GenerateCharmap(UnicodeSet chars, Charset cs, String codeset)82     public GenerateCharmap(UnicodeSet chars, Charset cs, String codeset) {
83         this.cs = cs;
84         if (cs != null && !cs.name().equals("UTF-8")) {
85             UnicodeSet csset = new SimpleConverter(cs).getCharset();
86             chars = new UnicodeSet(chars).retainAll(csset);
87         }
88         this.chars = chars;
89         this.codeset = codeset;
90 
91     }
92 
write(PrintWriter out)93     public void write(PrintWriter out) {
94         out.println("######################");
95         out.println("# POSIX charmap ");
96         out.println("# Generated automatically from the Unicode Character Database and Common Locale Data Repository");
97         out.println("# see http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html");
98         out.println("# charset:\t" + codeset);
99         out.println("######################");
100         out.println("#################################################################################################");
101         out.println("# Copyright 1991-2011 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in  #");
102         out.println("# http://www.unicode.org/copyright.html.                                                        #");
103         out.println("#                                                                                               #");
104         out.println("# Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode   #");
105         out.println("# data files and any associated documentation (the \"Data Files\") or Unicode software and any    #");
106         out.println("# associated documentation (the \"Software\") to deal in the Data Files or Software without       #");
107         out.println("# restriction, including without limitation the rights to use, copy, modify, merge, publish,    #");
108         out.println("# distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom   #");
109         out.println("# the Data Files or Software are furnished to do so, provided that (a) the above copyright      #");
110         out.println("# notice(s) and this permission notice appear with all copies of the Data Files or Software,    #");
111         out.println("# (b) both the above copyright notice(s) and this permission notice appear in associated        #");
112         out.println("# documentation, and (c) there is clear notice in each modified Data File or in the Software as #");
113         out.println("# well as in the documentation associated with the Data File(s) or Software that the data or    #");
114         out.println("# software has been modified.                                                                   #");
115         out.println("#                                                                                               #");
116         out.println("# THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR    #");
117         out.println("# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A        #");
118         out.println("# PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT #");
119         out.println("# HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR #");
120         out.println("# CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, #");
121         out.println("# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN   #");
122         out.println("# CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE.                         #");
123         out.println("#################################################################################################");
124         out.println();
125         doCharmap(out, cs);
126         out.println("######################");
127         out.println();
128     }
129 
130     /**
131      * @param out
132      */
doCharmap(PrintWriter out, Charset cs)133     private void doCharmap(PrintWriter out, Charset cs) {
134 
135         // print character types, restricted to the charset
136         int LongestCharNameLength = 0;
137         int LongestCharValueLength = 0;
138         UnicodeSet us = new UnicodeSet("[^[:Noncharacter_Code_Point:][:Cn:][:Cs:]]").retainAll(chars);
139         List<CharmapLine> cml = new ArrayList<CharmapLine>();
140         CharmapLine current;
141         for (UnicodeSetIterator it = new UnicodeSetIterator(us); it.next();) {
142             String Name = POSIXUtilities.POSIXCharFullName(it.getString());
143             String AltName = POSIXUtilities.POSIXCharName(it.getString());
144             String Value = getCodepointValue(it.getString(), cs);
145             current = new CharmapLine(Name, AltName, Value);
146             cml.add(current);
147             if (current.CharacterName.length() > LongestCharNameLength)
148                 LongestCharNameLength = current.CharacterName.length();
149             if (current.CharacterValue.length() > LongestCharValueLength)
150                 LongestCharValueLength = current.CharacterValue.length();
151         }
152 
153         Collections.sort(cml);
154 
155         out.print("<code_set_name> \"");
156         out.print(codeset);
157         out.println("\"");
158         out.println("<mb_cur_min>    1");
159         out.print("<mb_cur_max>    ");
160         out.print(LongestCharValueLength / 4);
161         out.println();
162         out.println();
163         out.println("CHARMAP");
164 
165         for (ListIterator<CharmapLine> li = cml.listIterator(); li.hasNext();) {
166             current = li.next();
167 
168             out.print(current.CharacterName);
169             for (int i = LongestCharNameLength + 1; i > current.CharacterName.length(); i--)
170                 out.print(" ");
171             out.println(current.CharacterValue);
172             if (current.CharacterAltName.length() > 0) {
173                 out.print(current.CharacterAltName);
174                 for (int i = LongestCharNameLength + 1; i > current.CharacterAltName.length(); i--)
175                     out.print(" ");
176                 out.println(current.CharacterValue);
177             }
178         }
179 
180         out.println();
181         out.println("END CHARMAP");
182         out.println();
183 
184     }
185 
getCodepointValue(String cp, Charset cs)186     private String getCodepointValue(String cp, Charset cs) {
187         StringBuffer result = new StringBuffer();
188         ByteBuffer bb = cs.encode(cp);
189         int i;
190         while (bb.hasRemaining()) {
191             result.append("\\x");
192             byte b = bb.get();
193             if (b < 0)
194                 i = b + 256;
195             else
196                 i = b;
197 
198             result.append(Utility.hex(i, 2));
199         }
200         return result.toString();
201     }
202 }
203