1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  * Copyright (C) 1996-2011, International Business Machines Corporation and
5  * others. All Rights Reserved.
6  */
7 package com.ibm.icu.text;
8 import com.ibm.icu.impl.PatternProps;
9 import com.ibm.icu.impl.UCharacterName;
10 import com.ibm.icu.impl.Utility;
11 import com.ibm.icu.lang.UCharacter;
12 
13 /**
14  * A transliterator that performs name to character mapping.
15  * @author Alan Liu
16  */
17 class NameUnicodeTransliterator extends Transliterator {
18 
19     static final String _ID = "Name-Any";
20 
21     static final String OPEN_PAT    = "\\N~{~";
22     static final char   OPEN_DELIM  = '\\'; // first char of OPEN_PAT
23     static final char   CLOSE_DELIM = '}';
24     static final char   SPACE       = ' ';
25 
26 
27     /**
28      * System registration hook.
29      */
register()30     static void register() {
31         Transliterator.registerFactory(_ID, new Transliterator.Factory() {
32             @Override
33             public Transliterator getInstance(String ID) {
34                 return new NameUnicodeTransliterator(null);
35             }
36         });
37     }
38 
39     /**
40      * Constructs a transliterator.
41      */
NameUnicodeTransliterator(UnicodeFilter filter)42     public NameUnicodeTransliterator(UnicodeFilter filter) {
43         super(_ID, filter);
44     }
45 
46     /**
47      * Implements {@link Transliterator#handleTransliterate}.
48      */
49     @Override
handleTransliterate(Replaceable text, Position offsets, boolean isIncremental)50     protected void handleTransliterate(Replaceable text,
51                                        Position offsets, boolean isIncremental) {
52 
53         int maxLen = UCharacterName.INSTANCE.getMaxCharNameLength() + 1; // allow for temporary trailing space
54 
55         StringBuffer name = new StringBuffer(maxLen);
56 
57         // Get the legal character set
58         UnicodeSet legal = new UnicodeSet();
59         UCharacterName.INSTANCE.getCharNameCharacters(legal);
60 
61         int cursor = offsets.start;
62         int limit = offsets.limit;
63 
64         // Modes:
65         // 0 - looking for open delimiter
66         // 1 - after open delimiter
67         int mode = 0;
68         int openPos = -1; // open delim candidate pos
69 
70         int c;
71         while (cursor < limit) {
72             c = text.char32At(cursor);
73 
74             switch (mode) {
75             case 0: // looking for open delimiter
76                 if (c == OPEN_DELIM) { // quick check first
77                     openPos = cursor;
78                     int i = Utility.parsePattern(OPEN_PAT, text, cursor, limit);
79                     if (i >= 0 && i < limit) {
80                         mode = 1;
81                         name.setLength(0);
82                         cursor = i;
83                         continue; // *** reprocess char32At(cursor)
84                     }
85                 }
86                 break;
87 
88             case 1: // after open delimiter
89                 // Look for legal chars.  If \s+ is found, convert it
90                 // to a single space.  If closeDelimiter is found, exit
91                 // the loop.  If any other character is found, exit the
92                 // loop.  If the limit is reached, exit the loop.
93 
94                 // Convert \s+ => SPACE.  This assumes there are no
95                 // runs of >1 space characters in names.
96                 if (PatternProps.isWhiteSpace(c)) {
97                     // Ignore leading whitespace
98                     if (name.length() > 0 &&
99                         name.charAt(name.length()-1) != SPACE) {
100                         name.append(SPACE);
101                         // If we are too long then abort.  maxLen includes
102                         // temporary trailing space, so use '>'.
103                         if (name.length() > maxLen) {
104                             mode = 0;
105                         }
106                     }
107                     break;
108                 }
109 
110                 if (c == CLOSE_DELIM) {
111 
112                     int len = name.length();
113 
114                     // Delete trailing space, if any
115                     if (len > 0 &&
116                         name.charAt(len-1) == SPACE) {
117                         name.setLength(--len);
118                     }
119 
120                     c = UCharacter.getCharFromExtendedName(name.toString());
121                     if (c != -1) {
122                         // Lookup succeeded
123 
124                         // assert(UTF16.getCharCount(CLOSE_DELIM) == 1);
125                         cursor++; // advance over CLOSE_DELIM
126 
127                         String str = UTF16.valueOf(c);
128                         text.replace(openPos, cursor, str);
129 
130                         // Adjust indices for the change in the length of
131                         // the string.  Do not assume that str.length() ==
132                         // 1, in case of surrogates.
133                         int delta = cursor - openPos - str.length();
134                         cursor -= delta;
135                         limit -= delta;
136                         // assert(cursor == openPos + str.length());
137                     }
138                     // If the lookup failed, we leave things as-is and
139                     // still switch to mode 0 and continue.
140                     mode = 0;
141                     openPos = -1; // close off candidate
142                     continue; // *** reprocess char32At(cursor)
143                 }
144 
145                 if (legal.contains(c)) {
146                     UTF16.append(name, c);
147                     // If we go past the longest possible name then abort.
148                     // maxLen includes temporary trailing space, so use '>='.
149                     if (name.length() >= maxLen) {
150                         mode = 0;
151                     }
152                 }
153 
154                 // Invalid character
155                 else {
156                     --cursor; // Backup and reprocess this character
157                     mode = 0;
158                 }
159 
160                 break;
161             }
162 
163             cursor += UTF16.getCharCount(c);
164         }
165 
166         offsets.contextLimit += limit - offsets.limit;
167         offsets.limit = limit;
168         // In incremental mode, only advance the cursor up to the last
169         // open delimiter candidate.
170         offsets.start = (isIncremental && openPos >= 0) ? openPos : cursor;
171     }
172 
173     /* (non-Javadoc)
174      * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
175      */
176     @Override
addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet)177     public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
178         UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
179         if (!myFilter.containsAll(UnicodeNameTransliterator.OPEN_DELIM) || !myFilter.contains(CLOSE_DELIM)) {
180             return; // we have to contain both prefix and suffix
181         }
182         UnicodeSet items = new UnicodeSet()
183         .addAll('0', '9')
184         .addAll('A', 'F')
185         .addAll('a', 'z') // for controls
186         .add('<').add('>') // for controls
187         .add('(').add(')') // for controls
188         .add('-')
189         .add(' ')
190         .addAll(UnicodeNameTransliterator.OPEN_DELIM)
191         .add(CLOSE_DELIM);
192         items.retainAll(myFilter);
193         if (items.size() > 0) {
194             sourceSet.addAll(items);
195             // could produce any character
196             targetSet.addAll(0, 0x10FFFF);
197         }
198     }
199 }
200