1 /*
2  * Copyright (C) 2009 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.escape;
18 
19 import static com.google.common.base.Preconditions.checkNotNull;
20 
21 import com.google.common.annotations.Beta;
22 import com.google.common.annotations.GwtCompatible;
23 
24 import java.util.HashMap;
25 import java.util.Map;
26 
27 import javax.annotation.Nullable;
28 
29 /**
30  * Static utility methods pertaining to {@link Escaper} instances.
31  *
32  * @author Sven Mawson
33  * @author David Beaumont
34  * @since 15.0
35  */
36 @Beta
37 @GwtCompatible
38 public final class Escapers {
Escapers()39   private Escapers() {}
40 
41   /**
42    * Returns an {@link Escaper} that does no escaping, passing all character
43    * data through unchanged.
44    */
nullEscaper()45   public static Escaper nullEscaper() {
46     return NULL_ESCAPER;
47   }
48 
49   // An Escaper that efficiently performs no escaping.
50   // Extending CharEscaper (instead of Escaper) makes Escapers.compose() easier.
51   private static final Escaper NULL_ESCAPER = new CharEscaper() {
52     @Override public String escape(String string) {
53       return checkNotNull(string);
54     }
55 
56     @Override protected char[] escape(char c) {
57       // TODO: Fix tests not to call this directly and make it throw an error.
58       return null;
59     }
60   };
61 
62   /**
63    * Returns a builder for creating simple, fast escapers. A builder instance
64    * can be reused and each escaper that is created will be a snapshot of the
65    * current builder state. Builders are not thread safe.
66    *
67    * <p>The initial state of the builder is such that:
68    * <ul>
69    * <li>There are no replacement mappings<li>
70    * <li>{@code safeMin == Character.MIN_VALUE}</li>
71    * <li>{@code safeMax == Character.MAX_VALUE}</li>
72    * <li>{@code unsafeReplacement == null}</li>
73    * </ul>
74    * <p>For performance reasons escapers created by this builder are not
75    * Unicode aware and will not validate the well-formedness of their input.
76    */
builder()77   public static Builder builder() {
78     return new Builder();
79   }
80 
81   /**
82    * A builder for simple, fast escapers.
83    *
84    * <p>Typically an escaper needs to deal with the escaping of high valued
85    * characters or code points. In these cases it is necessary to extend either
86    * {@link ArrayBasedCharEscaper} or {@link ArrayBasedUnicodeEscaper} to
87    * provide the desired behavior. However this builder is suitable for creating
88    * escapers that replace a relative small set of characters.
89    *
90    * @author David Beaumont
91    * @since 15.0
92    */
93   @Beta
94   public static final class Builder {
95     private final Map<Character, String> replacementMap =
96         new HashMap<Character, String>();
97     private char safeMin = Character.MIN_VALUE;
98     private char safeMax = Character.MAX_VALUE;
99     private String unsafeReplacement = null;
100 
101     // The constructor is exposed via the builder() method above.
Builder()102     private Builder() {}
103 
104     /**
105      * Sets the safe range of characters for the escaper. Characters in this
106      * range that have no explicit replacement are considered 'safe' and remain
107      * unescaped in the output. If {@code safeMax < safeMin} then the safe range
108      * is empty.
109      *
110      * @param safeMin the lowest 'safe' character
111      * @param safeMax the highest 'safe' character
112      * @return the builder instance
113      */
setSafeRange(char safeMin, char safeMax)114     public Builder setSafeRange(char safeMin, char safeMax) {
115       this.safeMin = safeMin;
116       this.safeMax = safeMax;
117       return this;
118     }
119 
120     /**
121      * Sets the replacement string for any characters outside the 'safe' range
122      * that have no explicit replacement. If {@code unsafeReplacement} is
123      * {@code null} then no replacement will occur, if it is {@code ""} then
124      * the unsafe characters are removed from the output.
125      *
126      * @param unsafeReplacement the string to replace unsafe chracters
127      * @return the builder instance
128      */
setUnsafeReplacement(@ullable String unsafeReplacement)129     public Builder setUnsafeReplacement(@Nullable String unsafeReplacement) {
130       this.unsafeReplacement = unsafeReplacement;
131       return this;
132     }
133 
134     /**
135      * Adds a replacement string for the given input character. The specified
136      * character will be replaced by the given string whenever it occurs in the
137      * input, irrespective of whether it lies inside or outside the 'safe'
138      * range.
139      *
140      * @param c the character to be replaced
141      * @param replacement the string to replace the given character
142      * @return the builder instance
143      * @throws NullPointerException if {@code replacement} is null
144      */
addEscape(char c, String replacement)145     public Builder addEscape(char c, String replacement) {
146       checkNotNull(replacement);
147       // This can replace an existing character (the builder is re-usable).
148       replacementMap.put(c, replacement);
149       return this;
150     }
151 
152     /**
153      * Returns a new escaper based on the current state of the builder.
154      */
build()155     public Escaper build() {
156       return new ArrayBasedCharEscaper(replacementMap, safeMin, safeMax) {
157         private final char[] replacementChars =
158             unsafeReplacement != null ? unsafeReplacement.toCharArray() : null;
159         @Override protected char[] escapeUnsafe(char c) {
160           return replacementChars;
161         }
162       };
163     }
164   }
165 
166   /**
167    * Returns a {@link UnicodeEscaper} equivalent to the given escaper instance.
168    * If the escaper is already a UnicodeEscaper then it is simply returned,
169    * otherwise it is wrapped in a UnicodeEscaper.
170    *
171    * <p>When a {@link CharEscaper} escaper is wrapped by this method it acquires
172    * extra behavior with respect to the well-formedness of Unicode character
173    * sequences and will throw {@link IllegalArgumentException} when given bad
174    * input.
175    *
176    * @param escaper the instance to be wrapped
177    * @return a UnicodeEscaper with the same behavior as the given instance
178    * @throws NullPointerException if escaper is null
179    * @throws IllegalArgumentException if escaper is not a UnicodeEscaper or a
180    *         CharEscaper
181    */
asUnicodeEscaper(Escaper escaper)182   static UnicodeEscaper asUnicodeEscaper(Escaper escaper) {
183     checkNotNull(escaper);
184     if (escaper instanceof UnicodeEscaper) {
185       return (UnicodeEscaper) escaper;
186     } else if (escaper instanceof CharEscaper) {
187       return wrap((CharEscaper) escaper);
188     }
189     // In practice this shouldn't happen because it would be very odd not to
190     // extend either CharEscaper or UnicodeEscaper for non trivial cases.
191     throw new IllegalArgumentException("Cannot create a UnicodeEscaper from: " +
192         escaper.getClass().getName());
193   }
194 
195   /**
196    * Returns a string that would replace the given character in the specified
197    * escaper, or {@code null} if no replacement should be made. This method is
198    * intended for use in tests through the {@code EscaperAsserts} class;
199    * production users of {@link CharEscaper} should limit themselves to its
200    * public interface.
201    *
202    * @param c the character to escape if necessary
203    * @return the replacement string, or {@code null} if no escaping was needed
204    */
computeReplacement(CharEscaper escaper, char c)205   public static String computeReplacement(CharEscaper escaper, char c) {
206     return stringOrNull(escaper.escape(c));
207   }
208 
209   /**
210    * Returns a string that would replace the given character in the specified
211    * escaper, or {@code null} if no replacement should be made. This method is
212    * intended for use in tests through the {@code EscaperAsserts} class;
213    * production users of {@link UnicodeEscaper} should limit themselves to its
214    * public interface.
215    *
216    * @param cp the Unicode code point to escape if necessary
217    * @return the replacement string, or {@code null} if no escaping was needed
218    */
computeReplacement(UnicodeEscaper escaper, int cp)219   public static String computeReplacement(UnicodeEscaper escaper, int cp) {
220     return stringOrNull(escaper.escape(cp));
221   }
222 
stringOrNull(char[] in)223   private static String stringOrNull(char[] in) {
224     return (in == null) ? null : new String(in);
225   }
226 
227   /** Private helper to wrap a CharEscaper as a UnicodeEscaper. */
wrap(final CharEscaper escaper)228   private static UnicodeEscaper wrap(final CharEscaper escaper) {
229     return new UnicodeEscaper() {
230       @Override protected char[] escape(int cp) {
231         // If a code point maps to a single character, just escape that.
232         if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
233           return escaper.escape((char) cp);
234         }
235         // Convert the code point to a surrogate pair and escape them both.
236         // Note: This code path is horribly slow and typically allocates 4 new
237         // char[] each time it is invoked. However this avoids any
238         // synchronization issues and makes the escaper thread safe.
239         char[] surrogateChars = new char[2];
240         Character.toChars(cp, surrogateChars, 0);
241         char[] hiChars = escaper.escape(surrogateChars[0]);
242         char[] loChars = escaper.escape(surrogateChars[1]);
243 
244         // If either hiChars or lowChars are non-null, the CharEscaper is trying
245         // to escape the characters of a surrogate pair separately. This is
246         // uncommon and applies only to escapers that assume UCS-2 rather than
247         // UTF-16. See: http://en.wikipedia.org/wiki/UTF-16/UCS-2
248         if (hiChars == null && loChars == null) {
249           // We expect this to be the common code path for most escapers.
250           return null;
251         }
252         // Combine the characters and/or escaped sequences into a single array.
253         int hiCount = hiChars != null ? hiChars.length : 1;
254         int loCount = loChars != null ? loChars.length : 1;
255         char[] output = new char[hiCount + loCount];
256         if (hiChars != null) {
257           // TODO: Is this faster than System.arraycopy() for small arrays?
258           for (int n = 0; n < hiChars.length; ++n) {
259             output[n] = hiChars[n];
260           }
261         } else {
262           output[0] = surrogateChars[0];
263         }
264         if (loChars != null) {
265           for (int n = 0; n < loChars.length; ++n) {
266             output[hiCount + n] = loChars[n];
267           }
268         } else {
269           output[hiCount] = surrogateChars[1];
270         }
271         return output;
272       }
273     };
274   }
275 }
276