1 use regex_automata::DFA;
2 
3 use ext_slice::ByteSlice;
4 use unicode::fsm::grapheme_break_fwd::GRAPHEME_BREAK_FWD;
5 use unicode::fsm::grapheme_break_rev::GRAPHEME_BREAK_REV;
6 use unicode::fsm::regional_indicator_rev::REGIONAL_INDICATOR_REV;
7 use utf8;
8 
9 /// An iterator over grapheme clusters in a byte string.
10 ///
11 /// This iterator is typically constructed by
12 /// [`ByteSlice::graphemes`](trait.ByteSlice.html#method.graphemes).
13 ///
14 /// Unicode defines a grapheme cluster as an *approximation* to a single user
15 /// visible character. A grapheme cluster, or just "grapheme," is made up of
16 /// one or more codepoints. For end user oriented tasks, one should generally
17 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
18 /// always yields one codepoint at a time.
19 ///
20 /// Since graphemes are made up of one or more codepoints, this iterator yields
21 /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
22 /// are [substituted](index.html#handling-of-invalid-utf-8).
23 ///
24 /// This iterator can be used in reverse. When reversed, exactly the same
25 /// set of grapheme clusters are yielded, but in reverse order.
26 ///
27 /// This iterator only yields *extended* grapheme clusters, in accordance with
28 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
29 #[derive(Clone, Debug)]
30 pub struct Graphemes<'a> {
31     bs: &'a [u8],
32 }
33 
34 impl<'a> Graphemes<'a> {
new(bs: &'a [u8]) -> Graphemes<'a>35     pub(crate) fn new(bs: &'a [u8]) -> Graphemes<'a> {
36         Graphemes { bs }
37     }
38 
39     /// View the underlying data as a subslice of the original data.
40     ///
41     /// The slice returned has the same lifetime as the original slice, and so
42     /// the iterator can continue to be used while this exists.
43     ///
44     /// # Examples
45     ///
46     /// ```
47     /// use bstr::ByteSlice;
48     ///
49     /// let mut it = b"abc".graphemes();
50     ///
51     /// assert_eq!(b"abc", it.as_bytes());
52     /// it.next();
53     /// assert_eq!(b"bc", it.as_bytes());
54     /// it.next();
55     /// it.next();
56     /// assert_eq!(b"", it.as_bytes());
57     /// ```
58     #[inline]
as_bytes(&self) -> &'a [u8]59     pub fn as_bytes(&self) -> &'a [u8] {
60         self.bs
61     }
62 }
63 
64 impl<'a> Iterator for Graphemes<'a> {
65     type Item = &'a str;
66 
67     #[inline]
next(&mut self) -> Option<&'a str>68     fn next(&mut self) -> Option<&'a str> {
69         let (grapheme, size) = decode_grapheme(self.bs);
70         if size == 0 {
71             return None;
72         }
73         self.bs = &self.bs[size..];
74         Some(grapheme)
75     }
76 }
77 
78 impl<'a> DoubleEndedIterator for Graphemes<'a> {
79     #[inline]
next_back(&mut self) -> Option<&'a str>80     fn next_back(&mut self) -> Option<&'a str> {
81         let (grapheme, size) = decode_last_grapheme(self.bs);
82         if size == 0 {
83             return None;
84         }
85         self.bs = &self.bs[..self.bs.len() - size];
86         Some(grapheme)
87     }
88 }
89 
90 /// An iterator over grapheme clusters in a byte string and their byte index
91 /// positions.
92 ///
93 /// This iterator is typically constructed by
94 /// [`ByteSlice::grapheme_indices`](trait.ByteSlice.html#method.grapheme_indices).
95 ///
96 /// Unicode defines a grapheme cluster as an *approximation* to a single user
97 /// visible character. A grapheme cluster, or just "grapheme," is made up of
98 /// one or more codepoints. For end user oriented tasks, one should generally
99 /// prefer using graphemes instead of [`Chars`](struct.Chars.html), which
100 /// always yields one codepoint at a time.
101 ///
102 /// Since graphemes are made up of one or more codepoints, this iterator
103 /// yields `&str` elements (along with their start and end byte offsets).
104 /// When invalid UTF-8 is encountered, replacement codepoints are
105 /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
106 /// indices yielded by this iterator may not correspond to the length of the
107 /// grapheme cluster yielded with those indices. For example, when this
108 /// iterator encounters `\xFF` in the byte string, then it will yield a pair
109 /// of indices ranging over a single byte, but will provide an `&str`
110 /// equivalent to `"\u{FFFD}"`, which is three bytes in length. However, when
111 /// given only valid UTF-8, then all indices are in exact correspondence with
112 /// their paired grapheme cluster.
113 ///
114 /// This iterator can be used in reverse. When reversed, exactly the same
115 /// set of grapheme clusters are yielded, but in reverse order.
116 ///
117 /// This iterator only yields *extended* grapheme clusters, in accordance with
118 /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Grapheme_Cluster_Boundaries).
119 #[derive(Clone, Debug)]
120 pub struct GraphemeIndices<'a> {
121     bs: &'a [u8],
122     forward_index: usize,
123     reverse_index: usize,
124 }
125 
126 impl<'a> GraphemeIndices<'a> {
new(bs: &'a [u8]) -> GraphemeIndices<'a>127     pub(crate) fn new(bs: &'a [u8]) -> GraphemeIndices<'a> {
128         GraphemeIndices { bs: bs, forward_index: 0, reverse_index: bs.len() }
129     }
130 
131     /// View the underlying data as a subslice of the original data.
132     ///
133     /// The slice returned has the same lifetime as the original slice, and so
134     /// the iterator can continue to be used while this exists.
135     ///
136     /// # Examples
137     ///
138     /// ```
139     /// use bstr::ByteSlice;
140     ///
141     /// let mut it = b"abc".grapheme_indices();
142     ///
143     /// assert_eq!(b"abc", it.as_bytes());
144     /// it.next();
145     /// assert_eq!(b"bc", it.as_bytes());
146     /// it.next();
147     /// it.next();
148     /// assert_eq!(b"", it.as_bytes());
149     /// ```
150     #[inline]
as_bytes(&self) -> &'a [u8]151     pub fn as_bytes(&self) -> &'a [u8] {
152         self.bs
153     }
154 }
155 
156 impl<'a> Iterator for GraphemeIndices<'a> {
157     type Item = (usize, usize, &'a str);
158 
159     #[inline]
next(&mut self) -> Option<(usize, usize, &'a str)>160     fn next(&mut self) -> Option<(usize, usize, &'a str)> {
161         let index = self.forward_index;
162         let (grapheme, size) = decode_grapheme(self.bs);
163         if size == 0 {
164             return None;
165         }
166         self.bs = &self.bs[size..];
167         self.forward_index += size;
168         Some((index, index + size, grapheme))
169     }
170 }
171 
172 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
173     #[inline]
next_back(&mut self) -> Option<(usize, usize, &'a str)>174     fn next_back(&mut self) -> Option<(usize, usize, &'a str)> {
175         let (grapheme, size) = decode_last_grapheme(self.bs);
176         if size == 0 {
177             return None;
178         }
179         self.bs = &self.bs[..self.bs.len() - size];
180         self.reverse_index -= size;
181         Some((self.reverse_index, self.reverse_index + size, grapheme))
182     }
183 }
184 
185 /// Decode a grapheme from the given byte string.
186 ///
187 /// This returns the resulting grapheme (which may be a Unicode replacement
188 /// codepoint if invalid UTF-8 was found), along with the number of bytes
189 /// decoded in the byte string. The number of bytes decoded may not be the
190 /// same as the length of grapheme in the case where invalid UTF-8 is found.
decode_grapheme(bs: &[u8]) -> (&str, usize)191 pub fn decode_grapheme(bs: &[u8]) -> (&str, usize) {
192     if bs.is_empty() {
193         ("", 0)
194     } else if let Some(end) = GRAPHEME_BREAK_FWD.find(bs) {
195         // Safe because a match can only occur for valid UTF-8.
196         let grapheme = unsafe { bs[..end].to_str_unchecked() };
197         (grapheme, grapheme.len())
198     } else {
199         const INVALID: &'static str = "\u{FFFD}";
200         // No match on non-empty bytes implies we found invalid UTF-8.
201         let (_, size) = utf8::decode_lossy(bs);
202         (INVALID, size)
203     }
204 }
205 
decode_last_grapheme(bs: &[u8]) -> (&str, usize)206 fn decode_last_grapheme(bs: &[u8]) -> (&str, usize) {
207     if bs.is_empty() {
208         ("", 0)
209     } else if let Some(mut start) = GRAPHEME_BREAK_REV.rfind(bs) {
210         start = adjust_rev_for_regional_indicator(bs, start);
211         // Safe because a match can only occur for valid UTF-8.
212         let grapheme = unsafe { bs[start..].to_str_unchecked() };
213         (grapheme, grapheme.len())
214     } else {
215         const INVALID: &'static str = "\u{FFFD}";
216         // No match on non-empty bytes implies we found invalid UTF-8.
217         let (_, size) = utf8::decode_last_lossy(bs);
218         (INVALID, size)
219     }
220 }
221 
222 /// Return the correct offset for the next grapheme decoded at the end of the
223 /// given byte string, where `i` is the initial guess. In particular,
224 /// `&bs[i..]` represents the candidate grapheme.
225 ///
226 /// `i` is returned by this function in all cases except when `&bs[i..]` is
227 /// a pair of regional indicator codepoints. In that case, if an odd number of
228 /// additional regional indicator codepoints precedes `i`, then `i` is
229 /// adjusted such that it points to only a single regional indicator.
230 ///
231 /// This "fixing" is necessary to handle the requirement that a break cannot
232 /// occur between regional indicators where it would cause an odd number of
233 /// regional indicators to exist before the break from the *start* of the
234 /// string. A reverse regex cannot detect this case easily without look-around.
adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize235 fn adjust_rev_for_regional_indicator(mut bs: &[u8], i: usize) -> usize {
236     // All regional indicators use a 4 byte encoding, and we only care about
237     // the case where we found a pair of regional indicators.
238     if bs.len() - i != 8 {
239         return i;
240     }
241     // Count all contiguous occurrences of regional indicators. If there's an
242     // even number of them, then we can accept the pair we found. Otherwise,
243     // we can only take one of them.
244     //
245     // FIXME: This is quadratic in the worst case, e.g., a string of just
246     // regional indicator codepoints. A fix probably requires refactoring this
247     // code a bit such that we don't rescan regional indicators.
248     let mut count = 0;
249     while let Some(start) = REGIONAL_INDICATOR_REV.rfind(bs) {
250         bs = &bs[..start];
251         count += 1;
252     }
253     if count % 2 == 0 {
254         i
255     } else {
256         i + 4
257     }
258 }
259 
260 #[cfg(test)]
261 mod tests {
262     use ucd_parse::GraphemeClusterBreakTest;
263 
264     use super::*;
265     use ext_slice::ByteSlice;
266     use tests::LOSSY_TESTS;
267 
268     #[test]
forward_ucd()269     fn forward_ucd() {
270         for (i, test) in ucdtests().into_iter().enumerate() {
271             let given = test.grapheme_clusters.concat();
272             let got: Vec<String> = Graphemes::new(given.as_bytes())
273                 .map(|cluster| cluster.to_string())
274                 .collect();
275             assert_eq!(
276                 test.grapheme_clusters,
277                 got,
278                 "\ngrapheme forward break test {} failed:\n\
279                  given:    {:?}\n\
280                  expected: {:?}\n\
281                  got:      {:?}\n",
282                 i,
283                 uniescape(&given),
284                 uniescape_vec(&test.grapheme_clusters),
285                 uniescape_vec(&got),
286             );
287         }
288     }
289 
290     #[test]
reverse_ucd()291     fn reverse_ucd() {
292         for (i, test) in ucdtests().into_iter().enumerate() {
293             let given = test.grapheme_clusters.concat();
294             let mut got: Vec<String> = Graphemes::new(given.as_bytes())
295                 .rev()
296                 .map(|cluster| cluster.to_string())
297                 .collect();
298             got.reverse();
299             assert_eq!(
300                 test.grapheme_clusters,
301                 got,
302                 "\n\ngrapheme reverse break test {} failed:\n\
303                  given:    {:?}\n\
304                  expected: {:?}\n\
305                  got:      {:?}\n",
306                 i,
307                 uniescape(&given),
308                 uniescape_vec(&test.grapheme_clusters),
309                 uniescape_vec(&got),
310             );
311         }
312     }
313 
314     #[test]
forward_lossy()315     fn forward_lossy() {
316         for &(expected, input) in LOSSY_TESTS {
317             let got = Graphemes::new(input.as_bytes()).collect::<String>();
318             assert_eq!(expected, got);
319         }
320     }
321 
322     #[test]
reverse_lossy()323     fn reverse_lossy() {
324         for &(expected, input) in LOSSY_TESTS {
325             let expected: String = expected.chars().rev().collect();
326             let got =
327                 Graphemes::new(input.as_bytes()).rev().collect::<String>();
328             assert_eq!(expected, got);
329         }
330     }
331 
uniescape(s: &str) -> String332     fn uniescape(s: &str) -> String {
333         s.chars().flat_map(|c| c.escape_unicode()).collect::<String>()
334     }
335 
uniescape_vec(strs: &[String]) -> Vec<String>336     fn uniescape_vec(strs: &[String]) -> Vec<String> {
337         strs.iter().map(|s| uniescape(s)).collect()
338     }
339 
340     /// Return all of the UCD for grapheme breaks.
ucdtests() -> Vec<GraphemeClusterBreakTest>341     fn ucdtests() -> Vec<GraphemeClusterBreakTest> {
342         const TESTDATA: &'static str =
343             include_str!("data/GraphemeBreakTest.txt");
344 
345         let mut tests = vec![];
346         for mut line in TESTDATA.lines() {
347             line = line.trim();
348             if line.starts_with("#") || line.contains("surrogate") {
349                 continue;
350             }
351             tests.push(line.parse().unwrap());
352         }
353         tests
354     }
355 }
356