1 use core::fmt;
2 
3 /// A representation of byte oriented equivalence classes.
4 ///
5 /// This is used in a DFA to reduce the size of the transition table. This can
6 /// have a particularly large impact not only on the total size of a dense DFA,
7 /// but also on compile times.
8 #[derive(Clone, Copy)]
9 pub struct ByteClasses([u8; 256]);
10 
11 impl ByteClasses {
12     /// Creates a new set of equivalence classes where all bytes are mapped to
13     /// the same class.
empty() -> ByteClasses14     pub fn empty() -> ByteClasses {
15         ByteClasses([0; 256])
16     }
17 
18     /// Creates a new set of equivalence classes where each byte belongs to
19     /// its own equivalence class.
singletons() -> ByteClasses20     pub fn singletons() -> ByteClasses {
21         let mut classes = ByteClasses::empty();
22         for i in 0..256 {
23             classes.set(i as u8, i as u8);
24         }
25         classes
26     }
27 
28     /// Copies the byte classes given. The given slice must have length 0 or
29     /// length 256. Slices of length 0 are treated as singletons (every byte
30     /// is its own class).
from_slice(slice: &[u8]) -> ByteClasses31     pub fn from_slice(slice: &[u8]) -> ByteClasses {
32         assert!(slice.is_empty() || slice.len() == 256);
33 
34         if slice.is_empty() {
35             ByteClasses::singletons()
36         } else {
37             let mut classes = ByteClasses::empty();
38             for (b, &class) in slice.iter().enumerate() {
39                 classes.set(b as u8, class);
40             }
41             classes
42         }
43     }
44 
45     /// Set the equivalence class for the given byte.
46     #[inline]
set(&mut self, byte: u8, class: u8)47     pub fn set(&mut self, byte: u8, class: u8) {
48         self.0[byte as usize] = class;
49     }
50 
51     /// Get the equivalence class for the given byte.
52     #[inline]
get(&self, byte: u8) -> u853     pub fn get(&self, byte: u8) -> u8 {
54         self.0[byte as usize]
55     }
56 
57     /// Get the equivalence class for the given byte while forcefully
58     /// eliding bounds checks.
59     #[inline]
get_unchecked(&self, byte: u8) -> u860     pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
61         *self.0.get_unchecked(byte as usize)
62     }
63 
64     /// Return the total number of elements in the alphabet represented by
65     /// these equivalence classes. Equivalently, this returns the total number
66     /// of equivalence classes.
67     #[inline]
alphabet_len(&self) -> usize68     pub fn alphabet_len(&self) -> usize {
69         self.0[255] as usize + 1
70     }
71 
72     /// Returns true if and only if every byte in this class maps to its own
73     /// equivalence class. Equivalently, there are 256 equivalence classes
74     /// and each class contains exactly one byte.
75     #[inline]
is_singleton(&self) -> bool76     pub fn is_singleton(&self) -> bool {
77         self.alphabet_len() == 256
78     }
79 
80     /// Returns an iterator over a sequence of representative bytes from each
81     /// equivalence class. Namely, this yields exactly N items, where N is
82     /// equivalent to the number of equivalence classes. Each item is an
83     /// arbitrary byte drawn from each equivalence class.
84     ///
85     /// This is useful when one is determinizing an NFA and the NFA's alphabet
86     /// hasn't been converted to equivalence classes yet. Picking an arbitrary
87     /// byte from each equivalence class then permits a full exploration of
88     /// the NFA instead of using every possible byte value.
89     #[cfg(feature = "std")]
representatives(&self) -> ByteClassRepresentatives90     pub fn representatives(&self) -> ByteClassRepresentatives {
91         ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
92     }
93 
94     /// Returns all of the bytes in the given equivalence class.
95     ///
96     /// The second element in the tuple indicates the number of elements in
97     /// the array.
elements(&self, equiv: u8) -> ([u8; 256], usize)98     fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
99         let (mut array, mut len) = ([0; 256], 0);
100         for b in 0..256 {
101             if self.get(b as u8) == equiv {
102                 array[len] = b as u8;
103                 len += 1;
104             }
105         }
106         (array, len)
107     }
108 }
109 
110 impl fmt::Debug for ByteClasses {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result111     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
112         if self.is_singleton() {
113             write!(f, "ByteClasses({{singletons}})")
114         } else {
115             write!(f, "ByteClasses(")?;
116             for equiv in 0..self.alphabet_len() {
117                 let (members, len) = self.elements(equiv as u8);
118                 write!(f, "{} => {:?}", equiv, &members[..len])?;
119             }
120             write!(f, ")")
121         }
122     }
123 }
124 
125 /// An iterator over representative bytes from each equivalence class.
126 #[cfg(feature = "std")]
127 #[derive(Debug)]
128 pub struct ByteClassRepresentatives<'a> {
129     classes: &'a ByteClasses,
130     byte: usize,
131     last_class: Option<u8>,
132 }
133 
134 #[cfg(feature = "std")]
135 impl<'a> Iterator for ByteClassRepresentatives<'a> {
136     type Item = u8;
137 
next(&mut self) -> Option<u8>138     fn next(&mut self) -> Option<u8> {
139         while self.byte < 256 {
140             let byte = self.byte as u8;
141             let class = self.classes.get(byte);
142             self.byte += 1;
143 
144             if self.last_class != Some(class) {
145                 self.last_class = Some(class);
146                 return Some(byte);
147             }
148         }
149         None
150     }
151 }
152 
153 /// A byte class set keeps track of an *approximation* of equivalence classes
154 /// of bytes during NFA construction. That is, every byte in an equivalence
155 /// class cannot discriminate between a match and a non-match.
156 ///
157 /// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
158 /// same equivalence class because it never matters whether an `a` or a `b` is
159 /// seen, and no combination of `a`s and `b`s in the text can discriminate
160 /// a match.
161 ///
162 /// Note though that this does not compute the minimal set of equivalence
163 /// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
164 /// same equivalence class for the same reason that `a` and `b` are in the
165 /// same equivalence class in the aforementioned regex. However, in this
166 /// implementation, `a` and `c` are put into distinct equivalence classes.
167 /// The reason for this is implementation complexity. In the future, we should
168 /// endeavor to compute the minimal equivalence classes since they can have a
169 /// rather large impact on the size of the DFA.
170 ///
171 /// The representation here is 256 booleans, all initially set to false. Each
172 /// boolean maps to its corresponding byte based on position. A `true` value
173 /// indicates the end of an equivalence class, where its corresponding byte
174 /// and all of the bytes corresponding to all previous contiguous `false`
175 /// values are in the same equivalence class.
176 ///
177 /// This particular representation only permits contiguous ranges of bytes to
178 /// be in the same equivalence class, which means that we can never discover
179 /// the true minimal set of equivalence classes.
180 #[cfg(feature = "std")]
181 #[derive(Debug)]
182 pub struct ByteClassSet(Vec<bool>);
183 
184 #[cfg(feature = "std")]
185 impl ByteClassSet {
186     /// Create a new set of byte classes where all bytes are part of the same
187     /// equivalence class.
new() -> Self188     pub fn new() -> Self {
189         ByteClassSet(vec![false; 256])
190     }
191 
192     /// Indicate the the range of byte given (inclusive) can discriminate a
193     /// match between it and all other bytes outside of the range.
set_range(&mut self, start: u8, end: u8)194     pub fn set_range(&mut self, start: u8, end: u8) {
195         debug_assert!(start <= end);
196         if start > 0 {
197             self.0[start as usize - 1] = true;
198         }
199         self.0[end as usize] = true;
200     }
201 
202     /// Convert this boolean set to a map that maps all byte values to their
203     /// corresponding equivalence class. The last mapping indicates the largest
204     /// equivalence class identifier (which is never bigger than 255).
byte_classes(&self) -> ByteClasses205     pub fn byte_classes(&self) -> ByteClasses {
206         let mut classes = ByteClasses::empty();
207         let mut class = 0u8;
208         let mut i = 0;
209         loop {
210             classes.set(i as u8, class as u8);
211             if i >= 255 {
212                 break;
213             }
214             if self.0[i] {
215                 class = class.checked_add(1).unwrap();
216             }
217             i += 1;
218         }
219         classes
220     }
221 }
222 
223 #[cfg(test)]
224 mod tests {
225     #[cfg(feature = "std")]
226     #[test]
byte_classes()227     fn byte_classes() {
228         use super::ByteClassSet;
229 
230         let mut set = ByteClassSet::new();
231         set.set_range(b'a', b'z');
232 
233         let classes = set.byte_classes();
234         assert_eq!(classes.get(0), 0);
235         assert_eq!(classes.get(1), 0);
236         assert_eq!(classes.get(2), 0);
237         assert_eq!(classes.get(b'a' - 1), 0);
238         assert_eq!(classes.get(b'a'), 1);
239         assert_eq!(classes.get(b'm'), 1);
240         assert_eq!(classes.get(b'z'), 1);
241         assert_eq!(classes.get(b'z' + 1), 2);
242         assert_eq!(classes.get(254), 2);
243         assert_eq!(classes.get(255), 2);
244 
245         let mut set = ByteClassSet::new();
246         set.set_range(0, 2);
247         set.set_range(4, 6);
248         let classes = set.byte_classes();
249         assert_eq!(classes.get(0), 0);
250         assert_eq!(classes.get(1), 0);
251         assert_eq!(classes.get(2), 0);
252         assert_eq!(classes.get(3), 1);
253         assert_eq!(classes.get(4), 2);
254         assert_eq!(classes.get(5), 2);
255         assert_eq!(classes.get(6), 2);
256         assert_eq!(classes.get(7), 3);
257         assert_eq!(classes.get(255), 3);
258     }
259 
260     #[cfg(feature = "std")]
261     #[test]
full_byte_classes()262     fn full_byte_classes() {
263         use super::ByteClassSet;
264 
265         let mut set = ByteClassSet::new();
266         for i in 0..256u16 {
267             set.set_range(i as u8, i as u8);
268         }
269         assert_eq!(set.byte_classes().alphabet_len(), 256);
270     }
271 }
272