1 use core::fmt; 2 3 /// A representation of byte oriented equivalence classes. 4 /// 5 /// This is used in a DFA to reduce the size of the transition table. This can 6 /// have a particularly large impact not only on the total size of a dense DFA, 7 /// but also on compile times. 8 #[derive(Clone, Copy)] 9 pub struct ByteClasses([u8; 256]); 10 11 impl ByteClasses { 12 /// Creates a new set of equivalence classes where all bytes are mapped to 13 /// the same class. empty() -> ByteClasses14 pub fn empty() -> ByteClasses { 15 ByteClasses([0; 256]) 16 } 17 18 /// Creates a new set of equivalence classes where each byte belongs to 19 /// its own equivalence class. singletons() -> ByteClasses20 pub fn singletons() -> ByteClasses { 21 let mut classes = ByteClasses::empty(); 22 for i in 0..256 { 23 classes.set(i as u8, i as u8); 24 } 25 classes 26 } 27 28 /// Copies the byte classes given. The given slice must have length 0 or 29 /// length 256. Slices of length 0 are treated as singletons (every byte 30 /// is its own class). from_slice(slice: &[u8]) -> ByteClasses31 pub fn from_slice(slice: &[u8]) -> ByteClasses { 32 assert!(slice.is_empty() || slice.len() == 256); 33 34 if slice.is_empty() { 35 ByteClasses::singletons() 36 } else { 37 let mut classes = ByteClasses::empty(); 38 for (b, &class) in slice.iter().enumerate() { 39 classes.set(b as u8, class); 40 } 41 classes 42 } 43 } 44 45 /// Set the equivalence class for the given byte. 46 #[inline] set(&mut self, byte: u8, class: u8)47 pub fn set(&mut self, byte: u8, class: u8) { 48 self.0[byte as usize] = class; 49 } 50 51 /// Get the equivalence class for the given byte. 52 #[inline] get(&self, byte: u8) -> u853 pub fn get(&self, byte: u8) -> u8 { 54 self.0[byte as usize] 55 } 56 57 /// Get the equivalence class for the given byte while forcefully 58 /// eliding bounds checks. 59 #[inline] get_unchecked(&self, byte: u8) -> u860 pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { 61 *self.0.get_unchecked(byte as usize) 62 } 63 64 /// Return the total number of elements in the alphabet represented by 65 /// these equivalence classes. Equivalently, this returns the total number 66 /// of equivalence classes. 67 #[inline] alphabet_len(&self) -> usize68 pub fn alphabet_len(&self) -> usize { 69 self.0[255] as usize + 1 70 } 71 72 /// Returns true if and only if every byte in this class maps to its own 73 /// equivalence class. Equivalently, there are 256 equivalence classes 74 /// and each class contains exactly one byte. 75 #[inline] is_singleton(&self) -> bool76 pub fn is_singleton(&self) -> bool { 77 self.alphabet_len() == 256 78 } 79 80 /// Returns an iterator over a sequence of representative bytes from each 81 /// equivalence class. Namely, this yields exactly N items, where N is 82 /// equivalent to the number of equivalence classes. Each item is an 83 /// arbitrary byte drawn from each equivalence class. 84 /// 85 /// This is useful when one is determinizing an NFA and the NFA's alphabet 86 /// hasn't been converted to equivalence classes yet. Picking an arbitrary 87 /// byte from each equivalence class then permits a full exploration of 88 /// the NFA instead of using every possible byte value. 89 #[cfg(feature = "std")] representatives(&self) -> ByteClassRepresentatives90 pub fn representatives(&self) -> ByteClassRepresentatives { 91 ByteClassRepresentatives { classes: self, byte: 0, last_class: None } 92 } 93 94 /// Returns all of the bytes in the given equivalence class. 95 /// 96 /// The second element in the tuple indicates the number of elements in 97 /// the array. elements(&self, equiv: u8) -> ([u8; 256], usize)98 fn elements(&self, equiv: u8) -> ([u8; 256], usize) { 99 let (mut array, mut len) = ([0; 256], 0); 100 for b in 0..256 { 101 if self.get(b as u8) == equiv { 102 array[len] = b as u8; 103 len += 1; 104 } 105 } 106 (array, len) 107 } 108 } 109 110 impl fmt::Debug for ByteClasses { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result111 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 112 if self.is_singleton() { 113 write!(f, "ByteClasses({{singletons}})") 114 } else { 115 write!(f, "ByteClasses(")?; 116 for equiv in 0..self.alphabet_len() { 117 let (members, len) = self.elements(equiv as u8); 118 write!(f, "{} => {:?}", equiv, &members[..len])?; 119 } 120 write!(f, ")") 121 } 122 } 123 } 124 125 /// An iterator over representative bytes from each equivalence class. 126 #[cfg(feature = "std")] 127 #[derive(Debug)] 128 pub struct ByteClassRepresentatives<'a> { 129 classes: &'a ByteClasses, 130 byte: usize, 131 last_class: Option<u8>, 132 } 133 134 #[cfg(feature = "std")] 135 impl<'a> Iterator for ByteClassRepresentatives<'a> { 136 type Item = u8; 137 next(&mut self) -> Option<u8>138 fn next(&mut self) -> Option<u8> { 139 while self.byte < 256 { 140 let byte = self.byte as u8; 141 let class = self.classes.get(byte); 142 self.byte += 1; 143 144 if self.last_class != Some(class) { 145 self.last_class = Some(class); 146 return Some(byte); 147 } 148 } 149 None 150 } 151 } 152 153 /// A byte class set keeps track of an *approximation* of equivalence classes 154 /// of bytes during NFA construction. That is, every byte in an equivalence 155 /// class cannot discriminate between a match and a non-match. 156 /// 157 /// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the 158 /// same equivalence class because it never matters whether an `a` or a `b` is 159 /// seen, and no combination of `a`s and `b`s in the text can discriminate 160 /// a match. 161 /// 162 /// Note though that this does not compute the minimal set of equivalence 163 /// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the 164 /// same equivalence class for the same reason that `a` and `b` are in the 165 /// same equivalence class in the aforementioned regex. However, in this 166 /// implementation, `a` and `c` are put into distinct equivalence classes. 167 /// The reason for this is implementation complexity. In the future, we should 168 /// endeavor to compute the minimal equivalence classes since they can have a 169 /// rather large impact on the size of the DFA. 170 /// 171 /// The representation here is 256 booleans, all initially set to false. Each 172 /// boolean maps to its corresponding byte based on position. A `true` value 173 /// indicates the end of an equivalence class, where its corresponding byte 174 /// and all of the bytes corresponding to all previous contiguous `false` 175 /// values are in the same equivalence class. 176 /// 177 /// This particular representation only permits contiguous ranges of bytes to 178 /// be in the same equivalence class, which means that we can never discover 179 /// the true minimal set of equivalence classes. 180 #[cfg(feature = "std")] 181 #[derive(Debug)] 182 pub struct ByteClassSet(Vec<bool>); 183 184 #[cfg(feature = "std")] 185 impl ByteClassSet { 186 /// Create a new set of byte classes where all bytes are part of the same 187 /// equivalence class. new() -> Self188 pub fn new() -> Self { 189 ByteClassSet(vec![false; 256]) 190 } 191 192 /// Indicate the the range of byte given (inclusive) can discriminate a 193 /// match between it and all other bytes outside of the range. set_range(&mut self, start: u8, end: u8)194 pub fn set_range(&mut self, start: u8, end: u8) { 195 debug_assert!(start <= end); 196 if start > 0 { 197 self.0[start as usize - 1] = true; 198 } 199 self.0[end as usize] = true; 200 } 201 202 /// Convert this boolean set to a map that maps all byte values to their 203 /// corresponding equivalence class. The last mapping indicates the largest 204 /// equivalence class identifier (which is never bigger than 255). byte_classes(&self) -> ByteClasses205 pub fn byte_classes(&self) -> ByteClasses { 206 let mut classes = ByteClasses::empty(); 207 let mut class = 0u8; 208 let mut i = 0; 209 loop { 210 classes.set(i as u8, class as u8); 211 if i >= 255 { 212 break; 213 } 214 if self.0[i] { 215 class = class.checked_add(1).unwrap(); 216 } 217 i += 1; 218 } 219 classes 220 } 221 } 222 223 #[cfg(test)] 224 mod tests { 225 #[cfg(feature = "std")] 226 #[test] byte_classes()227 fn byte_classes() { 228 use super::ByteClassSet; 229 230 let mut set = ByteClassSet::new(); 231 set.set_range(b'a', b'z'); 232 233 let classes = set.byte_classes(); 234 assert_eq!(classes.get(0), 0); 235 assert_eq!(classes.get(1), 0); 236 assert_eq!(classes.get(2), 0); 237 assert_eq!(classes.get(b'a' - 1), 0); 238 assert_eq!(classes.get(b'a'), 1); 239 assert_eq!(classes.get(b'm'), 1); 240 assert_eq!(classes.get(b'z'), 1); 241 assert_eq!(classes.get(b'z' + 1), 2); 242 assert_eq!(classes.get(254), 2); 243 assert_eq!(classes.get(255), 2); 244 245 let mut set = ByteClassSet::new(); 246 set.set_range(0, 2); 247 set.set_range(4, 6); 248 let classes = set.byte_classes(); 249 assert_eq!(classes.get(0), 0); 250 assert_eq!(classes.get(1), 0); 251 assert_eq!(classes.get(2), 0); 252 assert_eq!(classes.get(3), 1); 253 assert_eq!(classes.get(4), 2); 254 assert_eq!(classes.get(5), 2); 255 assert_eq!(classes.get(6), 2); 256 assert_eq!(classes.get(7), 3); 257 assert_eq!(classes.get(255), 3); 258 } 259 260 #[cfg(feature = "std")] 261 #[test] full_byte_classes()262 fn full_byte_classes() { 263 use super::ByteClassSet; 264 265 let mut set = ByteClassSet::new(); 266 for i in 0..256u16 { 267 set.set_range(i as u8, i as u8); 268 } 269 assert_eq!(set.byte_classes().alphabet_len(), 256); 270 } 271 } 272