/* * Copyright 2024 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ use std::cmp; /// An implementation of hyphenation for Android. /// /// The hyphenation in Android is done with two steps: first performs the Knuth-Liang hyphenation /// algorithm for populating all possible hyphenation points. Then, resolve hyphenation type from /// the scripts and locales. /// /// The Knuth-Liang hyphenation works like as follows: /// The Knuth-Liang hyphenation uses two dictionary: pattern dictionary and exception files. The /// files end with ".hyp.txt" are exception files and the files end with ".pat.txt" are pattern /// files. If the word is in exception file, the hyphenation is performed as it is specified in the /// exception file. /// /// Then, if the word is not in the exception file, the Knuth-Liang hyphenation is performed with /// hyphenation pattern dictionary. The hyphenation pattern dictionary is a list of sub-word with /// hyphenation level as number. The level values are assigned between each letters including before /// the first letter and last letter. If the value is odd number, then the position is a hyphenation /// point. If the value is even number, then the position is not a hyphenation point. In the pattern /// file, the 0 is dropped, so the meaning of "re4ti4z" is level values "0040040" for sub-word /// "retiz". The hyphenation is performed by iterating all patterns and assigning level values to /// the possible break points. If the break point can be assigned from multiple patterns, the /// maximum value is used. If none of the pattern matches the break point, the level is zero, /// therefore do not break. And finally the odd numbered positions are the break points. /// /// Here is an example how the "hyphenation" is broken into "hy-phen-ation". /// The relevant patterns in the pattern dictionary are /// - hy3ph /// - he2n /// - hena4 /// - hen5at /// - 1na /// - n2at /// - 1tio /// - 2io /// - o2n /// /// Then when these patterns are applied to the word "hyphenation", it becomes like /// /// h y p h e n a t i o n /// 0 0 3 0 0 : hy3ph /// 0 0 2 0 : he2n /// 0 0 0 0 4 : hena4 /// 0 0 0 5 0 0 : hen5at /// 1 0 0 : 1na /// 0 2 0 0 : n2at /// 1 0 0 0 : 1tio /// 2 0 0 : 2io /// 0 2 0: o2n /// --------------------------------- /// 0 0 3 0 0 2 5 4 2 0 2 0: max /// /// Then, the odd-numbered break points are hyphenation allowed break points, so the result is /// "hy-phen-ation". /// /// In the Android implementation, the hyphenation pattern file is preprocessed to Trie in build /// time. For the detailed file format, see comments of HyphenationData struct. /// /// Once the all possible hyphenation break points are collected, the decide the hyphenation break /// type is determined based on the script and locale. For example, in case of Arabic, the letter /// form should not be changed by hyphenation, so ZWJ can be inserted before and after hyphen /// letter. const CHAR_SOFT_HYPHEN: u16 = 0x00AD; const CHAR_MIDDLE_DOT: u16 = 0x00B7; const CHAR_HYPHEN_MINUS: u16 = 0x002D; const CHAR_HYPHEN: u16 = 0x2010; // The following U_JT_* constants must be same to the ones defined in // frameworks/minikin/lib/minikin/ffi/IciBridge.h // TODO: Replace with ICU4X once it becomes available in Android. const U_JT_NON_JOINING: u8 = 0; const U_JT_DUAL_JOINING: u8 = 1; const U_JT_RIGHT_JOINING: u8 = 2; const U_JT_LEFT_JOINING: u8 = 3; const U_JT_JOIN_CAUSING: u8 = 4; const U_JT_TRANSPARENT: u8 = 5; // The following USCRIPT_* constants must be same to the ones defined in // frameworks/minikin/lib/minikin/ffi/IciBridge.h // TODO: Replace with ICU4X once it becomes available in Android. const USCRIPT_LATIN: u8 = 0; const USCRIPT_ARABIC: u8 = 1; const USCRIPT_KANNADA: u8 = 2; const USCRIPT_MALAYALAM: u8 = 3; const USCRIPT_TAMIL: u8 = 4; const USCRIPT_TELUGU: u8 = 5; const USCRIPT_ARMENIAN: u8 = 6; const USCRIPT_CANADIAN_ABORIGINAL: u8 = 7; use crate::ffi::getJoiningType; use crate::ffi::getScript; /// Hyphenation types /// The following values must be equal to the ones in /// frameworks/minikin/include/minikin/Hyphenator.h #[repr(u8)] #[derive(PartialEq, Copy, Clone)] pub enum HyphenationType { /// Do not break. DontBreak = 0, /// Break the line and insert a normal hyphen. BreakAndInsertHyphen = 1, /// Break the line and insert an Armenian hyphen (U+058A). BreakAndInsertArmenianHyphen = 2, /// Break the line and insert a Canadian Syllabics hyphen (U+1400). BreakAndInsertUcasHyphen = 4, /// Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen /// present or the script does not use a hyphen (e.g. in Malayalam). BreakAndDontInsertHyphen = 5, /// Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates /// as "l-/l". BreakAndReplaceWithHyphen = 6, /// Break the line, and repeat the hyphen (which is the last character) at the beginning of the /// next line. Used in Polish (where "czerwono-niebieska" should hyphenate as /// "czerwono-/-niebieska") and Slovenian. BreakAndInsertHyphenAtNextLine = 7, /// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line. /// This is used in Arabic script, mostly for writing systems of Central Asia. It's our default /// behavior when a soft hyphen is used in Arabic script. BreakAndInsertHyphenAndZwj = 8, } /// Hyphenation locale #[repr(u8)] #[derive(PartialEq, Copy, Clone)] pub enum HyphenationLocale { /// Other locale Other = 0, /// Catalan Catalan = 1, /// Polish Polish = 2, /// Slovenian Slovenian = 3, } const MAX_HYPHEN_SIZE: u32 = 64; struct HyphenationData<'a> { bytes: &'a [u8], } /// The Hyphenation pattern file is encoded into binary format during build time. /// The hyphenation pattern file is encoded into three objects: AlphabetTable, Trie, Patterns. /// /// First, to avoid high value of utf16 char values in Trie object, char values are mapped to /// internal alphabet codes. The AlphabetTable0 and AndroidTable1 has a map from utf16 char values /// to internal u16 alphabet codes. The AlphabetTable0 is used if the min and max used code points /// has less than 1024, i.e. max_codepoint - min_codepoint < 1024. The AlphabetTable1 is used /// otherwise. /// /// Then, the pattern file is encoded with Trie and Pattern object with using internal /// alphabet code. For example, in case of the entry "ef5i5nite", the hyphenation score "00550000" /// is stored in the Pattern object and the subword "efinite" is stored in the Trie object. /// /// The Trie object is encoded as one dimensional u32 arrays. Each u32 integer contains packed /// index to the Pattern object, index to the next node entry and alphabet code. /// Trie Entry: /// 0 1 2 3 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | index to pattern data | index to the next node | code | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// Note: the layout is as an example of pattern_shift = 19, link_shift = 5. /// /// The Pattern object is encoded into two data: entry list and data payload. The entry is a packed /// u32 integer that contains length of the pattern, an amount of shift of the pattern index and /// an offset from the payload head. /// /// Pattern Entry: /// 0 1 2 3 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// |len of pat | pat shift | offset to the pattern data | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// /// The pattern data and related information can be obtained as follows: /// /// // Pattern information /// let entry = pattern[16 + pattern_index * 4] // read u32 as little endian. /// let pattern_length = entry >> 26 /// let pattern_shift = (entry > 20) & 0x3f /// /// // Pattern value retrieval: i-th offset in the word. /// let pattern_offset = pattern[8] // read u32 as little endian. /// let pattern_value = pattern[pattern_offset + (entry & 0xfffff) + i] impl<'a> HyphenationData<'a> { pub const fn new(bytes: &'a [u8]) -> Self { HyphenationData { bytes } } pub fn read_u32(&self, offset: u32) -> u32 { let usize_offset = offset as usize; self.bytes .get(usize_offset..usize_offset + 4) .map(|x: &[u8]| u32::from_le_bytes(x.try_into().unwrap())) .unwrap() } } /// Header struct of the hyphenation pattern file. /// The object layout follows: /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | magic | version |alphabet offset| trie offset | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// |pattern offset | file size | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ pub struct Header<'a> { data: HyphenationData<'a>, } /// Alphabet Table version 0 struct of the hyphenation pattern file. /// The object layout follows: /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | version | min codepoint | max codepoint | payload /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ pub struct AlphabetTable0<'a> { data: HyphenationData<'a>, min_codepoint: u32, max_codepoint: u32, } /// Alphabet Table version 1 struct of the hyphenation pattern file. /// The object layout follows: /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | version | num of entries| payload /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ pub struct AlphabetTable1<'a> { data: HyphenationData<'a>, num_entries: u32, } /// An entry of alphabet table version 1 struct of the hyphenation pattern file. /// The entry is packed u32 value: the high 21 bits are code point and low 11 bits /// are alphabet code value. /// 0 1 2 3 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | code point | code value | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ pub struct AlphabetTable1Entry { entry: u32, } /// Trie struct of the hyphenation pattern file. /// The object layout follows: /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | version | char mask | link shift | link mask | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | pattern shift | num entries | payload /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ pub struct Trie<'a> { data: HyphenationData<'a>, } /// Pattern struct of the hyphenation pattern file. /// The object layout follows: /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | version | num entries | pattern offset| pattern size | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | payload /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ pub struct Pattern<'a> { data: HyphenationData<'a>, pattern_offset: u32, } /// An entry of pattern struct of the hyphenation pattern file. /// The entry is packed u32 value: the highest 6 bits are for length, next 6 bits are amount of /// shift, and lowest 20 bits are offset of the first value from the pattern offset value. /// 0 1 2 3 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ /// | length | shift | offset of the first value | /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ pub struct PatternEntry<'a> { data: HyphenationData<'a>, pattern_offset: u32, entry: u32, } impl<'a> Header<'a> { /// Construct a reader of the Header struct from the byte array. pub const fn new(bytes: &'a [u8]) -> Self { Header { data: HyphenationData::new(bytes) } } /// Returns the reader of the alphabet code. pub fn alphabet_table(&self) -> Option> { let offset = self.data.read_u32(8); let version = self.data.read_u32(offset); return match version { 0 => Some(Box::new(AlphabetTable0::new(self.read_offset_and_slice(8)))), 1 => Some(Box::new(AlphabetTable1::new(self.read_offset_and_slice(8)))), _ => None, }; } /// Returns the reader of the trie struct. pub fn trie_table(&self) -> Trie<'a> { Trie::new(self.read_offset_and_slice(12)) } /// Returns the reader of the pattern struct. pub fn pattern_table(&self) -> Pattern<'a> { Pattern::new(self.read_offset_and_slice(16)) } fn read_offset_and_slice(&self, offset: u32) -> &'a [u8] { let offset = self.data.read_u32(offset) as usize; self.data.bytes.get(offset..).unwrap() } } pub trait AlphabetLookup { /// Get the alphabet code for the code point. fn get_at(&self, c: u32) -> Option; /// Lookup the internal alphabet codes from UTF-16 character codes. fn lookup( &self, alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize], word: &[u16], ) -> HyphenationType { let mut result = HyphenationType::BreakAndInsertHyphen; alpha_codes[0] = 0; // word start for i in 0..word.len() { let c = word[i] as u32; if let Some(code) = self.get_at(c) { alpha_codes[i + 1] = code; } else { return HyphenationType::DontBreak; } if result == HyphenationType::BreakAndInsertHyphen { result = Hyphenator::hyphenation_type_based_on_script(c); } } alpha_codes[word.len() + 1] = 0; // word termination result } } /// Map from utf16 code unit to the internal alphabet code. impl<'a> AlphabetTable0<'a> { /// Construct a reader of the Alphabet Table version 0 struct from the byte array. pub fn new(bytes: &'a [u8]) -> Self { let data = HyphenationData::new(bytes); let min_codepoint = data.read_u32(4); let max_codepoint = data.read_u32(8); AlphabetTable0 { data, min_codepoint, max_codepoint } } } impl<'a> AlphabetLookup for AlphabetTable0<'a> { /// Returns an entry of the specified offset. fn get_at(&self, offset: u32) -> Option { if offset < self.min_codepoint || offset >= self.max_codepoint { None } else { let code = self.data.bytes[(offset - self.min_codepoint) as usize + 12] as u16; if code == 0 { None } else { Some(code) } } } } /// Map from utf16 code unit to the internal alphabet code. impl<'a> AlphabetTable1<'a> { /// Construct a reader of the Alphabet Table version 1 struct from the byte array. pub fn new(bytes: &'a [u8]) -> Self { let data = HyphenationData::new(bytes); let num_entries = data.read_u32(4); AlphabetTable1 { data, num_entries } } fn lower_bounds(&self, value: u32) -> Option { let mut b = 0; let mut e = self.num_entries; while b != e { let m = b + (e - b) / 2; let c = self.data.read_u32(8 + m * 4); if c >= value { e = m; } else { b = m + 1; } } if b == self.num_entries { None } else { Some(b) } } } impl<'a> AlphabetLookup for AlphabetTable1<'a> { fn get_at(&self, c: u32) -> Option { if let Some(r) = self.lower_bounds(c << 11) { let entry = AlphabetTable1Entry::new(self.data.read_u32(8 + r * 4)); if entry.codepoint() == c { Some(entry.value()) } else { None } } else { None } } } /// A packed u32 entry of the AlphabetTable1. impl AlphabetTable1Entry { pub const fn new(entry_value: u32) -> Self { AlphabetTable1Entry { entry: entry_value } } /// Unpack code point from entry value. pub fn codepoint(&self) -> u32 { self.entry >> 11 } /// Unpack value from entry value. pub fn value(&self) -> u16 { (self.entry & 0x7ff).try_into().unwrap() } } /// A Trie object. /// See the function comment of HyphenationData for the details. impl<'a> Trie<'a> { /// Construct a reader of the Trie struct from the byte array. pub const fn new(bytes: &'a [u8]) -> Self { Trie { data: HyphenationData::new(bytes) } } /// Returns an entry of at the offset. /// The entry of the next alphabet code is /// /// let entry = trie.get_at(node + alphabet_codes[char]) pub fn get_at(&self, offset: u32) -> u32 { self.data.read_u32(24 + offset * 4) } /// Returns the bit mask for the character code point of the node. /// You can get node's character code point by /// /// let node_character = entry & char_mask. pub fn char_mask(&self) -> u32 { self.data.read_u32(4) } /// Returns the amount of shift of the node index. /// You can get node number as following /// /// let next_node = (entry & link_mask) >> link_shift pub fn link_shift(&self) -> u32 { self.data.read_u32(8) } /// Returns the mask for the node index. /// You can get node number as following /// /// let next_node = (entry & link_mask) >> link_shift pub fn link_mask(&self) -> u32 { self.data.read_u32(12) } /// Returns the amount of shift of the pattern index. /// You can get pattern index as following /// /// let pattern_index = entry >> pattern_shift pub fn pattern_shift(&self) -> u32 { self.data.read_u32(16) } } /// A Pattern object. /// See the function comment of HyphenationData for the details. impl<'a> Pattern<'a> { /// Construct a reader of the Pattern struct from the byte array. pub fn new(bytes: &'a [u8]) -> Self { let data = HyphenationData::new(bytes); let pattern_offset = data.read_u32(8); Pattern { data, pattern_offset } } /// Returns a packed u32 entry at the given offset. pub fn entry_at(&self, offset: u32) -> PatternEntry<'a> { let entry = self.data.read_u32(16 + offset * 4); PatternEntry::new(self.data.bytes, self.pattern_offset, entry) } } /// An entry of the pattern object. impl<'a> PatternEntry<'a> { /// Construct a reader of the Pattern struct from the byte array. pub const fn new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self { PatternEntry { data: HyphenationData::new(bytes), pattern_offset, entry } } /// Unpack length of the pattern from the packed entry value. pub fn len(&self) -> u32 { self.entry >> 26 } /// Unpack an amount of shift of the pattern data from the packed entry value. pub fn shift(&self) -> u32 { (self.entry >> 20) & 0x3f } /// Returns a hyphenation score value at the offset in word with the entry. pub fn value_at(&self, offset: u32) -> u8 { self.data.bytes[(self.pattern_offset + (self.entry & 0xfffff) + offset) as usize] } } /// Performs hyphenation pub struct Hyphenator { data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: HyphenationLocale, } impl Hyphenator { /// Create a new hyphenator instance pub fn new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self { logger::init( logger::Config::default() .with_tag_on_device("Minikin") .with_max_level(log::LevelFilter::Trace), ); Self { data, min_prefix, min_suffix, locale: if locale == "pl" { HyphenationLocale::Polish } else if locale == "ca" { HyphenationLocale::Catalan } else if locale == "sl" { HyphenationLocale::Slovenian } else { HyphenationLocale::Other }, } } /// Performs a hyphenation pub fn hyphenate(&self, word: &[u16], out: &mut [u8]) { let len: u32 = word.len().try_into().unwrap(); let padded_len = len + 2; if !self.data.is_empty() && len >= self.min_prefix + self.min_suffix && padded_len <= MAX_HYPHEN_SIZE { let header = Header::new(self.data); let mut alpha_codes: [u16; MAX_HYPHEN_SIZE as usize] = [0; MAX_HYPHEN_SIZE as usize]; let hyphen_value = if let Some(alphabet) = header.alphabet_table() { alphabet.lookup(&mut alpha_codes, word) } else { HyphenationType::DontBreak }; if hyphen_value != HyphenationType::DontBreak { self.hyphenate_from_codes(alpha_codes, padded_len, hyphen_value, out); return; } // TODO: try NFC normalization // TODO: handle non-BMP Unicode (requires remapping of offsets) } // Note that we will always get here if the word contains a hyphen or a soft hyphen, because // the alphabet is not expected to contain a hyphen or a soft hyphen character, so // alphabetLookup would return DONT_BREAK. self.hyphenate_with_no_pattern(word, out); } /// This function determines whether a character is like U+2010 HYPHEN in line breaking and /// usage: a character immediately after which line breaks are allowed, but words containing /// it should not be automatically hyphenated using patterns. This is a curated set, created by /// manually inspecting all the characters that have the Unicode line breaking property of BA or /// HY and seeing which ones are hyphens. fn is_line_breaking_hyphen(c: u16) -> bool { c == 0x002D || // HYPHEN-MINUS c == 0x058A || // ARMENIAN HYPHEN c == 0x05BE || // HEBREW PUNCTUATION MAQAF c == 0x1400 || // CANADIAN SYLLABICS HYPHEN c == 0x2010 || // HYPHEN c == 0x2013 || // EN DASH c == 0x2027 || // HYPHENATION POINT c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN c == 0x2E40 // DOUBLE HYPHEN } /// Resolves the hyphenation type for Arabic text. /// In case of Arabic text, the letter form should not be changed by hyphenation. /// So, if the hyphenation is in the middle of the joining context, insert ZWJ for keeping the /// form from the original text. fn get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType { let mut i = location; let mut join_type: u8 = U_JT_NON_JOINING; while i < word.len().try_into().unwrap() { join_type = getJoiningType(word[i as usize].into()); if join_type != U_JT_TRANSPARENT { break; } i += 1; } if join_type == U_JT_DUAL_JOINING || join_type == U_JT_RIGHT_JOINING || join_type == U_JT_JOIN_CAUSING { // The next character is of the type that may join the last character. See if the last // character is also of the right type. join_type = U_JT_NON_JOINING; if i >= 2 { i = location - 2; // skip the soft hyphen loop { join_type = getJoiningType(word[i as usize].into()); if join_type != U_JT_TRANSPARENT { break; } if i == 0 { break; } i -= 1; } } if join_type == U_JT_DUAL_JOINING || join_type == U_JT_LEFT_JOINING || join_type == U_JT_JOIN_CAUSING { return HyphenationType::BreakAndInsertHyphenAndZwj; } } HyphenationType::BreakAndInsertHyphen } /// Performs the hyphenation without pattern files. fn hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8]) { let word_len: u32 = word.len().try_into().unwrap(); out[0] = HyphenationType::DontBreak as u8; for i in 1..word_len { let prev_char = word[i as usize - 1]; if i > 1 && Self::is_line_breaking_hyphen(prev_char) { if (prev_char == CHAR_HYPHEN_MINUS || prev_char == CHAR_HYPHEN) && (self.locale == HyphenationLocale::Polish || self.locale == HyphenationLocale::Slovenian) && getScript(word[i as usize].into()) == USCRIPT_LATIN { // In Polish and Slovenian, hyphens get repeated at the next line. To be safe, // we will do this only if the next character is Latin. out[i as usize] = HyphenationType::BreakAndInsertHyphenAtNextLine as u8; } else { out[i as usize] = HyphenationType::BreakAndDontInsertHyphen as u8; } } else if i > 1 && prev_char == CHAR_SOFT_HYPHEN { // Break after soft hyphens, but only if they don't start the word (a soft hyphen // starting the word doesn't give any useful break opportunities). The type of the // break is based on the script of the character we break on. if getScript(word[i as usize].into()) == USCRIPT_ARABIC { // For Arabic, we need to look and see if the characters around the soft hyphen // actually join. If they don't, we'll just insert a normal hyphen. out[i as usize] = Self::get_hyph_type_for_arabic(word, i) as u8; } else { out[i as usize] = Self::hyphenation_type_based_on_script(word[i as usize] as u32) as u8; } } else if prev_char == CHAR_MIDDLE_DOT && self.min_prefix < i && i <= word_len - self.min_suffix && ((word[i as usize - 2] == 'l' as u16 && word[i as usize] == 'l' as u16) || (word[i as usize - 2] == 'L' as u16 && word[i as usize] == 'L' as u16)) && self.locale == HyphenationLocale::Catalan { // In Catalan, "l·l" should break as "l-" on the first line // and "l" on the next line. out[i as usize] = HyphenationType::BreakAndReplaceWithHyphen as u8; } else { out[i as usize] = HyphenationType::DontBreak as u8; } } } /// Performs the hyphenation with pattern file. fn hyphenate_from_codes( &self, codes: [u16; MAX_HYPHEN_SIZE as usize], len: u32, hyphen_value: HyphenationType, out: &mut [u8], ) { let header = Header::new(self.data); let trie = header.trie_table(); let pattern = header.pattern_table(); let char_mask = trie.char_mask(); let link_shift = trie.link_shift(); let link_mask = trie.link_mask(); let pattern_shift = trie.pattern_shift(); let max_offset = len - self.min_suffix - 1; for i in 0..(len - 1) { let mut node: u32 = 0; // index into Trie table for j in i..len { let c: u32 = codes[j as usize].into(); let entry = trie.get_at(node + c); if (entry & char_mask) == c { node = (entry & link_mask) >> link_shift; } else { break; } let pat_ix = trie.get_at(node) >> pattern_shift; // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an // offset into the buf pool. This is the pattern for the substring (i..j) we just // matched, which we combine (via point-wise max) into the buffer vector. if pat_ix != 0 { let pat_entry = pattern.entry_at(pat_ix); let pat_len = pat_entry.len(); let pat_shift = pat_entry.shift(); let offset = j + 1 - (pat_len + pat_shift); // offset is the index within buffer that lines up with the start of pat_buf let start = if self.min_prefix < offset { 0 } else { self.min_prefix - offset }; if offset > max_offset { continue; } let end = cmp::min(pat_len, max_offset - offset); for k in start..end { out[(offset + k) as usize] = cmp::max(out[(offset + k) as usize], pat_entry.value_at(k)); } } } } // Since the above calculation does not modify values outside // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK. for r in out.iter_mut().take(max_offset as usize).skip(self.min_prefix as usize) { *r = if *r & 1 != 0 { hyphen_value as u8 } else { HyphenationType::DontBreak as u8 }; } } fn hyphenation_type_based_on_script(code_point: u32) -> HyphenationType { let script = getScript(code_point); if script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL || script == USCRIPT_TELUGU { HyphenationType::BreakAndDontInsertHyphen } else if script == USCRIPT_ARMENIAN { HyphenationType::BreakAndInsertArmenianHyphen } else if script == USCRIPT_CANADIAN_ABORIGINAL { HyphenationType::BreakAndInsertUcasHyphen } else { HyphenationType::BreakAndInsertHyphen } } }