1# 2# Copyright (C) 2016 and later: Unicode, Inc. and others. 3# License & terms of use: http://www.unicode.org/copyright.html 4# Copyright (C) 2002-2016, International Business Machines Corporation 5# and others. All Rights Reserved. 6# 7# file: word.txt 8# 9# ICU Word Break Rules 10# See Unicode Standard Annex #29. 11# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 12# with additions for Emoji Sequences from https://goo.gl/cluFCn 13# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html 14# 15# Note: Updates to word.txt will usually need to be merged into 16# word_POSIX.txt also. 17 18############################################################################## 19# 20# Character class definitions from TR 29 21# 22############################################################################## 23 24!!chain; 25!!quoted_literals_only; 26 27 28# 29# Character Class Definitions. 30# 31 32$CR = [\p{Word_Break = CR}]; 33$LF = [\p{Word_Break = LF}]; 34$Newline = [\p{Word_Break = Newline} ]; 35$Extend = [\p{Word_Break = Extend}]; 36$ZWJ = [\p{Word_Break = ZWJ}]; 37$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 38$Format = [\p{Word_Break = Format}]; 39$Katakana = [\p{Word_Break = Katakana}]; 40$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 41$ALetter = [\p{Word_Break = ALetter}]; 42$Single_Quote = [\p{Word_Break = Single_Quote}]; 43$Double_Quote = [\p{Word_Break = Double_Quote}]; 44$MidNumLet = [\p{Word_Break = MidNumLet}]; 45$MidLetter = [\p{Word_Break = MidLetter}]; 46$MidNum = [\p{Word_Break = MidNum}]; 47$Numeric = [\p{Word_Break = Numeric}]; 48$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 49$WSegSpace = [\p{Word_Break = WSegSpace}]; 50$Extended_Pict = [:ExtPict:]; 51 52$Han = [:Han:]; 53$Hiragana = [:Hiragana:]; 54 55 56# Dictionary character set, for triggering language-based break engines. Currently 57# limited to LineBreak=Complex_Context. Note that this set only works in Unicode 58# 5.0 or later as the definition of Complex_Context was corrected to include all 59# characters requiring dictionary break. 60 61$Control = [\p{Grapheme_Cluster_Break = Control}]; 62$HangulSyllable = [\uac00-\ud7a3]; 63$ComplexContext = [:LineBreak = Complex_Context:]; 64$KanaKanji = [$Han $Hiragana $Katakana]; 65$dictionaryCJK = [$KanaKanji $HangulSyllable]; 66$dictionary = [$ComplexContext $dictionaryCJK]; 67 68# leave CJK scripts out of ALetterPlus 69$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; 70 71 72# 73# Rules 4 Ignore Format and Extend characters, 74# except when they appear at the beginning of a region of text. 75# 76# TODO: check if handling of katakana in dictionary makes rules incorrect/void 77$KatakanaEx = $Katakana ($Extend | $Format | $ZWJ)*; 78$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format | $ZWJ)*; 79$ALetterEx = $ALetterPlus ($Extend | $Format | $ZWJ)*; 80$Single_QuoteEx = $Single_Quote ($Extend | $Format | $ZWJ)*; 81$Double_QuoteEx = $Double_Quote ($Extend | $Format | $ZWJ)*; 82$MidNumLetEx = $MidNumLet ($Extend | $Format | $ZWJ)*; 83$MidLetterEx = $MidLetter ($Extend | $Format | $ZWJ)*; 84$MidNumEx = $MidNum ($Extend | $Format | $ZWJ)*; 85$NumericEx = $Numeric ($Extend | $Format | $ZWJ)*; 86$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format | $ZWJ)*; 87$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format | $ZWJ)*; 88 89$Ideographic = [\p{Ideographic}]; 90$HiraganaEx = $Hiragana ($Extend | $Format | $ZWJ)*; 91$IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*; 92 93## ------------------------------------------------- 94 95# Rule 3 - CR x LF 96# 97$CR $LF; 98 99# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed. 100# 101$ZWJ $Extended_Pict; 102 103# Rule 3d - Keep horizontal whitespace together. 104# 105$WSegSpace $WSegSpace; 106 107# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 108# of a region of Text. The rule here comes into play when the start of text 109# begins with a group of Format chars, or with a "word" consisting of a single 110# char that is not in any of the listed word break categories followed by 111# format char(s), or is not a CJK dictionary character. 112[^$CR $LF $Newline]? ($Extend | $Format | $ZWJ)+; 113 114$NumericEx {100}; 115$ALetterEx {200}; 116$HangulSyllable {200}; 117$Hebrew_LetterEx{200}; 118$KatakanaEx {400}; # note: these status values override those from rule 5 119$HiraganaEx {400}; # by virtue of being numerically larger. 120$IdeographicEx {400}; # 121 122$Extended_Pict ($Extend | $Format | $ZWJ)*; 123 124# 125# rule 5 126# Do not break between most letters. 127# 128($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 129 130# rule 6 and 7 131($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 132 133# rule 7a 134$Hebrew_LetterEx $Single_QuoteEx {200}; 135 136# rule 7b and 7c 137$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200}; 138 139# rule 8 140 141$NumericEx $NumericEx {100}; 142 143# rule 9 144 145($ALetterEx | $Hebrew_LetterEx) $NumericEx {200}; 146 147# rule 10 148 149$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; 150 151# rule 11 and 12 152 153$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; 154 155# rule 13 156# to be consistent with $KanaKanji $KanaKanhi, changed 157# from 300 to 400. 158# See also TestRuleStatus in intltest/rbbiapts.cpp 159$KatakanaEx $KatakanaEx {400}; 160 161# rule 13a/b 162 163$ALetterEx $ExtendNumLetEx {200}; # (13a) 164$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a) 165$NumericEx $ExtendNumLetEx {100}; # (13a) 166$KatakanaEx $ExtendNumLetEx {400}; # (13a) 167$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 168 169$ExtendNumLetEx $ALetterEx {200}; # (13b) 170$ExtendNumLetEx $Hebrew_Letter {200}; # (13b) 171$ExtendNumLetEx $NumericEx {100}; # (13b) 172$ExtendNumLetEx $KatakanaEx {400}; # (13b) 173 174# rules 15 - 17 175# Pairs of Regional Indicators stay together. 176# With rule chaining disabled by ^, this rule will match exactly two of them. 177# No other rule begins with a Regional_Indicator, so chaining cannot extend the match. 178# 179^$Regional_IndicatorEx $Regional_IndicatorEx; 180 181# special handling for CJK characters: chain for later dictionary segmentation 182$HangulSyllable $HangulSyllable {200}; 183$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 184 185# Rule 999 186# Match a single code point if no other rule applies. 187.; 188