1#
2# Copyright (C) 2002-2013, International Business Machines Corporation
3# and others. All Rights Reserved.
4#
5# file:  word.txt
6#
7# ICU Word Break Rules
8#      See Unicode Standard Annex #29.
9#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
10#
11# Note:  Updates to word.txt will usually need to be merged into
12#        word_POSIX.txt also.
13
14##############################################################################
15#
16#  Character class definitions from TR 29
17#
18##############################################################################
19
20!!chain;
21
22
23#
24#  Character Class Definitions.
25#
26
27$CR                 = [\p{Word_Break = CR}];
28$LF                 = [\p{Word_Break = LF}];
29$Newline            = [\p{Word_Break = Newline}];
30$Extend             = [\p{Word_Break = Extend}];
31$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
32$Format             = [\p{Word_Break = Format}];
33$Katakana           = [\p{Word_Break = Katakana}];
34$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
35$ALetter            = [\p{Word_Break = ALetter}];
36$Single_Quote       = [\p{Word_Break = Single_Quote}];
37$Double_Quote       = [\p{Word_Break = Double_Quote}];
38$MidNumLet          = [\p{Word_Break = MidNumLet}];
39$MidLetter          = [\p{Word_Break = MidLetter}];
40$MidNum             = [\p{Word_Break = MidNum}];
41$Numeric            = [\p{Word_Break = Numeric}];
42$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
43
44$Han                = [:Han:];
45$Hiragana           = [:Hiragana:];
46
47
48#   Dictionary character set, for triggering language-based break engines. Currently
49#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
50#   5.0 or later as the definition of Complex_Context was corrected to include all
51#   characters requiring dictionary break.
52
53$Control        = [\p{Grapheme_Cluster_Break = Control}];
54$HangulSyllable = [\uac00-\ud7a3];
55$ComplexContext = [:LineBreak = Complex_Context:];
56$KanaKanji      = [$Han $Hiragana $Katakana];
57$dictionaryCJK  = [$KanaKanji $HangulSyllable];
58$dictionary     = [$ComplexContext $dictionaryCJK];
59
60# leave CJK scripts out of ALetterPlus
61$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
62
63
64#
65#  Rules 4    Ignore Format and Extend characters,
66#             except when they appear at the beginning of a region of text.
67#
68# TODO: check if handling of katakana in dictionary makes rules incorrect/void
69$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
70$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
71$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
72$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
73$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
74$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
75$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
76$MidNumEx             = $MidNum             ($Extend |  $Format)*;
77$NumericEx            = $Numeric            ($Extend |  $Format)*;
78$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
79$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
80
81$Ideographic    = [\p{Ideographic}];
82$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
83$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
84
85## -------------------------------------------------
86
87!!forward;
88
89
90# Rule 3 - CR x LF
91#
92$CR $LF;
93
94# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
95#          of a region of Text.   The rule here comes into play when the start of text
96#          begins with a group of Format chars, or with a "word" consisting of a single
97#          char that is not in any of the listed word break categories followed by
98#          format char(s), or is not a CJK dictionary character.
99[^$CR $LF $Newline]? ($Extend |  $Format)+;
100
101$NumericEx {100};
102$ALetterEx {200};
103$HangulSyllable {200};
104$Hebrew_LetterEx{200};
105$KatakanaEx {400};       # note:  these status values override those from rule 5
106$HiraganaEx {400};       #        by virtue of being numerically larger.
107$IdeographicEx {400};    #
108
109#
110# rule 5
111#    Do not break between most letters.
112#
113($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
114
115# rule 6 and 7
116($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
117
118# rule 7a
119$Hebrew_LetterEx $Single_QuoteEx {200};
120
121# rule 7b and 7c
122$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
123
124# rule 8
125
126$NumericEx $NumericEx {100};
127
128# rule 9
129
130($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
131
132# rule 10
133
134$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
135
136# rule 11 and 12
137
138$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
139
140# rule 13
141# to be consistent with $KanaKanji $KanaKanhi, changed
142# from 300 to 400.
143# See also TestRuleStatus in intltest/rbbiapts.cpp
144$KatakanaEx  $KatakanaEx {400};
145
146# rule 13a/b
147
148$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
149$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
150$NumericEx       $ExtendNumLetEx {100};    #  (13a)
151$KatakanaEx      $ExtendNumLetEx {400};    #  (13a)
152$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
153
154$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
155$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
156$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
157$ExtendNumLetEx  $KatakanaEx     {400};    #  (13b)
158
159# rule 13c
160
161$Regional_IndicatorEx $Regional_IndicatorEx;
162
163# special handling for CJK characters: chain for later dictionary segmentation
164$HangulSyllable $HangulSyllable {200};
165$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
166
167
168## -------------------------------------------------
169
170!!reverse;
171
172$BackHebrew_LetterEx      = ($Format | $Extend)* $Hebrew_Letter;
173$BackALetterEx            = ($Format | $Extend)* $ALetterPlus;
174$BackSingle_QuoteEx       = ($Format | $Extend)* $Single_Quote;
175$BackDouble_QuoteEx       = ($Format | $Extend)* $Double_Quote;
176$BackMidNumLetEx          = ($Format | $Extend)* $MidNumLet;
177$BackNumericEx            = ($Format | $Extend)* $Numeric;
178$BackMidNumEx             = ($Format | $Extend)* $MidNum;
179$BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
180$BackKatakanaEx           = ($Format | $Extend)* $Katakana;
181$BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
182$BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
183$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
184
185# rule 3
186$LF $CR;
187
188# rule 4
189($Format | $Extend)*  [^$CR $LF $Newline]?;
190
191# rule 5
192
193($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
194
195# rule 6 and 7
196
197($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
198
199# rule 7a
200$BackSingle_QuoteEx $BackHebrew_LetterEx;
201
202# Rule 7b and 7c
203$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
204
205# rule 8
206
207$BackNumericEx $BackNumericEx;
208
209# rule 9
210
211$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
212
213# rule 10
214
215($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
216
217# rule 11 and 12
218
219$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
220
221# rule 13
222
223$BackKatakanaEx $BackKatakanaEx;
224
225# rules 13 a/b
226#
227$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
228($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
229
230# rule 13c
231
232$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
233
234# special handling for CJK characters: chain for later dictionary segmentation
235$HangulSyllable $HangulSyllable;
236$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
237
238## -------------------------------------------------
239
240!!safe_reverse;
241
242# rule 3
243($Extend | $Format)+ .?;
244
245# rule 6
246($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
247
248# rule 7b
249$Double_Quote $BackHebrew_LetterEx;
250
251
252# rule 11
253($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
254
255# For dictionary-based break
256$dictionary $dictionary;
257
258## -------------------------------------------------
259
260!!safe_forward;
261
262# rule 4
263($Extend | $Format)+ .?;
264
265# rule 6
266($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
267
268# rule 7b
269$Double_QuoteEx $Hebrew_LetterEx;
270
271# rule 11
272($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
273
274# For dictionary-based break
275$dictionary $dictionary;
276