1#
2# Copyright (C) 2016 and later: Unicode, Inc. and others.
3# License & terms of use: http://www.unicode.org/copyright.html
4# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
5
6# file: word.txt
7#
8# Reference Word Break rules for intltest rbbi/RBBIMonkeyTest
9#
10# Note: Rule syntax and the monkey test itself are still a work in progress.
11#       They are expected to change with review and the addition of support for rule tailoring.
12
13
14type = word;      # one of grapheme | word | line | sentence
15locale = en;
16
17Han            = [:Han:];
18
19CR                 = [\p{Word_Break = CR}];
20LF                 = [\p{Word_Break = LF}];
21Newline            = [\p{Word_Break = Newline}];
22Extend             = [\p{Word_Break = Extend}-Han];
23ZWJ                = [\p{Word_Break = ZWJ}];
24Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
25Format             = [\p{Word_Break = Format}];
26Katakana           = [\p{Word_Break = Katakana}];
27Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
28ALetter            = [\p{Word_Break = ALetter}];
29Single_Quote       = [\p{Word_Break = Single_Quote}];
30Double_Quote       = [\p{Word_Break = Double_Quote}];
31MidNumLet          = [\p{Word_Break = MidNumLet}];
32MidLetter          = [\p{Word_Break = MidLetter}];
33MidNum             = [\p{Word_Break = MidNum}];
34Numeric            = [\p{Word_Break = Numeric}];
35ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
36WSegSpace          = [\p{Word_Break = WSegSpace}];
37Extended_Pict      = [:ExtPict:];
38
39#define dictionary, with the effect being that those characters don't appear in test data.
40
41Hiragana       = [:Hiragana:];
42
43Control        = [\p{Grapheme_Cluster_Break = Control}];
44HangulSyllable = [\uac00-\ud7a3];
45ComplexContext = [:LineBreak = Complex_Context:];
46KanaKanji      = [Han Hiragana Katakana];
47dictionaryCJK  = [KanaKanji HangulSyllable];
48dictionary     = [ComplexContext dictionaryCJK];
49
50# leave dictionary scripts out of ALetter
51
52ALetter        = [ALetter - dictionary];
53
54AHLetter       = [ALetter  Hebrew_Letter];
55MidNumLetQ     = [MidNumLet  Single_Quote];
56ExtFmt         = [Extend Format ZWJ];
57
58WB3:   CR LF;
59WB3a:  (Newline | CR | LF) ÷;
60WB3b:  . ÷ (Newline | CR | LF);   # actually redundant? No other rule combines.
61                                  # (but needed with UAX treat-as scheme.)
62WB3c:   ZWJ Extended_Pict;
63WB3d:   WSegSpace WSegSpace;
64
65WB5:    AHLetter ExtFmt* AHLetter;
66
67# includes both WB6 and WB7
68WB6:    AHLetter ExtFmt* (MidLetter | MidNumLetQ) ExtFmt*  AHLetter;
69
70WB7a:   Hebrew_Letter ExtFmt* Single_Quote;
71WB7b:   Hebrew_Letter ExtFmt* Double_Quote ExtFmt* Hebrew_Letter;   # Include WB7c
72
73WB8:    Numeric ExtFmt* Numeric;
74WB9:    AHLetter ExtFmt* Numeric;
75WB10:   Numeric ExtFmt* AHLetter;
76
77WB11:   Numeric ExtFmt* (MidNum | MidNumLetQ) ExtFmt* Numeric;    # includes WB12
78WB13:   Katakana ExtFmt* Katakana;
79
80WB13a:  (AHLetter | Numeric | Katakana | ExtendNumLet) ExtFmt* ExtendNumLet;
81WB13b:  ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
82
83# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
84#              Interacts with WB3c.
85WB15:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ Extended_Pict;
86WB17:  Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
87
88# Rule WB 999   Any ÷ Any
89#    Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
90WB999.1: . ExtFmt* ZWJ Extended_Pict;
91WB999.2: . ExtFmt* ÷;
92
93