1# Copyright (c) 2001-2015 International Business Machines 2# Corporation and others. All Rights Reserved. 3# 4# RBBI Test Data 5# 6# File: rbbitst.txt 7# 8# The format of this file looks vaguely like some kind of xml-ish markup, 9# but it is NOT. The syntax is this.. 10# 11# <word> any following data is for word break testing 12# <sent> any following data is for sentence break testing 13# <line> any following data is for line break testing 14# <char> any following data is for char break testing 15# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc. 16# <data> ... </data> test data. May span multiple lines. 17# <> Break position, status == 0 18# • Break position, status == 0 (Bullet, \u2022) 19# <nnn> Break position, status == nnn 20# \ Escape. Normal ICU unescape applied. 21# \ at end of line -> Line Continuation. Remove both the backslash and the new line 22# 23# In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended. 24# In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended 25# 26# There are two copies of this file in the source repository, 27# [ICU4C] source/test/testdata/rbbitst.txt 28# [ICU4J] main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt 29# 30# ICU4C's copy is the master. If any changes are made to ICU4J's copy, make sure they 31# are merged back into ICU4C's copy of the file, lest they get overwritten later. 32# TODO: figure out how to have a single copy of the file for use by both C and Java. 33 34 35# Temp debugging tests 36<sent> 37<data>•\u00c0.•</data> 38 39#<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029•</data> 40######################################################################################## 41# 42# 43# G r a p h e m e C l u s t e r T e s t s 44# 45# 46########################################################################################## 47<char> 48 49<data>•a•b•c• •,•\u0666•</data> # Quick Test 50<data>•\r•\r•\r\n•\r\n•\n•\r•</data> # don't break CR/LF 51 52# Always break after controls. Combining chars don't combine with them. 53<data>•\u0003•\N{COMBINING GRAVE ACCENT}•\r•\N{COMBINING GRAVE ACCENT}•</data> 54<data>•\u0085•\N{COMBINING MACRON}•A\N{COMBINING MACRON}•</data> 55 56# Surrogates 57<data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data> 58<data>•\ud800\udc00•\udbff\udfff•a•</data> 59 60# Extend (Combining chars) combine. 61<data>•A\N{COMBINING GRAVE ACCENT}•B•</data> 62<data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data> 63<data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•</data> 64 65<data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304•</data> 66 67# Don't break Hangul Syllables 68# L : \u1100 69# V : \u1161 70# T : \u11A8 71# LV : \uAC00 72# LVT : \uAC01 73 74<data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT 75<data>•\u1100\u1161•\u1100\u1161•</data> 76<data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data> 77<data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data> 78<data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data> 79 80 81 82# Hindi combining chars. (An old test) 83# TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters 84#<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930• 85#•\u0939•\u094c•\u0964•</data> 86#<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data> 87 88 89# Bug 1587. Tamil. \u0baa\u0bc1 is an Extended Grpaheme Cluster 90<data>•\u0baa\u0bc1•\u0baa\u0bc1•</data> 91 92# Regression test for bug 1889 93<data>•\u0f40\u0f7d•\u0000•\u0f7e•</data> 94 95 96# 0xffff is a legal character, and should not stop the break iterator early. 97# (Requires special casing in implementation, which is why it gets a test.) 98<data>•\uffff•\uffff• •a•</data> 99 100# Treat Japanese Half Width voicing marks as combining 101<data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data> 102 103######################################################################################## 104# 105# 106# E x t e n d e d G r a p h e m e C l u s t e r T e s t s 107# 108# 109########################################################################################## 110#<xgc> 111 112# Plain Vanilla grapheme clusters 113#<data>•a•b•c•</data> 114#<data>•a\u0301\u0302• •b\u0303\u0304•</data> 115 116# Assorted Hindi combining marks 117#<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data> 118 119# Thai Clusters 120# $Prepend $Extend* $PrependBase $Extend*; 121# 122#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data> 123 124 125######################################################################################## 126# 127# 128# W o r d B o u n d a r y T e s t s 129# 130# 131########################################################################################## 132 133<word> 134# 135# Quick sanity test 136# 137<data>•hello<200> •there<200> •goodbye<200></data> 138<data>•hello<200> •12345<100> •,•</data> 139 140 141# 142# Test data originally in RBBIAPITest::TestFirstNextFollowing() and TestLastPreviousPreceding() 143# 144 145<word> 146<data>•This<200> •is<200> •a<200> •word<200> •break<200>.• • •Isn't<200> •it<200>?• •2.25<100></data> 147 148 149 150# 151# Data originally from TestDefaultRuleBasedWordIteration() 152# 153<data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<200> •\u092f\u0939<200> •</data> 154<data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data> 155 156#Hindi Numbers 157<data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<200>\n•</data> 158 159<data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.10<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data> 160 161<data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200> •STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<100>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•time<200> •</data> 162 163#Hangul 164<data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how<200> •are<200> •you<200> •</data> 165 166<data>•Hello<200>,• •how<200> •are<200> •you<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •</data> 167 168# Words containing non-BMP letters 169<data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATICAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200> •</data> 170 171# Unassigned code points 172<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> 173 174# Hiragana & Katakana stay together, but separates from each other and Latin. 175# *** what to do about theoretical combos of chars? i.e. hiragana + accent 176#<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<400>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<400>\N{HIRAGANA ITERATION MARK}<400>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<400>def<200>#•</data> 177 178# test normalization/dictionary handling of halfwidth katakana: same dictionary phrase in fullwidth and halfwidth 179<data>•芽キャベツ<400>芽キャベツ<400></data> 180 181# more Japanese tests 182# TODO: some script=common characters in the Hiragana and the Katakana block may not be treated correctly 183# (was formerly true for U+30FC); need to check and fix if so. 184#<data>•どー<400>せ<400>日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> 185<data>•日本語<400>を<400>勉強<400>する<400>理由<400>について<400> •て<400>こと<400>は<400>我<400>でも<400>知<400>ら<400>も<400>い<400>こと<400>なん<400>だ<400>。•</data> 186 187# Testing of word boundary for dictionary word containing both kanji and kana 188<data>•中だるみ<400>蔵王の森<400>ウ離島<400></data> 189 190# Testing of Chinese segmentation (taken from a Chinese news article) 191<data>•400<100>余<400>名<400>中央<400>委员<400>和<400>中央<400>候补<400>委员<400>都<400>领<400>到了<400>“•推荐<400>票<400>”•,•有<400>资格<400>在<400>200<100>多<400>名<400>符合<400>条件<400>的<400>63<100>岁<400>以下<400>中共<400>正<400>部<400>级<400>干部<400>中<400>,•选出<400>他们<400>属意<400>的<400>中央<400>政治局<400>委员<400>以<400>向<400>政治局<400>常委<400>会<400>举荐<400>。•</data> 192 193# Words with interior formatting characters 194<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data> 195 196# to test for bug #4097779 197<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> 198 199# fullwidth numeric, midletter characters etc should be treated like their halfwidth counterparts 200# <data>•ISN'T<200> •19<100>日<400></data> 201# why was this added with the dbbi stuff? 202 203# to test for bug #4098467 204# What follows is a string of Korean characters (I found it in the Yellow Pages 205# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 206# it correctly), first as precomposed syllables, and then as conjoining jamo. 207# Both sequences should be semantically identical and break the same way. 208# precomposed syllables... 209<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> 210 211# more Korean tests (Jamo not tested here, not counted as dictionary characters) 212# Disable them now because we don't include a Korean dictionary. 213#<data>•\ud55c\uad6d<200>\ub300\ud559\uad50<200>\uc790\uc5f0<200>\uacfc\ud559<200>\ub300\ud559<200>\ubb3c\ub9ac\ud559\uacfc<200></data> 214#<data>•\ud604\uc7ac<200>\ub294<200> •\uac80\ucc30<200>\uc774<200> •\ubd84\uc2dd<200>\ud68c\uacc4<200>\ubb38\uc81c<200>\ub97c<200> •\uc870\uc0ac<200>\ud560<200> •\uac00\ub2a5\uc131<200>\uc740<200> •\uc5c6\ub2e4<200>\u002e•</data> 215 216<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<400>\u4e03\u4e03<400>abc<200> •</data> 217 218<data>•\u06c9<200>\uc799\ufffa•</data> 219 220 221# 222# Try some words from other scripts. 223# 224 225# Try some words from other scripts. 226# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin 227# 228<data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200> •ABC<200> •</data> 229 230<data>•\u0301•A<200></data> 231 232 233# 234# Hindi word break tests, imported from the old RBBI tests. 235# An historical note: a much earlier version of ICU break iterators had a number 236# of special case rules for Hindi, which were tested by an earlier version of 237# this test data. The current RBBI rules do not special case Hindi in 238# any way, making this test data much less signfificant. 239# 240<data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903<200> 241•\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<200>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<200> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930<200>\r•</data> 242 243# 244# Failures from monkey tests 245# 246<data>•\u8527<400>\u02ba<200>\u0027\u0d42•\u00b7•\u09ea<100></data> 247 248# 249# Jitterbug 5276 - treat Japanese half width voicing marks as Grapheme Extend 250# 251<data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data> 252 253# User guide example: 254<data>•Parlez<200>-•vous<200> •français<200> •?•</data> 255 256######################################################################################## 257# 258# 259# S e n t e n c e B o u n d a r y T e s t s 260# 261# 262########################################################################################## 263 264 265# 266# Test data originally from RBBI RBBITest::TestDefaultRuleBasedSentenceIteration() 267# 268<sent> 269 270 271<sent> 272<data>•This\n<100></data> 273<data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \ 274doing? •This\n<100> costs $20,00,000. •</data> 275 276 277# Sentence ending in a quote. 278<data>•"Sentence ending with a quote." •Bye.•</data> 279 280# Sentence, and test data, ending without a period or other terminator. 281<data>•Here is a random sentence, no ending period<100></data> 282 283 284<data>• (This is it). •Testing the sentence iterator. •\ 285"This isn't it." •Hi! \ 286•This is a simple sample sentence. •(This is it.) •This is a simple sample sentence. •\ 287"This isn't it." •\ 288Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why. 289•Not on my time (el timo.)! •</data> 290 291<data>•Hello. •So what!!\u2029•"But now," he said, \ 292"I know!" •\ 293Harris thumbed down several, including "Away We Go" (which became the huge success Oklahoma!). •One species, B. anthracis, is highly virulent. 294•Wolf said about Sounder:\ 295"Beautifully thought-out and directed." •\ 296Have you ever said, "This is where\tI shall live"? •He answered, \ 297"You may not!" •Another popular saying is: "How do you do?". \n•\ 298Yet another popular saying is: \ 299'I'm fine thanks.' •\ 300What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tall!!\ 301•Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100></data> 302 303<data>•No breaks when . is surrounded by UPPER.Case letters. •</data> 304<data>•No breaks when . is followed by Numeric .4 a.4 C.4 3.1 .•</data> 305<data>•No breaks when . is followed by a lower, with possible intervening punct .,a .$a .)a. •</data> 306 307# 308# Sentence Breaks: no break at the boundary between CJK and other letters 309# 310<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u3002•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2048•He said, "I can go there."\u2029•Bye, now.•</data> 311 312# 313# Treat fullwidth variants of .!? the same as their 314# normal counterparts 315# 316<data>•I know I'm right\uff0e •Right\uff1f •Right\uff01 •</data> 317 318 319# 320# Don't break sentences at boundary between CJK and digits 321# 322<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u3002•Bye, now<100></data> 323 324# 325# Breaks around '(' following a sentence TERM. (Rule 9) 326# 327<data>•How do you do?(•Fine). •</data> 328<data>•How do you do? •(Fine). •</data> 329<data>•How do you do?(•fine). •</data> 330<data>•How do you do? •(fine). •</data> 331 332# 333<data>•Hello.123<100></data> # Rule 6 334<data>•Hello?•123<100></data> 335 336<data>•HELLO.Bye<100></data> # Rule 7 337<data>•HELLO?•Bye<100></data> 338 339<data>•Hello.goodbye<100></data> #Rule 8 340<data>•Hello. •Goodbye<100></data> 341<data>•Hello. goodbye<100></data> 342 343 344 345# 346# test for bug #4158381: No breaks when there are no terminators around 347# 348<data>•\<P>Provides a set of "lightweight" (all-java\<FONT SIZE="-2">\<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible, work the same on all platforms. •</data> 349<data>•Another test.\u2029•</data> 350 351# test for bug #4143071: Make sure sentences that end with digits 352# work right 353# 354<data>•Today is the 27th of May, 1998. •</data> 355<data>•Tomorrow with be 28 May 1998. •</data> 356<data>•The day after will be the 30th.\u2029•</data> 357 358# test for bug #4152416: Make sure sentences ending with a capital 359# letter are treated correctly 360# 361<data>•The type of all primitive \<code>boolean\</code> values accessed in the target VM. •Calls to xxx will return an implementor of this interface. \u2029•</data> 362 363# test for bug #4152117: Make sure sentence breaking is handling 364# punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS 365# HERE TO MAKE SURE IT DOESN'T CROP UP] 366# 367<data>•Constructs a randomly generated BigInteger, uniformly distributed over the range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •The uniformity of the distribution assumes that a fair source of random bits is provided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-negative biginteger. \n•Ahh abc. 368•</data> 369 370# sentence breaks for hindi which used Devanagari script 371# make sure there is sentence break after ?,danda(hindi phrase separator), 372# fullstop followed by space. (VERY old test) 373# 374<data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u094d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\ 375\u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u093e\n\ 376<100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 means "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u0905\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u093f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data> 377 378# Regression test for bug #1984, Sentence break in Arabic text. 379 380<data>\ 381•\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u0627\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u0645\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u0022\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u0630\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u0649\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data> 382 383# Try a few more of the less common sentence endings. 384<data>•Hello, world\u3002 •Hello, world\u1803 •Hello, world\u2048 •Hello, world\u203c •Let's end here. •</data> 385 386 387 388 389################################################################ 390# 391# 392# L I N E B R E A K 393# 394# 395################################################################ 396 397<line> 398# 399# Test Character for each of the line break classes. 400# 401# 00A1;AI # INVERTED EXCLAMATION MARK ¡ 402# 0041;AL # LATIN CAPITAL LETTER A 403# 0009;BA # <control> 404# 00B4;BB # ACUTE ACCENT 405# 000C;BK # <control> 406# 2014;B2 # EM DASH 407# FFFC;CB # OBJECT REPLACEMENT CHARACTER 408# 0029;CL # RIGHT PARENTHESIS 409# 0301;CM # COMBINING ACUTE ACCENT 410# 0021;EX # EXCLAMATION MARK 411# 00A0;GL # NO-BREAK SPACE 412# 002D;HY # HYPHEN-MINUS 413# 4E00;ID # <CJK Ideograph, First> 414# 2024;IN # ONE DOT LEADER 415# 002C;IS # COMMA 416# 000A;LF # <control> 417# 0E5A;NS # THAI CHARACTER ANGKHANKHU 418# 0032;NU # DIGIT TWO 419# 0028;OP # LEFT PARENTHESIS 420# 0025;PO # PERCENT SIGN 421# 0024;PR # DOLLAR SIGN 422# 0022;QU # QUOTATION MARK 423# 0E01;SA # THAI CHARACTER KO KAI 424# DB7F;SG # Surrogate 425# 0020;SP # SPACE 426# 002F;SY # SOLIDUS / 427# F8FF;XX # Private Use 428# 200B;ZW # ZERO WIDTH SPACE 429 430 431# 2b Always break at end of text 432 433<data>• •\u00A1•</data> 434<data>• •\u0041•</data> 435<data>• •\u0009•</data> 436<data>• •\u00B4•</data> 437<data>• \u000C<100></data> # LB3C × BK 438<data>• •\u2014•</data> 439<data>• •\uFFFC•</data> 440<data>• \u0029•</data> # LB 8 × CL 441# <data>• • \u0301•</data> # LB 7a Treat SP CM* as if it were ID #TODO: SP CM 442<data>• \u0021•</data> # LB 8 × EX 443#<data>• \u00A0•</data> # LB 11b × GL TODO: fix. 444<data>• •\u002D•</data> 445<data>• •\u4E00•</data> 446<data>• •\u2024•</data> 447<data>• \u002C•</data> # LB 8 × IS 448<data>• \u000A<100></data> # LB3C × ( BK | CR | LF | NL ) 449<data>• •\u0E5A•</data> 450<data>• •\u0032•</data> 451<data>• •\u0028•</data> 452<data>• •\u0025•</data> 453<data>• •\u0024•</data> 454<data>• •\u0022•</data> 455<data>• •\u0E01•</data> 456<data>• •\uDB7F•</data> 457<data>• \u0020•</data> # LB4 - don't break before space. 458<data>• \u002F•</data> # LB 8 × SY 459<data>• •\uF8FF•</data> 460<data>• \u200B•</data> # LB4 - don't break before ZA 461 462 463# 3a Always break after hard line breaks. 464# 3c Never break before hard line breaks. 465 466<data>• •\u00A1\u2028<100>\u00A1•</data> 467<data>• •\u0041\u2028<100>\u0041•</data> 468<data>• •\u0009\u2028<100>\u0009•</data> 469<data>• •\u00B4\u2028<100>\u00B4•</data> 470<data>• \u000C<100>\u2028<100>\u000C<100></data> 471<data>• •\u2014\u2028<100>\u2014•</data> 472<data>• •\uFFFC\u2028<100>\uFFFC•</data> 473<data>• \u0029\u2028<100>\u0029•</data> 474#<data>• \u0301\u2028<100>\u0301•</data> # TODO: fix. 475<data>• \u0021\u2028<100>\u0021•</data> 476#<data>• \u00A0\u2028<100>\u00A0•</data> # TODO: fix 477<data>• •\u002D\u2028<100>\u002D•</data> 478<data>• •\u4E00\u2028<100>\u4E00•</data> 479<data>• •\u2024\u2028<100>\u2024•</data> 480<data>• \u002C\u2028<100>\u002C•</data> 481<data>• \u000A<100>\u2028<100>\u000A<100></data> 482<data>• •\u0E5A\u2028<100>\u0E5A•</data> 483<data>• •\u0032\u2028<100>\u0032•</data> 484<data>• •\u0028\u2028<100>\u0028•</data> 485<data>• •\u0025\u2028<100>\u0025•</data> 486<data>• •\u0024\u2028<100>\u0024•</data> 487<data>• •\u0022\u2028<100>\u0022•</data> 488<data>• •\u0E01\u2028<100>\u0E01•</data> 489<data>• •\uDB7F\u2028<100>\uDB7F•</data> 490<data>• \u0020\u2028<100>\u0020•</data> 491<data>• \u002F\u2028<100>\u002F•</data> 492<data>• •\uF8FF\u2028<100>\uF8FF•</data> 493<data>• \u200B\u2028<100>\u200B•</data> 494 495# User Guide example 496 497<data>•Parlez-•vous •français ?•</data> 498 499# 500# Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBasedLineIteration() 501# 502 503<line> 504 505<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence. 506<100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data> 507 508<line> 509<data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00a0bar 510<100>How, •are, •you? •This, •costs •$20,00,000.•</data> 511 512# test for bug #4068133 513# 514<data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data> 515 516# to test for bug #4086052 517<data>•foo\u00a0bar•</data> 518 519# to test for bug #4097920 520<data>•dog,cat,mouse •(one)•(two)\n<100></data> 521 522# to test for bug #4035266 523<data>•The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100></data> 524 525 526# to test for bug #4098467 527# What follows is a string of Korean characters (I found it in the Yellow Pages 528# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 529# it correctly), first as precomposed syllables, and then as conjoining jamo. 530# Both sequences should be semantically identical and break the same way. 531# precomposed syllables... (I == Rich Gillam?) 532# 533<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data> 534 535# conjoining jamo... 536<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab •\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data> 537 538# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd 539<data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> 540 541# Surrogate line break tests. 542# 543<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data> #This line and the following are equivalent. 544<data>•\u4e01•\U00020001•\u4e02•abc •\ue000 •\U000f0001•</data> 545 546# Regression for bug 836 547# Note: Unicode 5.1 changed this behavior 548# Unicode 5.2 changed it again, there is no break following the '(' 549<data>•AAA(AAA •</data> 550 551# Try some words from other scripts. 552# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin 553# 554<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data> 555 556# 557# ticket #4853: unpaired surrogates should behave like AL 558# 559<data>•abc\ud801xyz•</data> 560 561# 562# Regression tests for failures that originally came from the monkey test. 563# Monkey test failure lines can, with slight reformatting, be copied into this section 564# as test cases. The error display from here is more informative. 565# 566<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data> 567<data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data> 568<data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data> 569 570# Test for #10176 (in root) 571<line> 572<data>•abc/•s •def•</data> 573<data>•abc/\u05D9 •def•</data> 574<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> 575<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data> 576 577 578 579######################################################################################## 580# 581# 582# T i t l e B o u n d a r y T e s t s 583# 584# 585########################################################################################## 586<title> 587<data>•Here •is •a •short •sample •sentence. •And •another.•</data> 588<data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data> 589<data>• •Start •and •end •with •spaces •</data> 590<data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</data> 591 592<data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data> 593<data>•123 •Start •with •a •number.•</data> 594 595<data>•'•start •with •a •case-•ignorable •cha'r'a'cter•</data> 596<data>•' '' •start •with •case-•ignorable & •case-•insensitive •cha'r'a'cter•</data> 597<data>• ''•aaa' •bbb '•ccc' '•ddd''' '''•eee '''•fff''' •ggg ''•</data> 598# Note: apostrophe is case-ignorable. space is not cased. 599 600########################################################################################## 601# 602# Thai Tests 603# 604########################################################################################## 605<locale th> 606<word> 607# 608# Test data originally from the test code source file 609# // @suwit -- Thai sample data from GVT Guideline 610# 611<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\ 612\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\ 613\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\ 614\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data> 615 616# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 617<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data> 618 619<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\ 620\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\ 621\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data> 622 623<line> 624<data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\ 625\u0020•\u0E1B\u0E34\u0E49\u0E48•\u0E07\u0E2D•\u0E22\u0E39\u0E48•\ 626\u0E43\u0E19•\u0E16\u0E49\u0E33•</data> 627 628# Data originally from intltest RBBITest::TestThaiLineBreak() 629# 630# \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that 631# represents elided letters at the end of a long word. It should be bound to 632# the end of the word and not treated as an independent punctuation mark. 633# 634# the one time where the paiyannoi occurs somewhere other than at the end 635# of a word is in the Thai abbrevation for "etc.", which both begins and 636# ends with a paiyannoi 637# 638<line> 639<data>•\u0e2a\u0e16\u0e32\u0e19\u0e35\u0e2f•\ 640\u0e08\u0e30•\ 641\u0e23\u0e30\u0e14\u0e21•\ 642\u0e40\u0e08\u0e49\u0e32•\ 643\u0e2b\u0e19\u0e49\u0e32\u0e17\u0e35\u0e48•\ 644\u0e2d\u0e2d\u0e01•\ 645\u0e21\u0e32•\ 646\u0e40\u0e23\u0e48\u0e07•\ 647\u0e23\u0e30\u0e1a\u0e32\u0e22•\ 648\u0e2d\u0e22\u0e48\u0e32\u0e07•\ 649\u0e40\u0e15\u0e47\u0e21•\ 650\u0e2f\u0e25\u0e2f•\ 651\u0e17\u0e35\u0e48•\ 652\u0e19\u0e31\u0e49\u0e19•</data> 653 654# Data originally from RBBITest::TestMixedThaiLineBreak() 655# @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start 656# 657<line> 658<data>•\u0E1B\u0E35•\ 659\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\ 6602545 •\ 661\u0E40\u0E1B\u0E47\u0E19•\ 662\u0E1B\u0E35•\ 663\u0E09\u0E25\u0E2D\u0E07•\ 664\u0E04\u0E23\u0E1A•\ 665\u0E23\u0E2D\u0E1A •\ 666\"\u0E52\u0E52\u0E50 •\ 667\u0E1b\u0E35\" •\ 668\u0E02\u0E2d\u0E07•\ 669\u0E01\u0E23\u0E38\u0E07•\ 670\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\ 671(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\ 672\u0E2B\u0E23\u0E37\u0E2D •\ 673Bangkok)•</data> 674 675# Data originally from RBBITest::TestMaiyamok() 676# The Thai maiyamok character is a shorthand symbol that means "repeat the previous 677# word". Instead of appearing as a word unto itself, however, it's kept together 678# with the word before it. 679# 680<line> 681<data>•\u0e44\u0e1b\u0e46•\ 682\u0e21\u0e32\u0e46•\ 683\u0e23\u0e30\u0e2b\u0e27\u0e48\u0e32\u0e07•\ 684\u0e01\u0e23\u0e38\u0e07•\ 685\u0e40\u0e17\u0e1e•\ 686\u0e41\u0e25\u0e30•\ 687\u0e40\u0e03\u0e35•\ 688\u0e22\u0e07•\ 689\u0e43\u0e2b\u0e21\u0e48•</data> 690 691# Test for #10296 692<line> 693<data>•ใช•มั้ย•</data> 694<data>•มั๊ยล่ะ•ที่รัก•</data> 695 696# Test for #10593 697<line> 698<data>•เล่น•ผ่าน•ทาง•บลูทูธ•บน•อุปกรณ์•</data> 699 700# Test for city names #10691 701<line> 702<data>•ไป•ที่•ซานฟรานซิสโก•</data> 703 704# Test for #10630, #10631 705<line> 706<data>•แท็ก•แอปพลิเคชัน•เป็น•พิเศษ•</data> 707 708# Test for #11019 709<line> 710<data>•เบ•เบราว์เซอร์•โพ•โพสต์•โพสท์•</data> 711 712########################################################################################## 713# 714# Lao Tests 715# 716########################################################################################## 717<locale en> 718# Basic check for #7647 719<line> 720<data>•ສະບາຍດີ•</data> 721<data>•ດີ•ຂອບໃຈ•</data> 722<data>•ເຈົ້າ•ເວົ້າ•ພາສາ•ອັງກິດ•ໄດ້•ບໍ່•</data> 723<data>•ກະລຸນາ•ເວົ້າ•ຊ້າ•ໆ•</data> 724 725########################################################################################## 726# 727# Burmese/Myanmar Tests 728# 729########################################################################################## 730<locale en> 731# Basic sanity check for #10326 (some text from http://www.unicode.org/udhr/d/udhr_mya.txt) 732<line> 733<data>•လူ•တိုင်း•သည် •တူညီ •လွတ်လပ်•သော •ဂုဏ်•သိ•က္•ခါ•ဖြ•င့် •လည်းကောင်း၊ •</data> 734<data>•တူညီ•လွတ်လပ်•သော •အ•ခွ•င့်•အရေး•များ•ဖြ•င့် •လည်းကောင်း၊ •မွေး•ဖွား•လာ•သူများ •ဖြစ်သည်။•</data> 735<data>•ထို•သူ•တို့၌ •ပိုင်းခြား •ဝေဖန်•တတ်•သော •ဉာဏ်•နှ•င့် •ကျ•င့်•ဝတ် •သိတတ်•သော •စိတ်•တို့•ရှိ•ကြ၍ •</data> 736<data>•ထို•သူ•တို့သည် •အချင်းချင်း •မေတ္တာ•ထား၍ •ဆက်ဆံ•ကျ•င့်•သုံး•</data> 737 738########################################################################################## 739# 740# Khmer Tests 741# 742########################################################################################## 743 744# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327 745# from the file testdata/wordsegments.txt 746<locale en> 747<word> 748 749<data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data> 750<data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data> 751<data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data> 752#ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data> 753<data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>អាច<200>ចូល<200></data> 754#ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data> 755<data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់<200>ព្រះអង្គ<200></data> 756<data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data> 757<data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data> 758<data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></data> 759<data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data> 760<data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data> 761<data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></data> 762<data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data> 763<data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data> 764<data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data> 765<data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data> 766<data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200></data> 767<data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data> 768<data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data> 769#អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data> 770<data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data> 771<data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data> 772<data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data> 773 774 775# 776# Jitterbug 3671 Test Case 777# 778<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data> 779 780# 781# Trac ticket 5595 Test Case 782<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\ 783ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\ 784ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\ 785สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\ 786ไมล์<200></data> 787 788#################################################################################### 789# 790# Tailored (locale specific) breaking. 791# 792#################################################################################### 793 794# Japanese line break tailoring test 795 796<locale ja> 797<line> 798<data>•\u3041•\u3043•\u3045•\u31f1•</data> 799<locale en> 800<line> 801<data>•\u3041\u3043\u3045\u31f1•</data> 802 803# The following data was originally in RBBITest::TestJapaneseWordBreak() 804<locale ja> 805<word> 806<data>•\u4ECA\u65E5<400>\u306F<400>\u3044\u3044<400>\u5929\u6C17<400>\u3067\u3059<400>\u306D<400>\u3002•\u000D\u000A•</data> 807 808# UBreakIteratorType UBRK_WORD, Locale "ja" 809# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009). 810# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002 811# modified to work with dbbi code - should verify 812 813<locale ja> 814<word> 815<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々<400>は<400>ワード<400>で<400>ある<400>。•</data> 816 817# Test for #10176 (in ja) 818<line> 819<data>•abc/•s •def•</data> 820<data>•abc/\u05D9 •def•</data> 821<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> 822<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data> 823 824 825<locale root> 826<word> 827<data>•私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々<400>は<400>ワード<400>で<400>ある<400>。•</data> 828# The following test is for #10300 829<data>•例えば<400>オーストラリア<400>。•</data> 830# The following test is for #10571 831<data>•一部<400>の<400>地域<400>では<400>、<0>ブラジル<400>、<0>インドネシア<400>、<0>オーストリア<400>、<0>ニュージーランド<400>で<400>ある<400>。•</data> 832 833# UBreakIteratorType UBRK_SENTENCE, Locale "el" 834# Add break after Greek question mark (cldrbug #2069). 835# "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. " 836# "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3" 837# which is "Αβ, γδ; Ε ζη; Θ ικ. Λμ νξ! Οπ, Ρς? Σ" 838 839<locale root> 840<sent> 841<data>•Αβ, γδ; Ε ζη; Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data> 842 843<locale el> 844<sent> 845<data>•Αβ, γδ; •Ε ζη; •Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data> 846 847# UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX" 848# Words don't include colon or period (cldrbug #1969). 849 850<locale en_US> 851<word> 852<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \ 853•for<200> •CS<200>-•types<200>.•</data> 854<data>•\uFF92\uFF76\uFF9E<400> •</data> 855 856<locale en_US_POSIX> 857<word> 858<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •struct<200>.•field<200> \ 859•for<200> •CS<200>-•types<200>.•</data> 860<data>•\u06c9<200>\uc799\ufffa•</data> 861<data>•\uFF92\uFF76\uFF9E<400> •</data> 862 863 864# UBreakIteratorType UBRK_CHARACTER, Locale "th" 865# Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161). 866# Update: As of Unicode 6.1 root has same behavior as th for this. 867# 868# "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 " 869# "(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0E28) " 870# "\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 " 871# which is "กระท่อมรจนา (สุชาติ-จุฑามาศ) เด็กมีปัญหา " 872 873<locale th> 874<char> 875<data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E32• •\ 876(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\ 877\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data> 878 879# Finnish line breaking 880# 881# These rules deal with hyphens when there is a space on the leading side. 882# There should be a break opportunity between the space and the hyphen, and not after the hyphen. 883# See CLDR ticket 3029. 884# See ICU ticket 8151 885 886<locale root> 887<line> 888<data>•abc •- •def •abc •-•def •abc- •def •abc-•def•</data> # With ASCII hyphen 889<data>•abc •‐ •def •abc •‐•def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen 890 891<locale fi> 892<line> 893# TODO: problems with Finnish line break rules cause these two lines to fail. 894#<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen 895#<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen 896 897<data>•abc •- •def •abc •-def •abc- •def •</data> # With ASCII hyphen 898<data>•abc •‐ •def •abc •‐def •abc‐ •def •</data> # With Unicode u2010 hyphen 899 900# Test for #10176 (in fi) 901<line> 902<data>•abc/•s •def•</data> 903<data>•abc/\u05D9 •def•</data> 904<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data> 905<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data> 906 907#################################################################################### 908# 909# Test CSS line break variants: strict, normal, loose 910# 911#################################################################################### 912 913<locale ja@lb=strict> 914<line> 915# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01• 916<data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data> 917 918<locale ja@lb=normal> 919<line> 920# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01• 921<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data> 922 923<locale ja@lb=loose> 924<line> 925# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01• 926<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data> 927 928<locale en@lb=strict> 929<line> 930# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01• 931<data>•\u3084\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data> 932 933<locale en@lb=normal> 934<line> 935# •brk OK before 3063 •no brk before 301C •no brk btw 2026 •no brk before FF01• 936<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data> 937 938<locale en@lb=loose> 939<line> 940# •brk OK before 3063 •no brk before 301C •brk OK btw 2026 •no brk before FF01• 941<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031\u301C\u0020•\u2026•\u2026\u0020•u30A2\uFF01\u0020•</data> 942