1#!/usr/bin/python 2# -*- coding: utf-8 -*- 3# 4# created on: 2013jun05 5# created by: Markus W. Scherer 6 7"""Converts CLDR collation files from XML syntax to ICU syntax. 8 9Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June. 10Preserves indentation (except where it joins lines) and text vs. NCR etc. 11Does not handle arbitrary LDML XML collation syntax.""" 12 13# Invoke with two arguments: 14# - the source folder path 15# - the destination folder path 16# For example: 17# ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation 18 19import codecs 20import glob 21import os.path 22import sys 23 24def GetIndent(s): 25 for i in range(len(s)): 26 if s[i] not in " \t": return s[:i] 27 return s 28 29 30# substring replacements 31replacements = ( 32 # White space and syntax characters must be quoted. 33 # Using '\\u0020' rather than just ' ' for clarity. 34 ("<reset> </reset>", "&'\\u0020'"), # can't just replace all "> <" 35 (">!<", ">'!'<"), 36 ('>"<', ">'\\\"'<"), 37 (">"<", ">'\\\"'<"), 38 (">#<", ">'\\u0023'<"), 39 (">$<", ">'$'<"), 40 (">%<", ">'%'<"), 41 (">&<", ">'&'<"), 42 (">&<", ">'&'<"), 43 (">'<", ">''<"), 44 (">'<", ">''<"), 45 (">(<", ">'('<"), 46 (">)<", ">')'<"), 47 (">*<", ">'*'<"), 48 (">+<", ">'+'<"), 49 (">,<", ">','<"), 50 (">-<", ">'-'<"), 51 (">.<", ">'.'<"), 52 (">/<", ">'/'<"), 53 (">:<", ">':'<"), 54 (">;<", ">';'<"), 55 ("><<", ">'<'<"), 56 (">=<", ">'='<"), 57 (">><", ">'>'<"), 58 (">?<", ">'?'<"), 59 (">@<", ">'@'<"), 60 (">[<", ">'['<"), 61 (">\\<", ">'\\\\'<"), 62 (">]<", ">']'<"), 63 (">^<", ">'^'<"), 64 (">_<", ">'_'<"), 65 (">`<", ">'`'<"), 66 (">{<", ">'{'<"), 67 (">|<", ">'|'<"), 68 (">}<", ">'}'<"), 69 (">~<", ">'~'<"), 70 # ha.xml has the following 71 ("'y", "''y"), 72 ("'Y", "''Y"), 73 # kl.xml has the following 74 ("K'", "K''"), 75 # not Pattern_White_Space, just obscure 76 (u"\u00A0", u"\\u00A0"), 77 (u"\u200C", u"\\u200C"), 78 (u"\u200D", u"\\u200D"), 79 (u"\u3000", u"\\u3000"), 80 # obscure, and some tools do not handle noncharacters well 81 (u"\uFDD0", u"'\\uFDD0'"), 82 # The old ICU collation rule parser seems to need more escaping than it should. 83 (u"≠", u"'≠'"), 84 # fi.xml resets contain a space 85 (u" ̵</reset>", u"'\\u0020'̵"), 86 # fa.xml <sc> with non-NFD_Inert chars 87 (u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"), 88 # ml.xml strings contain spaces 89 (u" </s>", u"'\\u0020'"), 90 (u" </reset>", u"'\\u0020'"), 91 # vi.xml <sc> with non-NFD_Inert chars 92 (u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"), 93 # en_US_POSIX needs a lot of quoting. 94 ("<pc> !"#$%&'()*+,-./</pc>", "<*'\\u0020'-'/'"), 95 ("<pc>0123456789:;<=>?@</pc>", "<*0-'@'"), 96 ("<pc>[\]^_`</pc>", "<*'['-'`'"), 97 ("<pc>{|}~</pc>", "<*'{'-'\u007F'"), 98 # CJK parenthesized resets 99 ("<reset>(", "&'('"), 100 (")</reset>", "')'"), 101 # Convert XML elements into ICU syntax. 102 ("><!--", "> #"), # add a space before an inline comment 103 ("<!--", "#"), 104 (" -->", ""), 105 ("-->", ""), 106 ("<reset>", "&"), 107 ('<reset before="primary">', "&[before 1]"), 108 ('<reset before="secondary">', "&[before 2]"), 109 ('<reset before="tertiary">', "&[before 3]"), 110 ("</reset>", ""), 111 ("<p>", "<"), 112 ("</p>", ""), 113 ("<s>", "<<"), 114 ("</s>", ""), 115 ("<t>", "<<<"), 116 ("</t>", ""), 117 ("<i>", "="), 118 ("</i>", ""), 119 ("<pc>", "<*"), 120 ("</pc>", ""), 121 ("<sc>", "<<*"), 122 ("</sc>", ""), 123 ("<tc>", "<<<*"), 124 ("</tc>", ""), 125 ("<ic>", "=*"), 126 ("</ic>", ""), 127 ("<x>", ""), 128 ("</x>", ""), 129 ("<extend>", "/"), 130 ("</extend>", ""), 131 ("</context>", "|"), 132 ("<first_tertiary_ignorable/>", "[first tertiary ignorable]"), 133 ("<last_tertiary_ignorable/>", "[last tertiary ignorable]"), 134 ("<first_secondary_ignorable/>", "[first secondary ignorable]"), 135 ("<last_secondary_ignorable/>", "[last secondary ignorable]"), 136 ("<first_primary_ignorable/>", "[first primary ignorable]"), 137 ("<last_primary_ignorable/>", "[last primary ignorable]"), 138 ("<first_variable/>", "[first variable]"), 139 ("<last_variable/>", "[last variable]"), 140 ("<first_non_ignorable/>", "[first regular]"), 141 ("<last_non_ignorable/>", "[last regular]"), 142 ("<last_non_ignorable />", "[last regular]"), 143 ("<first_trailing/>", "[first trailing]"), 144 ("<last_trailing/>", "[last trailing]") 145) 146 147 148def ConvertFile(src, dest): 149 in_rules = False 150 partial = "" 151 in_ml_comment = False 152 for line in src: 153 if "<rules>" in line: 154 indent = GetIndent(line) 155 stripped = line.strip() 156 # Replace import-only rules with import elements. 157 if stripped == '<rules><import source="sr"/></rules>': 158 dest.write(indent + '<import source="sr"/>\n') 159 elif stripped == '<rules><import source="hr" type="search"/></rules>': 160 dest.write(indent + '<import source="hr" type="search"/>\n') 161 elif stripped == '<rules><import source="hr"/></rules>': 162 dest.write(indent + '<import source="hr"/>\n') 163 elif stripped == '<rules><import source="ps"/></rules>': 164 dest.write(indent + '<import source="ps"/>\n') 165 else: 166 # Replace the XML <rules> section with ICU syntax rules in <cr>. 167 assert stripped == "<rules>" 168 dest.write(indent + "<cr><![CDATA[\n") 169 in_rules = True 170 elif "</rules>" in line: 171 # Flush, and go back to just copying lines until the next <rules>. 172 if partial: 173 dest.write(partial + "\n") 174 partial = "" 175 in_ml_comment = False 176 dest.write(GetIndent(line) + "]]></cr>\n") 177 in_rules = False 178 else: 179 if in_rules: 180 # Find out whether we want to concatenate the current line 181 # with the previous and/or next one. 182 finish_partial = False # Finish collected, partial input. 183 start_ml_comment = False # Start of a multi-line comment. 184 stop_comment = False # End of a comment, must terminate the line. 185 if ("<reset" in line) or line.lstrip().startswith("<!--"): 186 finish_partial = True 187 if partial and len(partial.strip()) > 80: 188 finish_partial = True 189 if "<!--" in line and "-->" not in line: 190 start_ml_comment = True 191 if "-->" in line: 192 assert line.rstrip().endswith("-->") 193 stop_comment = True 194 195 # Convert XML syntax to ICU syntax. 196 if "<context>" in line: 197 # Swap context & relation: 198 # <x><context>カ</context><i>ー</i></x> 199 # turns into 200 # =カ|ー 201 if "<i>" in line: 202 line = line.replace("<i>", "").replace("<context>", "<i>") 203 elif "<t>" in line: 204 line = line.replace("<t>", "").replace("<context>", "<t>") 205 206 for (xml, icu) in replacements: 207 line = line.replace(xml, icu) 208 209 while True: 210 # Convert a Numeric Character Reference to \\uhhhh. 211 i = line.find("&#x") 212 if i < 0: break 213 limit = line.find(";", i + 3) 214 cp = line[i + 3:limit] 215 while len(cp) < 4: cp = "0" + cp 216 assert len(cp) == 4 # not handling supplementary code points 217 line = line[:i] + "\\u" + cp + line[limit + 1:] 218 219 # Start/continue/finish concatenation, and output. 220 if partial and finish_partial: 221 # Write collected input. 222 dest.write(partial + "\n") 223 partial = "" 224 225 if start_ml_comment: 226 # Start a multi-line comment. 227 assert not partial 228 comment_indent = GetIndent(line) # can be the empty string 229 in_ml_comment = True 230 elif in_ml_comment: 231 # Continue a multi-line comment. 232 assert not partial 233 if line.startswith(comment_indent): 234 if line[len(comment_indent)] in " \t": 235 # Preserve further indentation. 236 line = comment_indent + "#" + line[len(comment_indent):] 237 else: 238 # Add a space after the #. 239 line = comment_indent + "# " + line[len(comment_indent):] 240 else: 241 # Indent at least as much as the first line. 242 line = line.lstrip() 243 if line: 244 line = comment_indent + "# " + line 245 else: 246 line = comment_indent + "#\n" 247 elif stop_comment: 248 # Just output the line, do not start collecting input. 249 # ICU-syntax comments end with the end of the line, 250 # do not append rules to them. 251 if partial: 252 line = partial + line.lstrip() + "\n" 253 partial = "" 254 elif not partial: 255 # Start collecting input. 256 partial = line.rstrip() 257 elif partial: 258 # Continue collecting input. 259 partial += line.strip() 260 261 if stop_comment: 262 in_ml_comment = False 263 if not partial: dest.write(line) 264 265 266def main(): 267 (src_root, dest_root) = sys.argv[1:3] 268 src_pattern = os.path.join(src_root, "*.xml") 269 for src_path in glob.iglob(src_pattern): 270 basename = os.path.basename(src_path) 271 dest_path = os.path.join(dest_root, basename) 272 with codecs.open(src_path, "r", "UTF-8") as src: 273 with codecs.open(dest_path, "w", "UTF-8") as dest: 274 ConvertFile(src, dest) 275 276 277if __name__ == "__main__": 278 main() 279