1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3#
4# created on: 2013jun05
5# created by: Markus W. Scherer
6
7"""Converts CLDR collation files from XML syntax to ICU syntax.
8
9Handles the CLDR collation data in the post-CLDR 23 trunk in 2013 June.
10Preserves indentation (except where it joins lines) and text vs. NCR etc.
11Does not handle arbitrary LDML XML collation syntax."""
12
13# Invoke with two arguments:
14# - the source folder path
15# - the destination folder path
16# For example:
17# ~/svn.cldr$ collicu/tools/scripts/coll2icu.py trunk/common/collation collicu/common/collation
18
19import codecs
20import glob
21import os.path
22import sys
23
24def GetIndent(s):
25  for i in range(len(s)):
26    if s[i] not in " \t": return s[:i]
27  return s
28
29
30# substring replacements
31replacements = (
32  # White space and syntax characters must be quoted.
33  # Using '\\u0020' rather than just ' ' for clarity.
34  ("<reset> </reset>", "&'\\u0020'"),  # can't just replace all "> <"
35  (">!<", ">'!'<"),
36  ('>"<', ">'\\\"'<"),
37  (">&quot;<", ">'\\\"'<"),
38  (">#<", ">'\\u0023'<"),
39  (">$<", ">'$'<"),
40  (">%<", ">'%'<"),
41  (">&<", ">'&'<"),
42  (">&amp;<", ">'&'<"),
43  (">'<", ">''<"),
44  (">&apos;<", ">''<"),
45  (">(<", ">'('<"),
46  (">)<", ">')'<"),
47  (">*<", ">'*'<"),
48  (">+<", ">'+'<"),
49  (">,<", ">','<"),
50  (">-<", ">'-'<"),
51  (">.<", ">'.'<"),
52  (">/<", ">'/'<"),
53  (">:<", ">':'<"),
54  (">;<", ">';'<"),
55  (">&lt;<", ">'<'<"),
56  (">=<", ">'='<"),
57  (">&gt;<", ">'>'<"),
58  (">?<", ">'?'<"),
59  (">@<", ">'@'<"),
60  (">[<", ">'['<"),
61  (">\\<", ">'\\\\'<"),
62  (">]<", ">']'<"),
63  (">^<", ">'^'<"),
64  (">_<", ">'_'<"),
65  (">`<", ">'`'<"),
66  (">{<", ">'{'<"),
67  (">|<", ">'|'<"),
68  (">}<", ">'}'<"),
69  (">~<", ">'~'<"),
70  # ha.xml has the following
71  ("'y", "''y"),
72  ("'Y", "''Y"),
73  # kl.xml has the following
74  ("K'", "K''"),
75  # not Pattern_White_Space, just obscure
76  (u"\u00A0", u"\\u00A0"),
77  (u"\u200C", u"\\u200C"),
78  (u"\u200D", u"\\u200D"),
79  (u"\u3000", u"\\u3000"),
80  # obscure, and some tools do not handle noncharacters well
81  (u"\uFDD0", u"'\\uFDD0'"),
82  # The old ICU collation rule parser seems to need more escaping than it should.
83  (u"≠", u"'≠'"),
84  # fi.xml resets contain a space
85  (u" ̵</reset>", u"'\\u0020'̵"),
86  # fa.xml <sc> with non-NFD_Inert chars
87  (u"<sc>\u0650\u064f\u064b\u064d\u064c</sc>", u"<<\u0650<<\u064f<<\u064b<<\u064d<<\u064c"),
88  # ml.xml strings contain spaces
89  (u" </s>", u"'\\u0020'"),
90  (u" </reset>", u"'\\u0020'"),
91  # vi.xml <sc> with non-NFD_Inert chars
92  (u"<sc>\u0309\u0303\u0301\u0323</sc>", u"<<\u0309<<\u0303<<\u0301<<\u0323"),
93  # en_US_POSIX needs a lot of quoting.
94  ("<pc>&#x20;&#x21;&#x22;&#x23;&#x24;&#x25;&#x26;&#x27;&#x28;&#x29;&#x2a;&#x2b;&#x2c;&#x2d;&#x2e;&#x2f;</pc>", "<*'\\u0020'-'/'"),
95  ("<pc>0123456789&#x3a;&#x3b;&#x3c;&#x3d;&#x3e;&#x3f;&#x40;</pc>", "<*0-'@'"),
96  ("<pc>&#x5b;&#x5c;&#x5d;&#x5e;&#x5f;&#x60;</pc>", "<*'['-'`'"),
97  ("<pc>&#x7b;&#x7c;&#x7d;&#x7e;&#x7f;</pc>", "<*'{'-'\u007F'"),
98  # CJK parenthesized resets
99  ("<reset>(", "&'('"),
100  (")</reset>", "')'"),
101  # Convert XML elements into ICU syntax.
102  ("><!--", "> #"),  # add a space before an inline comment
103  ("<!--", "#"),
104  (" -->", ""),
105  ("-->", ""),
106  ("<reset>", "&"),
107  ('<reset before="primary">', "&[before 1]"),
108  ('<reset before="secondary">', "&[before 2]"),
109  ('<reset before="tertiary">', "&[before 3]"),
110  ("</reset>", ""),
111  ("<p>", "<"),
112  ("</p>", ""),
113  ("<s>", "<<"),
114  ("</s>", ""),
115  ("<t>", "<<<"),
116  ("</t>", ""),
117  ("<i>", "="),
118  ("</i>", ""),
119  ("<pc>", "<*"),
120  ("</pc>", ""),
121  ("<sc>", "<<*"),
122  ("</sc>", ""),
123  ("<tc>", "<<<*"),
124  ("</tc>", ""),
125  ("<ic>", "=*"),
126  ("</ic>", ""),
127  ("<x>", ""),
128  ("</x>", ""),
129  ("<extend>", "/"),
130  ("</extend>", ""),
131  ("</context>", "|"),
132  ("<first_tertiary_ignorable/>", "[first tertiary ignorable]"),
133  ("<last_tertiary_ignorable/>", "[last tertiary ignorable]"),
134  ("<first_secondary_ignorable/>", "[first secondary ignorable]"),
135  ("<last_secondary_ignorable/>", "[last secondary ignorable]"),
136  ("<first_primary_ignorable/>", "[first primary ignorable]"),
137  ("<last_primary_ignorable/>", "[last primary ignorable]"),
138  ("<first_variable/>", "[first variable]"),
139  ("<last_variable/>", "[last variable]"),
140  ("<first_non_ignorable/>", "[first regular]"),
141  ("<last_non_ignorable/>", "[last regular]"),
142  ("<last_non_ignorable />", "[last regular]"),
143  ("<first_trailing/>", "[first trailing]"),
144  ("<last_trailing/>", "[last trailing]")
145)
146
147
148def ConvertFile(src, dest):
149  in_rules = False
150  partial = ""
151  in_ml_comment = False
152  for line in src:
153    if "<rules>" in line:
154      indent = GetIndent(line)
155      stripped = line.strip()
156      # Replace import-only rules with import elements.
157      if stripped == '<rules><import source="sr"/></rules>':
158        dest.write(indent + '<import source="sr"/>\n')
159      elif stripped == '<rules><import source="hr" type="search"/></rules>':
160        dest.write(indent + '<import source="hr" type="search"/>\n')
161      elif stripped == '<rules><import source="hr"/></rules>':
162        dest.write(indent + '<import source="hr"/>\n')
163      elif stripped == '<rules><import source="ps"/></rules>':
164        dest.write(indent + '<import source="ps"/>\n')
165      else:
166        # Replace the XML <rules> section with ICU syntax rules in <cr>.
167        assert stripped == "<rules>"
168        dest.write(indent + "<cr><![CDATA[\n")
169        in_rules = True
170    elif "</rules>" in line:
171      # Flush, and go back to just copying lines until the next <rules>.
172      if partial:
173        dest.write(partial + "\n")
174        partial = ""
175      in_ml_comment = False
176      dest.write(GetIndent(line) + "]]></cr>\n")
177      in_rules = False
178    else:
179      if in_rules:
180        # Find out whether we want to concatenate the current line
181        # with the previous and/or next one.
182        finish_partial = False  # Finish collected, partial input.
183        start_ml_comment = False  # Start of a multi-line comment.
184        stop_comment = False  # End of a comment, must terminate the line.
185        if ("<reset" in line) or line.lstrip().startswith("<!--"):
186          finish_partial = True
187        if partial and len(partial.strip()) > 80:
188          finish_partial = True
189        if "<!--" in line and "-->" not in line:
190          start_ml_comment = True
191        if "-->" in line:
192          assert line.rstrip().endswith("-->")
193          stop_comment = True
194
195        # Convert XML syntax to ICU syntax.
196        if "<context>" in line:
197          # Swap context & relation:
198          #   <x><context>カ</context><i>ー</i></x>
199          # turns into
200          #   =カ|ー
201          if "<i>" in line:
202            line = line.replace("<i>", "").replace("<context>", "<i>")
203          elif "<t>" in line:
204            line = line.replace("<t>", "").replace("<context>", "<t>")
205
206        for (xml, icu) in replacements:
207          line = line.replace(xml, icu)
208
209        while True:
210          # Convert a Numeric Character Reference to \\uhhhh.
211          i = line.find("&#x")
212          if i < 0: break
213          limit = line.find(";", i + 3)
214          cp = line[i + 3:limit]
215          while len(cp) < 4: cp = "0" + cp
216          assert len(cp) == 4  # not handling supplementary code points
217          line = line[:i] + "\\u" + cp + line[limit + 1:]
218
219        # Start/continue/finish concatenation, and output.
220        if partial and finish_partial:
221          # Write collected input.
222          dest.write(partial + "\n")
223          partial = ""
224
225        if start_ml_comment:
226          # Start a multi-line comment.
227          assert not partial
228          comment_indent = GetIndent(line)  # can be the empty string
229          in_ml_comment = True
230        elif in_ml_comment:
231          # Continue a multi-line comment.
232          assert not partial
233          if line.startswith(comment_indent):
234            if line[len(comment_indent)] in " \t":
235              # Preserve further indentation.
236              line = comment_indent + "#" + line[len(comment_indent):]
237            else:
238              # Add a space after the #.
239              line = comment_indent + "# " + line[len(comment_indent):]
240          else:
241            # Indent at least as much as the first line.
242            line = line.lstrip()
243            if line:
244              line = comment_indent + "# " + line
245            else:
246              line = comment_indent + "#\n"
247        elif stop_comment:
248          # Just output the line, do not start collecting input.
249          # ICU-syntax comments end with the end of the line,
250          # do not append rules to them.
251          if partial:
252            line = partial + line.lstrip() + "\n"
253            partial = ""
254        elif not partial:
255          # Start collecting input.
256          partial = line.rstrip()
257        elif partial:
258          # Continue collecting input.
259          partial += line.strip()
260
261        if stop_comment:
262          in_ml_comment = False
263      if not partial: dest.write(line)
264
265
266def main():
267  (src_root, dest_root) = sys.argv[1:3]
268  src_pattern = os.path.join(src_root, "*.xml")
269  for src_path in glob.iglob(src_pattern):
270    basename = os.path.basename(src_path)
271    dest_path = os.path.join(dest_root, basename)
272    with codecs.open(src_path, "r", "UTF-8") as src:
273      with codecs.open(dest_path, "w", "UTF-8") as dest:
274        ConvertFile(src, dest)
275
276
277if __name__ == "__main__":
278  main()
279