1"""
2Looks for duplicate resource definitions and removes all but the last one.
3"""
4
5import os.path
6import xml.parsers.expat
7
8class DuplicateRemover:
9    def matches(self, file_path):
10        dirname, basename = os.path.split(file_path)
11        dirname = os.path.split(dirname)[1]
12        return dirname.startswith("values") and basename.endswith(".xml")
13
14    def consume(self, xml_path, input):
15        parser = xml.parsers.expat.ParserCreate("utf-8")
16        parser.returns_unicode = True
17        tracker = ResourceDefinitionLocator(parser)
18        parser.StartElementHandler = tracker.start_element
19        parser.EndElementHandler = tracker.end_element
20        parser.Parse(input)
21
22        # Treat the input as UTF-8 or else column numbers will be wrong.
23        input_lines = input.decode('utf-8').splitlines(True)
24
25        # Extract the duplicate resource definitions, ignoring the last definition
26        # which will take precedence and be left intact.
27        duplicates = []
28        for res_name, entries in tracker.resource_definitions.iteritems():
29            if len(entries) > 1:
30                duplicates += entries[:-1]
31
32        # Sort the duplicates so that they are in order. That way we only do one pass.
33        duplicates = sorted(duplicates, key=lambda x: x.start)
34
35        last_line_no = 0
36        last_col_no = 0
37        output_lines = []
38        current_line = ""
39        for definition in duplicates:
40            print "{0}: removing duplicate resource '{1}'".format(xml_path, definition.name)
41
42            if last_line_no < definition.start[0]:
43                # The next definition is on a new line, so write what we have
44                # to the output.
45                new_line = current_line + input_lines[last_line_no][last_col_no:]
46                if not new_line.isspace():
47                    output_lines.append(new_line)
48                current_line = ""
49                last_col_no = 0
50                last_line_no += 1
51
52            # Copy all the lines up until this one.
53            for line_to_copy in xrange(last_line_no, definition.start[0]):
54                output_lines.append(input_lines[line_to_copy])
55
56            # Add to the existing line we're building, by including the prefix of this line
57            # and skipping the lines and characters until the end of this duplicate
58            # definition.
59            last_line_no = definition.start[0]
60            current_line += input_lines[last_line_no][last_col_no:definition.start[1]]
61            last_line_no = definition.end[0]
62            last_col_no = definition.end[1]
63
64        new_line = current_line + input_lines[last_line_no][last_col_no:]
65        if not new_line.isspace():
66            output_lines.append(new_line)
67        current_line = ""
68        last_line_no += 1
69        last_col_no = 0
70
71        for line_to_copy in xrange(last_line_no, len(input_lines)):
72            output_lines.append(input_lines[line_to_copy])
73
74        if len(duplicates) > 0:
75            print "deduped {0}".format(xml_path)
76            return "".join(output_lines).encode("utf-8")
77        return input
78
79class Duplicate:
80    """A small struct to maintain the positions of a Duplicate resource definition."""
81    def __init__(self, name, product, depth, start, end):
82        self.name = name
83        self.product = product
84        self.depth = depth
85        self.start = start
86        self.end = end
87
88class ResourceDefinitionLocator:
89    """Callback class for xml.parsers.expat which records resource definitions and their
90    locations.
91    """
92    def __init__(self, parser):
93        self.resource_definitions = {}
94        self._parser = parser
95        self._depth = 0
96        self._current_resource = None
97
98    def start_element(self, tag_name, attrs):
99        self._depth += 1
100        if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]:
101            resource_name = None
102            product = ""
103            try:
104                product = attrs["product"]
105            except KeyError:
106                pass
107
108            if tag_name == "item":
109                resource_name = "{0}/{1}".format(attrs["type"], attrs["name"])
110            else:
111                resource_name = "{0}/{1}".format(tag_name, attrs["name"])
112            self._current_resource = Duplicate(
113                    resource_name,
114                    product,
115                    self._depth,
116                    (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber),
117                    None)
118
119    def end_element(self, tag_name):
120        if self._current_resource and self._depth == self._current_resource.depth:
121            # Record the end position of the element, which is the length of the name
122            # plus the </> symbols (len("</>") == 3).
123            self._current_resource.end = (self._parser.CurrentLineNumber - 1,
124                    self._parser.CurrentColumnNumber + 3 + len(tag_name))
125            key_name = "{0}:{1}".format(self._current_resource.name,
126                    self._current_resource.product)
127            try:
128                self.resource_definitions[key_name] += [self._current_resource]
129            except KeyError:
130                self.resource_definitions[key_name] = [self._current_resource]
131            self._current_resource = None
132        self._depth -= 1
133