1""" 2Looks for duplicate resource definitions and removes all but the last one. 3""" 4 5import os.path 6import xml.parsers.expat 7 8class DuplicateRemover: 9 def matches(self, file_path): 10 dirname, basename = os.path.split(file_path) 11 dirname = os.path.split(dirname)[1] 12 return dirname.startswith("values") and basename.endswith(".xml") 13 14 def consume(self, xml_path, input): 15 parser = xml.parsers.expat.ParserCreate("utf-8") 16 parser.returns_unicode = True 17 tracker = ResourceDefinitionLocator(parser) 18 parser.StartElementHandler = tracker.start_element 19 parser.EndElementHandler = tracker.end_element 20 parser.Parse(input) 21 22 # Treat the input as UTF-8 or else column numbers will be wrong. 23 input_lines = input.decode('utf-8').splitlines(True) 24 25 # Extract the duplicate resource definitions, ignoring the last definition 26 # which will take precedence and be left intact. 27 duplicates = [] 28 for res_name, entries in tracker.resource_definitions.iteritems(): 29 if len(entries) > 1: 30 duplicates += entries[:-1] 31 32 # Sort the duplicates so that they are in order. That way we only do one pass. 33 duplicates = sorted(duplicates, key=lambda x: x.start) 34 35 last_line_no = 0 36 last_col_no = 0 37 output_lines = [] 38 current_line = "" 39 for definition in duplicates: 40 print "{0}: removing duplicate resource '{1}'".format(xml_path, definition.name) 41 42 if last_line_no < definition.start[0]: 43 # The next definition is on a new line, so write what we have 44 # to the output. 45 new_line = current_line + input_lines[last_line_no][last_col_no:] 46 if not new_line.isspace(): 47 output_lines.append(new_line) 48 current_line = "" 49 last_col_no = 0 50 last_line_no += 1 51 52 # Copy all the lines up until this one. 53 for line_to_copy in xrange(last_line_no, definition.start[0]): 54 output_lines.append(input_lines[line_to_copy]) 55 56 # Add to the existing line we're building, by including the prefix of this line 57 # and skipping the lines and characters until the end of this duplicate 58 # definition. 59 last_line_no = definition.start[0] 60 current_line += input_lines[last_line_no][last_col_no:definition.start[1]] 61 last_line_no = definition.end[0] 62 last_col_no = definition.end[1] 63 64 new_line = current_line + input_lines[last_line_no][last_col_no:] 65 if not new_line.isspace(): 66 output_lines.append(new_line) 67 current_line = "" 68 last_line_no += 1 69 last_col_no = 0 70 71 for line_to_copy in xrange(last_line_no, len(input_lines)): 72 output_lines.append(input_lines[line_to_copy]) 73 74 if len(duplicates) > 0: 75 print "deduped {0}".format(xml_path) 76 return "".join(output_lines).encode("utf-8") 77 return input 78 79class Duplicate: 80 """A small struct to maintain the positions of a Duplicate resource definition.""" 81 def __init__(self, name, product, depth, start, end): 82 self.name = name 83 self.product = product 84 self.depth = depth 85 self.start = start 86 self.end = end 87 88class ResourceDefinitionLocator: 89 """Callback class for xml.parsers.expat which records resource definitions and their 90 locations. 91 """ 92 def __init__(self, parser): 93 self.resource_definitions = {} 94 self._parser = parser 95 self._depth = 0 96 self._current_resource = None 97 98 def start_element(self, tag_name, attrs): 99 self._depth += 1 100 if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]: 101 resource_name = None 102 product = "" 103 try: 104 product = attrs["product"] 105 except KeyError: 106 pass 107 108 if tag_name == "item": 109 resource_name = "{0}/{1}".format(attrs["type"], attrs["name"]) 110 else: 111 resource_name = "{0}/{1}".format(tag_name, attrs["name"]) 112 self._current_resource = Duplicate( 113 resource_name, 114 product, 115 self._depth, 116 (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber), 117 None) 118 119 def end_element(self, tag_name): 120 if self._current_resource and self._depth == self._current_resource.depth: 121 # Record the end position of the element, which is the length of the name 122 # plus the </> symbols (len("</>") == 3). 123 self._current_resource.end = (self._parser.CurrentLineNumber - 1, 124 self._parser.CurrentColumnNumber + 3 + len(tag_name)) 125 key_name = "{0}:{1}".format(self._current_resource.name, 126 self._current_resource.product) 127 try: 128 self.resource_definitions[key_name] += [self._current_resource] 129 except KeyError: 130 self.resource_definitions[key_name] = [self._current_resource] 131 self._current_resource = None 132 self._depth -= 1 133