tools/aapt2/tools/remove-duplicates.py - platform/frameworks/base - Git at Google

 #!/usr/bin/env python

 import os
 import os.path
 import sys
 import tempfile
 import xml.parsers.expat

 """
 Scans each resource file in res/values/ looking for duplicates.
 All but the last occurrence of resource definition are removed.
 This creates no semantic changes, the resulting APK when built
 should contain the same definition.
 """

 class Duplicate:
     """A small struct to maintain the positions of a Duplicate resource definition."""
     def __init__(self, name, product, depth, start, end):
         self.name = name
         self.product = product
         self.depth = depth
         self.start = start
         self.end = end

 class ResourceDefinitionLocator:
     """Callback class for xml.parsers.expat which records resource definitions and their
     locations.
     """
     def __init__(self, parser):
         self.resource_definitions = {}
         self._parser = parser
         self._depth = 0
         self._current_resource = None

     def start_element(self, tag_name, attrs):
         self._depth += 1
         if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]:
             resource_name = None
             product = ""
             try:
                 product = attrs["product"]
             except KeyError:
                 pass

             if tag_name == "item":
                 resource_name = "{0}/{1}".format(attrs["type"], attrs["name"])
             else:
                 resource_name = "{0}/{1}".format(tag_name, attrs["name"])
             self._current_resource = Duplicate(
                     resource_name,
                     product,
                     self._depth,
                     (self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber),
                     None)

     def end_element(self, tag_name):
         if self._current_resource and self._depth == self._current_resource.depth:
             # Record the end position of the element, which is the length of the name
             # plus the </> symbols (len("</>") == 3).
             self._current_resource.end = (self._parser.CurrentLineNumber - 1,
                     self._parser.CurrentColumnNumber + 3 + len(tag_name))
             key_name = "{0}:{1}".format(self._current_resource.name,
                     self._current_resource.product)
             try:
                 self.resource_definitions[key_name] += [self._current_resource]
             except KeyError:
                 self.resource_definitions[key_name] = [self._current_resource]
             self._current_resource = None
         self._depth -= 1

 def remove_duplicates(xml_path):
     """Reads the input file and generates an output file with any duplicate
     resources removed, keeping the last occurring definition and removing
     the others. The output is written to a temporary and then renamed
     to the original file name.
     """
     input = ""
     with open(xml_path) as fin:
         input = fin.read()

     parser = xml.parsers.expat.ParserCreate("utf-8")
     parser.returns_unicode = True
     tracker = ResourceDefinitionLocator(parser)
     parser.StartElementHandler = tracker.start_element
     parser.EndElementHandler = tracker.end_element
     parser.Parse(input)

     # Treat the input as UTF-8 or else column numbers will be wrong.
     input_lines = input.decode('utf-8').splitlines(True)

     # Extract the duplicate resource definitions, ignoring the last definition
     # which will take precedence and be left intact.
     duplicates = []
     for res_name, entries in tracker.resource_definitions.iteritems():
         if len(entries) > 1:
             duplicates += entries[:-1]

     # Sort the duplicates so that they are in order. That way we only do one pass.
     duplicates = sorted(duplicates, key=lambda x: x.start)

     last_line_no = 0
     last_col_no = 0
     output_lines = []
     current_line = ""
     for definition in duplicates:
         print "{0}:{1}:{2}: removing duplicate resource '{3}'".format(
                 xml_path, definition.start[0] + 1, definition.start[1], definition.name)

         if last_line_no < definition.start[0]:
             # The next definition is on a new line, so write what we have
             # to the output.
             new_line = current_line + input_lines[last_line_no][last_col_no:]
             if not new_line.isspace():
                 output_lines.append(new_line)
             current_line = ""
             last_col_no = 0
             last_line_no += 1

         # Copy all the lines up until this one.
         for line_to_copy in xrange(last_line_no, definition.start[0]):
             output_lines.append(input_lines[line_to_copy])

         # Add to the existing line we're building, by including the prefix of this line
         # and skipping the lines and characters until the end of this duplicate definition.
         last_line_no = definition.start[0]
         current_line += input_lines[last_line_no][last_col_no:definition.start[1]]
         last_line_no = definition.end[0]
         last_col_no = definition.end[1]

     new_line = current_line + input_lines[last_line_no][last_col_no:]
     if not new_line.isspace():
         output_lines.append(new_line)
     current_line = ""
     last_line_no += 1
     last_col_no = 0

     for line_to_copy in xrange(last_line_no, len(input_lines)):
         output_lines.append(input_lines[line_to_copy])

     if len(duplicates) > 0:
         print "{0}: writing deduped copy...".format(xml_path)

         # Write the lines to a temporary file.
         dirname, basename = os.path.split(xml_path)
         temp_name = ""
         with tempfile.NamedTemporaryFile(prefix=basename, dir=dirname, delete=False) as temp:
             temp_name = temp.name
             for line in output_lines:
                 temp.write(line.encode('utf-8'))

         # Now rename that file to the original so we have an atomic write that is consistent.
         os.rename(temp.name, xml_path)

 def enumerate_files(res_path):
     """Enumerates all files in the resource directory that are XML files and
        within a values-* subdirectory. These types of files end up compiled
        in the resources.arsc table of an APK.
     """
     values_directories = os.listdir(res_path)
     values_directories = filter(lambda f: f.startswith('values'), values_directories)
     values_directories = map(lambda f: os.path.join(res_path, f), values_directories)
     all_files = []
     for dir in values_directories:
         files = os.listdir(dir)
         files = filter(lambda f: f.endswith('.xml'), files)
         files = map(lambda f: os.path.join(dir, f), files)
         all_files += files
     return all_files

 if __name__ == '__main__':
     if len(sys.argv) < 2:
         print >> sys.stderr, "please specify a path to a resource directory"
         sys.exit(1)

     res_path = os.path.abspath(sys.argv[1])
     print "looking in {0} ...".format(res_path)

     for f in enumerate_files(res_path):
         print "checking {0} ...".format(f)
         remove_duplicates(f)
	#!/usr/bin/env python

	import os
	import os.path
	import sys
	import tempfile
	import xml.parsers.expat

	"""
	Scans each resource file in res/values/ looking for duplicates.
	All but the last occurrence of resource definition are removed.
	This creates no semantic changes, the resulting APK when built
	should contain the same definition.
	"""

	class Duplicate:
	"""A small struct to maintain the positions of a Duplicate resource definition."""
	def __init__(self, name, product, depth, start, end):
	self.name = name
	self.product = product
	self.depth = depth
	self.start = start
	self.end = end

	class ResourceDefinitionLocator:
	"""Callback class for xml.parsers.expat which records resource definitions and their
	locations.
	"""
	def __init__(self, parser):
	self.resource_definitions = {}
	self._parser = parser
	self._depth = 0
	self._current_resource = None

	def start_element(self, tag_name, attrs):
	self._depth += 1
	if self._depth == 2 and tag_name not in ["public", "java-symbol", "eat-comment", "skip"]:
	resource_name = None
	product = ""
	try:
	product = attrs["product"]
	except KeyError:
	pass

	if tag_name == "item":
	resource_name = "{0}/{1}".format(attrs["type"], attrs["name"])
	else:
	resource_name = "{0}/{1}".format(tag_name, attrs["name"])
	self._current_resource = Duplicate(
	resource_name,
	product,
	self._depth,
	(self._parser.CurrentLineNumber - 1, self._parser.CurrentColumnNumber),
	None)

	def end_element(self, tag_name):
	if self._current_resource and self._depth == self._current_resource.depth:
	# Record the end position of the element, which is the length of the name
	# plus the </> symbols (len("</>") == 3).
	self._current_resource.end = (self._parser.CurrentLineNumber - 1,
	self._parser.CurrentColumnNumber + 3 + len(tag_name))
	key_name = "{0}:{1}".format(self._current_resource.name,
	self._current_resource.product)
	try:
	self.resource_definitions[key_name] += [self._current_resource]
	except KeyError:
	self.resource_definitions[key_name] = [self._current_resource]
	self._current_resource = None
	self._depth -= 1

	def remove_duplicates(xml_path):
	"""Reads the input file and generates an output file with any duplicate
	resources removed, keeping the last occurring definition and removing
	the others. The output is written to a temporary and then renamed
	to the original file name.
	"""
	input = ""
	with open(xml_path) as fin:
	input = fin.read()

	parser = xml.parsers.expat.ParserCreate("utf-8")
	parser.returns_unicode = True
	tracker = ResourceDefinitionLocator(parser)
	parser.StartElementHandler = tracker.start_element
	parser.EndElementHandler = tracker.end_element
	parser.Parse(input)

	# Treat the input as UTF-8 or else column numbers will be wrong.
	input_lines = input.decode('utf-8').splitlines(True)

	# Extract the duplicate resource definitions, ignoring the last definition
	# which will take precedence and be left intact.
	duplicates = []
	for res_name, entries in tracker.resource_definitions.iteritems():
	if len(entries) > 1:
	duplicates += entries[:-1]

	# Sort the duplicates so that they are in order. That way we only do one pass.
	duplicates = sorted(duplicates, key=lambda x: x.start)

	last_line_no = 0
	last_col_no = 0
	output_lines = []
	current_line = ""
	for definition in duplicates:
	print "{0}:{1}:{2}: removing duplicate resource '{3}'".format(
	xml_path, definition.start[0] + 1, definition.start[1], definition.name)

	if last_line_no < definition.start[0]:
	# The next definition is on a new line, so write what we have
	# to the output.
	new_line = current_line + input_lines[last_line_no][last_col_no:]
	if not new_line.isspace():
	output_lines.append(new_line)
	current_line = ""
	last_col_no = 0
	last_line_no += 1

	# Copy all the lines up until this one.
	for line_to_copy in xrange(last_line_no, definition.start[0]):
	output_lines.append(input_lines[line_to_copy])

	# Add to the existing line we're building, by including the prefix of this line
	# and skipping the lines and characters until the end of this duplicate definition.
	last_line_no = definition.start[0]
	current_line += input_lines[last_line_no][last_col_no:definition.start[1]]
	last_line_no = definition.end[0]
	last_col_no = definition.end[1]

	new_line = current_line + input_lines[last_line_no][last_col_no:]
	if not new_line.isspace():
	output_lines.append(new_line)
	current_line = ""
	last_line_no += 1
	last_col_no = 0

	for line_to_copy in xrange(last_line_no, len(input_lines)):
	output_lines.append(input_lines[line_to_copy])

	if len(duplicates) > 0:
	print "{0}: writing deduped copy...".format(xml_path)

	# Write the lines to a temporary file.
	dirname, basename = os.path.split(xml_path)
	temp_name = ""
	with tempfile.NamedTemporaryFile(prefix=basename, dir=dirname, delete=False) as temp:
	temp_name = temp.name
	for line in output_lines:
	temp.write(line.encode('utf-8'))

	# Now rename that file to the original so we have an atomic write that is consistent.
	os.rename(temp.name, xml_path)

	def enumerate_files(res_path):
	"""Enumerates all files in the resource directory that are XML files and
	within a values-* subdirectory. These types of files end up compiled
	in the resources.arsc table of an APK.
	"""
	values_directories = os.listdir(res_path)
	values_directories = filter(lambda f: f.startswith('values'), values_directories)
	values_directories = map(lambda f: os.path.join(res_path, f), values_directories)
	all_files = []
	for dir in values_directories:
	files = os.listdir(dir)
	files = filter(lambda f: f.endswith('.xml'), files)
	files = map(lambda f: os.path.join(dir, f), files)
	all_files += files
	return all_files

	if __name__ == '__main__':
	if len(sys.argv) < 2:
	print >> sys.stderr, "please specify a path to a resource directory"
	sys.exit(1)

	res_path = os.path.abspath(sys.argv[1])
	print "looking in {0} ...".format(res_path)

	for f in enumerate_files(res_path):
	print "checking {0} ...".format(f)
	remove_duplicates(f)