generator/extract_structures.py - platform/external/tpm2 - Git at Google

 #!/usr/bin/python
 # Copyright 2015 The Chromium OS Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.

 """Module for parsing TCG TPM2 library specification in HTML format.

 This module processes parts 2 and 3 of the specification, extracting
 information related to tables defined in the documents, feeding the
 information into the Table object for further processing and creating the
 appropriate TPM2 objects.
 """

 from __future__ import print_function

 import HTMLParser
 import os
 import re
 import sys

 import tpm_table

 table_name = re.compile(r'^\s*Table\s+[0-9]+')


 class SpecParser(HTMLParser.HTMLParser):
   """A class for parsing TCG specifications in html format."""

   # The state machine of the parser could be in one of the following states.
   ANCHOR = 0       # Look for table title anchor
   TABLE_NAME = 1   # Look for table title in the data stream
   TABLE_BODY = 2   # Scraping the actual table body
   MAYBE_DONE = 3   # Could be over, unless a single spec table is split in
                    # multiple HTML tables (to continue on the next page)
   SKIP_HEADER = 4  # Ignore the header of the split tables

   def __init__(self):
     """Initialize a parser object to default state."""
     HTMLParser.HTMLParser.__init__(self)
     self._state = self.ANCHOR
     self._title = ''
     self._table = tpm_table.Table()
     self._previous_table_number = 0  # Used to check if there are skipped tables

   def _Normalize(self, data):
     """Normalize HTML data.

     HTML files generated from TCG specifications sometimes include utf8
     characters (like long dashes), which appear only in comments/table titles
     and can be safely ignored.

     Args:
      data: a string representing portion of data from the HTML being parsed.

     Returns:
       a string, the input data with characters above ASCII printable range
                  excluded.
     """
     return ' ' + ''.join(x for x in self.unescape(data) if ord(x) < 128)

   def GetTable(self):
     """Return the Table object containing all information parsed so far."""
     return self._table

   def _SetState(self, new_state):
     if self._state != new_state:
       self._state = new_state
       if new_state == self.TABLE_NAME:
         self._title = ''

   def handle_starttag(self, tag, attrs):
     """Invoked each time a new HTML tag is opened.

     This method drives changes in the parser FSM states, its heuristics are
     derived from the format of the HTML files the TCG specs get converted to.

     Each specification table is preceded with a tittle. The title is wrapped
     in an anchor tag with a property 'name' set to 'bookmark#xxx. The title
     text starts with ' Table [0-9]+ '. Once the table title is detected,
     the state machine switches to looking for the actual HTML table, i.e. tags
     'table', 'tr' and 'td' (the generated specs do not use the 'th' tags).

     Large specification tables can be split into multiple HTML tables (so that
     they fit in a page). This is why the presence of the closing 'table' tag
     is not enough to close the parsing of the current specification table.

     In some cases the next table is defined in the spec immediately after the
     current one - this is when the new anchor tag is used as a signal that the
     previous table has been completely consumed.

     Args:
       tag: a string, the HTML tag
       attrs: a tuple of zero or more two-string tuples, the first element -
              the HTML tag's attribute, the second element - the attribute
              value.
     """
     if tag == 'a':
       if [x for x in attrs if x[0] == 'name' and x[1].startswith('bookmark')]:
         if self._state == self.ANCHOR:
           self._SetState(self.TABLE_NAME)
         elif self._state == self.MAYBE_DONE:
           # Done indeed
           self._table.ProcessTable()
           self._table.Init()
           self._SetState(self.TABLE_NAME)
         elif self._state == self.TABLE_NAME:
           self._title = ''
     elif tag == 'p' and self._state == self.TABLE_NAME and not self._title:
       # This was not a valid table start, back to looking for the right anchor.
       self._SetState(self.ANCHOR)
     elif self._state == self.TABLE_NAME and tag == 'table':
       if not table_name.search(self._title):
         # Table title does not match the expected format - back to square one.
         self._SetState(self.ANCHOR)
         return  # will have to start over
       table_number = int(self._title.split()[1])
       self._previous_table_number += 1
       if table_number > self._previous_table_number:
         print('Table(s) %s missing' % ' '.join(
             '%d' % x for x in
             range(self._previous_table_number, table_number)), file=sys.stderr)
         self._previous_table_number = table_number
       self._table.Init(self._title)
       self._SetState(self.TABLE_BODY)
     elif self._state == self.MAYBE_DONE and tag == 'tr':
       self._SetState(self.SKIP_HEADER)
     elif self._state == self.SKIP_HEADER and tag == 'tr':
       self._SetState(self.TABLE_BODY)
       self._table.NewRow()
     elif self._state == self.TABLE_BODY:
       if tag == 'tr':
         self._table.NewRow()
       elif tag == 'td':
         self._table.NewCell()

   def handle_endtag(self, tag):
     """Invoked each time an HTML tag is closed."""
     if tag == 'table' and self._table.InProgress():
       self._SetState(self.MAYBE_DONE)

   def handle_data(self, data):
     """Process data outside HTML tags."""
     if self._state == self.TABLE_NAME:
       self._title += ' %s' % self._Normalize(data)
     elif self._state == self.TABLE_BODY:
       self._table.AddData(self._Normalize(data))
     elif self._state == self.MAYBE_DONE:
       # Done indeed
       self._table.ProcessTable()
       self._table.Init()
       self._SetState(self.ANCHOR)

   def close(self):
     """Finish processing of the HTML buffer."""
     if self._state in (self.TABLE_BODY, self.MAYBE_DONE):
       self._table.ProcessTable()
     self._state = self.ANCHOR

   def handle_entityref(self, name):
     """Process HTML escape sequence."""
     entmap = {
         'amp': '&',
         'gt': '>',
         'lt': '<',
         'quot': '"',
     }
     if name in entmap:
       if self._state == self.TABLE_BODY:
         self._table.AddData(entmap[name])
       elif self._state == self.TABLE_NAME:
         self._title += entmap[name]


 def main(structs_html_file_name):
   """When invoked standalone - dump .h file on the console."""
   parser = SpecParser()
   with open(structs_html_file_name) as input_file:
     html_content = input_file.read()
   parser.feed(html_content)
   parser.close()
   print(parser.GetTable().GetHFile())

 if __name__ == '__main__':
   if len(sys.argv) != 2:
     print('%s: One parameter is required, the name of the html file '
           'which is the TPM2 library Part 2 specification' %
           os.path.basename(sys.argv[0]), file=sys.stderr)
     sys.exit(1)
   main(sys.argv[1])
	#!/usr/bin/python
	# Copyright 2015 The Chromium OS Authors. All rights reserved.
	# Use of this source code is governed by a BSD-style license that can be
	# found in the LICENSE file.

	"""Module for parsing TCG TPM2 library specification in HTML format.

	This module processes parts 2 and 3 of the specification, extracting
	information related to tables defined in the documents, feeding the
	information into the Table object for further processing and creating the
	appropriate TPM2 objects.
	"""

	from __future__ import print_function

	import HTMLParser
	import os
	import re
	import sys

	import tpm_table

	table_name = re.compile(r'^\s*Table\s+[0-9]+')


	class SpecParser(HTMLParser.HTMLParser):
	"""A class for parsing TCG specifications in html format."""

	# The state machine of the parser could be in one of the following states.
	ANCHOR = 0 # Look for table title anchor
	TABLE_NAME = 1 # Look for table title in the data stream
	TABLE_BODY = 2 # Scraping the actual table body
	MAYBE_DONE = 3 # Could be over, unless a single spec table is split in
	# multiple HTML tables (to continue on the next page)
	SKIP_HEADER = 4 # Ignore the header of the split tables

	def __init__(self):
	"""Initialize a parser object to default state."""
	HTMLParser.HTMLParser.__init__(self)
	self._state = self.ANCHOR
	self._title = ''
	self._table = tpm_table.Table()
	self._previous_table_number = 0 # Used to check if there are skipped tables

	def _Normalize(self, data):
	"""Normalize HTML data.

	HTML files generated from TCG specifications sometimes include utf8
	characters (like long dashes), which appear only in comments/table titles
	and can be safely ignored.

	Args:
	data: a string representing portion of data from the HTML being parsed.

	Returns:
	a string, the input data with characters above ASCII printable range
	excluded.
	"""
	return ' ' + ''.join(x for x in self.unescape(data) if ord(x) < 128)

	def GetTable(self):
	"""Return the Table object containing all information parsed so far."""
	return self._table

	def _SetState(self, new_state):
	if self._state != new_state:
	self._state = new_state
	if new_state == self.TABLE_NAME:
	self._title = ''

	def handle_starttag(self, tag, attrs):
	"""Invoked each time a new HTML tag is opened.

	This method drives changes in the parser FSM states, its heuristics are
	derived from the format of the HTML files the TCG specs get converted to.

	Each specification table is preceded with a tittle. The title is wrapped
	in an anchor tag with a property 'name' set to 'bookmark#xxx. The title
	text starts with ' Table [0-9]+ '. Once the table title is detected,
	the state machine switches to looking for the actual HTML table, i.e. tags
	'table', 'tr' and 'td' (the generated specs do not use the 'th' tags).

	Large specification tables can be split into multiple HTML tables (so that
	they fit in a page). This is why the presence of the closing 'table' tag
	is not enough to close the parsing of the current specification table.

	In some cases the next table is defined in the spec immediately after the
	current one - this is when the new anchor tag is used as a signal that the
	previous table has been completely consumed.

	Args:
	tag: a string, the HTML tag
	attrs: a tuple of zero or more two-string tuples, the first element -
	the HTML tag's attribute, the second element - the attribute
	value.
	"""
	if tag == 'a':
	if [x for x in attrs if x[0] == 'name' and x[1].startswith('bookmark')]:
	if self._state == self.ANCHOR:
	self._SetState(self.TABLE_NAME)
	elif self._state == self.MAYBE_DONE:
	# Done indeed
	self._table.ProcessTable()
	self._table.Init()
	self._SetState(self.TABLE_NAME)
	elif self._state == self.TABLE_NAME:
	self._title = ''
	elif tag == 'p' and self._state == self.TABLE_NAME and not self._title:
	# This was not a valid table start, back to looking for the right anchor.
	self._SetState(self.ANCHOR)
	elif self._state == self.TABLE_NAME and tag == 'table':
	if not table_name.search(self._title):
	# Table title does not match the expected format - back to square one.
	self._SetState(self.ANCHOR)
	return # will have to start over
	table_number = int(self._title.split()[1])
	self._previous_table_number += 1
	if table_number > self._previous_table_number:
	print('Table(s) %s missing' % ' '.join(
	'%d' % x for x in
	range(self._previous_table_number, table_number)), file=sys.stderr)
	self._previous_table_number = table_number
	self._table.Init(self._title)
	self._SetState(self.TABLE_BODY)
	elif self._state == self.MAYBE_DONE and tag == 'tr':
	self._SetState(self.SKIP_HEADER)
	elif self._state == self.SKIP_HEADER and tag == 'tr':
	self._SetState(self.TABLE_BODY)
	self._table.NewRow()
	elif self._state == self.TABLE_BODY:
	if tag == 'tr':
	self._table.NewRow()
	elif tag == 'td':
	self._table.NewCell()

	def handle_endtag(self, tag):
	"""Invoked each time an HTML tag is closed."""
	if tag == 'table' and self._table.InProgress():
	self._SetState(self.MAYBE_DONE)

	def handle_data(self, data):
	"""Process data outside HTML tags."""
	if self._state == self.TABLE_NAME:
	self._title += ' %s' % self._Normalize(data)
	elif self._state == self.TABLE_BODY:
	self._table.AddData(self._Normalize(data))
	elif self._state == self.MAYBE_DONE:
	# Done indeed
	self._table.ProcessTable()
	self._table.Init()
	self._SetState(self.ANCHOR)

	def close(self):
	"""Finish processing of the HTML buffer."""
	if self._state in (self.TABLE_BODY, self.MAYBE_DONE):
	self._table.ProcessTable()
	self._state = self.ANCHOR

	def handle_entityref(self, name):
	"""Process HTML escape sequence."""
	entmap = {
	'amp': '&',
	'gt': '>',
	'lt': '<',
	'quot': '"',
	}
	if name in entmap:
	if self._state == self.TABLE_BODY:
	self._table.AddData(entmap[name])
	elif self._state == self.TABLE_NAME:
	self._title += entmap[name]


	def main(structs_html_file_name):
	"""When invoked standalone - dump .h file on the console."""
	parser = SpecParser()
	with open(structs_html_file_name) as input_file:
	html_content = input_file.read()
	parser.feed(html_content)
	parser.close()
	print(parser.GetTable().GetHFile())

	if __name__ == '__main__':
	if len(sys.argv) != 2:
	print('%s: One parameter is required, the name of the html file '
	'which is the TPM2 library Part 2 specification' %
	os.path.basename(sys.argv[0]), file=sys.stderr)
	sys.exit(1)
	main(sys.argv[1])