tools/compile_seccomp_policy: Add the beginning of a new parser am: d4ce449ed0 am: 93bb1d5d6d
am: 44ca0f3fdd

Change-Id: Id7aedd87e940789046b090caba2700602aa563ac
diff --git a/.gitignore b/.gitignore
index 2414029..fed0adb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,10 @@
 # libseccomp.
 /libseccomp/
 
+# Python-related files.
+/tools/__pycache__/
+*.pyc
+
 # Shared libraries when compiling in-tree.
 *.so
 
diff --git a/tools/arch.py b/tools/arch.py
new file mode 100644
index 0000000..6f2dfb2
--- /dev/null
+++ b/tools/arch.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Architecture-specific information."""
+
+import collections
+import json
+
+
+class Arch(
+        collections.namedtuple(
+            'Arch',
+            ['arch_nr', 'arch_name', 'bits', 'syscalls', 'constants'])):
+    """Holds architecture-specific information."""
+
+    def truncate_word(self, value):
+        """Return the value truncated to fit in a word."""
+        return value & self.max_unsigned
+
+    @property
+    def min_signed(self):
+        """The smallest signed value that can be represented in a word."""
+        return -(1 << (self.bits - 1))
+
+    @property
+    def max_unsigned(self):
+        """The largest unsigned value that can be represented in a word."""
+        return (1 << self.bits) - 1
+
+    @staticmethod
+    def load_from_json(json_path):
+        """Return an Arch from a .json file."""
+        with open(json_path, 'r') as json_file:
+            constants = json.load(json_file)
+            return Arch(
+                arch_nr=constants['arch_nr'],
+                arch_name=constants['arch_name'],
+                bits=constants['bits'],
+                syscalls=constants['syscalls'],
+                constants=constants['constants'],
+            )
diff --git a/tools/parser.py b/tools/parser.py
new file mode 100644
index 0000000..05b6628
--- /dev/null
+++ b/tools/parser.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A parser for the Minijail policy file."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+Token = collections.namedtuple('token',
+                               ['type', 'value', 'filename', 'line', 'column'])
+
+# A regex that can tokenize a Minijail policy file line.
+_TOKEN_SPECIFICATION = (
+    ('COMMENT', r'#.*$'),
+    ('WHITESPACE', r'\s+'),
+    ('INCLUDE', r'@include'),
+    ('PATH', r'(?:\.)?/\S+'),
+    ('NUMERIC_CONSTANT', r'-?0[xX][0-9a-fA-F]+|-?0[Oo][0-7]+|-?[0-9]+'),
+    ('COLON', r':'),
+    ('SEMICOLON', r';'),
+    ('COMMA', r','),
+    ('BITWISE_COMPLEMENT', r'~'),
+    ('LPAREN', r'\('),
+    ('RPAREN', r'\)'),
+    ('LBRACE', r'\{'),
+    ('RBRACE', r'\}'),
+    ('RBRACKET', r'\]'),
+    ('LBRACKET', r'\['),
+    ('OR', r'\|\|'),
+    ('AND', r'&&'),
+    ('BITWISE_OR', r'\|'),
+    ('OP', r'&|in|==|!=|<=|<|>=|>'),
+    ('EQUAL', r'='),
+    ('ARGUMENT', r'arg[0-9]+'),
+    ('RETURN', r'return'),
+    ('ACTION', r'allow|kill-process|kill-thread|kill|trap|trace|log'),
+    ('IDENTIFIER', r'[a-zA-Z_][a-zA-Z_0-9@]*'),
+)
+_TOKEN_RE = re.compile('|'.join(
+    r'(?P<%s>%s)' % pair for pair in _TOKEN_SPECIFICATION))
+
+
+class ParseException(Exception):
+    """An exception that is raised when parsing fails."""
+
+    # pylint: disable=too-many-arguments
+    def __init__(self, message, filename, line, line_number=1, token=None):
+        if token:
+            column = token.column
+            length = len(token.value)
+        else:
+            column = len(line)
+            length = 1
+
+        message = ('%s(%d:%d): %s') % (filename, line_number, column + 1,
+                                       message)
+        message += '\n    %s' % line
+        message += '\n    %s%s' % (' ' * column, '^' * length)
+        super().__init__(message)
+
+
+class ParserState:
+    """Stores the state of the Parser to provide better diagnostics."""
+
+    def __init__(self, filename):
+        self._filename = filename
+        self._line = ''
+        self._line_number = 0
+
+    @property
+    def filename(self):
+        """Return the name of the file being processed."""
+        return self._filename
+
+    @property
+    def line(self):
+        """Return the current line being processed."""
+        return self._line
+
+    @property
+    def line_number(self):
+        """Return the current line number being processed."""
+        return self._line_number
+
+    def set_line(self, line):
+        """Update the current line being processed."""
+        self._line = line
+        self._line_number += 1
+
+    def error(self, message, token=None):
+        """Raise a ParserException with the provided message."""
+        raise ParseException(message, self.filename, self.line,
+                             self.line_number, token)
+
+    def tokenize(self):
+        """Return a list of tokens for the current line."""
+        tokens = []
+
+        last_end = 0
+        for token in _TOKEN_RE.finditer(self.line):
+            if token.start() != last_end:
+                self.error(
+                    'invalid token',
+                    token=Token('INVALID', self.line[last_end:token.start()],
+                                self.filename, self.line_number, last_end))
+            last_end = token.end()
+
+            # Omit whitespace and comments now to avoid sprinkling this logic
+            # elsewhere.
+            if token.lastgroup in ('WHITESPACE', 'COMMENT'):
+                continue
+            tokens.append(
+                Token(token.lastgroup, token.group(), self.filename,
+                      self.line_number, token.start()))
+        if last_end != len(self.line):
+            self.error(
+                'invalid token',
+                token=Token('INVALID', self.line[last_end:], self.filename,
+                            self.line_number, last_end))
+        return tokens
+
+
+# pylint: disable=too-few-public-methods
+class PolicyParser:
+    """A parser for the Minijail seccomp policy file format."""
+
+    def __init__(self, arch):
+        self._parser_states = [ParserState("<memory>")]
+        self._arch = arch
+
+    @property
+    def _parser_state(self):
+        return self._parser_states[-1]
+
+    # single-constant = identifier
+    #                 | numeric-constant
+    #                 ;
+    def _parse_single_constant(self, token):
+        if token.type == 'IDENTIFIER':
+            if token.value not in self._arch.constants:
+                self._parser_state.error('invalid constant', token=token)
+            single_constant = self._arch.constants[token.value]
+        elif token.type == 'NUMERIC_CONSTANT':
+            try:
+                single_constant = int(token.value, base=0)
+            except ValueError:
+                self._parser_state.error('invalid constant', token=token)
+        else:
+            self._parser_state.error('invalid constant', token=token)
+        if single_constant > self._arch.max_unsigned:
+            self._parser_state.error('unsigned overflow', token=token)
+        elif single_constant < self._arch.min_signed:
+            self._parser_state.error('signed underflow', token=token)
+        elif single_constant < 0:
+            # This converts the constant to an unsigned representation of the
+            # same value, since BPF only uses unsigned values.
+            single_constant = self._arch.truncate_word(single_constant)
+        return single_constant
+
+    # constant = [ '~' ] , '(' , value , ')'
+    #          | [ '~' ] , single-constant
+    #          ;
+    def _parse_constant(self, tokens):
+        negate = False
+        if tokens[0].type == 'BITWISE_COMPLEMENT':
+            negate = True
+            tokens.pop(0)
+            if not tokens:
+                self._parser_state.error('empty complement')
+            if tokens[0].type == 'BITWISE_COMPLEMENT':
+                self._parser_state.error(
+                    'invalid double complement', token=tokens[0])
+        if tokens[0].type == 'LPAREN':
+            last_open_paren = tokens.pop(0)
+            single_value = self.parse_value(tokens)
+            if not tokens or tokens[0].type != 'RPAREN':
+                self._parser_state.error(
+                    'unclosed parenthesis', token=last_open_paren)
+        else:
+            single_value = self._parse_single_constant(tokens[0])
+        tokens.pop(0)
+        if negate:
+            single_value = self._arch.truncate_word(~single_value)
+        return single_value
+
+    # value = constant , [ { '|' , constant } ]
+    #       ;
+    def parse_value(self, tokens):
+        """Parse constants separated bitwise OR operator |.
+
+        Constants can be:
+
+        - A number that can be parsed with int(..., base=0)
+        - A named constant expression.
+        - A parenthesized, valid constant expression.
+        - A valid constant expression prefixed with the unary bitwise
+          complement operator ~.
+        - A series of valid constant expressions separated by bitwise
+          OR operator |.
+
+        If there is an error parsing any of the constants, the whole process
+        fails.
+        """
+
+        value = 0
+        while tokens:
+            value |= self._parse_constant(tokens)
+            if not tokens or tokens[0].type != 'BITWISE_OR':
+                break
+            tokens.pop(0)
+        else:
+            self._parser_state.error('empty constant')
+        return value
diff --git a/tools/parser_unittest.py b/tools/parser_unittest.py
new file mode 100755
index 0000000..d40ab42
--- /dev/null
+++ b/tools/parser_unittest.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2018 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unittests for the parser module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import unittest
+
+import arch
+import parser  # pylint: disable=wrong-import-order
+
+ARCH_64 = arch.Arch.load_from_json(
+    os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), 'testdata/arch_64.json'))
+
+
+class TokenizerTests(unittest.TestCase):
+    """Tests for ParserState.tokenize."""
+
+    @staticmethod
+    def _tokenize(line):
+        parser_state = parser.ParserState('<memory>')
+        parser_state.set_line(line)
+        return parser_state.tokenize()
+
+    def test_tokenize(self):
+        """Accept valid tokens."""
+        self.assertEqual([
+            (token.type, token.value)
+            for token in TokenizerTests._tokenize('@include /minijail.policy')
+        ], [
+            ('INCLUDE', '@include'),
+            ('PATH', '/minijail.policy'),
+        ])
+        self.assertEqual([
+            (token.type, token.value)
+            for token in TokenizerTests._tokenize('@include ./minijail.policy')
+        ], [
+            ('INCLUDE', '@include'),
+            ('PATH', './minijail.policy'),
+        ])
+        self.assertEqual(
+            [(token.type, token.value) for token in TokenizerTests._tokenize(
+                'read: arg0 in ~0xffff || arg0 & (1|2) && arg0 == 0o755; '
+                'return ENOSYS # ignored')], [
+                    ('IDENTIFIER', 'read'),
+                    ('COLON', ':'),
+                    ('ARGUMENT', 'arg0'),
+                    ('OP', 'in'),
+                    ('BITWISE_COMPLEMENT', '~'),
+                    ('NUMERIC_CONSTANT', '0xffff'),
+                    ('OR', '||'),
+                    ('ARGUMENT', 'arg0'),
+                    ('OP', '&'),
+                    ('LPAREN', '('),
+                    ('NUMERIC_CONSTANT', '1'),
+                    ('BITWISE_OR', '|'),
+                    ('NUMERIC_CONSTANT', '2'),
+                    ('RPAREN', ')'),
+                    ('AND', '&&'),
+                    ('ARGUMENT', 'arg0'),
+                    ('OP', '=='),
+                    ('NUMERIC_CONSTANT', '0o755'),
+                    ('SEMICOLON', ';'),
+                    ('RETURN', 'return'),
+                    ('IDENTIFIER', 'ENOSYS'),
+                ])
+
+    def test_tokenize_invalid_token(self):
+        """Reject tokenizer errors."""
+        with self.assertRaisesRegex(parser.ParseException,
+                                    (r'<memory>\(1:1\): invalid token\n'
+                                     r'    %invalid-token%\n'
+                                     r'    \^')):
+            TokenizerTests._tokenize('%invalid-token%')
+
+
+class ParseConstantTests(unittest.TestCase):
+    """Tests for PolicyParser.parse_value."""
+
+    def setUp(self):
+        self.arch = ARCH_64
+        self.parser = parser.PolicyParser(self.arch)
+
+    def _tokenize(self, line):
+        # pylint: disable=protected-access
+        self.parser._parser_state.set_line(line)
+        return self.parser._parser_state.tokenize()
+
+    def test_parse_constant_unsigned(self):
+        """Accept reasonably-sized unsigned constants."""
+        self.assertEqual(
+            self.parser.parse_value(self._tokenize('0x80000000')), 0x80000000)
+        if self.arch.bits == 64:
+            self.assertEqual(
+                self.parser.parse_value(self._tokenize('0x8000000000000000')),
+                0x8000000000000000)
+
+    def test_parse_constant_unsigned_too_big(self):
+        """Reject unreasonably-sized unsigned constants."""
+        if self.arch.bits == 32:
+            with self.assertRaisesRegex(parser.ParseException,
+                                        'unsigned overflow'):
+                self.parser.parse_value(self._tokenize('0x100000000'))
+        with self.assertRaisesRegex(parser.ParseException,
+                                    'unsigned overflow'):
+            self.parser.parse_value(self._tokenize('0x10000000000000000'))
+
+    def test_parse_constant_signed(self):
+        """Accept reasonably-sized signed constants."""
+        self.assertEqual(
+            self.parser.parse_value(self._tokenize('-1')),
+            self.arch.max_unsigned)
+
+    def test_parse_constant_signed_too_negative(self):
+        """Reject unreasonably-sized signed constants."""
+        if self.arch.bits == 32:
+            with self.assertRaisesRegex(parser.ParseException,
+                                        'signed underflow'):
+                self.parser.parse_value(self._tokenize('-0x800000001'))
+        with self.assertRaisesRegex(parser.ParseException, 'signed underflow'):
+            self.parser.parse_value(self._tokenize('-0x8000000000000001'))
+
+    def test_parse_mask(self):
+        """Accept parsing a mask value."""
+        self.assertEqual(
+            self.parser.parse_value(self._tokenize('0x1|0x2|0x4|0x8')), 0xf)
+
+    def test_parse_parenthesized_expressions(self):
+        """Accept parsing parenthesized expressions."""
+        bad_expressions = [
+            '(1',
+            '|(1)',
+            '(1)|',
+            '()',
+            '(',
+            '((',
+            '(()',
+            '(()1',
+        ]
+        for expression in bad_expressions:
+            with self.assertRaises(parser.ParseException, msg=expression):
+                self.parser.parse_value(self._tokenize(expression))
+
+        bad_partial_expressions = [
+            '1)',
+            '(1)1',
+            '1(0)',
+        ]
+        for expression in bad_partial_expressions:
+            tokens = self._tokenize(expression)
+            self.parser.parse_value(tokens)
+            self.assertNotEqual(tokens, [])
+
+        good_expressions = [
+            '(3)',
+            '(1)|2',
+            '1|(2)',
+            '(1)|(2)',
+            '((3))',
+            '0|(1|2)',
+            '(0|1|2)',
+        ]
+        for expression in good_expressions:
+            self.assertEqual(
+                self.parser.parse_value(self._tokenize(expression)), 3)
+
+    def test_parse_constant_complements(self):
+        """Accept complementing constants."""
+        self.assertEqual(
+            self.parser.parse_value(self._tokenize('~0')),
+            self.arch.max_unsigned)
+        self.assertEqual(
+            self.parser.parse_value(self._tokenize('~0|~0')),
+            self.arch.max_unsigned)
+        if self.arch.bits == 32:
+            self.assertEqual(
+                self.parser.parse_value(
+                    self._tokenize('~0x005AF0FF|~0xFFA50FFF')), 0xFFFFFF00)
+            self.assertEqual(
+                self.parser.parse_value(
+                    self._tokenize('0x0F|~(0x005AF000|0x00A50FFF)|0xF0')),
+                0xFF0000FF)
+        else:
+            self.assertEqual(
+                self.parser.parse_value(
+                    self._tokenize('~0x00005A5AF0F0FFFF|~0xFFFFA5A50F0FFFFF')),
+                0xFFFFFFFFFFFF0000)
+            self.assertEqual(
+                self.parser.parse_value(
+                    self._tokenize(
+                        '0x00FF|~(0x00005A5AF0F00000|0x0000A5A50F0FFFFF)|0xFF00'
+                    )), 0xFFFF00000000FFFF)
+
+    def test_parse_double_complement(self):
+        """Reject double-complementing constants."""
+        with self.assertRaisesRegex(parser.ParseException,
+                                    'double complement'):
+            self.parser.parse_value(self._tokenize('~~0'))
+
+    def test_parse_empty_complement(self):
+        """Reject complementing nothing."""
+        with self.assertRaisesRegex(parser.ParseException, 'empty complement'):
+            self.parser.parse_value(self._tokenize('0|~'))
+
+    def test_parse_named_constant(self):
+        """Accept parsing a named constant."""
+        self.assertEqual(
+            self.parser.parse_value(self._tokenize('O_RDONLY')), 0)
+
+    def test_parse_empty_constant(self):
+        """Reject parsing nothing."""
+        with self.assertRaisesRegex(parser.ParseException, 'empty constant'):
+            self.parser.parse_value(self._tokenize(''))
+        with self.assertRaisesRegex(parser.ParseException, 'empty constant'):
+            self.parser.parse_value(self._tokenize('0|'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/testdata/arch_64.json b/tools/testdata/arch_64.json
new file mode 100644
index 0000000..c23f988
--- /dev/null
+++ b/tools/testdata/arch_64.json
@@ -0,0 +1,14 @@
+{
+  "arch_nr": 3735928559,
+  "arch_name": "test",
+  "bits": 64,
+  "syscalls": {
+    "read": 0,
+    "write": 1
+  },
+  "constants": {
+    "O_RDONLY": 0,
+    "PROT_WRITE": 2,
+    "PROT_EXEC": 4
+  }
+}