| """Code generator for Code Completion Model Inference. |
| |
| Tool runs on the Decision Forest model defined in {model} directory. |
| It generates two files: {output_dir}/{filename}.h and {output_dir}/{filename}.cpp |
| The generated files defines the Example class named {cpp_class} having all the features as class members. |
| The generated runtime provides an `Evaluate` function which can be used to score a code completion candidate. |
| """ |
| |
| import argparse |
| import json |
| import struct |
| |
| |
| class CppClass: |
| """Holds class name and names of the enclosing namespaces.""" |
| |
| def __init__(self, cpp_class): |
| ns_and_class = cpp_class.split("::") |
| self.ns = [ns for ns in ns_and_class[0:-1] if len(ns) > 0] |
| self.name = ns_and_class[-1] |
| if len(self.name) == 0: |
| raise ValueError("Empty class name.") |
| |
| def ns_begin(self): |
| """Returns snippet for opening namespace declarations.""" |
| open_ns = ["namespace %s {" % ns for ns in self.ns] |
| return "\n".join(open_ns) |
| |
| def ns_end(self): |
| """Returns snippet for closing namespace declarations.""" |
| close_ns = ["} // namespace %s" % ns for ns in reversed(self.ns)] |
| return "\n".join(close_ns) |
| |
| |
| def header_guard(filename): |
| """Returns the header guard for the generated header.""" |
| return "GENERATED_DECISION_FOREST_MODEL_%s_H" % filename.upper() |
| |
| |
| def boost_node(n, label, next_label): |
| """Returns code snippet for a leaf/boost node.""" |
| return "%s: return %sf;" % (label, n["score"]) |
| |
| |
| def if_greater_node(n, label, next_label): |
| """Returns code snippet for a if_greater node. |
| Jumps to true_label if the Example feature (NUMBER) is greater than the threshold. |
| Comparing integers is much faster than comparing floats. Assuming floating points |
| are represented as IEEE 754, it order-encodes the floats to integers before comparing them. |
| Control falls through if condition is evaluated to false.""" |
| threshold = n["threshold"] |
| return "%s: if (E.get%s() >= %s /*%s*/) goto %s;" % ( |
| label, |
| n["feature"], |
| order_encode(threshold), |
| threshold, |
| next_label, |
| ) |
| |
| |
| def if_member_node(n, label, next_label): |
| """Returns code snippet for a if_member node. |
| Jumps to true_label if the Example feature (ENUM) is present in the set of enum values |
| described in the node. |
| Control falls through if condition is evaluated to false.""" |
| members = "|".join( |
| ["BIT(%s_type::%s)" % (n["feature"], member) for member in n["set"]] |
| ) |
| return "%s: if (E.get%s() & (%s)) goto %s;" % ( |
| label, |
| n["feature"], |
| members, |
| next_label, |
| ) |
| |
| |
| def node(n, label, next_label): |
| """Returns code snippet for the node.""" |
| return { |
| "boost": boost_node, |
| "if_greater": if_greater_node, |
| "if_member": if_member_node, |
| }[n["operation"]](n, label, next_label) |
| |
| |
| def tree(t, tree_num, node_num): |
| """Returns code for inferencing a Decision Tree. |
| Also returns the size of the decision tree. |
| |
| A tree starts with its label `t{tree#}`. |
| A node of the tree starts with label `t{tree#}_n{node#}`. |
| |
| The tree contains two types of node: Conditional node and Leaf node. |
| - Conditional node evaluates a condition. If true, it jumps to the true node/child. |
| Code is generated using pre-order traversal of the tree considering |
| false node as the first child. Therefore the false node is always the |
| immediately next label. |
| - Leaf node adds the value to the score and jumps to the next tree. |
| """ |
| label = "t%d_n%d" % (tree_num, node_num) |
| code = [] |
| |
| if t["operation"] == "boost": |
| code.append(node(t, label=label, next_label="t%d" % (tree_num + 1))) |
| return code, 1 |
| |
| false_code, false_size = tree(t["else"], tree_num=tree_num, node_num=node_num + 1) |
| |
| true_node_num = node_num + false_size + 1 |
| true_label = "t%d_n%d" % (tree_num, true_node_num) |
| |
| true_code, true_size = tree(t["then"], tree_num=tree_num, node_num=true_node_num) |
| |
| code.append(node(t, label=label, next_label=true_label)) |
| |
| return code + false_code + true_code, 1 + false_size + true_size |
| |
| |
| def gen_header_code(features_json, cpp_class, filename): |
| """Returns code for header declaring the inference runtime. |
| |
| Declares the Example class named {cpp_class} inside relevant namespaces. |
| The Example class contains all the features as class members. This |
| class can be used to represent a code completion candidate. |
| Provides `float Evaluate()` function which can be used to score the Example. |
| """ |
| setters = [] |
| getters = [] |
| for f in features_json: |
| feature = f["name"] |
| |
| if f["kind"] == "NUMBER": |
| # Floats are order-encoded to integers for faster comparison. |
| setters.append( |
| "void set%s(float V) { %s = OrderEncode(V); }" % (feature, feature) |
| ) |
| elif f["kind"] == "ENUM": |
| setters.append( |
| "void set%s(unsigned V) { %s = 1LL << V; }" % (feature, feature) |
| ) |
| else: |
| raise ValueError("Unhandled feature type.", f["kind"]) |
| |
| # Class members represent all the features of the Example. |
| class_members = [ |
| "uint%d_t %s = 0;" % (64 if f["kind"] == "ENUM" else 32, f["name"]) |
| for f in features_json |
| ] |
| getters = [ |
| "LLVM_ATTRIBUTE_ALWAYS_INLINE uint%d_t get%s() const { return %s; }" |
| % (64 if f["kind"] == "ENUM" else 32, f["name"], f["name"]) |
| for f in features_json |
| ] |
| nline = "\n " |
| guard = header_guard(filename) |
| return """#ifndef %s |
| #define %s |
| #include <cstdint> |
| #include "llvm/Support/Compiler.h" |
| |
| %s |
| class %s { |
| public: |
| // Setters. |
| %s |
| |
| // Getters. |
| %s |
| |
| private: |
| %s |
| |
| // Produces an integer that sorts in the same order as F. |
| // That is: a < b <==> orderEncode(a) < orderEncode(b). |
| static uint32_t OrderEncode(float F); |
| }; |
| |
| float Evaluate(const %s&); |
| %s |
| #endif // %s |
| """ % ( |
| guard, |
| guard, |
| cpp_class.ns_begin(), |
| cpp_class.name, |
| nline.join(setters), |
| nline.join(getters), |
| nline.join(class_members), |
| cpp_class.name, |
| cpp_class.ns_end(), |
| guard, |
| ) |
| |
| |
| def order_encode(v): |
| i = struct.unpack("<I", struct.pack("<f", v))[0] |
| TopBit = 1 << 31 |
| # IEEE 754 floats compare like sign-magnitude integers. |
| if i & TopBit: # Negative float |
| return (1 << 32) - i # low half of integers, order reversed. |
| return TopBit + i # top half of integers |
| |
| |
| def evaluate_func(forest_json, cpp_class): |
| """Generates evaluation functions for each tree and combines them in |
| `float Evaluate(const {Example}&)` function. This function can be |
| used to score an Example.""" |
| |
| code = "" |
| |
| # Generate evaluation function of each tree. |
| code += "namespace {\n" |
| tree_num = 0 |
| for tree_json in forest_json: |
| code += "LLVM_ATTRIBUTE_NOINLINE float EvaluateTree%d(const %s& E) {\n" % ( |
| tree_num, |
| cpp_class.name, |
| ) |
| code += ( |
| " " + "\n ".join(tree(tree_json, tree_num=tree_num, node_num=0)[0]) + "\n" |
| ) |
| code += "}\n\n" |
| tree_num += 1 |
| code += "} // namespace\n\n" |
| |
| # Combine the scores of all trees in the final function. |
| # MSAN will timeout if these functions are inlined. |
| code += "float Evaluate(const %s& E) {\n" % cpp_class.name |
| code += " float Score = 0;\n" |
| for tree_num in range(len(forest_json)): |
| code += " Score += EvaluateTree%d(E);\n" % tree_num |
| code += " return Score;\n" |
| code += "}\n" |
| |
| return code |
| |
| |
| def gen_cpp_code(forest_json, features_json, filename, cpp_class): |
| """Generates code for the .cpp file.""" |
| # Headers |
| # Required by OrderEncode(float F). |
| angled_include = ["#include <%s>" % h for h in ["cstring", "limits"]] |
| |
| # Include generated header. |
| qouted_headers = {filename + ".h", "llvm/ADT/bit.h"} |
| # Headers required by ENUM features used by the model. |
| qouted_headers |= {f["header"] for f in features_json if f["kind"] == "ENUM"} |
| quoted_include = ['#include "%s"' % h for h in sorted(qouted_headers)] |
| |
| # using-decl for ENUM features. |
| using_decls = "\n".join( |
| "using %s_type = %s;" % (feature["name"], feature["type"]) |
| for feature in features_json |
| if feature["kind"] == "ENUM" |
| ) |
| nl = "\n" |
| return """%s |
| |
| %s |
| |
| #define BIT(X) (1LL << X) |
| |
| %s |
| |
| %s |
| |
| uint32_t %s::OrderEncode(float F) { |
| static_assert(std::numeric_limits<float>::is_iec559, ""); |
| constexpr uint32_t TopBit = ~(~uint32_t{0} >> 1); |
| |
| // Get the bits of the float. Endianness is the same as for integers. |
| uint32_t U = llvm::bit_cast<uint32_t>(F); |
| std::memcpy(&U, &F, sizeof(U)); |
| // IEEE 754 floats compare like sign-magnitude integers. |
| if (U & TopBit) // Negative float. |
| return 0 - U; // Map onto the low half of integers, order reversed. |
| return U + TopBit; // Positive floats map onto the high half of integers. |
| } |
| |
| %s |
| %s |
| """ % ( |
| nl.join(angled_include), |
| nl.join(quoted_include), |
| cpp_class.ns_begin(), |
| using_decls, |
| cpp_class.name, |
| evaluate_func(forest_json, cpp_class), |
| cpp_class.ns_end(), |
| ) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser("DecisionForestCodegen") |
| parser.add_argument("--filename", help="output file name.") |
| parser.add_argument("--output_dir", help="output directory.") |
| parser.add_argument("--model", help="path to model directory.") |
| parser.add_argument( |
| "--cpp_class", |
| help="The name of the class (which may be a namespace-qualified) created in generated header.", |
| ) |
| ns = parser.parse_args() |
| |
| output_dir = ns.output_dir |
| filename = ns.filename |
| header_file = "%s/%s.h" % (output_dir, filename) |
| cpp_file = "%s/%s.cpp" % (output_dir, filename) |
| cpp_class = CppClass(cpp_class=ns.cpp_class) |
| |
| model_file = "%s/forest.json" % ns.model |
| features_file = "%s/features.json" % ns.model |
| |
| with open(features_file) as f: |
| features_json = json.load(f) |
| |
| with open(model_file) as m: |
| forest_json = json.load(m) |
| |
| with open(cpp_file, "w+t") as output_cc: |
| output_cc.write( |
| gen_cpp_code( |
| forest_json=forest_json, |
| features_json=features_json, |
| filename=filename, |
| cpp_class=cpp_class, |
| ) |
| ) |
| |
| with open(header_file, "w+t") as output_h: |
| output_h.write( |
| gen_header_code( |
| features_json=features_json, cpp_class=cpp_class, filename=filename |
| ) |
| ) |
| |
| |
| if __name__ == "__main__": |
| main() |