torchgen/_autoheuristic/train.py - platform/external/pytorch - Git at Google

 # mypy: ignore-errors

 import argparse
 import json
 import warnings

 import pandas as pd  # type: ignore[import-untyped]

 from torch._inductor.autoheuristic.autoheuristic_utils import (
     CHOICE_COL,
     get_metadata_str_from_log,
 )


 # TODO (AlnisM): Fix these warnings
 warnings.filterwarnings(
     "ignore",
     message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
 )
 warnings.filterwarnings(
     "ignore",
     message="DataFrameGroupBy.apply operated on the grouping columns.",
 )


 class AHTrain:
     """
     Base class for AutoHeuristic training.
     """

     def __init__(self) -> None:
         self.parser = argparse.ArgumentParser()
         self.add_base_arguments()
         self.args = None

     def add_base_arguments(self):
         self.parser.add_argument(
             "dataset",
             type=str,
             help="Path to text file containing data collected with AutoHeuristic.",
         )
         self.parser.add_argument(
             "--nrows",
             type=int,
             default=None,
             help="Only read first n rows of the dataset.",
         )
         self.parser.add_argument(
             "--heuristic-name",
             type=str,
             default="learned_heuristic",
             help="Name of the heuristic to be generated.",
         )
         self.parser.add_argument(
             "--data",
             nargs=2,
             action="append",
             metavar=("TYPE", "PATH"),
             help="Specify name of datasets and file paths to be evaluated.",
         )
         self.parser.add_argument(
             "--save-dot",
             action="store_true",
             help="Export heuristic to graphviz dot.",
         )
         self.parser.add_argument(
             "--ranking",
             type=int,
             default=None,
             help="""
                 Makes AutoHeuristic learn a heuristic that ranks choices instead of predicting a single choice.
                 The argument is the number of choices the heuristic will provide.
             """,
         )

     def parse_args(self):
         return self.parser.parse_args()

     def parse_log(self, log_path, nrows=None):
         (df, metadata) = self.deserialize_data(log_path)
         numerical_features = metadata["numerical_features"]
         categorical_features = metadata["categorical_features"]
         choices = df[CHOICE_COL].unique().tolist()
         features = numerical_features + categorical_features
         if nrows is not None:
             df = df.head(nrows)
         df = self.filter_df(df)
         return (df, metadata, features, categorical_features, choices)

     def generate_heuristic(self):
         self.args = self.parse_args()
         self.main(
             self.args.dataset,
             self.args.data,
             self.args.nrows,
             self.args.heuristic_name,
             self.args.save_dot,
             self.args.ranking is not None,
         )

     def filter_df(self, df):
         return df

     def add_new_features(self, results):
         return (results, [])

     def add_real_datasets(self, datasets, other_datasets, cat_feature2cats):
         if other_datasets:
             for name, path in other_datasets:
                 (df_other, choices, _, _, _) = self.get_df(
                     path, cat_feature2cats=cat_feature2cats, apply_filters=False
                 )
                 datasets[name] = df_other

     def handle_categorical_features(
         self, cat_feature2cats, categorical_features, results
     ):
         # Doing this here because if we create another df for testing purposes
         # and that other df does not contain all categories for a categorical feature,
         # pd.dummies will not create columns for the missing categories
         if not cat_feature2cats:
             cat_feature2cats = {}
         for cat_feature in categorical_features:
             if cat_feature in cat_feature2cats:
                 categories = cat_feature2cats[cat_feature]
             else:
                 categories = results[cat_feature].unique()
                 cat_feature2cats[cat_feature] = categories
             results[cat_feature] = pd.Categorical(
                 results[cat_feature], categories=categories
             )

         dummy_col_2_col_val = {}
         for col in categorical_features:
             unique_vals = results[col].unique()
             for val in unique_vals:
                 dummy_col_2_col_val[f"{col}_{val}"] = (col, val)
         # one-hot encode categorical features
         results = pd.get_dummies(results, columns=categorical_features)
         return (results, cat_feature2cats, dummy_col_2_col_val)

     def gen_precondition(self, opt_name, shared_memory, device_capa):
         return f"""    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
         return (
             metadata.name == self.get_name()
             and metadata.shared_memory == {shared_memory}
             and str(metadata.device_capa) == "{device_capa}"
         )"""

     def codegen_boilerplate(
         self, heuristic_name, opt_name, threshold, shared_memory, device_capa, dt
     ):
         pass

     def gen_predict_fn_def(self):
         pass

     def write_heuristic_to_file(self, lines, heuristic_name):
         output_file = (
             f"../../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py"
         )
         path = f"{output_file}"
         with open(path, "w") as f:
             f.write("\n".join(lines) + "\n")

     def deserialize_data(self, log_path):
         json_string = get_metadata_str_from_log(log_path)
         metadata = self.deserialize_metadata(json_string)

         df = pd.read_csv(log_path, skiprows=1, on_bad_lines="skip")
         return (df, metadata)

     def deserialize_metadata(self, json_string):
         return json.loads(json_string)


 if __name__ == "__main__":
     train = AHTrain()
     train.generate_heuristic()
	# mypy: ignore-errors

	import argparse
	import json
	import warnings

	import pandas as pd # type: ignore[import-untyped]

	from torch._inductor.autoheuristic.autoheuristic_utils import (
	CHOICE_COL,
	get_metadata_str_from_log,
	)


	# TODO (AlnisM): Fix these warnings
	warnings.filterwarnings(
	"ignore",
	message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
	)
	warnings.filterwarnings(
	"ignore",
	message="DataFrameGroupBy.apply operated on the grouping columns.",
	)


	class AHTrain:
	"""
	Base class for AutoHeuristic training.
	"""

	def __init__(self) -> None:
	self.parser = argparse.ArgumentParser()
	self.add_base_arguments()
	self.args = None

	def add_base_arguments(self):
	self.parser.add_argument(
	"dataset",
	type=str,
	help="Path to text file containing data collected with AutoHeuristic.",
	)
	self.parser.add_argument(
	"--nrows",
	type=int,
	default=None,
	help="Only read first n rows of the dataset.",
	)
	self.parser.add_argument(
	"--heuristic-name",
	type=str,
	default="learned_heuristic",
	help="Name of the heuristic to be generated.",
	)
	self.parser.add_argument(
	"--data",
	nargs=2,
	action="append",
	metavar=("TYPE", "PATH"),
	help="Specify name of datasets and file paths to be evaluated.",
	)
	self.parser.add_argument(
	"--save-dot",
	action="store_true",
	help="Export heuristic to graphviz dot.",
	)
	self.parser.add_argument(
	"--ranking",
	type=int,
	default=None,
	help="""
	Makes AutoHeuristic learn a heuristic that ranks choices instead of predicting a single choice.
	The argument is the number of choices the heuristic will provide.
	""",
	)

	def parse_args(self):
	return self.parser.parse_args()

	def parse_log(self, log_path, nrows=None):
	(df, metadata) = self.deserialize_data(log_path)
	numerical_features = metadata["numerical_features"]
	categorical_features = metadata["categorical_features"]
	choices = df[CHOICE_COL].unique().tolist()
	features = numerical_features + categorical_features
	if nrows is not None:
	df = df.head(nrows)
	df = self.filter_df(df)
	return (df, metadata, features, categorical_features, choices)

	def generate_heuristic(self):
	self.args = self.parse_args()
	self.main(
	self.args.dataset,
	self.args.data,
	self.args.nrows,
	self.args.heuristic_name,
	self.args.save_dot,
	self.args.ranking is not None,
	)

	def filter_df(self, df):
	return df

	def add_new_features(self, results):
	return (results, [])

	def add_real_datasets(self, datasets, other_datasets, cat_feature2cats):
	if other_datasets:
	for name, path in other_datasets:
	(df_other, choices, _, _, _) = self.get_df(
	path, cat_feature2cats=cat_feature2cats, apply_filters=False
	)
	datasets[name] = df_other

	def handle_categorical_features(
	self, cat_feature2cats, categorical_features, results
	):
	# Doing this here because if we create another df for testing purposes
	# and that other df does not contain all categories for a categorical feature,
	# pd.dummies will not create columns for the missing categories
	if not cat_feature2cats:
	cat_feature2cats = {}
	for cat_feature in categorical_features:
	if cat_feature in cat_feature2cats:
	categories = cat_feature2cats[cat_feature]
	else:
	categories = results[cat_feature].unique()
	cat_feature2cats[cat_feature] = categories
	results[cat_feature] = pd.Categorical(
	results[cat_feature], categories=categories
	)

	dummy_col_2_col_val = {}
	for col in categorical_features:
	unique_vals = results[col].unique()
	for val in unique_vals:
	dummy_col_2_col_val[f"{col}_{val}"] = (col, val)
	# one-hot encode categorical features
	results = pd.get_dummies(results, columns=categorical_features)
	return (results, cat_feature2cats, dummy_col_2_col_val)

	def gen_precondition(self, opt_name, shared_memory, device_capa):
	return f""" def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
	return (
	metadata.name == self.get_name()
	and metadata.shared_memory == {shared_memory}
	and str(metadata.device_capa) == "{device_capa}"
	)"""

	def codegen_boilerplate(
	self, heuristic_name, opt_name, threshold, shared_memory, device_capa, dt
	):
	pass

	def gen_predict_fn_def(self):
	pass

	def write_heuristic_to_file(self, lines, heuristic_name):
	output_file = (
	f"../../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py"
	)
	path = f"{output_file}"
	with open(path, "w") as f:
	f.write("\n".join(lines) + "\n")

	def deserialize_data(self, log_path):
	json_string = get_metadata_str_from_log(log_path)
	metadata = self.deserialize_metadata(json_string)

	df = pd.read_csv(log_path, skiprows=1, on_bad_lines="skip")
	return (df, metadata)

	def deserialize_metadata(self, json_string):
	return json.loads(json_string)


	if __name__ == "__main__":
	train = AHTrain()
	train.generate_heuristic()