blob: c089ffd5cbb87c103b3cd6f06fbd92a777cd0f7f [file] [log] [blame] [edit]
# Python wrapper script for collecting Canary metrics, setting-up/tearing-down alarms, reporting metrics to Cloudwatch,
# checking the alarms to ensure everything is correct at the end of the run, and pushing the log to S3 if successful.
# Needs to be installed prior to running
# Part of standard packages in Python 3.4+
import argparse
import time
import datetime
# Dependencies in project folder
from CanaryWrapper_Classes import *
from CanaryWrapper_MetricFunctions import *
# Code for command line argument parsing
# ================================================================================
command_parser = argparse.ArgumentParser("CanaryWrapper")
command_parser.add_argument("--canary_executable", type=str, required=True,
help="The path to the canary executable (or program - like 'python3')")
command_parser.add_argument("--canary_arguments", type=str, default="",
help="The arguments to pass/launch the canary executable with")
command_parser.add_argument("--git_hash", type=str, required=True,
help="The Git commit hash that we are running the canary with")
command_parser.add_argument("--git_repo_name", type=str, required=True,
help="The name of the Git repository")
command_parser.add_argument("--git_hash_as_namespace", type=bool, default=False,
help="(OPTIONAL, default=False) If true, the git hash will be used as the name of the Cloudwatch namespace")
command_parser.add_argument("--output_log_filepath", type=str, default="output.log",
help="(OPTIONAL, default=output.log) The file to output log info to. Set to 'None' to disable")
command_parser.add_argument("--output_to_console", type=bool, default=True,
help="(OPTIONAL, default=True) If true, info will be output to the console")
command_parser.add_argument("--cloudwatch_region", type=str, default="us-east-1",
help="(OPTIONAL, default=us-east-1) The AWS region for Cloudwatch")
command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder",
help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored")
command_parser.add_argument("--snapshot_wait_time", type=int, default=600,
help="(OPTIONAL, default=600) The number of seconds between gathering and sending snapshot reports")
command_parser.add_argument("--ticket_category", type=str, default="AWS",
help="(OPTIONAL, default=AWS) The category to register the ticket under")
command_parser.add_argument("--ticket_type", type=str, default="SDKs and Tools",
help="(OPTIONAL, default='SDKs and Tools') The type to register the ticket under")
command_parser.add_argument("--ticket_item", type=str, default="IoT SDK for CPP",
help="(OPTIONAL, default='IoT SDK for CPP') The item to register the ticket under")
command_parser.add_argument("--ticket_group", type=str, default="AWS IoT Device SDK",
help="(OPTIONAL, default='AWS IoT Device SDK') The group to register the ticket under")
command_parser.add_argument("--dependencies", type=str, default="",
help="(OPTIONAL, default='') Any dependencies and their commit hashes. \
Current expected format is '(name or path);(hash);(next name or path);(hash);(etc...)'.")
command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda",
help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails")
command_parser.add_argument("--codebuild_log_path", type=str, default="",
help="The CODEBUILD_LOG_PATH environment variable. Leave blank to ignore")
command_parser_arguments = command_parser.parse_args()
if (command_parser_arguments.output_log_filepath == "None"):
command_parser_arguments.output_log_filepath = None
if (command_parser_arguments.snapshot_wait_time <= 0):
command_parser_arguments.snapshot_wait_time = 60
# Deal with possibly empty values in semi-critical commands/arguments
if (command_parser_arguments.canary_executable == ""):
print ("ERROR - required canary_executable is empty!", flush=True)
exit (1) # cannot run without a canary executable
if (command_parser_arguments.git_hash == ""):
print ("ERROR - required git_hash is empty!", flush=True)
exit (1) # cannot run without git hash
if (command_parser_arguments.git_repo_name == ""):
print ("ERROR - required git_repo_name is empty!", flush=True)
exit (1) # cannot run without git repo name
if (command_parser_arguments.git_hash_as_namespace is not True and command_parser_arguments.git_hash_as_namespace is not False):
command_parser_arguments.git_hash_as_namespace = False
if (command_parser_arguments.output_log_filepath == ""):
command_parser_arguments.output_log_filepath = None
if (command_parser_arguments.output_to_console != True and command_parser_arguments.output_to_console != False):
command_parser_arguments.output_to_console = True
if (command_parser_arguments.cloudwatch_region == ""):
command_parser_arguments.cloudwatch_region = "us-east-1"
if (command_parser_arguments.s3_bucket_name == ""):
command_parser_arguments.s3_bucket_name = "canary-wrapper-folder"
if (command_parser_arguments.ticket_category == ""):
command_parser_arguments.ticket_category = "AWS"
if (command_parser_arguments.ticket_type == ""):
command_parser_arguments.ticket_type = "SDKs and Tools"
if (command_parser_arguments.ticket_item == ""):
command_parser_arguments.ticket_item = "IoT SDK for CPP"
if (command_parser_arguments.ticket_group == ""):
command_parser_arguments.ticket_group = "AWS IoT Device SDK"
# ================================================================================
datetime_now = datetime.datetime.now()
datetime_string = datetime_now.strftime("%d-%m-%Y/%H-%M-%S")
print("Datetime string is: " + datetime_string, flush=True)
# Make the snapshot class
data_snapshot = DataSnapshot(
git_hash=command_parser_arguments.git_hash,
git_repo_name=command_parser_arguments.git_repo_name,
datetime_string=datetime_string,
git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
git_fixed_namespace_text="mqtt5_canary",
output_log_filepath="output.txt",
output_to_console=command_parser_arguments.output_to_console,
cloudwatch_region="us-east-1",
cloudwatch_make_dashboard=False,
cloudwatch_teardown_alarms_on_complete=True,
cloudwatch_teardown_dashboard_on_complete=True,
s3_bucket_name=command_parser_arguments.s3_bucket_name,
s3_bucket_upload_on_complete=True,
lambda_name=command_parser_arguments.lambda_name,
metric_frequency=command_parser_arguments.snapshot_wait_time)
# Make sure nothing failed
if (data_snapshot.abort_due_to_internal_error == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again", flush=True)
exit(0)
# Register metrics
data_snapshot.register_metric(
new_metric_name="total_cpu_usage",
new_metric_function=get_metric_total_cpu_usage,
new_metric_unit="Percent",
new_metric_alarm_threshold=70,
new_metric_reports_to_skip=1,
new_metric_alarm_severity=5,
is_percent=True)
data_snapshot.register_metric(
new_metric_name="total_memory_usage_value",
new_metric_function=get_metric_total_memory_usage_value,
new_metric_unit="Bytes")
data_snapshot.register_metric(
new_metric_name="total_memory_usage_percent",
new_metric_function=get_metric_total_memory_usage_percent,
new_metric_unit="Percent",
new_metric_alarm_threshold=70,
new_metric_reports_to_skip=0,
new_metric_alarm_severity=5,
is_percent=True)
# Print diagnosis information
data_snapshot.output_diagnosis_information(command_parser_arguments.dependencies)
# Make the snapshot (metrics) monitor
snapshot_monitor = SnapshotMonitor(
wrapper_data_snapshot=data_snapshot,
wrapper_metrics_wait_time=command_parser_arguments.snapshot_wait_time)
# Make sure nothing failed
if (snapshot_monitor.had_internal_error == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again", flush=True)
exit(0)
# Make the application monitor
application_monitor = ApplicationMonitor(
wrapper_application_path=command_parser_arguments.canary_executable,
wrapper_application_arguments=command_parser_arguments.canary_arguments,
wrapper_application_restart_on_finish=False,
data_snapshot=data_snapshot # pass the data_snapshot for printing to the log
)
# Make sure nothing failed
if (application_monitor.error_has_occurred == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again", flush=True)
exit(0)
# For tracking if we stopped due to a metric alarm
stopped_due_to_metric_alarm = False
execution_sleep_time = 30
def execution_loop():
while True:
snapshot_monitor.monitor_loop_function(
time_passed=execution_sleep_time, psutil_process=application_monitor.application_process_psutil)
application_monitor.monitor_loop_function(
time_passed=execution_sleep_time)
# Did a metric go into alarm?
if (snapshot_monitor.has_cut_ticket == True):
# Set that we had an 'internal error' so we go down the right code path
snapshot_monitor.had_internal_error = True
break
# If an error has occurred or otherwise this thread needs to stop, then break the loop
if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True):
break
time.sleep(execution_sleep_time)
def application_thread():
start_email_body = "MQTT5 Short Running Canary Wrapper has started for "
start_email_body += "\"" + command_parser_arguments.git_repo_name + "\" commit \"" + command_parser_arguments.git_hash + "\""
start_email_body += "\nThe wrapper will run for the length the MQTT5 Canary application is set to run for, which is determined by "
start_email_body += "the arguments set. The arguments used for this run are listed below:"
start_email_body += "\n Arguments: " + command_parser_arguments.canary_arguments
snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started")
# Start the application going
snapshot_monitor.start_monitoring()
application_monitor.start_monitoring()
# Allow the snapshot monitor to cut tickets
snapshot_monitor.can_cut_ticket = True
# Start the execution loop
execution_loop()
# Make sure everything is stopped
snapshot_monitor.stop_monitoring()
application_monitor.stop_monitoring()
# Track whether this counts as an error (and therefore we should cleanup accordingly) or not
wrapper_error_occurred = False
# Finished Email
send_finished_email = True
finished_email_body = "MQTT5 Short Running Canary Wrapper has stopped."
finished_email_body += "\n\n"
try:
# Find out why we stopped
if (snapshot_monitor.had_internal_error == True):
if (snapshot_monitor.has_cut_ticket == True):
# We do not need to cut a ticket here - it's cut by the snapshot monitor!
print ("ERROR - Snapshot monitor stopped due to metric in alarm!", flush=True)
finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!"
finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered)
wrapper_error_occurred = True
else:
print ("ERROR - Snapshot monitor stopped due to internal error!", flush=True)
cut_ticket_using_cloudwatch(
git_repo_name=command_parser_arguments.git_repo_name,
git_hash=command_parser_arguments.git_hash,
git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
git_fixed_namespace_text="mqtt5_canary",
cloudwatch_region="us-east-1",
ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason,
ticket_reason="Snapshot monitor stopped due to internal error",
ticket_allow_duplicates=True,
ticket_category=command_parser_arguments.ticket_category,
ticket_item=command_parser_arguments.ticket_item,
ticket_group=command_parser_arguments.ticket_group,
ticket_type=command_parser_arguments.ticket_type,
ticket_severity=4)
wrapper_error_occurred = True
finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error."
finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason
elif (application_monitor.error_has_occurred == True):
if (application_monitor.error_due_to_credentials == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again", flush=True)
wrapper_error_occurred = True
send_finished_email = False
else:
# Is the error something in the canary failed?
if (application_monitor.error_code != 0):
cut_ticket_using_cloudwatch(
git_repo_name=command_parser_arguments.git_repo_name,
git_hash=command_parser_arguments.git_hash,
git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
git_fixed_namespace_text="mqtt5_canary",
cloudwatch_region="us-east-1",
ticket_description="The Short Running Canary exited with a non-zero exit code! This likely means something in the canary failed.",
ticket_reason="The Short Running Canary exited with a non-zero exit code",
ticket_allow_duplicates=True,
ticket_category=command_parser_arguments.ticket_category,
ticket_item=command_parser_arguments.ticket_item,
ticket_group=command_parser_arguments.ticket_group,
ticket_type=command_parser_arguments.ticket_type,
ticket_severity=4)
wrapper_error_occurred = True
finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code! This means something in the Canary application itself failed"
else:
print ("INFO - Stopping application. No error has occurred, application has stopped normally", flush=True)
application_monitor.print_stdout()
finished_email_body += "Short Running Canary finished successfully and run without errors!"
wrapper_error_occurred = False
else:
print ("ERROR - Short Running Canary stopped due to unknown reason!", flush=True)
cut_ticket_using_cloudwatch(
git_repo_name=command_parser_arguments.git_repo_name,
git_hash=command_parser_arguments.git_hash,
git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace,
git_fixed_namespace_text="mqtt5_canary",
cloudwatch_region="us-east-1",
ticket_description="The Short Running Canary stopped for an unknown reason!",
ticket_reason="The Short Running Canary stopped for unknown reason",
ticket_allow_duplicates=True,
ticket_category=command_parser_arguments.ticket_category,
ticket_item=command_parser_arguments.ticket_item,
ticket_group=command_parser_arguments.ticket_group,
ticket_type=command_parser_arguments.ticket_type,
ticket_severity=4)
wrapper_error_occurred = True
finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!"
except Exception as e:
print ("ERROR: Could not (possibly) cut ticket due to exception!")
print ("Exception: " + str(e), flush=True)
# Clean everything up and stop
snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred)
application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred)
print ("Short Running Canary finished!", flush=True)
finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: "
finished_email_body += "https://s3.console.aws.amazon.com/s3/object/"
finished_email_body += command_parser_arguments.s3_bucket_name
finished_email_body += "?region=" + command_parser_arguments.cloudwatch_region
finished_email_body += "&prefix=" + command_parser_arguments.git_repo_name + "/" + datetime_string + "/"
if (wrapper_error_occurred == True):
finished_email_body += "Failed_Logs/"
finished_email_body += command_parser_arguments.git_hash + ".log"
if (command_parser_arguments.codebuild_log_path != ""):
print ("\n Codebuild log path: " + command_parser_arguments.codebuild_log_path + "\n")
# Send the finish email
if (send_finished_email == True):
if (wrapper_error_occurred == True):
snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error")
else:
snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished")
exit (application_monitor.error_code)
# Start the application!
application_thread()