blob: 877b825965dc85ffe6cdcc9bc9ac6ae7e7657699 [file] [log] [blame] [edit]
# Python wrapper script for collecting Canary metrics, setting up alarms, reporting metrics to Cloudwatch,
# checking the alarms to ensure everything is correct at the end of the run, and checking for new
# builds in S3, downloading them, and launching them if they exist (24/7 operation)
#
# Will only stop running if the Canary application itself has an issue - in which case it Canary application will
# need to be fixed and then the wrapper script restarted
# Needs to be installed prior to running
# Part of standard packages in Python 3.4+
import argparse
import time
# Dependencies in project folder
from CanaryWrapper_Classes import *
from CanaryWrapper_MetricFunctions import *
# TODO - Using subprocess may not work on Windows for starting/stopping the application thread.
# Canary will likely be running on Linux, so it's probably okay, but need to confirm/check at some point....
# ================================================================================
# Code for command line argument parsing
command_parser = argparse.ArgumentParser("CanaryWrapper_24_7")
command_parser.add_argument("--canary_executable", type=str, required=True,
help="The path to the canary executable")
command_parser.add_argument("--canary_arguments", type=str, default="",
help="The arguments to pass/launch the canary executable with")
command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder",
help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored")
command_parser.add_argument("--s3_bucket_application", type=str, required=True,
help="(OPTIONAL, default=canary-wrapper-folder) The S3 URL to monitor for changes MINUS the bucket name")
command_parser.add_argument("--s3_bucket_application_in_zip", type=str, required=False, default="",
help="(OPTIONAL, default="") The file path in the zip folder where the application is stored. Will be ignored if set to empty string")
command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda",
help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails")
command_parser_arguments = command_parser.parse_args()
# ================================================================================
# Global variables that both threads use to communicate.
# NOTE - These should likely be replaced with futures or similar for better thread safety.
# However, these variables are only either read or written to from a single thread, no
# thread should read and write to these variables.
# The local file path (and extension) of the Canary application that the wrapper will manage
# (This will also be the filename and directory used when a new file is detected in S3)
# [THIS IS READ ONLY]
canary_local_application_path = command_parser_arguments.canary_executable
if (canary_local_application_path == ""):
print ("ERROR - required canary_executable is empty!")
exit (1) # cannot run without a canary executable
# This is the arguments passed to the local file path when starting
# [THIS IS READ ONLY]
canary_local_application_arguments = command_parser_arguments.canary_arguments
# The "Git Hash" to use for metrics and dimensions
# [THIS IS READ ONLY]
canary_local_git_hash_stub = "Canary"
# The "Git Repo" name to use for metrics and dimensions. Is hard-coded since this is a 24/7 canary that should only run for MQTT
# [THIS IS READ ONLY]
canary_local_git_repo_stub = "MQTT5_24_7"
# The Fixed Namespace name for the Canary
# [THIS IS READ ONLY]
canary_local_git_fixed_namespace = "MQTT5_24_7_Canary"
# The S3 bucket name to monitor for the application
# [THIS IS READ ONLY]
canary_s3_bucket_name = command_parser_arguments.s3_bucket_name
if (canary_s3_bucket_name == ""):
canary_s3_bucket_name = "canary-wrapper-folder"
# The file in the S3 bucket to monitor (The application filepath and file. Example: "canary/canary_application.exe")
# [THIS IS READ ONLY]
canary_s3_bucket_application_path = command_parser_arguments.s3_bucket_application
if (canary_s3_bucket_application_path == ""):
print ("ERROR - required s3_bucket_application is empty!")
exit (1) # cannot run without a s3_bucket_application to monitor
# The location of the file in the S3 zip, if the S3 file being monitored is a zip
# (THIS IS READ ONLY)
canary_s3_bucket_application_path_zip = command_parser_arguments.s3_bucket_application_in_zip
if (canary_s3_bucket_application_path_zip == ""):
canary_s3_bucket_application_path_zip = None
# The name of the email lambda. If an empty string is set, it defaults to 'iot-send-email-lambda'
if (command_parser_arguments.lambda_name == ""):
command_parser_arguments.lambda_name = "iot-send-email-lambda"
# The region the canary is running in
# (THIS IS READ ONLY)
canary_region_stub = "us-east-1"
# How long (in seconds) to wait before gathering metrics and pushing them to Cloudwatch
canary_metrics_wait_time = 600 # 10 minutes
# How long (in seconds) to run the Application thread loop. Should be shorter or equal to the Canary Metrics time
canary_application_loop_wait_time = 300 # 5 minutes
# For testing - set both to 30 seconds
# canary_metrics_wait_time = 30
# canary_application_loop_wait_time = 30
# ================================================================================
# Make the snapshot class
data_snapshot = DataSnapshot(
git_hash=canary_local_git_hash_stub,
git_repo_name=canary_local_git_repo_stub,
git_hash_as_namespace=False,
datetime_string=None,
git_fixed_namespace_text=canary_local_git_fixed_namespace,
output_log_filepath="output.txt",
output_to_console=True,
cloudwatch_region=canary_region_stub,
cloudwatch_make_dashboard=True,
cloudwatch_teardown_alarms_on_complete=True,
cloudwatch_teardown_dashboard_on_complete=False,
s3_bucket_name=canary_s3_bucket_name,
s3_bucket_upload_on_complete=True,
lambda_name=command_parser_arguments.lambda_name,
metric_frequency=canary_metrics_wait_time)
# Make sure nothing failed
if (data_snapshot.abort_due_to_internal_error == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again")
exit(0)
# Register metrics
data_snapshot.register_metric(
new_metric_name="total_cpu_usage",
new_metric_function=get_metric_total_cpu_usage,
new_metric_unit="Percent",
new_metric_alarm_threshold=70,
new_metric_reports_to_skip=1,
new_metric_alarm_severity=5,
is_percent=True)
data_snapshot.register_metric(
new_metric_name="total_memory_usage_value",
new_metric_function=get_metric_total_memory_usage_value,
new_metric_unit="Bytes")
data_snapshot.register_metric(
new_metric_name="total_memory_usage_percent",
new_metric_function=get_metric_total_memory_usage_percent,
new_metric_unit="Percent",
new_metric_alarm_threshold=70,
new_metric_reports_to_skip=0,
new_metric_alarm_severity=5,
is_percent=True)
data_snapshot.register_dashboard_widget("Process CPU Usage - Percentage", ["total_cpu_usage"], 60)
data_snapshot.register_dashboard_widget("Process Memory Usage - Percentage", ["total_memory_usage_percent"], 60)
# Print diagnosis information
data_snapshot.output_diagnosis_information("24/7 Canary cannot show dependencies!")
# Make the S3 class
s3_monitor = S3Monitor(
s3_bucket_name=canary_s3_bucket_name,
s3_file_name=canary_s3_bucket_application_path,
s3_file_name_in_zip=canary_s3_bucket_application_path_zip,
canary_local_application_path=canary_local_application_path,
data_snapshot=data_snapshot)
if (s3_monitor.had_internal_error == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again")
exit(0)
# Make the snapshot (metrics) monitor
snapshot_monitor = SnapshotMonitor(
wrapper_data_snapshot=data_snapshot,
wrapper_metrics_wait_time=canary_metrics_wait_time)
# Make sure nothing failed
if (snapshot_monitor.had_internal_error == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again")
exit(0)
# Make the application monitor
application_monitor = ApplicationMonitor(
wrapper_application_path=canary_local_application_path,
wrapper_application_arguments=canary_local_application_arguments,
wrapper_application_restart_on_finish=True,
data_snapshot=data_snapshot)
# Make sure nothing failed
if (application_monitor.error_has_occurred == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again")
exit(0)
# For tracking if we stopped due to a metric alarm
stopped_due_to_metric_alarm = False
def execution_loop():
while True:
s3_monitor.monitor_loop_function(time_passed=canary_application_loop_wait_time)
# Is there an error?
if (s3_monitor.had_internal_error == True):
print ("[Debug] S3 monitor had an internal error!")
break
# Is there a new file?
if (s3_monitor.s3_file_needs_replacing == True):
# Stop the application
print ("[Debug] Stopping application monitor...")
application_monitor.stop_monitoring()
print ("[Debug] Getting S3 file...")
s3_monitor.replace_current_file_for_new_file()
# Start the application
print ("[Debug] Starting application monitor...")
application_monitor.start_monitoring()
# Allow the snapshot monitor to cut a ticket
snapshot_monitor.can_cut_ticket = True
snapshot_monitor.monitor_loop_function(
time_passed=canary_application_loop_wait_time, psutil_process=application_monitor.application_process_psutil)
application_monitor.monitor_loop_function(
time_passed=canary_application_loop_wait_time)
# Did a metric go into alarm?
if (snapshot_monitor.has_cut_ticket == True):
# Do not allow it to cut anymore tickets until it gets a new build
snapshot_monitor.can_cut_ticket = False
# If an error has occurred or otherwise this thread needs to stop, then break the loop
if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True):
if (application_monitor.error_has_occurred == True):
print ("[Debug] Application monitor error occurred!")
else:
print ("[Debug] Snapshot monitor internal error ocurred!")
break
time.sleep(canary_application_loop_wait_time)
def application_thread():
# Start the application going
snapshot_monitor.start_monitoring()
application_monitor.start_monitoring()
# Allow the snapshot monitor to cut tickets
snapshot_monitor.can_cut_ticket = True
start_email_body = "MQTT5 24/7 Canary Wrapper has started. This will run and continue to test new MQTT5 application builds as"
start_email_body += " they pass CodeBuild and are uploaded to S3."
snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started")
# Start the execution loop
execution_loop()
# Make sure everything is stopped
snapshot_monitor.stop_monitoring()
application_monitor.stop_monitoring()
# Track whether this counts as an error (and therefore we should cleanup accordingly) or not
wrapper_error_occurred = False
send_finished_email = True
finished_email_body = "MQTT5 24/7 Canary Wrapper has stopped."
finished_email_body += "\n\n"
try:
# Find out why we stopped
# S3 Monitor
if (s3_monitor.had_internal_error == True):
if (s3_monitor.error_due_to_credentials == False):
print ("ERROR - S3 monitor stopped due to internal error!")
cut_ticket_using_cloudwatch(
git_repo_name=canary_local_git_repo_stub,
git_hash=canary_local_git_hash_stub,
git_hash_as_namespace=False,
git_fixed_namespace_text=canary_local_git_fixed_namespace,
cloudwatch_region=canary_region_stub,
ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + s3_monitor.internal_error_reason,
ticket_reason="S3 monitor stopped due to internal error",
ticket_allow_duplicates=True,
ticket_category="AWS",
ticket_type="SDKs and Tools",
ticket_item="IoT SDK for CPP",
ticket_group="AWS IoT Device SDK",
ticket_severity=4)
finished_email_body += "Failure due to S3 monitor stopping due to an internal error."
finished_email_body += " Reason given for error: " + s3_monitor.internal_error_reason
wrapper_error_occurred = True
# Snapshot Monitor
elif (snapshot_monitor.had_internal_error == True):
if (snapshot_monitor.has_cut_ticket == True):
# We do not need to cut a ticket here - it's cut by the snapshot monitor!
print ("ERROR - Snapshot monitor stopped due to metric in alarm!")
finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!"
finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered)
finished_email_body += "\nNOTE - this shouldn't occur in the 24/7 Canary! If it does, then the wrapper needs adjusting."
wrapper_error_occurred = True
else:
print ("ERROR - Snapshot monitor stopped due to internal error!")
cut_ticket_using_cloudwatch(
git_repo_name=canary_local_git_repo_stub,
git_hash=canary_local_git_hash_stub,
git_hash_as_namespace=False,
git_fixed_namespace_text=canary_local_git_fixed_namespace,
cloudwatch_region=canary_region_stub,
ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason,
ticket_reason="Snapshot monitor stopped due to internal error",
ticket_allow_duplicates=True,
ticket_category="AWS",
ticket_type="SDKs and Tools",
ticket_item="IoT SDK for CPP",
ticket_group="AWS IoT Device SDK",
ticket_severity=4)
wrapper_error_occurred = True
finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error."
finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason
# Application Monitor
elif (application_monitor.error_has_occurred == True):
if (application_monitor.error_due_to_credentials == True):
print ("INFO - Stopping application due to error caused by credentials")
print ("Please fix your credentials and then restart this application again")
wrapper_error_occurred = True
send_finished_email = False
else:
# Is the error something in the canary failed?
if (application_monitor.error_code != 0):
cut_ticket_using_cloudwatch(
git_repo_name=canary_local_git_repo_stub,
git_hash=canary_local_git_hash_stub,
git_hash_as_namespace=False,
git_fixed_namespace_text=canary_local_git_fixed_namespace,
cloudwatch_region=canary_region_stub,
ticket_description="The 24/7 Canary exited with a non-zero exit code! This likely means something in the canary failed.",
ticket_reason="The 24/7 Canary exited with a non-zero exit code",
ticket_allow_duplicates=True,
ticket_category="AWS",
ticket_type="SDKs and Tools",
ticket_item="IoT SDK for CPP",
ticket_group="AWS IoT Device SDK",
ticket_severity=3)
wrapper_error_occurred = True
finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code!"
finished_email_body += " This means something in the Canary application itself failed"
else:
cut_ticket_using_cloudwatch(
git_repo_name=canary_local_git_repo_stub,
git_hash=canary_local_git_hash_stub,
git_hash_as_namespace=False,
git_fixed_namespace_text=canary_local_git_fixed_namespace,
cloudwatch_region=canary_region_stub,
ticket_description="The 24/7 Canary exited with a zero exit code but did not restart!",
ticket_reason="The 24/7 Canary exited with a zero exit code but did not restart",
ticket_allow_duplicates=True,
ticket_category="AWS",
ticket_type="SDKs and Tools",
ticket_item="IoT SDK for CPP",
ticket_group="AWS IoT Device SDK",
ticket_severity=3)
wrapper_error_occurred = True
finished_email_body += "Failure due to MQTT5 application stopping and not automatically restarting!"
finished_email_body += " This shouldn't occur and means something is wrong with the Canary wrapper!"
# Other
else:
print ("ERROR - 24/7 Canary stopped due to unknown reason!")
cut_ticket_using_cloudwatch(
git_repo_name=canary_local_git_repo_stub,
git_hash=canary_local_git_hash_stub,
git_hash_as_namespace=False,
git_fixed_namespace_text=canary_local_git_fixed_namespace,
cloudwatch_region=canary_region_stub,
ticket_description="The 24/7 Canary stopped for an unknown reason!",
ticket_reason="The 24/7 Canary stopped for unknown reason",
ticket_allow_duplicates=True,
ticket_category="AWS",
ticket_type="SDKs and Tools",
ticket_item="IoT SDK for CPP",
ticket_group="AWS IoT Device SDK",
ticket_severity=3)
wrapper_error_occurred = True
finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!"
except Exception as e:
print ("ERROR: Could not (possibly) cut ticket due to exception!")
print ("Exception: " + str(e), flush=True)
# Clean everything up and stop
snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred)
application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred)
print ("24/7 Canary finished!")
finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: "
finished_email_body += "https://s3.console.aws.amazon.com/s3/object/"
finished_email_body += command_parser_arguments.s3_bucket_name
finished_email_body += "?region=" + canary_region_stub
finished_email_body += "&prefix=" + canary_local_git_repo_stub + "/"
if (wrapper_error_occurred == True):
finished_email_body += "Failed_Logs/"
finished_email_body += canary_local_git_hash_stub + ".log"
# Send the finish email
if (send_finished_email == True):
if (wrapper_error_occurred == True):
snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error")
else:
snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished")
exit (-1)
# Start the application!
application_thread()