| # Python wrapper script for collecting Canary metrics, setting-up/tearing-down alarms, reporting metrics to Cloudwatch, |
| # checking the alarms to ensure everything is correct at the end of the run, and pushing the log to S3 if successful. |
| |
| # Needs to be installed prior to running |
| # Part of standard packages in Python 3.4+ |
| import argparse |
| import time |
| import datetime |
| # Dependencies in project folder |
| from CanaryWrapper_Classes import * |
| from CanaryWrapper_MetricFunctions import * |
| |
| # Code for command line argument parsing |
| # ================================================================================ |
| command_parser = argparse.ArgumentParser("CanaryWrapper") |
| command_parser.add_argument("--canary_executable", type=str, required=True, |
| help="The path to the canary executable (or program - like 'python3')") |
| command_parser.add_argument("--canary_arguments", type=str, default="", |
| help="The arguments to pass/launch the canary executable with") |
| command_parser.add_argument("--git_hash", type=str, required=True, |
| help="The Git commit hash that we are running the canary with") |
| command_parser.add_argument("--git_repo_name", type=str, required=True, |
| help="The name of the Git repository") |
| command_parser.add_argument("--git_hash_as_namespace", type=bool, default=False, |
| help="(OPTIONAL, default=False) If true, the git hash will be used as the name of the Cloudwatch namespace") |
| command_parser.add_argument("--output_log_filepath", type=str, default="output.log", |
| help="(OPTIONAL, default=output.log) The file to output log info to. Set to 'None' to disable") |
| command_parser.add_argument("--output_to_console", type=bool, default=True, |
| help="(OPTIONAL, default=True) If true, info will be output to the console") |
| command_parser.add_argument("--cloudwatch_region", type=str, default="us-east-1", |
| help="(OPTIONAL, default=us-east-1) The AWS region for Cloudwatch") |
| command_parser.add_argument("--s3_bucket_name", type=str, default="canary-wrapper-folder", |
| help="(OPTIONAL, default=canary-wrapper-folder) The name of the S3 bucket where success logs will be stored") |
| command_parser.add_argument("--snapshot_wait_time", type=int, default=600, |
| help="(OPTIONAL, default=600) The number of seconds between gathering and sending snapshot reports") |
| command_parser.add_argument("--ticket_category", type=str, default="AWS", |
| help="(OPTIONAL, default=AWS) The category to register the ticket under") |
| command_parser.add_argument("--ticket_type", type=str, default="SDKs and Tools", |
| help="(OPTIONAL, default='SDKs and Tools') The type to register the ticket under") |
| command_parser.add_argument("--ticket_item", type=str, default="IoT SDK for CPP", |
| help="(OPTIONAL, default='IoT SDK for CPP') The item to register the ticket under") |
| command_parser.add_argument("--ticket_group", type=str, default="AWS IoT Device SDK", |
| help="(OPTIONAL, default='AWS IoT Device SDK') The group to register the ticket under") |
| command_parser.add_argument("--dependencies", type=str, default="", |
| help="(OPTIONAL, default='') Any dependencies and their commit hashes. \ |
| Current expected format is '(name or path);(hash);(next name or path);(hash);(etc...)'.") |
| command_parser.add_argument("--lambda_name", type=str, default="iot-send-email-lambda", |
| help="(OPTIONAL, default='CanarySendEmailLambda') The name of the Lambda used to send emails") |
| command_parser.add_argument("--codebuild_log_path", type=str, default="", |
| help="The CODEBUILD_LOG_PATH environment variable. Leave blank to ignore") |
| command_parser_arguments = command_parser.parse_args() |
| |
| if (command_parser_arguments.output_log_filepath == "None"): |
| command_parser_arguments.output_log_filepath = None |
| if (command_parser_arguments.snapshot_wait_time <= 0): |
| command_parser_arguments.snapshot_wait_time = 60 |
| |
| # Deal with possibly empty values in semi-critical commands/arguments |
| if (command_parser_arguments.canary_executable == ""): |
| print ("ERROR - required canary_executable is empty!", flush=True) |
| exit (1) # cannot run without a canary executable |
| if (command_parser_arguments.git_hash == ""): |
| print ("ERROR - required git_hash is empty!", flush=True) |
| exit (1) # cannot run without git hash |
| if (command_parser_arguments.git_repo_name == ""): |
| print ("ERROR - required git_repo_name is empty!", flush=True) |
| exit (1) # cannot run without git repo name |
| if (command_parser_arguments.git_hash_as_namespace is not True and command_parser_arguments.git_hash_as_namespace is not False): |
| command_parser_arguments.git_hash_as_namespace = False |
| if (command_parser_arguments.output_log_filepath == ""): |
| command_parser_arguments.output_log_filepath = None |
| if (command_parser_arguments.output_to_console != True and command_parser_arguments.output_to_console != False): |
| command_parser_arguments.output_to_console = True |
| if (command_parser_arguments.cloudwatch_region == ""): |
| command_parser_arguments.cloudwatch_region = "us-east-1" |
| if (command_parser_arguments.s3_bucket_name == ""): |
| command_parser_arguments.s3_bucket_name = "canary-wrapper-folder" |
| if (command_parser_arguments.ticket_category == ""): |
| command_parser_arguments.ticket_category = "AWS" |
| if (command_parser_arguments.ticket_type == ""): |
| command_parser_arguments.ticket_type = "SDKs and Tools" |
| if (command_parser_arguments.ticket_item == ""): |
| command_parser_arguments.ticket_item = "IoT SDK for CPP" |
| if (command_parser_arguments.ticket_group == ""): |
| command_parser_arguments.ticket_group = "AWS IoT Device SDK" |
| |
| |
| |
| # ================================================================================ |
| |
| datetime_now = datetime.datetime.now() |
| datetime_string = datetime_now.strftime("%d-%m-%Y/%H-%M-%S") |
| print("Datetime string is: " + datetime_string, flush=True) |
| |
| # Make the snapshot class |
| data_snapshot = DataSnapshot( |
| git_hash=command_parser_arguments.git_hash, |
| git_repo_name=command_parser_arguments.git_repo_name, |
| datetime_string=datetime_string, |
| git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, |
| git_fixed_namespace_text="mqtt5_canary", |
| output_log_filepath="output.txt", |
| output_to_console=command_parser_arguments.output_to_console, |
| cloudwatch_region="us-east-1", |
| cloudwatch_make_dashboard=False, |
| cloudwatch_teardown_alarms_on_complete=True, |
| cloudwatch_teardown_dashboard_on_complete=True, |
| s3_bucket_name=command_parser_arguments.s3_bucket_name, |
| s3_bucket_upload_on_complete=True, |
| lambda_name=command_parser_arguments.lambda_name, |
| metric_frequency=command_parser_arguments.snapshot_wait_time) |
| |
| # Make sure nothing failed |
| if (data_snapshot.abort_due_to_internal_error == True): |
| print ("INFO - Stopping application due to error caused by credentials") |
| print ("Please fix your credentials and then restart this application again", flush=True) |
| exit(0) |
| |
| # Register metrics |
| data_snapshot.register_metric( |
| new_metric_name="total_cpu_usage", |
| new_metric_function=get_metric_total_cpu_usage, |
| new_metric_unit="Percent", |
| new_metric_alarm_threshold=70, |
| new_metric_reports_to_skip=1, |
| new_metric_alarm_severity=5, |
| is_percent=True) |
| data_snapshot.register_metric( |
| new_metric_name="total_memory_usage_value", |
| new_metric_function=get_metric_total_memory_usage_value, |
| new_metric_unit="Bytes") |
| data_snapshot.register_metric( |
| new_metric_name="total_memory_usage_percent", |
| new_metric_function=get_metric_total_memory_usage_percent, |
| new_metric_unit="Percent", |
| new_metric_alarm_threshold=70, |
| new_metric_reports_to_skip=0, |
| new_metric_alarm_severity=5, |
| is_percent=True) |
| |
| # Print diagnosis information |
| data_snapshot.output_diagnosis_information(command_parser_arguments.dependencies) |
| |
| # Make the snapshot (metrics) monitor |
| snapshot_monitor = SnapshotMonitor( |
| wrapper_data_snapshot=data_snapshot, |
| wrapper_metrics_wait_time=command_parser_arguments.snapshot_wait_time) |
| |
| # Make sure nothing failed |
| if (snapshot_monitor.had_internal_error == True): |
| print ("INFO - Stopping application due to error caused by credentials") |
| print ("Please fix your credentials and then restart this application again", flush=True) |
| exit(0) |
| |
| # Make the application monitor |
| application_monitor = ApplicationMonitor( |
| wrapper_application_path=command_parser_arguments.canary_executable, |
| wrapper_application_arguments=command_parser_arguments.canary_arguments, |
| wrapper_application_restart_on_finish=False, |
| data_snapshot=data_snapshot # pass the data_snapshot for printing to the log |
| ) |
| |
| # Make sure nothing failed |
| if (application_monitor.error_has_occurred == True): |
| print ("INFO - Stopping application due to error caused by credentials") |
| print ("Please fix your credentials and then restart this application again", flush=True) |
| exit(0) |
| |
| # For tracking if we stopped due to a metric alarm |
| stopped_due_to_metric_alarm = False |
| |
| execution_sleep_time = 30 |
| def execution_loop(): |
| while True: |
| snapshot_monitor.monitor_loop_function( |
| time_passed=execution_sleep_time, psutil_process=application_monitor.application_process_psutil) |
| application_monitor.monitor_loop_function( |
| time_passed=execution_sleep_time) |
| |
| # Did a metric go into alarm? |
| if (snapshot_monitor.has_cut_ticket == True): |
| # Set that we had an 'internal error' so we go down the right code path |
| snapshot_monitor.had_internal_error = True |
| break |
| |
| # If an error has occurred or otherwise this thread needs to stop, then break the loop |
| if (application_monitor.error_has_occurred == True or snapshot_monitor.had_internal_error == True): |
| break |
| |
| time.sleep(execution_sleep_time) |
| |
| |
| def application_thread(): |
| |
| start_email_body = "MQTT5 Short Running Canary Wrapper has started for " |
| start_email_body += "\"" + command_parser_arguments.git_repo_name + "\" commit \"" + command_parser_arguments.git_hash + "\"" |
| start_email_body += "\nThe wrapper will run for the length the MQTT5 Canary application is set to run for, which is determined by " |
| start_email_body += "the arguments set. The arguments used for this run are listed below:" |
| start_email_body += "\n Arguments: " + command_parser_arguments.canary_arguments |
| snapshot_monitor.send_email(email_body=start_email_body, email_subject_text_append="Started") |
| |
| # Start the application going |
| snapshot_monitor.start_monitoring() |
| application_monitor.start_monitoring() |
| # Allow the snapshot monitor to cut tickets |
| snapshot_monitor.can_cut_ticket = True |
| |
| # Start the execution loop |
| execution_loop() |
| |
| # Make sure everything is stopped |
| snapshot_monitor.stop_monitoring() |
| application_monitor.stop_monitoring() |
| |
| # Track whether this counts as an error (and therefore we should cleanup accordingly) or not |
| wrapper_error_occurred = False |
| # Finished Email |
| send_finished_email = True |
| finished_email_body = "MQTT5 Short Running Canary Wrapper has stopped." |
| finished_email_body += "\n\n" |
| |
| try: |
| # Find out why we stopped |
| if (snapshot_monitor.had_internal_error == True): |
| if (snapshot_monitor.has_cut_ticket == True): |
| # We do not need to cut a ticket here - it's cut by the snapshot monitor! |
| print ("ERROR - Snapshot monitor stopped due to metric in alarm!", flush=True) |
| finished_email_body += "Failure due to required metrics being in alarm! A new ticket should have been cut!" |
| finished_email_body += "\nMetrics in Alarm: " + str(snapshot_monitor.cloudwatch_current_alarms_triggered) |
| wrapper_error_occurred = True |
| else: |
| print ("ERROR - Snapshot monitor stopped due to internal error!", flush=True) |
| cut_ticket_using_cloudwatch( |
| git_repo_name=command_parser_arguments.git_repo_name, |
| git_hash=command_parser_arguments.git_hash, |
| git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, |
| git_fixed_namespace_text="mqtt5_canary", |
| cloudwatch_region="us-east-1", |
| ticket_description="Snapshot monitor stopped due to internal error! Reason info: " + snapshot_monitor.internal_error_reason, |
| ticket_reason="Snapshot monitor stopped due to internal error", |
| ticket_allow_duplicates=True, |
| ticket_category=command_parser_arguments.ticket_category, |
| ticket_item=command_parser_arguments.ticket_item, |
| ticket_group=command_parser_arguments.ticket_group, |
| ticket_type=command_parser_arguments.ticket_type, |
| ticket_severity=4) |
| wrapper_error_occurred = True |
| finished_email_body += "Failure due to Snapshot monitor stopping due to an internal error." |
| finished_email_body += " Reason given for error: " + snapshot_monitor.internal_error_reason |
| |
| elif (application_monitor.error_has_occurred == True): |
| if (application_monitor.error_due_to_credentials == True): |
| print ("INFO - Stopping application due to error caused by credentials") |
| print ("Please fix your credentials and then restart this application again", flush=True) |
| wrapper_error_occurred = True |
| send_finished_email = False |
| else: |
| # Is the error something in the canary failed? |
| if (application_monitor.error_code != 0): |
| cut_ticket_using_cloudwatch( |
| git_repo_name=command_parser_arguments.git_repo_name, |
| git_hash=command_parser_arguments.git_hash, |
| git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, |
| git_fixed_namespace_text="mqtt5_canary", |
| cloudwatch_region="us-east-1", |
| ticket_description="The Short Running Canary exited with a non-zero exit code! This likely means something in the canary failed.", |
| ticket_reason="The Short Running Canary exited with a non-zero exit code", |
| ticket_allow_duplicates=True, |
| ticket_category=command_parser_arguments.ticket_category, |
| ticket_item=command_parser_arguments.ticket_item, |
| ticket_group=command_parser_arguments.ticket_group, |
| ticket_type=command_parser_arguments.ticket_type, |
| ticket_severity=4) |
| wrapper_error_occurred = True |
| finished_email_body += "Failure due to MQTT5 application exiting with a non-zero exit code! This means something in the Canary application itself failed" |
| else: |
| print ("INFO - Stopping application. No error has occurred, application has stopped normally", flush=True) |
| application_monitor.print_stdout() |
| finished_email_body += "Short Running Canary finished successfully and run without errors!" |
| wrapper_error_occurred = False |
| else: |
| print ("ERROR - Short Running Canary stopped due to unknown reason!", flush=True) |
| cut_ticket_using_cloudwatch( |
| git_repo_name=command_parser_arguments.git_repo_name, |
| git_hash=command_parser_arguments.git_hash, |
| git_hash_as_namespace=command_parser_arguments.git_hash_as_namespace, |
| git_fixed_namespace_text="mqtt5_canary", |
| cloudwatch_region="us-east-1", |
| ticket_description="The Short Running Canary stopped for an unknown reason!", |
| ticket_reason="The Short Running Canary stopped for unknown reason", |
| ticket_allow_duplicates=True, |
| ticket_category=command_parser_arguments.ticket_category, |
| ticket_item=command_parser_arguments.ticket_item, |
| ticket_group=command_parser_arguments.ticket_group, |
| ticket_type=command_parser_arguments.ticket_type, |
| ticket_severity=4) |
| wrapper_error_occurred = True |
| finished_email_body += "Failure due to unknown reason! This shouldn't happen and means something has gone wrong!" |
| except Exception as e: |
| print ("ERROR: Could not (possibly) cut ticket due to exception!") |
| print ("Exception: " + str(e), flush=True) |
| |
| # Clean everything up and stop |
| snapshot_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) |
| application_monitor.cleanup_monitor(error_occurred=wrapper_error_occurred) |
| print ("Short Running Canary finished!", flush=True) |
| |
| finished_email_body += "\n\nYou can find the log file for this run at the following S3 location: " |
| finished_email_body += "https://s3.console.aws.amazon.com/s3/object/" |
| finished_email_body += command_parser_arguments.s3_bucket_name |
| finished_email_body += "?region=" + command_parser_arguments.cloudwatch_region |
| finished_email_body += "&prefix=" + command_parser_arguments.git_repo_name + "/" + datetime_string + "/" |
| if (wrapper_error_occurred == True): |
| finished_email_body += "Failed_Logs/" |
| finished_email_body += command_parser_arguments.git_hash + ".log" |
| if (command_parser_arguments.codebuild_log_path != ""): |
| print ("\n Codebuild log path: " + command_parser_arguments.codebuild_log_path + "\n") |
| |
| # Send the finish email |
| if (send_finished_email == True): |
| if (wrapper_error_occurred == True): |
| snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Had an error") |
| else: |
| snapshot_monitor.send_email(email_body=finished_email_body, email_subject_text_append="Finished") |
| |
| exit (application_monitor.error_code) |
| |
| |
| # Start the application! |
| application_thread() |