| #!/usr/bin/Rscript |
| # |
| # Write an overview of task status, per-metric task status, task histograms. |
| |
| library(data.table) |
| library(ggplot2) |
| |
| options(stringsAsFactors = FALSE) # get rid of annoying behavior |
| |
| Log <- function(fmt, ...) { |
| cat(sprintf(fmt, ...)) |
| cat('\n') |
| } |
| |
| # max of non-NA values; NA if there are none |
| MaybeMax <- function(values) { |
| v <- values[!is.na(values)] |
| if (length(v) == 0) { |
| m <- NA |
| } else { |
| m <- max(v) |
| } |
| as.numeric(m) # data.table requires this; otherwise we get type errors |
| } |
| |
| # mean of non-NA values; NA if there are none |
| MaybeMean <- function(values) { |
| v <- values[!is.na(values)] |
| if (length(v) == 0) { |
| m <- NA |
| } else { |
| m <- mean(v) |
| } |
| as.numeric(m) # data.table require this; otherwise we get type errors |
| } |
| |
| WriteDistOverview <- function(summary, output_dir) { |
| s <- data.table(summary) # data.table syntax is easier here |
| |
| by_metric <- s[ , list( |
| params_file = unique(params_file), |
| map_file = unique(map_file), |
| days = length(date), |
| max_num_reports = MaybeMax(num_reports), |
| |
| # summarize status |
| ok = sum(status == 'OK'), |
| fail = sum(status == 'FAIL'), |
| timeout = sum(status == 'TIMEOUT'), |
| skipped = sum(status == 'SKIPPED'), |
| |
| # TODO: Need to document the meaning of these metrics. |
| # All could be NA |
| # KiB -> MB |
| #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6), |
| #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6), |
| |
| mean_secs = MaybeMean(seconds), |
| mean_allocated_mass = MaybeMean(allocated_mass) |
| |
| # unique failure reasons |
| # This can be used when there are different call stacks. |
| #fail_reasons = length(unique(fail_reason[fail_reason != ""])) |
| ), by=metric] |
| |
| # Case insensitive sort by metric name |
| by_metric <- by_metric[order(tolower(by_metric$metric)), ] |
| |
| overview_path <- file.path(output_dir, 'overview.csv') |
| write.csv(by_metric, file = overview_path, row.names = FALSE) |
| Log("Wrote %s", overview_path) |
| |
| by_metric |
| } |
| |
| WriteDistMetricStatus <- function(summary, output_dir) { |
| # Write status.csv, num_reports.csv, and mass.csv for each metric. |
| |
| s <- data.table(summary) |
| |
| # loop over unique metrics, and write a CSV for each one |
| for (m in unique(s$metric)) { |
| # Select cols, and convert units. Don't need params / map / metric. |
| subframe <- s[s$metric == m, |
| list(job_id, date, status, |
| #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6, |
| #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6, |
| num_reports, |
| seconds, |
| allocated_mass, num_rappor)] |
| |
| # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD. |
| subframe <- subframe[order(subframe$date, decreasing = TRUE), ] |
| |
| out_path = file.path(output_dir, m, 'status.csv') |
| write.csv(subframe, file = out_path, row.names = FALSE) |
| Log("Wrote %s", out_path) |
| } |
| |
| # This one is just for plotting with dygraphs. TODO: can dygraphs do |
| # something smarter? Maybe you need to select the column in JavaScript, and |
| # pass it an array, rather than CSV text. |
| for (m in unique(s$metric)) { |
| f1 <- s[s$metric == m, list(date, num_reports)] |
| path1 <- file.path(output_dir, m, 'num_reports.csv') |
| # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around |
| # "2015-04-03". In general, we can't turn off quotes, because strings with |
| # double quotes will be invalid CSV files. But in this case, we only have |
| # date and number columns, so we can. dygraphs is mistaken here. |
| write.csv(f1, file = path1, row.names = FALSE, quote = FALSE) |
| Log("Wrote %s", path1) |
| |
| # Write unallocated mass. TODO: Write the other 2 vars too? |
| f2 <- s[s$metric == m, |
| list(date, |
| unallocated_mass = 1.0 - allocated_mass)] |
| |
| path2 <- file.path(output_dir, m, 'mass.csv') |
| write.csv(f2, file = path2, row.names = FALSE, quote = FALSE) |
| Log("Wrote %s", path2) |
| } |
| } |
| |
| WritePlot <- function(p, outdir, filename, width = 800, height = 600) { |
| filename <- file.path(outdir, filename) |
| png(filename, width = width, height = height) |
| plot(p) |
| dev.off() |
| Log('Wrote %s', filename) |
| } |
| |
| # Make sure the histogram has some valid input. If we don't do this, ggplot |
| # blows up with an unintuitive error message. |
| CheckHistogramInput <- function(v) { |
| if (all(is.na(v))) { |
| arg_name <- deparse(substitute(v)) # R idiom to get name |
| Log('FATAL: All values in %s are NA (no successful runs?)', arg_name) |
| quit(status = 1) |
| } |
| } |
| |
| WriteDistHistograms <- function(s, output_dir) { |
| CheckHistogramInput(s$allocated_mass) |
| |
| p <- qplot(s$allocated_mass, geom = "histogram") |
| t <- ggtitle("Allocated Mass by Task") |
| x <- xlab("allocated mass") |
| y <- ylab("number of tasks") |
| WritePlot(p + t + x + y, output_dir, 'allocated_mass.png') |
| |
| CheckHistogramInput(s$num_rappor) |
| |
| p <- qplot(s$num_rappor, geom = "histogram") |
| t <- ggtitle("Detected Strings by Task") |
| x <- xlab("detected strings") |
| y <- ylab("number of tasks") |
| WritePlot(p + t + x + y, output_dir, 'num_rappor.png') |
| |
| CheckHistogramInput(s$num_reports) |
| |
| p <- qplot(s$num_reports / 1e6, geom = "histogram") |
| t <- ggtitle("Raw Reports by Task") |
| x <- xlab("millions of reports") |
| y <- ylab("number of tasks") |
| WritePlot(p + t + x + y, output_dir, 'num_reports.png') |
| |
| CheckHistogramInput(s$seconds) |
| |
| p <- qplot(s$seconds, geom = "histogram") |
| t <- ggtitle("Analysis Duration by Task") |
| x <- xlab("seconds") |
| y <- ylab("number of tasks") |
| WritePlot(p + t + x + y, output_dir, 'seconds.png') |
| |
| # NOTE: Skipping this for 'series' jobs. |
| if (sum(!is.na(s$vm5_peak_kib)) > 0) { |
| p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram") |
| t <- ggtitle("Peak Memory Usage by Task") |
| x <- xlab("Peak megabytes (1e6 bytes) of memory") |
| y <- ylab("number of tasks") |
| WritePlot(p + t + x + y, output_dir, 'memory.png') |
| } |
| } |
| |
| ProcessAllDist <- function(s, output_dir) { |
| Log('dist: Writing per-metric status.csv') |
| WriteDistMetricStatus(s, output_dir) |
| |
| Log('dist: Writing histograms') |
| WriteDistHistograms(s, output_dir) |
| |
| Log('dist: Writing aggregated overview.csv') |
| WriteDistOverview(s, output_dir) |
| } |
| |
| # Write the single CSV file loaded by assoc-overview.html. |
| WriteAssocOverview <- function(summary, output_dir) { |
| s <- data.table(summary) # data.table syntax is easier here |
| |
| by_metric <- s[ , list( |
| #params_file = unique(params_file), |
| #map_file = unique(map_file), |
| |
| days = length(date), |
| max_num_reports = MaybeMax(num_reports), |
| |
| # summarize status |
| ok = sum(status == 'OK'), |
| fail = sum(status == 'FAIL'), |
| timeout = sum(status == 'TIMEOUT'), |
| skipped = sum(status == 'SKIPPED'), |
| |
| mean_total_secs = MaybeMean(total_elapsed_seconds), |
| mean_em_secs = MaybeMean(em_elapsed_seconds) |
| |
| ), by=list(metric)] |
| |
| # Case insensitive sort by metric name |
| by_metric <- by_metric[order(tolower(by_metric$metric)), ] |
| |
| overview_path <- file.path(output_dir, 'assoc-overview.csv') |
| write.csv(by_metric, file = overview_path, row.names = FALSE) |
| Log("Wrote %s", overview_path) |
| |
| by_metric |
| } |
| |
| # Write the CSV files loaded by assoc-metric.html -- that is, one |
| # metric-status.csv for each metric name. |
| WriteAssocMetricStatus <- function(summary, output_dir) { |
| s <- data.table(summary) |
| csv_list <- unique(s[, list(metric)]) |
| for (i in 1:nrow(csv_list)) { |
| u <- csv_list[i, ] |
| # Select cols, and convert units. Don't need params / map / metric. |
| by_pair <- s[s$metric == u$metric, |
| list(days = length(date), |
| max_num_reports = MaybeMax(num_reports), |
| |
| # summarize status |
| ok = sum(status == 'OK'), |
| fail = sum(status == 'FAIL'), |
| timeout = sum(status == 'TIMEOUT'), |
| skipped = sum(status == 'SKIPPED'), |
| |
| mean_total_secs = MaybeMean(total_elapsed_seconds), |
| mean_em_secs = MaybeMean(em_elapsed_seconds) |
| ), |
| by=list(var1, var2)] |
| |
| # Case insensitive sort by var1 name |
| by_pair <- by_pair[order(tolower(by_pair$var1)), ] |
| |
| csv_path <- file.path(output_dir, u$metric, 'metric-status.csv') |
| write.csv(by_pair, file = csv_path, row.names = FALSE) |
| Log("Wrote %s", csv_path) |
| } |
| } |
| |
| # This naming convention is in task_spec.py AssocTaskSpec. |
| FormatAssocRelPath <- function(metric, var1, var2) { |
| v2 <- gsub('..', '_', var2, fixed = TRUE) |
| var_dir <- sprintf('%s_X_%s', var1, v2) |
| file.path(metric, var_dir) |
| } |
| |
| # Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv |
| # for each (metric, var1, var2) pair. |
| WriteAssocPairStatus <- function(summary, output_dir) { |
| |
| s <- data.table(summary) |
| |
| csv_list <- unique(s[, list(metric, var1, var2)]) |
| Log('CSV list:') |
| print(csv_list) |
| |
| # loop over unique metrics, and write a CSV for each one |
| for (i in 1:nrow(csv_list)) { |
| u <- csv_list[i, ] |
| |
| # Select cols, and convert units. Don't need params / map / metric. |
| subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2, |
| list(job_id, date, status, |
| num_reports, d1, d2, |
| total_elapsed_seconds, |
| em_elapsed_seconds)] |
| |
| # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD. |
| subframe <- subframe[order(subframe$date, decreasing = TRUE), ] |
| |
| pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2) |
| |
| csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv') |
| write.csv(subframe, file = csv_path, row.names = FALSE) |
| Log("Wrote %s", csv_path) |
| |
| # Write a file with the raw variable names. Parsed by ui.sh, to pass to |
| # csv_to_html.py. |
| meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt') |
| |
| # NOTE: The conversion from data.table to character vector requires |
| # stringsAsFactors to work correctly! |
| lines <- as.character(u) |
| writeLines(lines, con = meta_path) |
| Log("Wrote %s", meta_path) |
| } |
| } |
| |
| ProcessAllAssoc <- function(s, output_dir) { |
| Log('assoc: Writing pair-status.csv for each variable pair in each metric') |
| WriteAssocPairStatus(s, output_dir) |
| |
| Log('assoc: Writing metric-status.csv for each metric') |
| WriteAssocMetricStatus(s, output_dir) |
| |
| Log('assoc: Writing aggregated overview.csv') |
| WriteAssocOverview(s, output_dir) |
| } |
| |
| main <- function(argv) { |
| # increase ggplot font size globally |
| theme_set(theme_grey(base_size = 16)) |
| |
| action = argv[[1]] |
| input = argv[[2]] |
| output_dir = argv[[3]] |
| |
| if (action == 'dist') { |
| summary = read.csv(input) |
| ProcessAllDist(summary, output_dir) |
| } else if (action == 'assoc') { |
| summary = read.csv(input) |
| ProcessAllAssoc(summary, output_dir) |
| } else { |
| stop(sprintf('Invalid action %s', action)) |
| } |
| |
| Log('Done') |
| } |
| |
| if (length(sys.frames()) == 0) { |
| main(commandArgs(TRUE)) |
| } |