| #!/usr/bin/env python |
| # @lint-avoid-python-3-compatibility-imports |
| # |
| # cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU. |
| # For Linux, uses BCC, eBPF. |
| # |
| # This samples the length of the run queues and determine when there are idle |
| # CPUs, yet queued threads waiting their turn. Report the amount of idle |
| # (yet unclaimed by waiting threads) CPU as a system-wide percentage. |
| # |
| # This situation can happen for a number of reasons: |
| # |
| # - An application has been bound to some, but not all, CPUs, and has runnable |
| # threads that cannot migrate to other CPUs due to this configuration. |
| # - CPU affinity: an optimization that leaves threads on CPUs where the CPU |
| # caches are warm, even if this means short periods of waiting while other |
| # CPUs are idle. The wait period is tunale (see sysctl, kernel.sched*). |
| # - Scheduler bugs. |
| # |
| # An unclaimed idle of < 1% is likely to be CPU affinity, and not usually a |
| # cause for concern. By leaving the CPU idle, overall throughput of the system |
| # may be improved. This tool is best for identifying larger issues, > 2%, due |
| # to the coarseness of its 99 Hertz samples. |
| # |
| # This is an experimental tool that currently works by use of sampling to |
| # keep overheads low. Tool assumptions: |
| # |
| # - CPU samples consistently fire around the same offset. There will sometimes |
| # be a lag as a sample is delayed by higher-priority interrupts, but it is |
| # assumed the subsequent samples will catch up to the expected offsets (as |
| # is seen in practice). You can use -J to inspect sample offsets. Some |
| # systems can power down CPUs when idle, and when they wake up again they |
| # may begin firing at a skewed offset: this tool will detect the skew, print |
| # an error, and exit. |
| # - All CPUs are online (see ncpu). |
| # |
| # If this identifies unclaimed CPU, you can double check it by dumping raw |
| # samples (-j), as well as using other tracing tools to instrument scheduler |
| # events (although this latter approach has much higher overhead). |
| # |
| # This tool passes all sampled events to user space for post processing. |
| # I originally wrote this to do the calculations entirerly in kernel context, |
| # and only pass a summary. That involves a number of challenges, and the |
| # overhead savings may not outweigh the caveats. You can see my WIP here: |
| # https://gist.github.com/brendangregg/731cf2ce54bf1f9a19d4ccd397625ad9 |
| # |
| # USAGE: cpuunclaimed [-h] [-j] [-J] [-T] [interval] [count] |
| # |
| # If you see "Lost 1881 samples" warnings, try increasing wakeup_hz. |
| # |
| # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is |
| # a version of this tool that may work on Linux 4.6 - 4.8. |
| # |
| # Copyright 2016 Netflix, Inc. |
| # Licensed under the Apache License, Version 2.0 (the "License") |
| # |
| # 20-Dec-2016 Brendan Gregg Created this. |
| |
| from __future__ import print_function |
| from bcc import BPF, PerfType, PerfSWConfig |
| from time import sleep, strftime |
| import argparse |
| import multiprocessing |
| from os import getpid, system, open, close, dup, unlink, O_WRONLY |
| from tempfile import NamedTemporaryFile |
| |
| # arguments |
| examples = """examples: |
| ./cpuunclaimed # sample and calculate unclaimed idle CPUs, |
| # output every 1 second (default) |
| ./cpuunclaimed 5 10 # print 5 second summaries, 10 times |
| ./cpuunclaimed -T 1 # 1s summaries and timestamps |
| ./cpuunclaimed -j # raw dump of all samples (verbose), CSV |
| """ |
| parser = argparse.ArgumentParser( |
| description="Sample CPU run queues and calculate unclaimed idle CPU", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=examples) |
| parser.add_argument("-j", "--csv", action="store_true", |
| help="print sample summaries (verbose) as comma-separated values") |
| parser.add_argument("-J", "--fullcsv", action="store_true", |
| help="print sample summaries with extra fields: CPU sample offsets") |
| parser.add_argument("-T", "--timestamp", action="store_true", |
| help="include timestamp on output") |
| parser.add_argument("interval", nargs="?", default=-1, |
| help="output interval, in seconds") |
| parser.add_argument("count", nargs="?", default=99999999, |
| help="number of outputs") |
| parser.add_argument("--ebpf", action="store_true", |
| help=argparse.SUPPRESS) |
| args = parser.parse_args() |
| countdown = int(args.count) |
| frequency = 99 |
| dobind = 1 |
| wakeup_hz = 10 # frequency to read buffers |
| wakeup_s = float(1) / wakeup_hz |
| ncpu = multiprocessing.cpu_count() # assume all are online |
| debug = 0 |
| |
| # Linux 4.15 introduced a new field runnable_weight |
| # in linux_src:kernel/sched/sched.h as |
| # struct cfs_rq { |
| # struct load_weight load; |
| # unsigned long runnable_weight; |
| # unsigned int nr_running, h_nr_running; |
| # ...... |
| # } |
| # and this tool requires to access nr_running to get |
| # runqueue len information. |
| # |
| # The commit which introduces cfs_rq->runnable_weight |
| # field also introduces the field sched_entity->runnable_weight |
| # where sched_entity is defined in linux_src:include/linux/sched.h. |
| # |
| # To cope with pre-4.15 and 4.15/post-4.15 releases, |
| # we run a simple BPF program to detect whether |
| # field sched_entity->runnable_weight exists. The existence of |
| # this field should infer the existence of cfs_rq->runnable_weight. |
| # |
| # This will need maintenance as the relationship between these |
| # two fields may change in the future. |
| # |
| def check_runnable_weight_field(): |
| # Define the bpf program for checking purpose |
| bpf_check_text = """ |
| #include <linux/sched.h> |
| unsigned long dummy(struct sched_entity *entity) |
| { |
| return entity->runnable_weight; |
| } |
| """ |
| |
| # Get a temporary file name |
| tmp_file = NamedTemporaryFile(delete=False) |
| tmp_file.close(); |
| |
| # Duplicate and close stderr (fd = 2) |
| old_stderr = dup(2) |
| close(2) |
| |
| # Open a new file, should get fd number 2 |
| # This will avoid printing llvm errors on the screen |
| fd = open(tmp_file.name, O_WRONLY) |
| try: |
| t = BPF(text=bpf_check_text) |
| success_compile = True |
| except: |
| success_compile = False |
| |
| # Release the fd 2, and next dup should restore old stderr |
| close(fd) |
| dup(old_stderr) |
| close(old_stderr) |
| |
| # remove the temporary file and return |
| unlink(tmp_file.name) |
| return success_compile |
| |
| |
| # process arguments |
| if args.fullcsv: |
| args.csv = True |
| if args.csv: |
| interval = 0.2 |
| if args.interval != -1 and (args.fullcsv or args.csv): |
| print("ERROR: cannot use interval with either -j or -J. Exiting.") |
| exit() |
| if args.interval == -1: |
| args.interval = "1" |
| interval = float(args.interval) |
| |
| # define BPF program |
| bpf_text = """ |
| #include <uapi/linux/ptrace.h> |
| #include <uapi/linux/bpf_perf_event.h> |
| #include <linux/sched.h> |
| |
| struct data_t { |
| u64 ts; |
| u64 cpu; |
| u64 len; |
| }; |
| |
| BPF_PERF_OUTPUT(events); |
| |
| // Declare enough of cfs_rq to find nr_running, since we can't #import the |
| // header. This will need maintenance. It is from kernel/sched/sched.h: |
| // The runnable_weight field is removed from Linux 5.7.0 |
| struct cfs_rq_partial { |
| struct load_weight load; |
| #if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) |
| RUNNABLE_WEIGHT_FIELD |
| #endif |
| unsigned int nr_running, h_nr_running; |
| }; |
| |
| int do_perf_event(struct bpf_perf_event_data *ctx) |
| { |
| int cpu = bpf_get_smp_processor_id(); |
| u64 now = bpf_ktime_get_ns(); |
| |
| /* |
| * Fetch the run queue length from task->se.cfs_rq->nr_running. This is an |
| * unstable interface and may need maintenance. Perhaps a future version |
| * of BPF will support task_rq(p) or something similar as a more reliable |
| * interface. |
| */ |
| unsigned int len = 0; |
| struct task_struct *task = NULL; |
| struct cfs_rq_partial *my_q = NULL; |
| task = (struct task_struct *)bpf_get_current_task(); |
| my_q = (struct cfs_rq_partial *)task->se.cfs_rq; |
| len = my_q->nr_running; |
| |
| struct data_t data = {.ts = now, .cpu = cpu, .len = len}; |
| events.perf_submit(ctx, &data, sizeof(data)); |
| |
| return 0; |
| } |
| """ |
| |
| # If target has BTF enabled, use BTF to check runnable_weight field exists in |
| # cfs_rq first, otherwise fallback to use check_runnable_weight_field(). |
| if BPF.kernel_struct_has_field(b'cfs_rq', b'runnable_weight') == 1 \ |
| or check_runnable_weight_field(): |
| bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') |
| else: |
| bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') |
| |
| # code substitutions |
| if debug or args.ebpf: |
| print(bpf_text) |
| if args.ebpf: |
| exit() |
| |
| # initialize BPF & perf_events |
| b = BPF(text=bpf_text) |
| # TODO: check for HW counters first and use if more accurate |
| b.attach_perf_event(ev_type=PerfType.SOFTWARE, |
| ev_config=PerfSWConfig.TASK_CLOCK, fn_name="do_perf_event", |
| sample_period=0, sample_freq=frequency) |
| |
| if args.csv: |
| if args.timestamp: |
| print("TIME", end=",") |
| print("TIMESTAMP_ns", end=",") |
| print(",".join("CPU" + str(c) for c in range(ncpu)), end="") |
| if args.fullcsv: |
| print(",", end="") |
| print(",".join("OFFSET_ns_CPU" + str(c) for c in range(ncpu)), end="") |
| print() |
| else: |
| print(("Sampling run queues... Output every %s seconds. " + |
| "Hit Ctrl-C to end.") % args.interval) |
| |
| samples = {} |
| group = {} |
| last = 0 |
| |
| # process event |
| def print_event(cpu, data, size): |
| event = b["events"].event(data) |
| samples[event.ts] = {} |
| samples[event.ts]['cpu'] = event.cpu |
| samples[event.ts]['len'] = event.len |
| |
| exiting = 0 if args.interval else 1 |
| slept = float(0) |
| |
| # Choose the elapsed time from one sample group to the next that identifies a |
| # new sample group (a group being a set of samples from all CPUs). The |
| # earliest timestamp is compared in each group. This trigger is also used |
| # for sanity testing, if a group's samples exceed half this value. |
| trigger = int(0.8 * (1000000000 / frequency)) |
| |
| # read events |
| b["events"].open_perf_buffer(print_event, page_cnt=64) |
| while 1: |
| # allow some buffering by calling sleep(), to reduce the context switch |
| # rate and lower overhead. |
| try: |
| if not exiting: |
| sleep(wakeup_s) |
| except KeyboardInterrupt: |
| exiting = 1 |
| b.perf_buffer_poll() |
| slept += wakeup_s |
| |
| if slept < 0.999 * interval: # floating point workaround |
| continue |
| slept = 0 |
| |
| positive = 0 # number of samples where an idle CPU could have run work |
| running = 0 |
| idle = 0 |
| if debug >= 2: |
| print("DEBUG: begin samples loop, count %d" % len(samples)) |
| for e in sorted(samples): |
| if debug >= 2: |
| print("DEBUG: ts %d cpu %d len %d delta %d trig %d" % (e, |
| samples[e]['cpu'], samples[e]['len'], e - last, |
| e - last > trigger)) |
| |
| # look for time jumps to identify a new sample group |
| if e - last > trigger: |
| |
| # first first group timestamp, and sanity test |
| g_time = 0 |
| g_max = 0 |
| for ge in sorted(group): |
| if g_time == 0: |
| g_time = ge |
| g_max = ge |
| |
| # process previous sample group |
| if args.csv: |
| lens = [0] * ncpu |
| offs = [0] * ncpu |
| for ge in sorted(group): |
| lens[samples[ge]['cpu']] = samples[ge]['len'] |
| if args.fullcsv: |
| offs[samples[ge]['cpu']] = ge - g_time |
| if g_time > 0: # else first sample |
| if args.timestamp: |
| print("%-8s" % strftime("%H:%M:%S"), end=",") |
| print("%d" % g_time, end=",") |
| print(",".join(str(lens[c]) for c in range(ncpu)), end="") |
| if args.fullcsv: |
| print(",", end="") |
| print(",".join(str(offs[c]) for c in range(ncpu))) |
| else: |
| print() |
| else: |
| # calculate stats |
| g_running = 0 |
| g_queued = 0 |
| for ge in group: |
| if samples[ge]['len'] > 0: |
| g_running += 1 |
| if samples[ge]['len'] > 1: |
| g_queued += samples[ge]['len'] - 1 |
| g_idle = ncpu - g_running |
| |
| # calculate the number of threads that could have run as the |
| # minimum of idle and queued |
| if g_idle > 0 and g_queued > 0: |
| if g_queued > g_idle: |
| i = g_idle |
| else: |
| i = g_queued |
| positive += i |
| running += g_running |
| idle += g_idle |
| |
| # now sanity test, after -J output |
| g_range = g_max - g_time |
| if g_range > trigger / 2: |
| # if a sample group exceeds half the interval, we can no |
| # longer draw conclusions about some CPUs idle while others |
| # have queued work. Error and exit. This can happen when |
| # CPUs power down, then start again on different offsets. |
| # TODO: Since this is a sampling tool, an error margin should |
| # be anticipated, so an improvement may be to bump a counter |
| # instead of exiting, and only exit if this counter shows |
| # a skewed sample rate of over, say, 1%. Such an approach |
| # would allow a small rate of outliers (sampling error), |
| # and, we could tighten the trigger to be, say, trigger / 5. |
| # In the case of a power down, if it's detectable, perhaps |
| # the tool could reinitialize the timers (although exiting |
| # is simple and works). |
| print(("ERROR: CPU samples arrived at skewed offsets " + |
| "(CPUs may have powered down when idle), " + |
| "spanning %d ns (expected < %d ns). Debug with -J, " + |
| "and see the man page. As output may begin to be " + |
| "unreliable, exiting.") % (g_range, trigger / 2)) |
| exit() |
| |
| # these are done, remove |
| for ge in sorted(group): |
| del samples[ge] |
| |
| # begin next group |
| group = {} |
| last = e |
| |
| # stash this timestamp in a sample group dict |
| group[e] = 1 |
| |
| if not args.csv: |
| total = running + idle |
| unclaimed = util = 0 |
| |
| if debug: |
| print("DEBUG: hit %d running %d idle %d total %d buffered %d" % ( |
| positive, running, idle, total, len(samples))) |
| |
| if args.timestamp: |
| print("%-8s " % strftime("%H:%M:%S"), end="") |
| |
| # output |
| if total: |
| unclaimed = float(positive) / total |
| util = float(running) / total |
| print("%%CPU %6.2f%%, unclaimed idle %0.2f%%" % (100 * util, |
| 100 * unclaimed)) |
| |
| countdown -= 1 |
| if exiting or countdown == 0: |
| exit() |