| #!/usr/bin/env python |
| # |
| # kvmexit.py |
| # |
| # Display the exit_reason and its statistics of each vm exit |
| # for all vcpus of all virtual machines. For example: |
| # $./kvmexit.py |
| # PID TID KVM_EXIT_REASON COUNT |
| # 1273551 1273568 EXIT_REASON_MSR_WRITE 6 |
| # 1274253 1274261 EXIT_REASON_EXTERNAL_INTERRUPT 1 |
| # 1274253 1274261 EXIT_REASON_HLT 12 |
| # ... |
| # |
| # Besides, we also allow users to specify one pid, tid(s), or one |
| # pid and its vcpu. See kvmexit_example.txt for more examples. |
| # |
| # @PID: each vitual machine's pid in the user space. |
| # @TID: the user space's thread of each vcpu of that virtual machine. |
| # @KVM_EXIT_REASON: the reason why the vm exits. |
| # @COUNT: the counts of the @KVM_EXIT_REASONS. |
| # |
| # REQUIRES: Linux 4.7+ (BPF_PROG_TYPE_TRACEPOINT support) |
| # |
| # Copyright (c) 2021 ByteDance Inc. All rights reserved. |
| # |
| # Author(s): |
| # Fei Li <[email protected]> |
| |
| |
| from __future__ import print_function |
| from time import sleep |
| from bcc import BPF |
| import argparse |
| import multiprocessing |
| import os |
| import subprocess |
| |
| # |
| # Process Arguments |
| # |
| def valid_args_list(args): |
| args_list = args.split(",") |
| for arg in args_list: |
| try: |
| int(arg) |
| except: |
| raise argparse.ArgumentTypeError("must be valid integer") |
| return args_list |
| |
| # arguments |
| examples = """examples: |
| ./kvmexit # Display kvm_exit_reason and its statistics in real-time until Ctrl-C |
| ./kvmexit 5 # Display in real-time after sleeping 5s |
| ./kvmexit -p 3195281 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order |
| ./kvmexit -p 3195281 20 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order, and display after sleeping 20s |
| ./kvmexit -p 3195281 -v 0 # Display only vcpu0 for pid 3195281, descending sort by default |
| ./kvmexit -p 3195281 -a # Display all tids for pid 3195281 |
| ./kvmexit -t 395490 # Display only for tid 395490 with exit reasons sorted in descending order |
| ./kvmexit -t 395490 20 # Display only for tid 395490 with exit reasons sorted in descending order after sleeping 20s |
| ./kvmexit -T '395490,395491' # Display for a union like {395490, 395491} |
| """ |
| parser = argparse.ArgumentParser( |
| description="Display kvm_exit_reason and its statistics at a timed interval", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=examples) |
| parser.add_argument("duration", nargs="?", default=99999999, type=int, help="show delta for next several seconds") |
| parser.add_argument("-p", "--pid", type=int, help="trace this PID only") |
| exgroup = parser.add_mutually_exclusive_group() |
| exgroup.add_argument("-t", "--tid", type=int, help="trace this TID only") |
| exgroup.add_argument("-T", "--tids", type=valid_args_list, help="trace a comma separated series of tids with no space in between") |
| exgroup.add_argument("-v", "--vcpu", type=int, help="trace this vcpu only") |
| exgroup.add_argument("-a", "--alltids", action="store_true", help="trace all tids for this pid") |
| args = parser.parse_args() |
| duration = int(args.duration) |
| |
| # |
| # Setup BPF |
| # |
| |
| # load BPF program |
| bpf_text = """ |
| #include <linux/delay.h> |
| |
| #define REASON_NUM 69 |
| #define TGID_NUM 1024 |
| |
| struct exit_count { |
| u64 exit_ct[REASON_NUM]; |
| }; |
| BPF_PERCPU_ARRAY(init_value, struct exit_count, 1); |
| BPF_TABLE("percpu_hash", u64, struct exit_count, pcpu_kvm_stat, TGID_NUM); |
| |
| struct cache_info { |
| u64 cache_pid_tgid; |
| struct exit_count cache_exit_ct; |
| }; |
| BPF_PERCPU_ARRAY(pcpu_cache, struct cache_info, 1); |
| |
| FUNC_ENTRY { |
| int cache_miss = 0; |
| int zero = 0; |
| u32 er = GET_ER; |
| if (er >= REASON_NUM) { |
| return 0; |
| } |
| |
| u64 cur_pid_tgid = bpf_get_current_pid_tgid(); |
| u32 tgid = cur_pid_tgid >> 32; |
| u32 pid = cur_pid_tgid; |
| |
| if (THREAD_FILTER) |
| return 0; |
| |
| struct exit_count *tmp_info = NULL, *initial = NULL; |
| struct cache_info *cache_p; |
| cache_p = pcpu_cache.lookup(&zero); |
| if (cache_p == NULL) { |
| return 0; |
| } |
| |
| if (cache_p->cache_pid_tgid == cur_pid_tgid) { |
| //a. If the cur_pid_tgid hit this physical cpu consecutively, save it to pcpu_cache |
| tmp_info = &cache_p->cache_exit_ct; |
| } else { |
| //b. If another pid_tgid matches this pcpu for the last hit, OR it is the first time to hit this physical cpu. |
| cache_miss = 1; |
| |
| // b.a Try to load the last cache struct if exists. |
| tmp_info = pcpu_kvm_stat.lookup(&cur_pid_tgid); |
| |
| // b.b If it is the first time for the cur_pid_tgid to hit this pcpu, employ a |
| // per_cpu array to initialize pcpu_kvm_stat's exit_count with each exit reason's count is zero |
| if (tmp_info == NULL) { |
| initial = init_value.lookup(&zero); |
| if (initial == NULL) { |
| return 0; |
| } |
| |
| pcpu_kvm_stat.update(&cur_pid_tgid, initial); |
| tmp_info = pcpu_kvm_stat.lookup(&cur_pid_tgid); |
| // To pass the verifier |
| if (tmp_info == NULL) { |
| return 0; |
| } |
| } |
| } |
| |
| if (er < REASON_NUM) { |
| tmp_info->exit_ct[er]++; |
| if (cache_miss == 1) { |
| if (cache_p->cache_pid_tgid != 0) { |
| // b.*.a Let's save the last hit cache_info into kvm_stat. |
| pcpu_kvm_stat.update(&cache_p->cache_pid_tgid, &cache_p->cache_exit_ct); |
| } |
| // b.* As the cur_pid_tgid meets current pcpu_cache_array for the first time, save it. |
| cache_p->cache_pid_tgid = cur_pid_tgid; |
| bpf_probe_read(&cache_p->cache_exit_ct, sizeof(*tmp_info), tmp_info); |
| } |
| return 0; |
| } |
| |
| return 0; |
| } |
| """ |
| |
| # format output |
| exit_reasons = ( |
| "EXCEPTION_NMI", |
| "EXTERNAL_INTERRUPT", |
| "TRIPLE_FAULT", |
| "INIT_SIGNAL", |
| "N/A", |
| "N/A", |
| "N/A", |
| "INTERRUPT_WINDOW", |
| "NMI_WINDOW", |
| "TASK_SWITCH", |
| "CPUID", |
| "N/A", |
| "HLT", |
| "INVD", |
| "INVLPG", |
| "RDPMC", |
| "RDTSC", |
| "N/A", |
| "VMCALL", |
| "VMCLEAR", |
| "VMLAUNCH", |
| "VMPTRLD", |
| "VMPTRST", |
| "VMREAD", |
| "VMRESUME", |
| "VMWRITE", |
| "VMOFF", |
| "VMON", |
| "CR_ACCESS", |
| "DR_ACCESS", |
| "IO_INSTRUCTION", |
| "MSR_READ", |
| "MSR_WRITE", |
| "INVALID_STATE", |
| "MSR_LOAD_FAIL", |
| "N/A", |
| "MWAIT_INSTRUCTION", |
| "MONITOR_TRAP_FLAG", |
| "N/A", |
| "MONITOR_INSTRUCTION", |
| "PAUSE_INSTRUCTION", |
| "MCE_DURING_VMENTRY", |
| "N/A", |
| "TPR_BELOW_THRESHOLD", |
| "APIC_ACCESS", |
| "EOI_INDUCED", |
| "GDTR_IDTR", |
| "LDTR_TR", |
| "EPT_VIOLATION", |
| "EPT_MISCONFIG", |
| "INVEPT", |
| "RDTSCP", |
| "PREEMPTION_TIMER", |
| "INVVPID", |
| "WBINVD", |
| "XSETBV", |
| "APIC_WRITE", |
| "RDRAND", |
| "INVPCID", |
| "VMFUNC", |
| "ENCLS", |
| "RDSEED", |
| "PML_FULL", |
| "XSAVES", |
| "XRSTORS", |
| "N/A", |
| "N/A", |
| "UMWAIT", |
| "TPAUSE" |
| ) |
| |
| # |
| # Do some checks |
| # |
| try: |
| # Currently, only adapte on intel architecture |
| cmd = "cat /proc/cpuinfo | grep vendor_id | head -n 1" |
| arch_info = subprocess.check_output(cmd, shell=True).strip() |
| if b"Intel" in arch_info: |
| pass |
| else: |
| raise Exception("Currently we only support Intel architecture, please do expansion if needs more.") |
| |
| # Check if kvm module is loaded |
| if os.access("/dev/kvm", os.R_OK | os.W_OK): |
| pass |
| else: |
| raise Exception("Please insmod kvm module to use kvmexit tool.") |
| except Exception as e: |
| raise Exception("Failed to do precondition check, due to: %s." % e) |
| |
| try: |
| if BPF.support_raw_tracepoint_in_module(): |
| # Let's firstly try raw_tracepoint_in_module |
| func_entry = "RAW_TRACEPOINT_PROBE(kvm_exit)" |
| get_er = "ctx->args[0]" |
| else: |
| # If raw_tp_in_module is not supported, fall back to regular tp |
| func_entry = "TRACEPOINT_PROBE(kvm, kvm_exit)" |
| get_er = "args->exit_reason" |
| except Exception as e: |
| raise Exception("Failed to catch kvm exit reasons due to: %s" % e) |
| |
| |
| def find_tid(tgt_dir, tgt_vcpu): |
| for tid in os.listdir(tgt_dir): |
| path = tgt_dir + "/" + tid + "/comm" |
| fp = open(path, "r") |
| comm = fp.read() |
| if (comm.find(tgt_vcpu) != -1): |
| return tid |
| return -1 |
| |
| # set process/thread filter |
| thread_context = "" |
| header_format = "" |
| need_collapse = not args.alltids |
| if args.tid is not None: |
| thread_context = "TID %s" % args.tid |
| thread_filter = 'pid != %s' % args.tid |
| elif args.tids is not None: |
| thread_context = "TIDS %s" % args.tids |
| thread_filter = "pid != " + " && pid != ".join(args.tids) |
| header_format = "TIDS " |
| elif args.pid is not None: |
| thread_context = "PID %s" % args.pid |
| thread_filter = 'tgid != %s' % args.pid |
| if args.vcpu is not None: |
| thread_context = "PID %s VCPU %s" % (args.pid, args.vcpu) |
| # transfer vcpu to tid |
| tgt_dir = '/proc/' + str(args.pid) + '/task' |
| tgt_vcpu = "CPU " + str(args.vcpu) |
| args.tid = find_tid(tgt_dir, tgt_vcpu) |
| if args.tid == -1: |
| raise Exception("There's no v%s for PID %d." % (tgt_vcpu, args.pid)) |
| thread_filter = 'pid != %s' % args.tid |
| elif args.alltids: |
| thread_context = "PID %s and its all threads" % args.pid |
| header_format = "TID " |
| else: |
| thread_context = "all threads" |
| thread_filter = '0' |
| header_format = "PID TID " |
| bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) |
| |
| # For kernel >= 5.0, use RAW_TRACEPOINT_MODULE for performance consideration |
| bpf_text = bpf_text.replace('FUNC_ENTRY', func_entry) |
| bpf_text = bpf_text.replace('GET_ER', get_er) |
| b = BPF(text=bpf_text) |
| |
| |
| # header |
| print("Display kvm exit reasons and statistics for %s" % thread_context, end="") |
| if duration < 99999999: |
| print(" after sleeping %d secs." % duration) |
| else: |
| print("... Hit Ctrl-C to end.") |
| |
| try: |
| sleep(duration) |
| except KeyboardInterrupt: |
| print() |
| |
| |
| # Currently, sort multiple tids in descending order is not supported. |
| if (args.pid or args.tid): |
| ct_reason = [] |
| if args.pid: |
| tgid_exit = [0 for i in range(len(exit_reasons))] |
| |
| # output |
| print("%s%-35s %s" % (header_format, "KVM_EXIT_REASON", "COUNT")) |
| |
| pcpu_kvm_stat = b["pcpu_kvm_stat"] |
| pcpu_cache = b["pcpu_cache"] |
| for k, v in pcpu_kvm_stat.items(): |
| tgid = k.value >> 32 |
| pid = k.value & 0xffffffff |
| for i in range(0, len(exit_reasons)): |
| sum1 = 0 |
| for inner_cpu in range(0, multiprocessing.cpu_count()): |
| cachePIDTGID = pcpu_cache[0][inner_cpu].cache_pid_tgid |
| # Take priority to check if it is in cache |
| if cachePIDTGID == k.value: |
| sum1 += pcpu_cache[0][inner_cpu].cache_exit_ct.exit_ct[i] |
| # If not in cache, find from kvm_stat |
| else: |
| sum1 += v[inner_cpu].exit_ct[i] |
| if sum1 == 0: |
| continue |
| |
| if (args.pid and args.pid == tgid and need_collapse): |
| tgid_exit[i] += sum1 |
| elif (args.tid and args.tid == pid): |
| ct_reason.append((sum1, i)) |
| elif not need_collapse or args.tids: |
| print("%-8u %-35s %-8u" % (pid, exit_reasons[i], sum1)) |
| else: |
| print("%-8u %-8u %-35s %-8u" % (tgid, pid, exit_reasons[i], sum1)) |
| |
| # Display only for the target tid in descending sort |
| if (args.tid and args.tid == pid): |
| ct_reason.sort(reverse=True) |
| for i in range(0, len(ct_reason)): |
| if ct_reason[i][0] == 0: |
| continue |
| print("%-35s %-8u" % (exit_reasons[ct_reason[i][1]], ct_reason[i][0])) |
| break |
| |
| |
| # Aggregate all tids' counts for this args.pid in descending sort |
| if args.pid and need_collapse: |
| for i in range(0, len(exit_reasons)): |
| ct_reason.append((tgid_exit[i], i)) |
| ct_reason.sort(reverse=True) |
| for i in range(0, len(ct_reason)): |
| if ct_reason[i][0] == 0: |
| continue |
| print("%-35s %-8u" % (exit_reasons[ct_reason[i][1]], ct_reason[i][0])) |