src/PerfCounters_x86.h - toolchain/rr - Git at Google

 /* -*- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */
 // This file is included from PerfCounters.cc

 static bool has_kvm_in_txcp_bug;
 static bool has_xen_pmi_bug;
 static bool supports_txcp;

 /**
  * Return the detected, known microarchitecture of this CPU, or don't
  * return; i.e. never return UnknownCpu.
  */
 static CpuMicroarch compute_cpu_microarch() {
   auto cpuid_vendor = cpuid(CPUID_GETVENDORSTRING, 0);
   char vendor[12];
   memcpy(&vendor[0], &cpuid_vendor.ebx, 4);
   memcpy(&vendor[4], &cpuid_vendor.edx, 4);
   memcpy(&vendor[8], &cpuid_vendor.ecx, 4);
   if (strncmp(vendor, "GenuineIntel", sizeof(vendor)) &&
       strncmp(vendor, "AuthenticAMD", sizeof(vendor))) {
     CLEAN_FATAL() << "Unknown CPU vendor '" << vendor << "'";
   }

   auto cpuid_data = cpuid(CPUID_GETFEATURES, 0);
   unsigned int cpu_type = cpuid_data.eax & 0xF0FF0;
   unsigned int ext_family = (cpuid_data.eax >> 20) & 0xff;
   switch (cpu_type) {
     case 0x006F0:
     case 0x10660:
       return IntelMerom;
     case 0x10670:
     case 0x106D0:
       return IntelPenryn;
     case 0x106A0:
     case 0x106E0:
     case 0x206E0:
       return IntelNehalem;
     case 0x20650:
     case 0x206C0:
     case 0x206F0:
       return IntelWestmere;
     case 0x206A0:
     case 0x206D0:
     case 0x306e0:
       return IntelSandyBridge;
     case 0x306A0:
       return IntelIvyBridge;
     case 0x306C0: /* Devil's Canyon */
     case 0x306F0:
     case 0x40650:
     case 0x40660:
       return IntelHaswell;
     case 0x306D0:
     case 0x40670:
     case 0x406F0:
     case 0x50660:
       return IntelBroadwell;
     case 0x406e0:
     case 0x50650:
     case 0x506e0:
       return IntelSkylake;
     case 0x30670:
     case 0x406c0:
     case 0x50670:
       return IntelSilvermont;
     case 0x506f0:
     case 0x706a0:
     case 0x506c0:
       return IntelGoldmont;
     case 0x706e0:
     case 0x606a0:
       return IntelIcelake;
     case 0x806c0:
     case 0x806d0:
       return IntelTigerlake;
     case 0x806e0:
     case 0x906e0:
       return IntelKabylake;
     case 0xa0650:
     case 0xa0660:
       return IntelCometlake;
     case 0xa0670:
       return IntelRocketlake;
     case 0x90670:
     case 0x906a0:
       return IntelAlderlake;
     case 0xb0670:
       return IntelRaptorlake;
     case 0x806f0:
       return IntelSapphireRapid;
     case 0x30f00:
       return AMDF15R30;
     case 0x00f10: // Naples, Whitehaven, Summit Ridge, Snowy Owl (Zen), Milan (Zen 3) (UNTESTED)
     case 0x10f10: // Raven Ridge, Great Horned Owl (Zen) (UNTESTED)
     case 0x10f80: // Banded Kestrel (Zen), Picasso (Zen+) (UNTESTED)
     case 0x20f00: // Dali (Zen) (UNTESTED)
     case 0x00f80: // Colfax, Pinnacle Ridge (Zen+) (UNTESTED)
     case 0x30f10: // Rome, Castle Peak (Zen 2)
     case 0x60f00: // Renoir (Zen 2) (UNTESTED)
     case 0x70f10: // Matisse (Zen 2) (UNTESTED)
     case 0x60f80: // Lucienne
     case 0x90f00: // Van Gogh (Zen 2)
       if (ext_family == 8 || ext_family == 0xa) {
         return AMDZen;
       } else if (ext_family == 3) {
         return AMDF15R30;
       }
       break;
     case 0x20f10: // Vermeer (Zen 3)
     case 0x50f00: // Cezanne (Zen 3)
     case 0x40f40: // Rembrandt (Zen 3+)
     case 0x60f10: // Raphael (Zen 4)
       if (ext_family == 0xa) {
         return AMDZen;
       }
     default:
       break;
   }

   if (!strncmp(vendor, "AuthenticAMD", sizeof(vendor))) {
     CLEAN_FATAL() << "AMD CPU type " << HEX(cpu_type) <<
                      " (ext family " << HEX(ext_family) << ") unknown";
   } else {
     CLEAN_FATAL() << "Intel CPU type " << HEX(cpu_type) << " unknown";
   }
   return UnknownCpu; // not reached
 }

 static std::vector<CpuMicroarch> compute_cpu_microarchs() {
   return { compute_cpu_microarch() };
 }

 static void check_for_kvm_in_txcp_bug(const perf_event_attrs &perf_attr) {
   int64_t count = 0;
   struct perf_event_attr attr = perf_attr.ticks;
   attr.config |= IN_TXCP;
   attr.sample_period = 0;
   bool disabled_txcp;
   ScopedFd fd = start_counter(0, -1, &attr, &disabled_txcp);
   if (fd.is_open() && !disabled_txcp) {
     ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
     ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
     do_branches();
     count = read_counter(fd);
   }

   supports_txcp = count > 0;
   has_kvm_in_txcp_bug = supports_txcp && count < NUM_BRANCHES;
   LOG(debug) << "supports txcp=" << supports_txcp;
   LOG(debug) << "has_kvm_in_txcp_bug=" << has_kvm_in_txcp_bug
              << " count=" << count;
 }

 static void check_for_xen_pmi_bug(const perf_event_attrs &perf_attr) {
   int32_t count = -1;
   struct perf_event_attr attr = perf_attr.ticks;
   attr.sample_period = NUM_BRANCHES - 1;
   ScopedFd fd = start_counter(0, -1, &attr);
   if (fd.is_open()) {
     // Do NUM_BRANCHES conditional branches that can't be optimized out.
     // 'accumulator' is always odd and can't be zero
     uint32_t accumulator = uint32_t(rand()) * 2 + 1;
     int raw_fd = fd;
     asm volatile(
 #if defined(__x86_64__)
         "mov %[_SYS_ioctl], %%rax;"
         "mov %[raw_fd], %%edi;"
         "xor %%rdx, %%rdx;"
         "mov %[_PERF_EVENT_IOC_ENABLE], %%rsi;"
         "syscall;"
         "cmp $-4095, %%rax;"
         "jae 2f;"
         "mov %[_SYS_ioctl], %%rax;"
         "mov %[_PERF_EVENT_IOC_RESET], %%rsi;"
         "syscall;"
         // From this point on all conditional branches count!
         "cmp $-4095, %%rax;"
         "jae 2f;"
         // Reset the counter period to the desired value.
         "mov %[_SYS_ioctl], %%rax;"
         "mov %[_PERF_EVENT_IOC_PERIOD], %%rsi;"
         "mov %[period], %%rdx;"
         "syscall;"
         "cmp $-4095, %%rax;"
         "jae 2f;"
         "mov %[_iterations], %%rax;"
         "1: dec %%rax;"
         // Multiply by 7.
         "mov %[accumulator], %%edx;"
         "shl $3, %[accumulator];"
         "sub %%edx, %[accumulator];"
         // Add 2.
         "add $2, %[accumulator];"
         // Mask off bits.
         "and $0xffffff, %[accumulator];"
         // And loop.
         "test %%rax, %%rax;"
         "jnz 1b;"
         "mov %[_PERF_EVENT_IOC_DISABLE], %%rsi;"
         "mov %[_SYS_ioctl], %%rax;"
         "xor %%rdx, %%rdx;"
         // We didn't touch rdi.
         "syscall;"
         "cmp $-4095, %%rax;"
         "jae 2f;"
         "movl $0, %[count];"
         "2: nop;"
 #elif defined(__i386__)
         "mov %[_SYS_ioctl], %%eax;"
         "mov %[raw_fd], %%ebx;"
         "xor %%edx, %%edx;"
         "mov %[_PERF_EVENT_IOC_ENABLE], %%ecx;"
         "int $0x80;"
         "cmp $-4095, %%eax;"
         "jae 2f;"
         "mov %[_SYS_ioctl], %%eax;"
         "mov %[_PERF_EVENT_IOC_RESET], %%ecx;"
         "int $0x80;"
         // From this point on all conditional branches count!
         "cmp $-4095, %%eax;"
         "jae 2f;"
         // Reset the counter period to the desired value.
         "mov %[_SYS_ioctl], %%eax;"
         "mov %[_PERF_EVENT_IOC_PERIOD], %%ecx;"
         "mov %[period], %%edx;"
         "int $0x80;"
         "cmp $-4095, %%eax;"
         "jae 2f;"
         "mov %[_iterations], %%eax;"
         "1: dec %%eax;"
         // Multiply by 7.
         "mov %[accumulator], %%edx;"
         "shll $3, %[accumulator];"
         "sub %%edx, %[accumulator];"
         // Add 2.
         "addl $2, %[accumulator];"
         // Mask off bits.
         "andl $0xffffff, %[accumulator];"
         // And loop.
         "test %%eax, %%eax;"
         "jnz 1b;"
         "mov %[_PERF_EVENT_IOC_DISABLE], %%ecx;"
         "mov %[_SYS_ioctl], %%eax;"
         "xor %%edx, %%edx;"
         // We didn't touch rdi.
         "int $0x80;"
         "cmp $-4095, %%eax;"
         "jae 2f;"
         "movl $0, %[count];"
         "2: nop;"
 #else
 #error unknown CPU architecture
 #endif
         : [accumulator] "+rm"(accumulator), [count] "=rm"(count)
         : [_SYS_ioctl] "i"(SYS_ioctl),
           [_PERF_EVENT_IOC_DISABLE] "i"(PERF_EVENT_IOC_DISABLE),
           [_PERF_EVENT_IOC_ENABLE] "i"(PERF_EVENT_IOC_ENABLE),
           [_PERF_EVENT_IOC_PERIOD] "i"(PERF_EVENT_IOC_PERIOD),
           [_PERF_EVENT_IOC_RESET] "i"(PERF_EVENT_IOC_RESET),
           // The check for the failure of some of our ioctls is in
           // the measured region, so account for that when looping.
           [_iterations] "i"(NUM_BRANCHES - 2),
           [period] "rm"(&attr.sample_period), [raw_fd] "rm"(raw_fd)
         :
 #if defined(__x86_64__)
         "rax", "rdx", "rdi", "rsi"
         // `syscall` clobbers rcx and r11.
         ,
         "rcx", "r11"
 #elif defined(__i386__)
         "eax", "ebx", "ecx", "edx"
 #else
 #error unknown CPU architecture
 #endif
         );
     // If things worked above, `count` should have been set to 0.
     if (count == 0) {
       count = read_counter(fd);
     }
     // Use 'accumulator' so it can't be optimized out.
     accumulator_sink = accumulator;
   }

   has_xen_pmi_bug = count > NUM_BRANCHES || count == -1;
   if (has_xen_pmi_bug) {
     LOG(debug) << "has_xen_pmi_bug=" << has_xen_pmi_bug << " count=" << count;
     if (!Flags::get().force_things) {
       FATAL()
           << "Overcount triggered by PMU interrupts detected due to Xen PMU "
              "virtualization bug.\n"
              "Aborting. Retry with -F to override, but it will probably\n"
              "fail.";
     }
   }
 }

 static void check_for_zen_speclockmap() {
   // When the SpecLockMap optimization is not disabled, rr will not work
   // reliably (e.g. it would work fine on a single process with a single
   // thread, but not more). When the optimization is disabled, the
   // perf counter for retired lock instructions of type SpecLockMapCommit
   // (on PMC 0x25) stays at 0.
   // See more details at https://github.com/rr-debugger/rr/issues/2034.
   struct perf_event_attr attr;
   // 0x25 == RETIRED_LOCK_INSTRUCTIONS - Counts the number of retired locked instructions
   // + 0x08 == SPECLOCKMAPCOMMIT
   init_perf_event_attr(&attr, PERF_TYPE_RAW, 0x510825);

   ScopedFd fd = start_counter(0, -1, &attr);
   if (fd.is_open()) {
     int atomic = 0;
     int64_t count = read_counter(fd);
     // A lock add is known to increase the perf counter we're looking at.
     asm volatile("lock addl $1, %0": "+m" (atomic));
     if (read_counter(fd) == count) {
       LOG(debug) << "SpecLockMap is disabled";
     } else {
       LOG(debug) << "SpecLockMap is not disabled";
       fprintf(stderr,
               "On Zen CPUs, rr will not work reliably unless you disable the "
               "hardware SpecLockMap optimization.\nFor instructions on how to "
               "do this, see https://github.com/rr-debugger/rr/wiki/Zen\n");
     }
   }
 }

 static void check_for_freeze_on_smi() {
   ScopedFd fd = ScopedFd("/sys/devices/cpu/freeze_on_smi", O_RDONLY);
   if (!fd.is_open()) {
     LOG(debug) << "/sys/devices/cpu/freeze_on_smi not present";
     return;
   }

   char freeze_on_smi = 0;
   ssize_t ret = read(fd, &freeze_on_smi, 1);
   if (ret != 1) {
     FATAL() << "Can't read freeze_on_smi";
   }
   if (freeze_on_smi == 0) {
     LOG(warn) << "Failed to read freeze_on_smi";
   } else if (freeze_on_smi == '1') {
     LOG(debug) << "freeze_on_smi is set";
   } else if (freeze_on_smi == '0') {
     LOG(warn) << "freeze_on_smi is not set";
     if (!Flags::get().suppress_environment_warnings) {
       fprintf(stderr,
               "Freezing performance counters on SMIs should be enabled for maximum rr\n"
               "reliability on Comet Lake and later CPUs. To manually enable this setting, run\n"
               "\techo 1 | sudo tee /sys/devices/cpu/freeze_on_smi\n"
               "On systemd systems, consider putting\n"
               "'w /sys/devices/cpu/freeze_on_smi - - - - 1' into /etc/tmpfiles.d/10-rr.conf\n"
               "to automatically apply this setting on every reboot.\n"
               "See 'man 5 sysfs', 'man 5 tmpfiles.d'.\n"
               "If you are seeing this message, the setting has not been enabled.\n");
     }
   } else {
     LOG(warn) << "Unrecognized freeze_on_smi value " << freeze_on_smi;
   }
 }

 static void check_for_arch_bugs(perf_event_attrs &perf_attr) {
   DEBUG_ASSERT(rr::perf_attrs.size() == 1);
   CpuMicroarch uarch = (CpuMicroarch)perf_attr.bug_flags;
   if (uarch >= FirstIntel && uarch <= LastIntel) {
     check_for_kvm_in_txcp_bug(perf_attr);
     check_for_xen_pmi_bug(perf_attr);
   }
   if (uarch >= IntelCometlake && uarch <= LastIntel) {
     check_for_freeze_on_smi();
   }
   if (uarch == AMDZen) {
     check_for_zen_speclockmap();
   }
 }

 static void post_init_pmu_uarchs(std::vector<PmuConfig> &pmu_uarchs)
 {
   if (pmu_uarchs.size() != 1) {
     CLEAN_FATAL() << "rr only support a single PMU on x86, "
                   << pmu_uarchs.size() << " specified.";
   }
 }

 static bool always_recreate_counters(const perf_event_attrs &perf_attr) {
   // When we have the KVM IN_TXCP bug, reenabling the TXCP counter after
   // disabling it does not work.
   DEBUG_ASSERT(perf_attr.checked);
   return perf_attr.has_ioc_period_bug || has_kvm_in_txcp_bug;
 }

 static void arch_check_restricted_counter() {
   if ((cpuid(CPUID_GETEXTENDEDFEATURES, 0).ebx & HLE_FEATURE_FLAG) &&
     !Flags::get().suppress_environment_warnings) {
     fprintf(stderr,
             "Your CPU supports Hardware Lock Elision but you only have one\n"
             "hardware performance counter available. Record and replay\n"
             "of code that uses HLE will fail unless you alter your\n"
             "configuration to make more than one hardware performance counter\n"
             "available.\n");
   }
 }

 template <typename Arch>
 void PerfCounters::reset_arch_extras() {
   DEBUG_ASSERT(rr::perf_attrs.size() == 1);
   if (supports_txcp) {
     struct perf_event_attr attr = rr::perf_attrs[0].ticks;
     if (has_kvm_in_txcp_bug) {
       // IN_TXCP isn't going to work reliably. Assume that HLE/RTM are not
       // used,
       // and check that.
       attr.sample_period = 0;
       attr.config |= IN_TX;
       fd_ticks_in_transaction = start_counter(tid, fd_ticks_interrupt, &attr);
     } else {
       // Set up a separate counter for measuring ticks, which does not have
       // a sample period and does not count events during aborted
       // transactions.
       // We have to use two separate counters here because the kernel does
       // not support setting a sample_period with IN_TXCP, apparently for
       // reasons related to this Intel note on IA32_PERFEVTSEL2:
       // ``When IN_TXCP=1 & IN_TX=1 and in sampling, spurious PMI may
       // occur and transactions may continuously abort near overflow
       // conditions. Software should favor using IN_TXCP for counting over
       // sampling. If sampling, software should use large “sample-after“
       // value after clearing the counter configured to use IN_TXCP and
       // also always reset the counter even when no overflow condition
       // was reported.''
       attr.sample_period = 0;
       attr.config |= IN_TXCP;
       fd_ticks_measure = start_counter(tid, fd_ticks_interrupt, &attr);
     }
   }
 }
	/* -- Mode: C++; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -- */
	// This file is included from PerfCounters.cc

	static bool has_kvm_in_txcp_bug;
	static bool has_xen_pmi_bug;
	static bool supports_txcp;

	/**
	* Return the detected, known microarchitecture of this CPU, or don't
	* return; i.e. never return UnknownCpu.
	*/
	static CpuMicroarch compute_cpu_microarch() {
	auto cpuid_vendor = cpuid(CPUID_GETVENDORSTRING, 0);
	char vendor[12];
	memcpy(&vendor[0], &cpuid_vendor.ebx, 4);
	memcpy(&vendor[4], &cpuid_vendor.edx, 4);
	memcpy(&vendor[8], &cpuid_vendor.ecx, 4);
	if (strncmp(vendor, "GenuineIntel", sizeof(vendor)) &&
	strncmp(vendor, "AuthenticAMD", sizeof(vendor))) {
	CLEAN_FATAL() << "Unknown CPU vendor '" << vendor << "'";
	}

	auto cpuid_data = cpuid(CPUID_GETFEATURES, 0);
	unsigned int cpu_type = cpuid_data.eax & 0xF0FF0;
	unsigned int ext_family = (cpuid_data.eax >> 20) & 0xff;
	switch (cpu_type) {
	case 0x006F0:
	case 0x10660:
	return IntelMerom;
	case 0x10670:
	case 0x106D0:
	return IntelPenryn;
	case 0x106A0:
	case 0x106E0:
	case 0x206E0:
	return IntelNehalem;
	case 0x20650:
	case 0x206C0:
	case 0x206F0:
	return IntelWestmere;
	case 0x206A0:
	case 0x206D0:
	case 0x306e0:
	return IntelSandyBridge;
	case 0x306A0:
	return IntelIvyBridge;
	case 0x306C0: /* Devil's Canyon */
	case 0x306F0:
	case 0x40650:
	case 0x40660:
	return IntelHaswell;
	case 0x306D0:
	case 0x40670:
	case 0x406F0:
	case 0x50660:
	return IntelBroadwell;
	case 0x406e0:
	case 0x50650:
	case 0x506e0:
	return IntelSkylake;
	case 0x30670:
	case 0x406c0:
	case 0x50670:
	return IntelSilvermont;
	case 0x506f0:
	case 0x706a0:
	case 0x506c0:
	return IntelGoldmont;
	case 0x706e0:
	case 0x606a0:
	return IntelIcelake;
	case 0x806c0:
	case 0x806d0:
	return IntelTigerlake;
	case 0x806e0:
	case 0x906e0:
	return IntelKabylake;
	case 0xa0650:
	case 0xa0660:
	return IntelCometlake;
	case 0xa0670:
	return IntelRocketlake;
	case 0x90670:
	case 0x906a0:
	return IntelAlderlake;
	case 0xb0670:
	return IntelRaptorlake;
	case 0x806f0:
	return IntelSapphireRapid;
	case 0x30f00:
	return AMDF15R30;
	case 0x00f10: // Naples, Whitehaven, Summit Ridge, Snowy Owl (Zen), Milan (Zen 3) (UNTESTED)
	case 0x10f10: // Raven Ridge, Great Horned Owl (Zen) (UNTESTED)
	case 0x10f80: // Banded Kestrel (Zen), Picasso (Zen+) (UNTESTED)
	case 0x20f00: // Dali (Zen) (UNTESTED)
	case 0x00f80: // Colfax, Pinnacle Ridge (Zen+) (UNTESTED)
	case 0x30f10: // Rome, Castle Peak (Zen 2)
	case 0x60f00: // Renoir (Zen 2) (UNTESTED)
	case 0x70f10: // Matisse (Zen 2) (UNTESTED)
	case 0x60f80: // Lucienne
	case 0x90f00: // Van Gogh (Zen 2)
	if (ext_family == 8 \|\| ext_family == 0xa) {
	return AMDZen;
	} else if (ext_family == 3) {
	return AMDF15R30;
	}
	break;
	case 0x20f10: // Vermeer (Zen 3)
	case 0x50f00: // Cezanne (Zen 3)
	case 0x40f40: // Rembrandt (Zen 3+)
	case 0x60f10: // Raphael (Zen 4)
	if (ext_family == 0xa) {
	return AMDZen;
	}
	default:
	break;
	}

	if (!strncmp(vendor, "AuthenticAMD", sizeof(vendor))) {
	CLEAN_FATAL() << "AMD CPU type " << HEX(cpu_type) <<
	" (ext family " << HEX(ext_family) << ") unknown";
	} else {
	CLEAN_FATAL() << "Intel CPU type " << HEX(cpu_type) << " unknown";
	}
	return UnknownCpu; // not reached
	}

	static std::vector<CpuMicroarch> compute_cpu_microarchs() {
	return { compute_cpu_microarch() };
	}

	static void check_for_kvm_in_txcp_bug(const perf_event_attrs &perf_attr) {
	int64_t count = 0;
	struct perf_event_attr attr = perf_attr.ticks;
	attr.config \|= IN_TXCP;
	attr.sample_period = 0;
	bool disabled_txcp;
	ScopedFd fd = start_counter(0, -1, &attr, &disabled_txcp);
	if (fd.is_open() && !disabled_txcp) {
	ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
	ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
	do_branches();
	count = read_counter(fd);
	}

	supports_txcp = count > 0;
	has_kvm_in_txcp_bug = supports_txcp && count < NUM_BRANCHES;
	LOG(debug) << "supports txcp=" << supports_txcp;
	LOG(debug) << "has_kvm_in_txcp_bug=" << has_kvm_in_txcp_bug
	<< " count=" << count;
	}

	static void check_for_xen_pmi_bug(const perf_event_attrs &perf_attr) {
	int32_t count = -1;
	struct perf_event_attr attr = perf_attr.ticks;
	attr.sample_period = NUM_BRANCHES - 1;
	ScopedFd fd = start_counter(0, -1, &attr);
	if (fd.is_open()) {
	// Do NUM_BRANCHES conditional branches that can't be optimized out.
	// 'accumulator' is always odd and can't be zero
	uint32_t accumulator = uint32_t(rand()) * 2 + 1;
	int raw_fd = fd;
	asm volatile(
	#if defined(__x86_64__)
	"mov %[_SYS_ioctl], %%rax;"
	"mov %[raw_fd], %%edi;"
	"xor %%rdx, %%rdx;"
	"mov %[_PERF_EVENT_IOC_ENABLE], %%rsi;"
	"syscall;"
	"cmp $-4095, %%rax;"
	"jae 2f;"
	"mov %[_SYS_ioctl], %%rax;"
	"mov %[_PERF_EVENT_IOC_RESET], %%rsi;"
	"syscall;"
	// From this point on all conditional branches count!
	"cmp $-4095, %%rax;"
	"jae 2f;"
	// Reset the counter period to the desired value.
	"mov %[_SYS_ioctl], %%rax;"
	"mov %[_PERF_EVENT_IOC_PERIOD], %%rsi;"
	"mov %[period], %%rdx;"
	"syscall;"
	"cmp $-4095, %%rax;"
	"jae 2f;"
	"mov %[_iterations], %%rax;"
	"1: dec %%rax;"
	// Multiply by 7.
	"mov %[accumulator], %%edx;"
	"shl $3, %[accumulator];"
	"sub %%edx, %[accumulator];"
	// Add 2.
	"add $2, %[accumulator];"
	// Mask off bits.
	"and $0xffffff, %[accumulator];"
	// And loop.
	"test %%rax, %%rax;"
	"jnz 1b;"
	"mov %[_PERF_EVENT_IOC_DISABLE], %%rsi;"
	"mov %[_SYS_ioctl], %%rax;"
	"xor %%rdx, %%rdx;"
	// We didn't touch rdi.
	"syscall;"
	"cmp $-4095, %%rax;"
	"jae 2f;"
	"movl $0, %[count];"
	"2: nop;"
	#elif defined(__i386__)
	"mov %[_SYS_ioctl], %%eax;"
	"mov %[raw_fd], %%ebx;"
	"xor %%edx, %%edx;"
	"mov %[_PERF_EVENT_IOC_ENABLE], %%ecx;"
	"int $0x80;"
	"cmp $-4095, %%eax;"
	"jae 2f;"
	"mov %[_SYS_ioctl], %%eax;"
	"mov %[_PERF_EVENT_IOC_RESET], %%ecx;"
	"int $0x80;"
	// From this point on all conditional branches count!
	"cmp $-4095, %%eax;"
	"jae 2f;"
	// Reset the counter period to the desired value.
	"mov %[_SYS_ioctl], %%eax;"
	"mov %[_PERF_EVENT_IOC_PERIOD], %%ecx;"
	"mov %[period], %%edx;"
	"int $0x80;"
	"cmp $-4095, %%eax;"
	"jae 2f;"
	"mov %[_iterations], %%eax;"
	"1: dec %%eax;"
	// Multiply by 7.
	"mov %[accumulator], %%edx;"
	"shll $3, %[accumulator];"
	"sub %%edx, %[accumulator];"
	// Add 2.
	"addl $2, %[accumulator];"
	// Mask off bits.
	"andl $0xffffff, %[accumulator];"
	// And loop.
	"test %%eax, %%eax;"
	"jnz 1b;"
	"mov %[_PERF_EVENT_IOC_DISABLE], %%ecx;"
	"mov %[_SYS_ioctl], %%eax;"
	"xor %%edx, %%edx;"
	// We didn't touch rdi.
	"int $0x80;"
	"cmp $-4095, %%eax;"
	"jae 2f;"
	"movl $0, %[count];"
	"2: nop;"
	#else
	#error unknown CPU architecture
	#endif
	: [accumulator] "+rm"(accumulator), [count] "=rm"(count)
	: [_SYS_ioctl] "i"(SYS_ioctl),
	[_PERF_EVENT_IOC_DISABLE] "i"(PERF_EVENT_IOC_DISABLE),
	[_PERF_EVENT_IOC_ENABLE] "i"(PERF_EVENT_IOC_ENABLE),
	[_PERF_EVENT_IOC_PERIOD] "i"(PERF_EVENT_IOC_PERIOD),
	[_PERF_EVENT_IOC_RESET] "i"(PERF_EVENT_IOC_RESET),
	// The check for the failure of some of our ioctls is in
	// the measured region, so account for that when looping.
	[_iterations] "i"(NUM_BRANCHES - 2),
	[period] "rm"(&attr.sample_period), [raw_fd] "rm"(raw_fd)
	:
	#if defined(__x86_64__)
	"rax", "rdx", "rdi", "rsi"
	// `syscall` clobbers rcx and r11.
	,
	"rcx", "r11"
	#elif defined(__i386__)
	"eax", "ebx", "ecx", "edx"
	#else
	#error unknown CPU architecture
	#endif
	);
	// If things worked above, `count` should have been set to 0.
	if (count == 0) {
	count = read_counter(fd);
	}
	// Use 'accumulator' so it can't be optimized out.
	accumulator_sink = accumulator;
	}

	has_xen_pmi_bug = count > NUM_BRANCHES \|\| count == -1;
	if (has_xen_pmi_bug) {
	LOG(debug) << "has_xen_pmi_bug=" << has_xen_pmi_bug << " count=" << count;
	if (!Flags::get().force_things) {
	FATAL()
	<< "Overcount triggered by PMU interrupts detected due to Xen PMU "
	"virtualization bug.\n"
	"Aborting. Retry with -F to override, but it will probably\n"
	"fail.";
	}
	}
	}

	static void check_for_zen_speclockmap() {
	// When the SpecLockMap optimization is not disabled, rr will not work
	// reliably (e.g. it would work fine on a single process with a single
	// thread, but not more). When the optimization is disabled, the
	// perf counter for retired lock instructions of type SpecLockMapCommit
	// (on PMC 0x25) stays at 0.
	// See more details at https://github.com/rr-debugger/rr/issues/2034.
	struct perf_event_attr attr;
	// 0x25 == RETIRED_LOCK_INSTRUCTIONS - Counts the number of retired locked instructions
	// + 0x08 == SPECLOCKMAPCOMMIT
	init_perf_event_attr(&attr, PERF_TYPE_RAW, 0x510825);

	ScopedFd fd = start_counter(0, -1, &attr);
	if (fd.is_open()) {
	int atomic = 0;
	int64_t count = read_counter(fd);
	// A lock add is known to increase the perf counter we're looking at.
	asm volatile("lock addl $1, %0": "+m" (atomic));
	if (read_counter(fd) == count) {
	LOG(debug) << "SpecLockMap is disabled";
	} else {
	LOG(debug) << "SpecLockMap is not disabled";
	fprintf(stderr,
	"On Zen CPUs, rr will not work reliably unless you disable the "
	"hardware SpecLockMap optimization.\nFor instructions on how to "
	"do this, see https://github.com/rr-debugger/rr/wiki/Zen\n");
	}
	}
	}

	static void check_for_freeze_on_smi() {
	ScopedFd fd = ScopedFd("/sys/devices/cpu/freeze_on_smi", O_RDONLY);
	if (!fd.is_open()) {
	LOG(debug) << "/sys/devices/cpu/freeze_on_smi not present";
	return;
	}

	char freeze_on_smi = 0;
	ssize_t ret = read(fd, &freeze_on_smi, 1);
	if (ret != 1) {
	FATAL() << "Can't read freeze_on_smi";
	}
	if (freeze_on_smi == 0) {
	LOG(warn) << "Failed to read freeze_on_smi";
	} else if (freeze_on_smi == '1') {
	LOG(debug) << "freeze_on_smi is set";
	} else if (freeze_on_smi == '0') {
	LOG(warn) << "freeze_on_smi is not set";
	if (!Flags::get().suppress_environment_warnings) {
	fprintf(stderr,
	"Freezing performance counters on SMIs should be enabled for maximum rr\n"
	"reliability on Comet Lake and later CPUs. To manually enable this setting, run\n"
	"\techo 1 \| sudo tee /sys/devices/cpu/freeze_on_smi\n"
	"On systemd systems, consider putting\n"
	"'w /sys/devices/cpu/freeze_on_smi - - - - 1' into /etc/tmpfiles.d/10-rr.conf\n"
	"to automatically apply this setting on every reboot.\n"
	"See 'man 5 sysfs', 'man 5 tmpfiles.d'.\n"
	"If you are seeing this message, the setting has not been enabled.\n");
	}
	} else {
	LOG(warn) << "Unrecognized freeze_on_smi value " << freeze_on_smi;
	}
	}

	static void check_for_arch_bugs(perf_event_attrs &perf_attr) {
	DEBUG_ASSERT(rr::perf_attrs.size() == 1);
	CpuMicroarch uarch = (CpuMicroarch)perf_attr.bug_flags;
	if (uarch >= FirstIntel && uarch <= LastIntel) {
	check_for_kvm_in_txcp_bug(perf_attr);
	check_for_xen_pmi_bug(perf_attr);
	}
	if (uarch >= IntelCometlake && uarch <= LastIntel) {
	check_for_freeze_on_smi();
	}
	if (uarch == AMDZen) {
	check_for_zen_speclockmap();
	}
	}

	static void post_init_pmu_uarchs(std::vector<PmuConfig> &pmu_uarchs)
	{
	if (pmu_uarchs.size() != 1) {
	CLEAN_FATAL() << "rr only support a single PMU on x86, "
	<< pmu_uarchs.size() << " specified.";
	}
	}

	static bool always_recreate_counters(const perf_event_attrs &perf_attr) {
	// When we have the KVM IN_TXCP bug, reenabling the TXCP counter after
	// disabling it does not work.
	DEBUG_ASSERT(perf_attr.checked);
	return perf_attr.has_ioc_period_bug \|\| has_kvm_in_txcp_bug;
	}

	static void arch_check_restricted_counter() {
	if ((cpuid(CPUID_GETEXTENDEDFEATURES, 0).ebx & HLE_FEATURE_FLAG) &&
	!Flags::get().suppress_environment_warnings) {
	fprintf(stderr,
	"Your CPU supports Hardware Lock Elision but you only have one\n"
	"hardware performance counter available. Record and replay\n"
	"of code that uses HLE will fail unless you alter your\n"
	"configuration to make more than one hardware performance counter\n"
	"available.\n");
	}
	}

	template <typename Arch>
	void PerfCounters::reset_arch_extras() {
	DEBUG_ASSERT(rr::perf_attrs.size() == 1);
	if (supports_txcp) {
	struct perf_event_attr attr = rr::perf_attrs[0].ticks;
	if (has_kvm_in_txcp_bug) {
	// IN_TXCP isn't going to work reliably. Assume that HLE/RTM are not
	// used,
	// and check that.
	attr.sample_period = 0;
	attr.config \|= IN_TX;
	fd_ticks_in_transaction = start_counter(tid, fd_ticks_interrupt, &attr);
	} else {
	// Set up a separate counter for measuring ticks, which does not have
	// a sample period and does not count events during aborted
	// transactions.
	// We have to use two separate counters here because the kernel does
	// not support setting a sample_period with IN_TXCP, apparently for
	// reasons related to this Intel note on IA32_PERFEVTSEL2:
	// ``When IN_TXCP=1 & IN_TX=1 and in sampling, spurious PMI may
	// occur and transactions may continuously abort near overflow
	// conditions. Software should favor using IN_TXCP for counting over
	// sampling. If sampling, software should use large “sample-after“
	// value after clearing the counter configured to use IN_TXCP and
	// also always reset the counter even when no overflow condition
	// was reported.''
	attr.sample_period = 0;
	attr.config \|= IN_TXCP;
	fd_ticks_measure = start_counter(tid, fd_ticks_interrupt, &attr);
	}
	}
	}