[PATCH 08/11] fwts: Use linux perf counters for cpu benchmarking

Jeremy Kerr jk at ozlabs.org
Thu May 21 09:34:27 UTC 2015


Currently, we use a loop counter to measure cpu performance. There are a
couple of drawbacks of this:

1) the numbers are fairly arbitrary, and don't reflect any external
values; and

2) the results are subject to peturbation from other tasks.

This change uses the linux perf counters to measure actual CPU cycles,
where available. This means we get numbers that are meaningful, and will
we'll get the performance value even if our process has been scheduled
off the CPU.

Now, the new results should match the actual cpu frequencies:

  CPU 0: 16 CPU frequency steps supported.
   Frequency | Relative Speed |  Cycles    | Bogo loops
  -----------+----------------+------------+-----------
   1.200 GHz |      36.5 %    | 1197262295 |    114643
   1.300 GHz |      39.5 %    | 1297048295 |    124214
   1.400 GHz |      42.6 %    | 1396813066 |    133889
   1.500 GHz |      45.6 %    | 1496592732 |    143456
   1.600 GHz |      48.6 %    | 1596393751 |    150124
   1.700 GHz |      51.7 %    | 1696137977 |    162350
   1.800 GHz |      54.7 %    | 1795918438 |    172071
   1.900 GHz |      57.8 %    | 1895685372 |    181249
   2.000 GHz |      60.8 %    | 1995453204 |    191176
   2.100 GHz |      63.8 %    | 2095225345 |    200753
   2.200 GHz |      66.9 %    | 2194993645 |    210282
   2.300 GHz |      69.9 %    | 2294780535 |    219945
   2.400 GHz |      73.0 %    | 2394544770 |    229664
   2.500 GHz |      76.0 %    | 2494315101 |    239121
   2.600 GHz |      79.0 %    | 2594055116 |    248536
   2.601 GHz |     100.0 %    | 3281891417 |    314546

  CPU 0 performance scaling OK

Because we still want benchmarks to work when perf counters are not
available, we still measure the loop count. This means we need to
represent test results as a struct, with cycles and loops members.

When tests need a scalar performance value,
fwts_cpu_benchmark_get_best_result will return the most accurate result
measured.

Signed-off-by: Jeremy Kerr <jk at ozlabs.org>

---
 src/acpi/cstates/cstates.c |    4 -
 src/cpu/cpufreq/cpufreq.c  |   63 +++++++++++++--------
 src/lib/include/fwts_cpu.h |   11 +++
 src/lib/src/fwts_cpu.c     |  107 ++++++++++++++++++++++++++++++++++---
 4 files changed, 153 insertions(+), 32 deletions(-)

diff --git a/src/acpi/cstates/cstates.c b/src/acpi/cstates/cstates.c
index b58f15c..42c634c 100644
--- a/src/acpi/cstates/cstates.c
+++ b/src/acpi/cstates/cstates.c
@@ -133,9 +133,9 @@ static void do_cpu(fwts_framework *fw, int nth, int cpus, int cpu, char *path)
 		if ((i & 7) < 4)
 			sleep(1);
 		else {
-			uint64_t loop_count;
+			fwts_cpu_benchmark_result result;
 
-			if (fwts_cpu_performance(fw, cpu, &loop_count) != FWTS_OK) {
+			if (fwts_cpu_benchmark(fw, cpu, &result) != FWTS_OK) {
 				fwts_failed(fw, LOG_LEVEL_HIGH, "CPUFailedPerformance",
 					"Could not determine the CPU performance, this "
 					"may be due to not being able to get or set the "
diff --git a/src/cpu/cpufreq/cpufreq.c b/src/cpu/cpufreq/cpufreq.c
index 979a9e1..e409375 100644
--- a/src/cpu/cpufreq/cpufreq.c
+++ b/src/cpu/cpufreq/cpufreq.c
@@ -45,7 +45,7 @@
 
 typedef struct {
 	uint64_t	Hz;
-	uint64_t	speed;
+	fwts_cpu_benchmark_result perf;
 } fwts_cpu_freq;
 
 struct cpu {
@@ -149,6 +149,7 @@ static int get_performance_repeat(
 	const int type,
 	uint64_t *retval)
 {
+	fwts_cpu_benchmark_result result;
 	int i;
 
 	uint64_t max = 0;
@@ -159,9 +160,10 @@ static int get_performance_repeat(
 	for (i = 0; i < count; i++) {
 		uint64_t temp;
 
-		if (fwts_cpu_performance(fw, cpu->idx, &temp) != FWTS_OK)
+		if (fwts_cpu_benchmark(fw, cpu->idx, &result) != FWTS_OK)
 			return FWTS_ERROR;
 
+		temp = fwts_cpu_benchmark_best_result(&result);
 		if (temp) {
 			if (temp < min)
 				min = temp;
@@ -276,17 +278,19 @@ static int test_one_cpu_performance(fwts_framework *fw, struct cpu *cpu,
 	int i;
 
 	for (i = 0; i < cpu->n_freqs; i++) {
+		uint64_t perf;
+
 		cpu_set_frequency(fw, cpu, cpu->freqs[i].Hz);
 
-		if (fwts_cpu_performance(fw, cpu->idx, &cpu->freqs[i].speed)
+		if (fwts_cpu_benchmark(fw, cpu->idx, &cpu->freqs[i].perf)
 				!= FWTS_OK) {
 			fwts_log_error(fw, "Failed to get CPU performance for "
 				"CPU frequency %" PRId64 " Hz.",
 				cpu->freqs[i].Hz);
-			cpu->freqs[i].speed = 0;
 		}
-		if (cpu->freqs[i].speed > cpu_top_perf)
-			cpu_top_perf = cpu->freqs[i].speed;
+		perf = fwts_cpu_benchmark_best_result(&cpu->freqs[i].perf);
+		if (perf > cpu_top_perf)
+			cpu_top_perf = perf;
 
 		fwts_progress(fw, (100 * ((cpu_idx * cpu->n_freqs) + i)) /
 				(n_online_cpus * cpu->n_freqs));
@@ -294,37 +298,46 @@ static int test_one_cpu_performance(fwts_framework *fw, struct cpu *cpu,
 
 	fwts_log_info(fw, "CPU %d: %i CPU frequency steps supported.",
 			cpu->idx, cpu->n_freqs);
-	fwts_log_info_verbatum(fw, " Frequency | Relative Speed | Bogo loops");
-	fwts_log_info_verbatum(fw, "-----------+----------------+-----------");
+	fwts_log_info_verbatum(fw,
+		" Frequency | Relative Speed |  Cycles    | Bogo loops");
+	fwts_log_info_verbatum(fw,
+		"-----------+----------------+------------+-----------");
 	for (i = 0; i < cpu->n_freqs; i++) {
 		char *turbo = "";
 #ifdef FWTS_ARCH_INTEL
 		if ((i == 0) && (cpu->n_freqs > 1) &&
-		    (hz_almost_equal(cpu->freqs[i].Hz, cpu->freqs[i + 1].Hz)))
+			(hz_almost_equal(cpu->freqs[i].Hz, cpu->freqs[i + 1].Hz)))
 			turbo = " (Turbo Boost)";
 #endif
-		fwts_log_info_verbatum(fw, "%10s |     %5.1f %%    | %9" PRIu64
-				"%s",
+		uint64_t perf = fwts_cpu_benchmark_best_result(
+				&cpu->freqs[i].perf);
+		fwts_log_info_verbatum(fw,
+				"%10s |     %5.1f %%    "
+				"| %10" PRIu64 " | %9" PRIu64 "%s",
 			hz_to_human(cpu->freqs[i].Hz),
-			100.0 * cpu->freqs[i].speed / cpu_top_perf,
-			cpu->freqs[i].speed, turbo);
+			100.0 * perf / cpu_top_perf,
+			cpu->freqs[i].perf.cycles,
+			cpu->freqs[i].perf.loops,
+			turbo);
 	}
 
 	fwts_log_nl(fw);
 
 	/* now check for increasing performance */
 	for (i = 0; i < cpu->n_freqs - 1; i++) {
-		if (cpu->freqs[i].speed <= cpu->freqs[i+1].speed)
+		uint64_t perf, last_perf;
+
+		last_perf = fwts_cpu_benchmark_best_result(&cpu->freqs[i].perf);
+		perf = fwts_cpu_benchmark_best_result(&cpu->freqs[i+1].perf);
+		if (last_perf <= perf)
 			continue;
 
 		fwts_log_warning(fw,
 			"Supposedly higher frequency %s is slower (%" PRIu64
-			" bogo loops) than frequency %s (%" PRIu64
-			" bogo loops) on CPU %i.",
-			hz_to_human(cpu->freqs[i+1].Hz),
-			cpu->freqs[i+1].speed,
-			hz_to_human(cpu->freqs[i].Hz),
-			cpu->freqs[i].speed,
+			") than frequency %s (%" PRIu64
+			") on CPU %i.",
+			hz_to_human(cpu->freqs[i+1].Hz), perf,
+			hz_to_human(cpu->freqs[i].Hz), last_perf,
 			cpu->idx);
 		return FWTS_ERROR;
 	}
@@ -459,6 +472,7 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
 {
 	uint64_t low_perf, high_perf, newhigh_perf;
 	int i, j, rc, n_tests, performed_tests;
+	fwts_cpu_benchmark_result result;
 	bool ok;
 
 	rc = sw_tests_possible(fw);
@@ -478,12 +492,13 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
 		cpu_set_lowest_frequency(fw, &cpus[i]);
 
 	/* assume that all processors have the same low performance */
-	if (fwts_cpu_performance(fw, cpus[0].idx, &low_perf) != FWTS_OK) {
+	if (fwts_cpu_benchmark(fw, cpus[0].idx, &result) != FWTS_OK) {
 		fwts_failed(fw, LOG_LEVEL_MEDIUM,
 			"CPUFreqCPsSetToSW_ANYGetPerf",
 			"Cannot get CPU performance.");
 		return FWTS_ERROR;
 	}
+	low_perf = fwts_cpu_benchmark_best_result(&result);
 
 	ok = true;
 
@@ -497,12 +512,13 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
 		if (!cpu->online)
 			continue;
 
-		if (fwts_cpu_performance(fw, cpu->idx, &high_perf) != FWTS_OK) {
+		if (fwts_cpu_benchmark(fw, cpu->idx, &result) != FWTS_OK) {
 			fwts_failed(fw, LOG_LEVEL_MEDIUM,
 				"CPUFreqCPsSetToSW_ANYGetPerf",
 				"Cannot get CPU performance.");
 			return FWTS_ERROR;
 		}
+		high_perf = fwts_cpu_benchmark_best_result(&result);
 
 		performed_tests++;
 		fwts_progress(fw, 100 * performed_tests/n_tests);
@@ -514,13 +530,14 @@ static int cpufreq_test_sw_any(fwts_framework *fw)
 		for (j = 0; j < num_cpus; j++)
 			if (i != j)
 				cpu_set_lowest_frequency(fw, &cpus[j]);
-		if (fwts_cpu_performance(fw, cpu->idx, &newhigh_perf)
+		if (fwts_cpu_benchmark(fw, cpu->idx, &result)
 				!= FWTS_OK) {
 			fwts_failed(fw, LOG_LEVEL_MEDIUM,
 				"CPUFreqCPsSetToSW_ANYGetPerf",
 				"Cannot get CPU performance.");
 			return FWTS_ERROR;
 		}
+		newhigh_perf = fwts_cpu_benchmark_best_result(&result);
 		if ((high_perf > newhigh_perf) &&
 		    (high_perf - newhigh_perf > (high_perf - low_perf)/4) &&
 		    (high_perf - low_perf > 20)) {
diff --git a/src/lib/include/fwts_cpu.h b/src/lib/include/fwts_cpu.h
index b132697..7162316 100644
--- a/src/lib/include/fwts_cpu.h
+++ b/src/lib/include/fwts_cpu.h
@@ -33,6 +33,12 @@ typedef struct cpuinfo_x86 {
 	char *flags;		/* String containing flags */
 } fwts_cpuinfo_x86;
 
+typedef struct cpu_benchmark_result {
+	bool		cycles_valid;
+	uint64_t	loops;
+	uint64_t	cycles;
+} fwts_cpu_benchmark_result;
+
 int fwts_cpu_readmsr(const int cpu, const uint32_t reg, uint64_t *val);
 
 int fwts_cpu_is_Intel(bool *is_intel);
@@ -46,6 +52,9 @@ int fwts_cpu_enumerate(void);
 int fwts_cpu_consume(const int seconds);
 int fwts_cpu_consume_start(void);
 void fwts_cpu_consume_complete(void);
-int fwts_cpu_performance(fwts_framework *fw, const int cpu, uint64_t *loop_count);
+int fwts_cpu_benchmark(fwts_framework *fw, const int cpu,
+		fwts_cpu_benchmark_result *result);
+
+uint64_t fwts_cpu_benchmark_best_result(fwts_cpu_benchmark_result *res);
 
 #endif
diff --git a/src/lib/src/fwts_cpu.c b/src/lib/src/fwts_cpu.c
index 75b1100..a7cfd3d 100644
--- a/src/lib/src/fwts_cpu.c
+++ b/src/lib/src/fwts_cpu.c
@@ -26,8 +26,10 @@
 #include <limits.h>
 #include <string.h>
 #include <dirent.h>
+#include <sys/ioctl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <sys/syscall.h>
 #include <sys/wait.h>
 #include <signal.h>
 #include <fcntl.h>
@@ -35,6 +37,8 @@
 #include <sched.h>
 #include <time.h>
 
+#include <linux/perf_event.h>
+
 #include "fwts_types.h"
 #include "fwts_cpu.h"
 #include "fwts_pipeio.h"
@@ -312,20 +316,73 @@ static void fwts_cpu_burn_cycles(void)
 	}
 }
 
+static int perf_setup_counter(int cpu)
+{
+	struct perf_event_attr attr;
+	int fd;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.type = PERF_TYPE_HARDWARE;
+	attr.config = PERF_COUNT_HW_CPU_CYCLES;
+	attr.disabled = 1;
+	attr.size = sizeof(attr);
+
+	fd = syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
+	return fd;
+}
+
+static int perf_start_counter(int fd)
+{
+	int rc;
+
+	rc = ioctl(fd, PERF_EVENT_IOC_ENABLE);
+	return rc == 0 ? FWTS_OK : FWTS_ERROR;
+}
+
+static int perf_stop_counter(int fd)
+{
+	int rc;
+
+	rc = ioctl(fd, PERF_EVENT_IOC_DISABLE);
+	return rc == 0 ? FWTS_OK : FWTS_ERROR;
+}
+
+static int perf_read_counter(int fd, unsigned long long *result)
+{
+	unsigned long long buf;
+	int rc;
+
+	rc = read(fd, &buf, sizeof(buf));
+	if (rc == sizeof(buf)) {
+		*result = buf;
+		rc = FWTS_OK;
+	} else {
+		rc = FWTS_ERROR;
+	}
+
+	close(fd);
+	return rc;
+}
+
 /*
- *  fwts_cpu_performance()
+ *  fwts_cpu_benchmark()
  *
  */
-int fwts_cpu_performance(
+int fwts_cpu_benchmark(
 	fwts_framework *fw,
 	const int cpu,		/* CPU we want to measure performance */
-	uint64_t *loop_count)	/* Returned measure of bogo compute power */
+	fwts_cpu_benchmark_result *result)
 {
+	unsigned long long perfctr_result;
+	fwts_cpu_benchmark_result tmp;
 	cpu_set_t mask, oldset;
+	int perfctr, ncpus, rc;
+	static bool warned;
 	time_t current;
-	int ncpus = fwts_cpu_enumerate();
+	bool perf_ok;
 
-	*loop_count = 0;
+	ncpus = fwts_cpu_enumerate();
+	memset(&tmp, 0, sizeof(tmp));
 
 	if (ncpus == FWTS_ERROR)
 		return FWTS_ERROR;
@@ -333,6 +390,20 @@ int fwts_cpu_performance(
 	if (cpu < 0 || cpu > ncpus)
 		return FWTS_ERROR;
 
+	/* setup perf counter */
+	perf_ok = true;
+	perfctr = perf_setup_counter(cpu);
+	if (perfctr < 0) {
+		if (!warned) {
+			fwts_log_warning(fw, "Can't use linux performance "
+					"counters (perf), falling back to "
+					"relative measurements");
+			warned = true;
+		}
+		perf_ok = false;
+	}
+
+
 	/* Pin to the specified CPU */
 
 	if (sched_getaffinity(0, sizeof(oldset), &oldset) < 0) {
@@ -352,6 +423,9 @@ int fwts_cpu_performance(
 	while (current == time(NULL))
 		sched_yield();
 
+	if (perf_ok)
+		perf_start_counter(perfctr);
+
 	current = time(NULL);
 
 	/*
@@ -360,17 +434,38 @@ int fwts_cpu_performance(
 	 */
 	do {
 		fwts_cpu_burn_cycles();
-		(*loop_count)++;
+		tmp.loops++;
 	} while (current == time(NULL));
 
+	if (perf_ok)
+		perf_stop_counter(perfctr);
+
 	if (sched_setaffinity(0, sizeof(oldset), &oldset) < 0) {
 		fwts_log_error(fw, "Cannot restore old CPU affinity settings.");
 		return FWTS_ERROR;
 	}
 
+	if (perf_ok) {
+		rc = perf_read_counter(perfctr, &perfctr_result);
+		if (rc == FWTS_OK) {
+			tmp.cycles = perfctr_result;
+			tmp.cycles_valid = true;
+		} else {
+			fwts_log_warning(fw, "failed to read perf counters");
+		}
+
+	}
+
+	*result = tmp;
+
 	return FWTS_OK;
 }
 
+uint64_t fwts_cpu_benchmark_best_result(fwts_cpu_benchmark_result *res)
+{
+	return res->cycles_valid ? res->cycles : res->loops;
+}
+
 /*
  *  fwts_cpu_consume_cycles()
  *	eat up CPU cycles



More information about the fwts-devel mailing list