[lucid PATCH] fix smp kvm guests on AMD

Serge E. Hallyn serge.hallyn at canonical.com
Tue Mar 1 16:25:00 UTC 2011


Hi,

attached is a patch which fixes bug 714335.  Here is the SRU
justification:

1. impact: KVM smp guests hang on AMD
2. how was the bug addressed: Two upstream commits (plus a third
auxiliary) were re-implemented on lucid's kernel.
3. patch: see below
4. TEST CASE: in lucid, on an AMD box, start a guest with '-smp 2'.
5. regression: Honestly this one scares me a bit because it was a
tricky cherrypick, and affects timekeeping for all kvm users on
x86 lucid.  In theory timekeeping on guests could be adversely
affected, or guests could fail to boot.  However, guests have
been tested on both AMD and intel hosts.

>From 2b08698ef4a73fd3fc025e3fa01dc0f524f591c6 Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serge.hallyn at canonical.com>
Date: Mon, 28 Feb 2011 20:48:41 +0000
Subject: [PATCH 1/1] kvm: cherrypick three kvm fixes

1d5f066e0b63271b67eac6d3752f8aa96adcbddb: KVM: x86: Fix a possible backwards warp of kvmclock
and
28e4639adf0c9f26f6bb56149b7ab547bf33bb95: KVM: x86: Fix kvmclock bug
and
347bb4448c2155eb2310923ccaa4be5677649003: x86: pvclock: Move scale_delta into common header

Hopefully this will fix qemu hangs on AMD systems which involve
-smp X (X >=2) and time.  (LP: #714335)

Signed-off-by: Serge E. Hallyn <serge.hallyn at canonical.com>
---
 arch/x86/include/asm/kvm_host.h |    3 ++
 arch/x86/include/asm/pvclock.h  |   38 ++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              |   48 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 600807b..08bc2ff 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -357,6 +357,9 @@ struct kvm_vcpu_arch {
 	struct page *time_page;
 
 	bool singlestep; /* guest is single stepped by KVM */
+	u64 last_guest_tsc;
+	u64 last_kernel_ns;
+
 	bool nmi_pending;
 	bool nmi_injected;
 
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd..44b9ae7 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -11,4 +11,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
 			    struct timespec *ts);
 
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+	u64 product;
+#ifdef __i386__
+	u32 tmp1, tmp2;
+#endif
+
+	if (shift < 0)
+		delta >>= -shift;
+	else
+		delta <<= shift;
+
+#ifdef __i386__
+	__asm__ (
+		"mul  %5       ; "
+		"mov  %4,%%eax ; "
+		"mov  %%edx,%4 ; "
+		"mul  %5       ; "
+		"xor  %5,%5    ; "
+		"add  %4,%%eax ; "
+		"adc  %5,%%edx ; "
+		: "=A" (product), "=r" (tmp1), "=r" (tmp2)
+		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif defined(__x86_64__)
+	__asm__ (
+		"mul %%rdx ; shrd $32,%%rdx,%%rax"
+		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+	return product;
+}
+
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b2c02a2..a857d05 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -47,6 +47,7 @@
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/pvclock.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -633,6 +634,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
 	unsigned long this_tsc_khz;
+	s64 kernel_ns, max_kernel_ns;
+	u64 tsc_timestamp;
 
 	if ((!vcpu->time_page))
 		return;
@@ -646,15 +649,52 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-	kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+	//kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
+	kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
 	ktime_get_ts(&ts);
 	monotonic_to_bootbased(&ts);
 	local_irq_restore(flags);
+	kernel_ns = timespec_to_ns(&ts);
+
+	/*
+	 * Time as measured by the TSC may go backwards when resetting the base
+	 * tsc_timestamp.  The reason for this is that the TSC resolution is
+	 * higher than the resolution of the other clock scales.  Thus, many
+	 * possible measurments of the TSC correspond to one measurement of any
+	 * other clock, and so a spread of values is possible.  This is not a
+	 * problem for the computation of the nanosecond clock; with TSC rates
+	 * around 1GHZ, there can only be a few cycles which correspond to one
+	 * nanosecond value, and any path through this code will inevitably
+	 * take longer than that.  However, with the kernel_ns value itself,
+	 * the precision may be much lower, down to HZ granularity.  If the
+	 * first sampling of TSC against kernel_ns ends in the low part of the
+	 * range, and the second in the high end of the range, we can get:
+	 *
+	 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
+	 *
+	 * As the sampling errors potentially range in the thousands of cycles,
+	 * it is possible such a time value has already been observed by the
+	 * guest.  To protect against this, we must compute the system time as
+	 * observed by the guest and ensure the new system time is greater.
+	 */
+	max_kernel_ns = 0;
+	if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
+		max_kernel_ns = vcpu->last_guest_tsc -
+			vcpu->hv_clock.tsc_timestamp;
+		max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
+				vcpu->hv_clock.tsc_to_system_mul,
+				vcpu->hv_clock.tsc_shift);
+		max_kernel_ns += vcpu->last_kernel_ns;
+	}
+
+	if (max_kernel_ns > kernel_ns)
+		kernel_ns = max_kernel_ns;
 
 	/* With all the info we got, fill in the values */
 
-	vcpu->hv_clock.system_time = ts.tv_nsec +
-				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
+	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
+	vcpu->last_kernel_ns = kernel_ns;
+	vcpu->last_guest_tsc = tsc_timestamp;
 
 	/*
 	 * The interface expects us to write an even number signaling that the
@@ -3695,6 +3735,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	kvm_load_guest_fpu(vcpu);
 
+	kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+
 	local_irq_disable();
 
 	clear_bit(KVM_REQ_KICK, &vcpu->requests);
-- 
1.7.0.4





More information about the kernel-team mailing list