From 70cd4c10b290dd77fff6dc702a9a2c8c679df121 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 27 Feb 2017 11:51:37 +1100
Subject: KVM: PPC: Book3S HV: Fix software walk of guest process page tables

This fixes some bugs in the code that walks the guest's page tables.
These bugs cause MMIO emulation to fail whenever the guest is in
virtial mode (MMU on), leading to the guest hanging if it tried to
access a virtio device.

The first bug was that when reading the guest's process table, we were
using the whole of arch->process_table, not just the field that contains
the process table base address.  The second bug was that the mask used
when reading the process table entry to get the radix tree base address,
RPDB_MASK, had the wrong value.

Fixes: 9e04ba69beec ("KVM: PPC: Book3S HV: Add basic infrastructure for radix guests")
Fixes: e99833448c5f ("powerpc/mm/radix: Add partition table format & callback")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 3 ++-
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index d73e9dfa5237..440f3423e213 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -46,7 +46,7 @@ extern struct patb_entry *partition_tb;
 
 /* Bits in patb0 field */
 #define PATB_HR		(1UL << 63)
-#define RPDB_MASK	0x0ffffffffffff00fUL
+#define RPDB_MASK	0x0fffffffffffff00UL
 #define RPDB_SHIFT	(1UL << 8)
 #define RTS1_SHIFT	61		/* top 2 bits of radix tree size */
 #define RTS1_MASK	(3UL << RTS1_SHIFT)
@@ -57,6 +57,7 @@ extern struct patb_entry *partition_tb;
 /* Bits in patb1 field */
 #define PATB_GR		(1UL << 63)	/* guest uses radix; must match HR */
 #define PRTS_MASK	0x1f		/* process table size field */
+#define PRTB_MASK	0x0ffffffffffff000UL
 
 /*
  * Limit process table to PAGE_SIZE table. This
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 4344651f408c..f6b3e67c5762 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -32,6 +32,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	u32 pid;
 	int ret, level, ps;
 	__be64 prte, rpte;
+	unsigned long ptbl;
 	unsigned long root, pte, index;
 	unsigned long rts, bits, offset;
 	unsigned long gpa;
@@ -53,8 +54,8 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		return -EINVAL;
 
 	/* Read partition table to find root of tree for effective PID */
-	ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16,
-			     &prte, sizeof(prte));
+	ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
+	ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From 4e5acdc23a3dcbd6ad6dc93a9783dd9c838987c8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 28 Feb 2017 11:05:47 +1100
Subject: KVM: PPC: Book3S HV: Don't use ASDR for real-mode HPT faults on
 POWER9

In HPT mode on POWER9, the ASDR register is supposed to record
segment information for hypervisor page faults.  It turns out that
POWER9 DD1 does not record the page size information in the ASDR
for faults in guest real mode.  We have the necessary information
in memory already, so by moving the checks for real mode that already
existed, we can use the in-memory copy.  Since a load is likely to
be faster than reading an SPR, we do this unconditionally (not just
for POWER9 DD1).

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 47414a6fe2dd..7c6477d1840a 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1787,12 +1787,12 @@ kvmppc_hdsi:
 	/* HPTE not found fault or protection fault? */
 	andis.	r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h
 	beq	1f			/* if not, send it to the guest */
+	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
+	beq	3f
 BEGIN_FTR_SECTION
 	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
 	b	4f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-	andi.	r0, r11, MSR_DR		/* data relocation enabled? */
-	beq	3f
 	clrrdi	r0, r4, 28
 	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */
 	li	r0, BOOK3S_INTERRUPT_DATA_SEGMENT
@@ -1879,12 +1879,12 @@ kvmppc_hisi:
 	bne	.Lradix_hisi		/* for radix, just save ASDR */
 	andis.	r0, r11, SRR1_ISI_NOPT@h
 	beq	1f
+	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
+	beq	3f
 BEGIN_FTR_SECTION
 	mfspr	r5, SPRN_ASDR		/* on POWER9, use ASDR to get VSID */
 	b	4f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-	andi.	r0, r11, MSR_IR		/* instruction relocation enabled? */
-	beq	3f
 	clrrdi	r0, r10, 28
 	PPC_SLBFEE_DOT(R5, R0)		/* if so, look up SLB */
 	li	r0, BOOK3S_INTERRUPT_INST_SEGMENT
-- 
cgit v1.2.3


From bba82fd75694832b59433bf4e9d7ede8de1dfd42 Mon Sep 17 00:00:00 2001
From: Robert O'Callahan <robert@ocallahan.org>
Date: Wed, 1 Feb 2017 17:06:11 +1300
Subject: KVM: x86: never specify a sample period for virtualized in_tx_cp
 counters
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pmc_reprogram_counter() always sets a sample period based on the value of
pmc->counter. However, hsw_hw_config() rejects sample periods less than
2^31 - 1. So for example, if a KVM guest does

    struct perf_event_attr attr;
    memset(&attr, 0, sizeof(attr));
    attr.type = PERF_TYPE_RAW;
    attr.size = sizeof(attr);
    attr.config = 0x2005101c4; // conditional branches retired IN_TXCP
    attr.sample_period = 0;
    int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
    ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
    ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);

the guest kernel counts some conditional branch events, then updates the
virtual PMU register with a nonzero count. The host reaches
pmc_reprogram_counter() with nonzero pmc->counter, triggers EOPNOTSUPP
in hsw_hw_config(), prints "kvm_pmu: event creation failed" in
pmc_reprogram_counter(), and silently (from the guest's point of view) stops
counting events.

We fix event counting by forcing attr.sample_period to always be zero for
in_tx_cp counters. Sampling doesn't work, but it already didn't work and
can't be fixed without major changes to the approach in hsw_hw_config().

Signed-off-by: Robert O'Callahan <robert@ocallahan.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/pmu.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 06ce377dcbc9..026db42a86c3 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -113,12 +113,19 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.config = config,
 	};
 
+	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+
 	if (in_tx)
 		attr.config |= HSW_IN_TX;
-	if (in_tx_cp)
+	if (in_tx_cp) {
+		/*
+		 * HSW_IN_TX_CHECKPOINTED is not supported with nonzero
+		 * period. Just clear the sample period so at least
+		 * allocating the counter doesn't fail.
+		 */
+		attr.sample_period = 0;
 		attr.config |= HSW_IN_TX_CHECKPOINTED;
-
-	attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc);
+	}
 
 	event = perf_event_create_kernel_counter(&attr, -1, current,
 						 intr ? kvm_perf_overflow_intr :
-- 
cgit v1.2.3


From e3736c3eb3a6f7c0966923b629c9f92b558aa9c7 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Mon, 20 Feb 2017 13:06:21 +0200
Subject: kvm: convert kvm.users_count from atomic_t to refcount_t
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 include/linux/kvm_host.h | 3 ++-
 virt/kvm/kvm_main.c      | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8d69d5150748..2c14ad9809da 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -26,6 +26,7 @@
 #include <linux/context_tracking.h>
 #include <linux/irqbypass.h>
 #include <linux/swait.h>
+#include <linux/refcount.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -401,7 +402,7 @@ struct kvm {
 #endif
 	struct kvm_vm_stat stat;
 	struct kvm_arch arch;
-	atomic_t users_count;
+	refcount_t users_count;
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 	spinlock_t ring_lock;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index cc4d6e0dd2a2..c6b7aff634be 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -617,7 +617,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->lock);
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
-	atomic_set(&kvm->users_count, 1);
+	refcount_set(&kvm->users_count, 1);
 	INIT_LIST_HEAD(&kvm->devices);
 
 	r = kvm_arch_init_vm(kvm, type);
@@ -747,13 +747,13 @@ static void kvm_destroy_vm(struct kvm *kvm)
 
 void kvm_get_kvm(struct kvm *kvm)
 {
-	atomic_inc(&kvm->users_count);
+	refcount_inc(&kvm->users_count);
 }
 EXPORT_SYMBOL_GPL(kvm_get_kvm);
 
 void kvm_put_kvm(struct kvm *kvm)
 {
-	if (atomic_dec_and_test(&kvm->users_count))
+	if (refcount_dec_and_test(&kvm->users_count))
 		kvm_destroy_vm(kvm);
 }
 EXPORT_SYMBOL_GPL(kvm_put_kvm);
@@ -3639,7 +3639,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
 	 * To avoid the race between open and the removal of the debugfs
 	 * directory we test against the users count.
 	 */
-	if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0))
+	if (!refcount_inc_not_zero(&stat_data->kvm->users_count))
 		return -ENOENT;
 
 	if (simple_attr_open(inode, file, get, set, fmt)) {
-- 
cgit v1.2.3


From b7ceaec112aa35aa287325754d8c52b8304892cd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 22 Feb 2017 07:36:16 -0800
Subject: x86/asm: Tidy up TSS limit code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In an earlier version of the patch ("x86/kvm/vmx: Defer TR reload
after VM exit") that introduced TSS limit validity tracking, I
confused which helper was which.  On reflection, the names I chose
sucked.  Rename the helpers to make it more obvious what's going on
and add some comments.

While I'm at it, clear __tss_limit_invalid when force-reloading as
well as when contitionally reloading, since any TR reload fixes the
limit.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/include/asm/desc.h | 18 +++++++++++-------
 arch/x86/kernel/ioport.c    |  8 +++++++-
 arch/x86/kernel/process.c   |  6 +++---
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index cb8f9149f6c8..1548ca92ad3f 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -205,6 +205,8 @@ static inline void native_load_tr_desc(void)
 	asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
 }
 
+DECLARE_PER_CPU(bool, __tss_limit_invalid);
+
 static inline void force_reload_TR(void)
 {
 	struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
@@ -220,18 +222,20 @@ static inline void force_reload_TR(void)
 	write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
 
 	load_TR_desc();
+	this_cpu_write(__tss_limit_invalid, false);
 }
 
-DECLARE_PER_CPU(bool, need_tr_refresh);
-
-static inline void refresh_TR(void)
+/*
+ * Call this if you need the TSS limit to be correct, which should be the case
+ * if and only if you have TIF_IO_BITMAP set or you're switching to a task
+ * with TIF_IO_BITMAP set.
+ */
+static inline void refresh_tss_limit(void)
 {
 	DEBUG_LOCKS_WARN_ON(preemptible());
 
-	if (unlikely(this_cpu_read(need_tr_refresh))) {
+	if (unlikely(this_cpu_read(__tss_limit_invalid)))
 		force_reload_TR();
-		this_cpu_write(need_tr_refresh, false);
-	}
 }
 
 /*
@@ -250,7 +254,7 @@ static inline void invalidate_tss_limit(void)
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
 		force_reload_TR();
 	else
-		this_cpu_write(need_tr_refresh, true);
+		this_cpu_write(__tss_limit_invalid, true);
 }
 
 static inline void native_load_gdt(const struct desc_ptr *dtr)
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index b01bc8517450..875d3d25dd6a 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -47,8 +47,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 		t->io_bitmap_ptr = bitmap;
 		set_thread_flag(TIF_IO_BITMAP);
 
+		/*
+		 * Now that we have an IO bitmap, we need our TSS limit to be
+		 * correct.  It's fine if we are preempted after doing this:
+		 * with TIF_IO_BITMAP set, context switches will keep our TSS
+		 * limit correct.
+		 */
 		preempt_disable();
-		refresh_TR();
+		refresh_tss_limit();
 		preempt_enable();
 	}
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7780efa635b9..0b302591b51f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -65,8 +65,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 };
 EXPORT_PER_CPU_SYMBOL(cpu_tss);
 
-DEFINE_PER_CPU(bool, need_tr_refresh);
-EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
+DEFINE_PER_CPU(bool, __tss_limit_invalid);
+EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
 
 /*
  * this gets called so that we can store lazy state into memory and copy the
@@ -218,7 +218,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 * Make sure that the TSS limit is correct for the CPU
 		 * to notice the IO bitmap.
 		 */
-		refresh_TR();
+		refresh_tss_limit();
 	} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
 		/*
 		 * Clear any possible leftover bits:
-- 
cgit v1.2.3


From 0eb1d0fa6a68324b51db4f7ae16d9b042c5b2de4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 22 Feb 2017 07:36:17 -0800
Subject: selftests/x86: Add a basic selftest for ioperm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This doesn't fully exercise the interaction between KVM and ioperm(),
but it does test basic functionality.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 tools/testing/selftests/x86/Makefile |   2 +-
 tools/testing/selftests/x86/ioperm.c | 170 +++++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/x86/ioperm.c

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 83d8b1c6cb0e..bed3e6086cb1 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,7 +5,7 @@ include ../lib.mk
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \
-			check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test \
+			check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \
 			protection_keys test_vdso
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
diff --git a/tools/testing/selftests/x86/ioperm.c b/tools/testing/selftests/x86/ioperm.c
new file mode 100644
index 000000000000..b77313ba2ab1
--- /dev/null
+++ b/tools/testing/selftests/x86/ioperm.c
@@ -0,0 +1,170 @@
+/*
+ * ioperm.c - Test case for ioperm(2)
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <sched.h>
+#include <sys/io.h>
+
+static int nerrs = 0;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+	siglongjmp(jmpbuf, 1);
+}
+
+static bool try_outb(unsigned short port)
+{
+	sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
+	if (sigsetjmp(jmpbuf, 1) != 0) {
+		return false;
+	} else {
+		asm volatile ("outb %%al, %w[port]"
+			      : : [port] "Nd" (port), "a" (0));
+		return true;
+	}
+	clearhandler(SIGSEGV);
+}
+
+static void expect_ok(unsigned short port)
+{
+	if (!try_outb(port)) {
+		printf("[FAIL]\toutb to 0x%02hx failed\n", port);
+		exit(1);
+	}
+
+	printf("[OK]\toutb to 0x%02hx worked\n", port);
+}
+
+static void expect_gp(unsigned short port)
+{
+	if (try_outb(port)) {
+		printf("[FAIL]\toutb to 0x%02hx worked\n", port);
+		exit(1);
+	}
+
+	printf("[OK]\toutb to 0x%02hx failed\n", port);
+}
+
+int main(void)
+{
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+		err(1, "sched_setaffinity to CPU 0");
+
+	expect_gp(0x80);
+	expect_gp(0xed);
+
+	/*
+	 * Probe for ioperm support.  Note that clearing ioperm bits
+	 * works even as nonroot.
+	 */
+	printf("[RUN]\tenable 0x80\n");
+	if (ioperm(0x80, 1, 1) != 0) {
+		printf("[OK]\tioperm(0x80, 1, 1) failed (%d) -- try running as root\n",
+		       errno);
+		return 0;
+	}
+	expect_ok(0x80);
+	expect_gp(0xed);
+
+	printf("[RUN]\tdisable 0x80\n");
+	if (ioperm(0x80, 1, 0) != 0) {
+		printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+		return 1;
+	}
+	expect_gp(0x80);
+	expect_gp(0xed);
+
+	/* Make sure that fork() preserves ioperm. */
+	if (ioperm(0x80, 1, 1) != 0) {
+		printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+		return 1;
+	}
+
+	pid_t child = fork();
+	if (child == -1)
+		err(1, "fork");
+
+	if (child == 0) {
+		printf("[RUN]\tchild: check that we inherited permissions\n");
+		expect_ok(0x80);
+		expect_gp(0xed);
+		return 0;
+	} else {
+		int status;
+		if (waitpid(child, &status, 0) != child ||
+		    !WIFEXITED(status)) {
+			printf("[FAIL]\tChild died\n");
+			nerrs++;
+		} else if (WEXITSTATUS(status) != 0) {
+			printf("[FAIL]\tChild failed\n");
+			nerrs++;
+		} else {
+			printf("[OK]\tChild succeeded\n");
+		}
+	}
+
+	/* Test the capability checks. */
+
+	printf("\tDrop privileges\n");
+	if (setresuid(1, 1, 1) != 0) {
+		printf("[WARN]\tDropping privileges failed\n");
+		return 0;
+	}
+
+	printf("[RUN]\tdisable 0x80\n");
+	if (ioperm(0x80, 1, 0) != 0) {
+		printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+		return 1;
+	}
+	printf("[OK]\tit worked\n");
+
+	printf("[RUN]\tenable 0x80 again\n");
+	if (ioperm(0x80, 1, 1) == 0) {
+		printf("[FAIL]\tit succeeded but should have failed.\n");
+		return 1;
+	}
+	printf("[OK]\tit failed\n");
+	return 0;
+}
-- 
cgit v1.2.3


From 0fce546f9f07b94ccc9de09cf48d35e18946d2fa Mon Sep 17 00:00:00 2001
From: Jérémy Lefaure <jeremy.lefaure@lse.epita.fr>
Date: Sat, 25 Feb 2017 17:46:53 -0500
Subject: x86/kvm/vmx: remove unused variable in segment_base()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pointer 'struct desc_struct *d' is unused since commit 8c2e41f7ae12
("x86/kvm/vmx: Simplify segment_base()") so let's remove it.

Signed-off-by: Jérémy Lefaure <jeremy.lefaure@lse.epita.fr>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ef4ba71dbb66..764f1f897847 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2053,7 +2053,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 static unsigned long segment_base(u16 selector)
 {
 	struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-	struct desc_struct *d;
 	struct desc_struct *table;
 	unsigned long v;
 
-- 
cgit v1.2.3


From acc9ab60132739e0c5ab40644444cd7b22d12886 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Mon, 27 Feb 2017 04:24:39 -0800
Subject: KVM: nVMX: Fix pending events injection
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

L2 fails to boot on a non-APICv box dues to 'commit 0ad3bed6c5ec
("kvm: nVMX: move nested events check to kvm_vcpu_running")'

KVM internal error. Suberror: 3
extra data[0]: 800000ef
extra data[1]: 1
RAX=0000000000000000 RBX=ffffffff81f36140 RCX=0000000000000000 RDX=0000000000000000
RSI=0000000000000000 RDI=0000000000000000 RBP=ffff88007c92fe90 RSP=ffff88007c92fe90
R8 =ffff88007fccdca0 R9 =0000000000000000 R10=00000000fffedb3d R11=0000000000000000
R12=0000000000000003 R13=0000000000000000 R14=0000000000000000 R15=ffff88007c92c000
RIP=ffffffff810645e6 RFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 0000000000000000 ffffffff 00c00000
CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA]
SS =0000 0000000000000000 ffffffff 00c00000
DS =0000 0000000000000000 ffffffff 00c00000
FS =0000 0000000000000000 ffffffff 00c00000
GS =0000 ffff88007fcc0000 ffffffff 00c00000
LDT=0000 0000000000000000 ffffffff 00c00000
TR =0040 ffff88007fcd4200 00002087 00008b00 DPL=0 TSS64-busy
GDT=     ffff88007fcc9000 0000007f
IDT=     ffffffffff578000 00000fff
CR0=80050033 CR2=00000000ffffffff CR3=0000000001e0a000 CR4=003406e0
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000
DR6=00000000fffe0ff0 DR7=0000000000000400
EFER=0000000000000d01

We should try to reinject previous events if any before trying to inject
new event if pending. If vmexit is triggered by L2 guest and L0 interested
in, we should reinject IDT-vectoring info to L2 through vmcs02 if any,
otherwise, we can consider new IRQs/NMIs which can be injected and call
nested events callback to switch from L2 to L1 if needed and inject the
proper vmexit events. However, 'commit 0ad3bed6c5ec ("kvm: nVMX: move
nested events check to kvm_vcpu_running")' results in the handle events
order reversely on non-APICv box. This patch fixes it by bailing out for
pending events and not consider new events in this scenario.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Fixes: 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running")
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
---
 arch/x86/kvm/vmx.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 764f1f897847..283aa8601833 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10641,6 +10641,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (vcpu->arch.exception.pending ||
+		vcpu->arch.nmi_injected ||
+		vcpu->arch.interrupt.pending)
+		return -EBUSY;
+
 	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
 	    vmx->nested.preemption_timer_expired) {
 		if (vmx->nested.nested_run_pending)
@@ -10650,8 +10655,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 	}
 
 	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
-		if (vmx->nested.nested_run_pending ||
-		    vcpu->arch.interrupt.pending)
+		if (vmx->nested.nested_run_pending)
 			return -EBUSY;
 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
 				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
-- 
cgit v1.2.3