summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig19
-rw-r--r--arch/x86/Kconfig.debug10
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/acpi.c338
-rw-r--r--arch/x86/boot/compressed/cmdline.c4
-rw-r--r--arch/x86/boot/compressed/head_64.S21
-rw-r--r--arch/x86/boot/compressed/kaslr.c75
-rw-r--r--arch/x86/boot/compressed/misc.c3
-rw-r--r--arch/x86/boot/compressed/misc.h23
-rw-r--r--arch/x86/boot/compressed/pgtable.h2
-rw-r--r--arch/x86/boot/compressed/pgtable_64.c19
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S2
-rw-r--r--arch/x86/boot/setup.ld2
-rw-r--r--arch/x86/boot/string.c141
-rw-r--r--arch/x86/boot/string.h1
-rw-r--r--arch/x86/configs/i386_defconfig3
-rw-r--r--arch/x86/configs/x86_64_defconfig4
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c38
-rw-r--r--arch/x86/crypto/aegis128l-aesni-glue.c38
-rw-r--r--arch/x86/crypto/aegis256-aesni-glue.c38
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c47
-rw-r--r--arch/x86/crypto/crct10dif-pcl-asm_64.S782
-rw-r--r--arch/x86/crypto/crct10dif-pclmul_glue.c12
-rw-r--r--arch/x86/crypto/morus1280_glue.c40
-rw-r--r--arch/x86/crypto/morus640_glue.c39
-rw-r--r--arch/x86/crypto/poly1305-sse2-x86_64.S4
-rw-r--r--arch/x86/entry/entry_64_compat.S6
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl89
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl10
-rw-r--r--arch/x86/events/amd/ibs.c13
-rw-r--r--arch/x86/events/amd/iommu.c6
-rw-r--r--arch/x86/events/amd/power.c10
-rw-r--r--arch/x86/events/amd/uncore.c7
-rw-r--r--arch/x86/events/core.c27
-rw-r--r--arch/x86/events/intel/bts.c4
-rw-r--r--arch/x86/events/intel/core.c312
-rw-r--r--arch/x86/events/intel/cstate.c12
-rw-r--r--arch/x86/events/intel/ds.c2
-rw-r--r--arch/x86/events/intel/lbr.c1
-rw-r--r--arch/x86/events/intel/pt.c14
-rw-r--r--arch/x86/events/intel/rapl.c9
-rw-r--r--arch/x86/events/intel/uncore.c10
-rw-r--r--arch/x86/events/intel/uncore.h12
-rw-r--r--arch/x86/events/intel/uncore_snb.c13
-rw-r--r--arch/x86/events/intel/uncore_snbep.c4
-rw-r--r--arch/x86/events/msr.c10
-rw-r--r--arch/x86/events/perf_event.h48
-rw-r--r--arch/x86/hyperv/hv_init.c8
-rw-r--r--arch/x86/ia32/ia32_aout.c157
-rw-r--r--arch/x86/include/asm/a.out-core.h67
-rw-r--r--arch/x86/include/asm/alternative.h39
-rw-r--r--arch/x86/include/asm/asm-prototypes.h1
-rw-r--r--arch/x86/include/asm/cpu_device_id.h28
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/fpu/internal.h57
-rw-r--r--arch/x86/include/asm/fpu/types.h7
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h2
-rw-r--r--arch/x86/include/asm/intel-family.h5
-rw-r--r--arch/x86/include/asm/kvm_host.h44
-rw-r--r--arch/x86/include/asm/kvm_vcpu_regs.h25
-rw-r--r--arch/x86/include/asm/mce.h7
-rw-r--r--arch/x86/include/asm/mmu_context.h18
-rw-r--r--arch/x86/include/asm/msr-index.h6
-rw-r--r--arch/x86/include/asm/msr.h16
-rw-r--r--arch/x86/include/asm/paravirt.h13
-rw-r--r--arch/x86/include/asm/paravirt_types.h5
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/refcount.h22
-rw-r--r--arch/x86/include/asm/resctrl_sched.h4
-rw-r--r--arch/x86/include/asm/uaccess.h36
-rw-r--r--arch/x86/include/asm/unistd.h8
-rw-r--r--arch/x86/include/asm/unwind.h6
-rw-r--r--arch/x86/include/asm/uv/bios.h13
-rw-r--r--arch/x86/include/asm/xen/hypercall.h13
-rw-r--r--arch/x86/include/uapi/asm/Kbuild1
-rw-r--r--arch/x86/include/uapi/asm/socket.h1
-rw-r--r--arch/x86/kernel/acpi/boot.c3
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S12
-rw-r--r--arch/x86/kernel/alternative.c7
-rw-r--r--arch/x86/kernel/apic/io_apic.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c7
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/amd.c8
-rw-r--r--arch/x86/kernel/cpu/bugs.c16
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c1
-rw-r--r--arch/x86/kernel/cpu/match.c31
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c62
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c10
-rw-r--r--arch/x86/kernel/cpu/mce/core.c31
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/Makefile4
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h16
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c7
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c185
-rw-r--r--arch/x86/kernel/crash.c1
-rw-r--r--arch/x86/kernel/e820.c19
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/fpu/xstate.c2
-rw-r--r--arch/x86/kernel/ftrace.c45
-rw-r--r--arch/x86/kernel/hpet.c4
-rw-r--r--arch/x86/kernel/hw_breakpoint.c5
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c23
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/kprobes/core.c7
-rw-r--r--arch/x86/kernel/kprobes/opt.c4
-rw-r--r--arch/x86/kernel/kvm.c7
-rw-r--r--arch/x86/kernel/kvmclock.c20
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/process.c12
-rw-r--r--arch/x86/kernel/setup_percpu.c12
-rw-r--r--arch/x86/kernel/smpboot.c10
-rw-r--r--arch/x86/kernel/traps.c6
-rw-r--r--arch/x86/kernel/tsc.c30
-rw-r--r--arch/x86/kernel/unwind_frame.c25
-rw-r--r--arch/x86/kernel/unwind_orc.c17
-rw-r--r--arch/x86/kernel/uprobes.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/cpuid.c6
-rw-r--r--arch/x86/kvm/hyperv.c9
-rw-r--r--arch/x86/kvm/i8254.c2
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/ioapic.c2
-rw-r--r--arch/x86/kvm/lapic.c9
-rw-r--r--arch/x86/kvm/mmu.c485
-rw-r--r--arch/x86/kvm/mmu.h1
-rw-r--r--arch/x86/kvm/mmutrace.h42
-rw-r--r--arch/x86/kvm/page_track.c2
-rw-r--r--arch/x86/kvm/svm.c154
-rw-r--r--arch/x86/kvm/trace.h2
-rw-r--r--arch/x86/kvm/vmx/evmcs.c7
-rw-r--r--arch/x86/kvm/vmx/nested.c153
-rw-r--r--arch/x86/kvm/vmx/vmcs.h1
-rw-r--r--arch/x86/kvm/vmx/vmenter.S167
-rw-r--r--arch/x86/kvm/vmx/vmx.c223
-rw-r--r--arch/x86/kvm/vmx/vmx.h28
-rw-r--r--arch/x86/kvm/x86.c49
-rw-r--r--arch/x86/kvm/x86.h7
-rw-r--r--arch/x86/lib/insn-eval.c2
-rw-r--r--arch/x86/lib/iomem.c33
-rw-r--r--arch/x86/lib/kaslr.c4
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/mm/cpu_entry_area.c2
-rw-r--r--arch/x86/mm/dump_pagetables.c2
-rw-r--r--arch/x86/mm/extable.c59
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/ioremap.c4
-rw-r--r--arch/x86/mm/kasan_init_64.c14
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c4
-rw-r--r--arch/x86/mm/mpx.c2
-rw-r--r--arch/x86/mm/numa.c16
-rw-r--r--arch/x86/mm/pageattr.c54
-rw-r--r--arch/x86/mm/tlb.c3
-rw-r--r--arch/x86/net/bpf_jit_comp.c46
-rw-r--r--arch/x86/net/bpf_jit_comp32.c121
-rw-r--r--arch/x86/pci/fixup.c16
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/early_printk.c240
-rw-r--r--arch/x86/platform/efi/quirks.c6
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c1
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c3
-rw-r--r--arch/x86/platform/uv/bios_uv.c35
-rw-r--r--arch/x86/platform/uv/tlb_uv.c8
-rw-r--r--arch/x86/realmode/rm/Makefile5
-rw-r--r--arch/x86/realmode/rm/realmode.lds.S2
-rw-r--r--arch/x86/um/Kconfig2
-rw-r--r--arch/x86/xen/enlighten_pv.c5
-rw-r--r--arch/x86/xen/mmu.h4
-rw-r--r--arch/x86/xen/mmu_pv.c21
-rw-r--r--arch/x86/xen/p2m.c11
-rw-r--r--arch/x86/xen/setup.c13
-rw-r--r--arch/x86/xen/time.c12
182 files changed, 3095 insertions, 2672 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6185d4f332965..c1f9b3cf437c3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -14,8 +14,6 @@ config X86_32
select ARCH_WANT_IPC_PARSE_VERSION
select CLKSRC_I8253
select CLONE_BACKWARDS
- select HAVE_AOUT
- select HAVE_GENERIC_DMA_COHERENT
select MODULES_USE_ELF_REL
select OLD_SIGACTION
@@ -47,6 +45,7 @@ config X86
select ACPI_LEGACY_TABLES_LOOKUP if ACPI
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ANON_INODES
+ select ARCH_32BIT_OFF_T if X86_32
select ARCH_CLOCKSOURCE_DATA
select ARCH_CLOCKSOURCE_INIT
select ARCH_DISCARD_MEMBLOCK
@@ -198,7 +197,7 @@ config X86
select IRQ_FORCED_THREADING
select NEED_SG_DMA_LENGTH
select PCI_DOMAINS if PCI
- select PCI_LOCKLESS_CONFIG
+ select PCI_LOCKLESS_CONFIG if PCI
select PERF_EVENTS
select RTC_LIB
select RTC_MC146818_LIB
@@ -446,12 +445,12 @@ config RETPOLINE
branches. Requires a compiler with -mindirect-branch=thunk-extern
support for full protection. The kernel may run slower.
-config RESCTRL
- bool "Resource Control support"
+config X86_CPU_RESCTRL
+ bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select KERNFS
help
- Enable Resource Control support.
+ Enable x86 CPU resource control support.
Provide support for the allocation and monitoring of system resources
usage by the CPU.
@@ -617,7 +616,7 @@ config X86_INTEL_QUARK
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
- depends on X86 && ACPI
+ depends on X86 && ACPI && PCI
select COMMON_CLK
select PINCTRL
select IOSF_MBI
@@ -1510,6 +1509,7 @@ config AMD_MEM_ENCRYPT
bool "AMD Secure Memory Encryption (SME) support"
depends on X86_64 && CPU_SUP_AMD
select DYNAMIC_PHYSICAL_MASK
+ select ARCH_USE_MEMREMAP_PROT
---help---
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1529,10 +1529,6 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
If set to N, then the encryption of system memory can be
activated with the mem_encrypt=on command line option.
-config ARCH_USE_MEMREMAP_PROT
- def_bool y
- depends on AMD_MEM_ENCRYPT
-
# Common NUMA Features
config NUMA
bool "Numa Memory Allocation and Scheduler Support"
@@ -2843,6 +2839,7 @@ config IA32_EMULATION
config IA32_AOUT
tristate "IA32 a.out support"
depends on IA32_EMULATION
+ depends on BROKEN
---help---
Support old a.out binaries in the 32bit emulation.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 0723dff17e6cb..15d0fbe278726 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -40,16 +40,6 @@ config EARLY_PRINTK_DBGP
with klogd/syslogd or the X server. You should normally say N here,
unless you want to debug such a crash. You need usb debug device.
-config EARLY_PRINTK_EFI
- bool "Early printk via the EFI framebuffer"
- depends on EFI && EARLY_PRINTK
- select FONT_SUPPORT
- ---help---
- Write kernel log output directly into the EFI framebuffer.
-
- This is useful for kernel debugging when your machine crashes very
- early before the console code is initialized.
-
config EARLY_PRINTK_USB_XDBC
bool "Early printk via the xHCI debug port"
depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 9c5a67d1b9c1b..2d8b9d8ca4f87 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -187,7 +187,6 @@ cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,
cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
# does binutils support specific instructions?
-asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
@@ -217,6 +216,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
# Avoid indirect branches in kernel to deal with Spectre
ifdef CONFIG_RETPOLINE
KBUILD_CFLAGS += $(RETPOLINE_CFLAGS)
+ # Additionally, avoid generating expensive indirect jumps which
+ # are subject to retpolines for small number of switch cases.
+ # clang turns off jump table generation by default when under
+ # retpoline builds, however, gcc does not for x86.
+ KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20)
endif
archscripts: scripts_basic
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 9b5adae9cc40c..e2839b5c246c2 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
AFLAGS_header.o += -I$(objtree)/$(obj)
$(obj)/header.o: $(obj)/zoffset.h
-LDFLAGS_setup.elf := -T
+LDFLAGS_setup.elf := -m elf_i386 -T
$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f0515ac895a43..6b84afdd75382 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -84,6 +84,8 @@ ifdef CONFIG_X86_64
vmlinux-objs-y += $(obj)/pgtable_64.o
endif
+vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
+
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
new file mode 100644
index 0000000000000..0ef4ad55b29b2
--- /dev/null
+++ b/arch/x86/boot/compressed/acpi.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+#define BOOT_CTYPE_H
+#include "misc.h"
+#include "error.h"
+#include "../string.h"
+
+#include <linux/numa.h>
+#include <linux/efi.h>
+#include <asm/efi.h>
+
+/*
+ * Longest parameter of 'acpi=' is 'copy_dsdt', plus an extra '\0'
+ * for termination.
+ */
+#define MAX_ACPI_ARG_LENGTH 10
+
+/*
+ * Immovable memory regions representation. Max amount of memory regions is
+ * MAX_NUMNODES*2.
+ */
+struct mem_vector immovable_mem[MAX_NUMNODES*2];
+
+/*
+ * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex
+ * digits, and '\0' for termination.
+ */
+#define MAX_ADDR_LEN 19
+
+static acpi_physical_address get_acpi_rsdp(void)
+{
+ acpi_physical_address addr = 0;
+
+#ifdef CONFIG_KEXEC
+ char val[MAX_ADDR_LEN] = { };
+ int ret;
+
+ ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN);
+ if (ret < 0)
+ return 0;
+
+ if (kstrtoull(val, 16, &addr))
+ return 0;
+#endif
+ return addr;
+}
+
+/* Search EFI system tables for RSDP. */
+static acpi_physical_address efi_get_rsdp_addr(void)
+{
+ acpi_physical_address rsdp_addr = 0;
+
+#ifdef CONFIG_EFI
+ unsigned long systab, systab_tables, config_tables;
+ unsigned int nr_tables;
+ struct efi_info *ei;
+ bool efi_64;
+ int size, i;
+ char *sig;
+
+ ei = &boot_params->efi_info;
+ sig = (char *)&ei->efi_loader_signature;
+
+ if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4)) {
+ efi_64 = true;
+ } else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4)) {
+ efi_64 = false;
+ } else {
+ debug_putstr("Wrong EFI loader signature.\n");
+ return 0;
+ }
+
+ /* Get systab from boot params. */
+#ifdef CONFIG_X86_64
+ systab = ei->efi_systab | ((__u64)ei->efi_systab_hi << 32);
+#else
+ if (ei->efi_systab_hi || ei->efi_memmap_hi) {
+ debug_putstr("Error getting RSDP address: EFI system table located above 4GB.\n");
+ return 0;
+ }
+ systab = ei->efi_systab;
+#endif
+ if (!systab)
+ error("EFI system table not found.");
+
+ /* Handle EFI bitness properly */
+ if (efi_64) {
+ efi_system_table_64_t *stbl = (efi_system_table_64_t *)systab;
+
+ config_tables = stbl->tables;
+ nr_tables = stbl->nr_tables;
+ size = sizeof(efi_config_table_64_t);
+ } else {
+ efi_system_table_32_t *stbl = (efi_system_table_32_t *)systab;
+
+ config_tables = stbl->tables;
+ nr_tables = stbl->nr_tables;
+ size = sizeof(efi_config_table_32_t);
+ }
+
+ if (!config_tables)
+ error("EFI config tables not found.");
+
+ /* Get EFI tables from systab. */
+ for (i = 0; i < nr_tables; i++) {
+ acpi_physical_address table;
+ efi_guid_t guid;
+
+ config_tables += size;
+
+ if (efi_64) {
+ efi_config_table_64_t *tbl = (efi_config_table_64_t *)config_tables;
+
+ guid = tbl->guid;
+ table = tbl->table;
+
+ if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) {
+ debug_putstr("Error getting RSDP address: EFI config table located above 4GB.\n");
+ return 0;
+ }
+ } else {
+ efi_config_table_32_t *tbl = (efi_config_table_32_t *)config_tables;
+
+ guid = tbl->guid;
+ table = tbl->table;
+ }
+
+ if (!(efi_guidcmp(guid, ACPI_TABLE_GUID)))
+ rsdp_addr = table;
+ else if (!(efi_guidcmp(guid, ACPI_20_TABLE_GUID)))
+ return table;
+ }
+#endif
+ return rsdp_addr;
+}
+
+static u8 compute_checksum(u8 *buffer, u32 length)
+{
+ u8 *end = buffer + length;
+ u8 sum = 0;
+
+ while (buffer < end)
+ sum += *(buffer++);
+
+ return sum;
+}
+
+/* Search a block of memory for the RSDP signature. */
+static u8 *scan_mem_for_rsdp(u8 *start, u32 length)
+{
+ struct acpi_table_rsdp *rsdp;
+ u8 *address, *end;
+
+ end = start + length;
+
+ /* Search from given start address for the requested length */
+ for (address = start; address < end; address += ACPI_RSDP_SCAN_STEP) {
+ /*
+ * Both RSDP signature and checksum must be correct.
+ * Note: Sometimes there exists more than one RSDP in memory;
+ * the valid RSDP has a valid checksum, all others have an
+ * invalid checksum.
+ */
+ rsdp = (struct acpi_table_rsdp *)address;
+
+ /* BAD Signature */
+ if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature))
+ continue;
+
+ /* Check the standard checksum */
+ if (compute_checksum((u8 *)rsdp, ACPI_RSDP_CHECKSUM_LENGTH))
+ continue;
+
+ /* Check extended checksum if table version >= 2 */
+ if ((rsdp->revision >= 2) &&
+ (compute_checksum((u8 *)rsdp, ACPI_RSDP_XCHECKSUM_LENGTH)))
+ continue;
+
+ /* Signature and checksum valid, we have found a real RSDP */
+ return address;
+ }
+ return NULL;
+}
+
+/* Search RSDP address in EBDA. */
+static acpi_physical_address bios_get_rsdp_addr(void)
+{
+ unsigned long address;
+ u8 *rsdp;
+
+ /* Get the location of the Extended BIOS Data Area (EBDA) */
+ address = *(u16 *)ACPI_EBDA_PTR_LOCATION;
+ address <<= 4;
+
+ /*
+ * Search EBDA paragraphs (EBDA is required to be a minimum of
+ * 1K length)
+ */
+ if (address > 0x400) {
+ rsdp = scan_mem_for_rsdp((u8 *)address, ACPI_EBDA_WINDOW_SIZE);
+ if (rsdp)
+ return (acpi_physical_address)(unsigned long)rsdp;
+ }
+
+ /* Search upper memory: 16-byte boundaries in E0000h-FFFFFh */
+ rsdp = scan_mem_for_rsdp((u8 *) ACPI_HI_RSDP_WINDOW_BASE,
+ ACPI_HI_RSDP_WINDOW_SIZE);
+ if (rsdp)
+ return (acpi_physical_address)(unsigned long)rsdp;
+
+ return 0;
+}
+
+/* Return RSDP address on success, otherwise 0. */
+acpi_physical_address get_rsdp_addr(void)
+{
+ acpi_physical_address pa;
+
+ pa = get_acpi_rsdp();
+
+ if (!pa)
+ pa = boot_params->acpi_rsdp_addr;
+
+ if (!pa)
+ pa = efi_get_rsdp_addr();
+
+ if (!pa)
+ pa = bios_get_rsdp_addr();
+
+ return pa;
+}
+
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE)
+/* Compute SRAT address from RSDP. */
+static unsigned long get_acpi_srat_table(void)
+{
+ unsigned long root_table, acpi_table;
+ struct acpi_table_header *header;
+ struct acpi_table_rsdp *rsdp;
+ u32 num_entries, size, len;
+ char arg[10];
+ u8 *entry;
+
+ rsdp = (struct acpi_table_rsdp *)(long)boot_params->acpi_rsdp_addr;
+ if (!rsdp)
+ return 0;
+
+ /* Get ACPI root table from RSDP.*/
+ if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 &&
+ !strncmp(arg, "rsdt", 4)) &&
+ rsdp->xsdt_physical_address &&
+ rsdp->revision > 1) {
+ root_table = rsdp->xsdt_physical_address;
+ size = ACPI_XSDT_ENTRY_SIZE;
+ } else {
+ root_table = rsdp->rsdt_physical_address;
+ size = ACPI_RSDT_ENTRY_SIZE;
+ }
+
+ if (!root_table)
+ return 0;
+
+ header = (struct acpi_table_header *)root_table;
+ len = header->length;
+ if (len < sizeof(struct acpi_table_header) + size)
+ return 0;
+
+ num_entries = (len - sizeof(struct acpi_table_header)) / size;
+ entry = (u8 *)(root_table + sizeof(struct acpi_table_header));
+
+ while (num_entries--) {
+ if (size == ACPI_RSDT_ENTRY_SIZE)
+ acpi_table = *(u32 *)entry;
+ else
+ acpi_table = *(u64 *)entry;
+
+ if (acpi_table) {
+ header = (struct acpi_table_header *)acpi_table;
+
+ if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_SRAT))
+ return acpi_table;
+ }
+ entry += size;
+ }
+ return 0;
+}
+
+/**
+ * count_immovable_mem_regions - Parse SRAT and cache the immovable
+ * memory regions into the immovable_mem array.
+ *
+ * Return the number of immovable memory regions on success, 0 on failure:
+ *
+ * - Too many immovable memory regions
+ * - ACPI off or no SRAT found
+ * - No immovable memory region found.
+ */
+int count_immovable_mem_regions(void)
+{
+ unsigned long table_addr, table_end, table;
+ struct acpi_subtable_header *sub_table;
+ struct acpi_table_header *table_header;
+ char arg[MAX_ACPI_ARG_LENGTH];
+ int num = 0;
+
+ if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
+ !strncmp(arg, "off", 3))
+ return 0;
+
+ table_addr = get_acpi_srat_table();
+ if (!table_addr)
+ return 0;
+
+ table_header = (struct acpi_table_header *)table_addr;
+ table_end = table_addr + table_header->length;
+ table = table_addr + sizeof(struct acpi_table_srat);
+
+ while (table + sizeof(struct acpi_subtable_header) < table_end) {
+ sub_table = (struct acpi_subtable_header *)table;
+ if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
+ struct acpi_srat_mem_affinity *ma;
+
+ ma = (struct acpi_srat_mem_affinity *)sub_table;
+ if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) {
+ immovable_mem[num].start = ma->base_address;
+ immovable_mem[num].size = ma->length;
+ num++;
+ }
+
+ if (num >= MAX_NUMNODES*2) {
+ debug_putstr("Too many immovable memory regions, aborting.\n");
+ return 0;
+ }
+ }
+ table += sub_table->length;
+ }
+ return num;
+}
+#endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */
diff --git a/arch/x86/boot/compressed/cmdline.c b/arch/x86/boot/compressed/cmdline.c
index af6cda0b7900f..f1add5d85da9d 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "misc.h"
-#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE || CONFIG_X86_5LEVEL
-
static unsigned long fs;
static inline void set_fs(unsigned long seg)
{
@@ -30,5 +28,3 @@ int cmdline_find_option_bool(const char *option)
{
return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
}
-
-#endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 64037895b0859..fafb75c6c5925 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -358,8 +358,11 @@ ENTRY(startup_64)
* paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging.
*
- * Address of the trampoline is returned in RAX.
- * Non zero RDX on return means we need to enable 5-level paging.
+ * paging_prepare() returns a two-quadword structure which lands
+ * into RDX:RAX:
+ * - Address of the trampoline is returned in RAX.
+ * - Non zero RDX means trampoline needs to enable 5-level
+ * paging.
*
* RSI holds real mode data and needs to be preserved across
* this function call.
@@ -565,7 +568,7 @@ adjust_got:
*
* RDI contains the return address (might be above 4G).
* ECX contains the base address of the trampoline memory.
- * Non zero RDX on return means we need to enable 5-level paging.
+ * Non zero RDX means trampoline needs to enable 5-level paging.
*/
ENTRY(trampoline_32bit_src)
/* Set up data and stack segments */
@@ -600,6 +603,16 @@ ENTRY(trampoline_32bit_src)
leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
movl %eax, %cr3
3:
+ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
+ pushl %ecx
+ pushl %edx
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+ popl %edx
+ popl %ecx
+
/* Enable PAE and LA57 (if required) paging modes */
movl $X86_CR4_PAE, %eax
cmpl $0, %edx
@@ -645,8 +658,6 @@ no_longmode:
.data
gdt64:
.word gdt_end - gdt
- .long 0
- .word 0
.quad 0
gdt:
.word gdt_end - gdt
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 9ed9709d9947a..2e53c056ba20c 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -87,10 +87,6 @@ static unsigned long get_boot_seed(void)
#define KASLR_COMPRESSED_BOOT
#include "../../lib/kaslr.c"
-struct mem_vector {
- unsigned long long start;
- unsigned long long size;
-};
/* Only supporting at most 4 unusable memmap regions with kaslr */
#define MAX_MEMMAP_REGIONS 4
@@ -101,6 +97,8 @@ static bool memmap_too_large;
/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
static unsigned long long mem_limit = ULLONG_MAX;
+/* Number of immovable memory regions */
+static int num_immovable_mem;
enum mem_avoid_index {
MEM_AVOID_ZO_RANGE = 0,
@@ -417,6 +415,9 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* Mark the memmap regions we need to avoid */
handle_mem_options();
+ /* Enumerate the immovable memory regions */
+ num_immovable_mem = count_immovable_mem_regions();
+
#ifdef CONFIG_X86_VERBOSE_BOOTUP
/* Make sure video RAM can be used. */
add_identity_map(0, PMD_SIZE);
@@ -572,9 +573,9 @@ static unsigned long slots_fetch_random(void)
return 0;
}
-static void process_mem_region(struct mem_vector *entry,
- unsigned long minimum,
- unsigned long image_size)
+static void __process_mem_region(struct mem_vector *entry,
+ unsigned long minimum,
+ unsigned long image_size)
{
struct mem_vector region, overlap;
unsigned long start_orig, end;
@@ -650,6 +651,56 @@ static void process_mem_region(struct mem_vector *entry,
}
}
+static bool process_mem_region(struct mem_vector *region,
+ unsigned long long minimum,
+ unsigned long long image_size)
+{
+ int i;
+ /*
+ * If no immovable memory found, or MEMORY_HOTREMOVE disabled,
+ * use @region directly.
+ */
+ if (!num_immovable_mem) {
+ __process_mem_region(region, minimum, image_size);
+
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
+ return 1;
+ }
+ return 0;
+ }
+
+#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+ /*
+ * If immovable memory found, filter the intersection between
+ * immovable memory and @region.
+ */
+ for (i = 0; i < num_immovable_mem; i++) {
+ unsigned long long start, end, entry_end, region_end;
+ struct mem_vector entry;
+
+ if (!mem_overlaps(region, &immovable_mem[i]))
+ continue;
+
+ start = immovable_mem[i].start;
+ end = start + immovable_mem[i].size;
+ region_end = region->start + region->size;
+
+ entry.start = clamp(region->start, start, end);
+ entry_end = clamp(region_end, start, end);
+ entry.size = entry_end - entry.start;
+
+ __process_mem_region(&entry, minimum, image_size);
+
+ if (slot_area_index == MAX_SLOT_AREA) {
+ debug_putstr("Aborted e820/efi memmap scan when walking immovable regions(slot_areas full)!\n");
+ return 1;
+ }
+ }
+#endif
+ return 0;
+}
+
#ifdef CONFIG_EFI
/*
* Returns true if mirror region found (and must have been processed
@@ -715,11 +766,8 @@ process_efi_entries(unsigned long minimum, unsigned long image_size)
region.start = md->phys_addr;
region.size = md->num_pages << EFI_PAGE_SHIFT;
- process_mem_region(&region, minimum, image_size);
- if (slot_area_index == MAX_SLOT_AREA) {
- debug_putstr("Aborted EFI scan (slot_areas full)!\n");
+ if (process_mem_region(&region, minimum, image_size))
break;
- }
}
return true;
}
@@ -746,11 +794,8 @@ static void process_e820_entries(unsigned long minimum,
continue;
region.start = entry->addr;
region.size = entry->size;
- process_mem_region(&region, minimum, image_size);
- if (slot_area_index == MAX_SLOT_AREA) {
- debug_putstr("Aborted e820 scan (slot_areas full)!\n");
+ if (process_mem_region(&region, minimum, image_size))
break;
- }
}
}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8dd1d5ccae580..c0d6c560df69e 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -351,6 +351,9 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
/* Clear flags intended for solely in-kernel use. */
boot_params->hdr.loadflags &= ~KASLR_FLAG;
+ /* Save RSDP address for later use. */
+ boot_params->acpi_rsdp_addr = get_rsdp_addr();
+
sanitize_boot_params(boot_params);
if (boot_params->screen_info.orig_video_mode == 7) {
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index a1d5918765f36..fd13655e0f9b0 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -25,6 +25,9 @@
#include <asm/bootparam.h>
#include <asm/bootparam_utils.h>
+#define BOOT_CTYPE_H
+#include <linux/acpi.h>
+
#define BOOT_BOOT_H
#include "../ctype.h"
@@ -63,12 +66,14 @@ static inline void debug_puthex(const char *s)
#endif
-#if CONFIG_EARLY_PRINTK || CONFIG_RANDOMIZE_BASE
/* cmdline.c */
int cmdline_find_option(const char *option, char *buffer, int bufsize);
int cmdline_find_option_bool(const char *option);
-#endif
+struct mem_vector {
+ unsigned long long start;
+ unsigned long long size;
+};
#if CONFIG_RANDOMIZE_BASE
/* kaslr.c */
@@ -116,3 +121,17 @@ static inline void console_init(void)
void set_sev_encryption_mask(void);
#endif
+
+/* acpi.c */
+#ifdef CONFIG_ACPI
+acpi_physical_address get_rsdp_addr(void);
+#else
+static inline acpi_physical_address get_rsdp_addr(void) { return 0; }
+#endif
+
+#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
+extern struct mem_vector immovable_mem[MAX_NUMNODES*2];
+int count_immovable_mem_regions(void);
+#else
+static inline int count_immovable_mem_regions(void) { return 0; }
+#endif
diff --git a/arch/x86/boot/compressed/pgtable.h b/arch/x86/boot/compressed/pgtable.h
index 91f75638f6e68..6ff7e81b56284 100644
--- a/arch/x86/boot/compressed/pgtable.h
+++ b/arch/x86/boot/compressed/pgtable.h
@@ -6,7 +6,7 @@
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
-#define TRAMPOLINE_32BIT_CODE_SIZE 0x60
+#define TRAMPOLINE_32BIT_CODE_SIZE 0x70
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c
index 9e21573714910..f8debf7aeb4c1 100644
--- a/arch/x86/boot/compressed/pgtable_64.c
+++ b/arch/x86/boot/compressed/pgtable_64.c
@@ -1,5 +1,7 @@
+#include <linux/efi.h>
#include <asm/e820/types.h>
#include <asm/processor.h>
+#include <asm/efi.h>
#include "pgtable.h"
#include "../string.h"
@@ -37,9 +39,10 @@ int cmdline_find_option_bool(const char *option);
static unsigned long find_trampoline_placement(void)
{
- unsigned long bios_start, ebda_start;
+ unsigned long bios_start = 0, ebda_start = 0;
unsigned long trampoline_start;
struct boot_e820_entry *entry;
+ char *signature;
int i;
/*
@@ -47,8 +50,18 @@ static unsigned long find_trampoline_placement(void)
* This code is based on reserve_bios_regions().
*/
- ebda_start = *(unsigned short *)0x40e << 4;
- bios_start = *(unsigned short *)0x413 << 10;
+ /*
+ * EFI systems may not provide legacy ROM. The memory may not be mapped
+ * at all.
+ *
+ * Only look for values in the legacy ROM for non-EFI system.
+ */
+ signature = (char *)&boot_params->efi_info.efi_loader_signature;
+ if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
+ strncmp(signature, EFI64_LOADER_SIGNATURE, 4)) {
+ ebda_start = *(unsigned short *)0x40e << 4;
+ bios_start = *(unsigned short *)0x413 << 10;
+ }
if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX)
bios_start = BIOS_START_MAX;
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f491bbde84931..508cfa6828c5d 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm-generic/vmlinux.lds.h>
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#undef i386
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 96a6c75635383..0149e41d42c27 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -3,7 +3,7 @@
*
* Linker script for the i386 setup code
*/
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_FORMAT("elf32-i386")
OUTPUT_ARCH(i386)
ENTRY(_start)
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index c4428a1769733..315a67b8896b9 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -13,10 +13,14 @@
*/
#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
#include <asm/asm.h>
#include "ctype.h"
#include "string.h"
+#define KSTRTOX_OVERFLOW (1U << 31)
+
/*
* Undef these macros so that the functions that we provide
* here will have the correct names regardless of how string.h
@@ -187,3 +191,140 @@ char *strchr(const char *s, int c)
return NULL;
return (char *)s;
}
+
+static inline u64 __div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
+{
+ union {
+ u64 v64;
+ u32 v32[2];
+ } d = { dividend };
+ u32 upper;
+
+ upper = d.v32[1];
+ d.v32[1] = 0;
+ if (upper >= divisor) {
+ d.v32[1] = upper / divisor;
+ upper %= divisor;
+ }
+ asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
+ "rm" (divisor), "0" (d.v32[0]), "1" (upper));
+ return d.v64;
+}
+
+static inline u64 __div_u64(u64 dividend, u32 divisor)
+{
+ u32 remainder;
+
+ return __div_u64_rem(dividend, divisor, &remainder);
+}
+
+static inline char _tolower(const char c)
+{
+ return c | 0x20;
+}
+
+static const char *_parse_integer_fixup_radix(const char *s, unsigned int *base)
+{
+ if (*base == 0) {
+ if (s[0] == '0') {
+ if (_tolower(s[1]) == 'x' && isxdigit(s[2]))
+ *base = 16;
+ else
+ *base = 8;
+ } else
+ *base = 10;
+ }
+ if (*base == 16 && s[0] == '0' && _tolower(s[1]) == 'x')
+ s += 2;
+ return s;
+}
+
+/*
+ * Convert non-negative integer string representation in explicitly given radix
+ * to an integer.
+ * Return number of characters consumed maybe or-ed with overflow bit.
+ * If overflow occurs, result integer (incorrect) is still returned.
+ *
+ * Don't you dare use this function.
+ */
+static unsigned int _parse_integer(const char *s,
+ unsigned int base,
+ unsigned long long *p)
+{
+ unsigned long long res;
+ unsigned int rv;
+
+ res = 0;
+ rv = 0;
+ while (1) {
+ unsigned int c = *s;
+ unsigned int lc = c | 0x20; /* don't tolower() this line */
+ unsigned int val;
+
+ if ('0' <= c && c <= '9')
+ val = c - '0';
+ else if ('a' <= lc && lc <= 'f')
+ val = lc - 'a' + 10;
+ else
+ break;
+
+ if (val >= base)
+ break;
+ /*
+ * Check for overflow only if we are within range of
+ * it in the max base we support (16)
+ */
+ if (unlikely(res & (~0ull << 60))) {
+ if (res > __div_u64(ULLONG_MAX - val, base))
+ rv |= KSTRTOX_OVERFLOW;
+ }
+ res = res * base + val;
+ rv++;
+ s++;
+ }
+ *p = res;
+ return rv;
+}
+
+static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+ unsigned long long _res;
+ unsigned int rv;
+
+ s = _parse_integer_fixup_radix(s, &base);
+ rv = _parse_integer(s, base, &_res);
+ if (rv & KSTRTOX_OVERFLOW)
+ return -ERANGE;
+ if (rv == 0)
+ return -EINVAL;
+ s += rv;
+ if (*s == '\n')
+ s++;
+ if (*s)
+ return -EINVAL;
+ *res = _res;
+ return 0;
+}
+
+/**
+ * kstrtoull - convert a string to an unsigned long long
+ * @s: The start of the string. The string must be null-terminated, and may also
+ * include a single newline before its terminating null. The first character
+ * may also be a plus sign, but not a minus sign.
+ * @base: The number base to use. The maximum supported base is 16. If base is
+ * given as 0, then the base of the string is automatically detected with the
+ * conventional semantics - If it begins with 0x the number will be parsed as a
+ * hexadecimal (case insensitive), if it otherwise begins with 0, it will be
+ * parsed as an octal number. Otherwise it will be parsed as a decimal.
+ * @res: Where to write the result of the conversion on success.
+ *
+ * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error.
+ * Used as a replacement for the obsolete simple_strtoull. Return code must
+ * be checked.
+ */
+int kstrtoull(const char *s, unsigned int base, unsigned long long *res)
+{
+ if (s[0] == '+')
+ s++;
+ return _kstrtoull(s, base, res);
+}
diff --git a/arch/x86/boot/string.h b/arch/x86/boot/string.h
index 3d78e27077f41..38d8f2f5e47e2 100644
--- a/arch/x86/boot/string.h
+++ b/arch/x86/boot/string.h
@@ -29,4 +29,5 @@ extern unsigned int atou(const char *s);
extern unsigned long long simple_strtoull(const char *cp, char **endp,
unsigned int base);
+int kstrtoull(const char *s, unsigned int base, unsigned long long *res);
#endif /* BOOT_STRING_H */
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 4bb95d7ad9475..9f908112bbb97 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -287,7 +287,6 @@ CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_FRAME_WARN=1024
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
@@ -310,3 +309,5 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_SELINUX_DISABLE=y
CONFIG_CRYPTO_AES_586=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_EFI_STUB=y
+CONFIG_ACPI_BGRT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 0fed049422a8f..1d3badfda09ee 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -286,7 +286,6 @@ CONFIG_NLS_ASCII=y
CONFIG_NLS_ISO8859_1=y
CONFIG_NLS_UTF8=y
CONFIG_PRINTK_TIME=y
-# CONFIG_ENABLE_WARN_DEPRECATED is not set
CONFIG_MAGIC_SYSRQ=y
# CONFIG_UNUSED_SYMBOLS is not set
CONFIG_DEBUG_KERNEL=y
@@ -308,3 +307,6 @@ CONFIG_SECURITY_SELINUX=y
CONFIG_SECURITY_SELINUX_BOOTPARAM=y
CONFIG_SECURITY_SELINUX_DISABLE=y
# CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_EFI_STUB=y
+CONFIG_EFI_MIXED=y
+CONFIG_ACPI_BGRT=y
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 2a356b948720e..3ea71b8718135 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -119,31 +119,20 @@ static void crypto_aegis128_aesni_process_ad(
}
static void crypto_aegis128_aesni_process_crypt(
- struct aegis_state *state, struct aead_request *req,
+ struct aegis_state *state, struct skcipher_walk *walk,
const struct aegis_crypt_ops *ops)
{
- struct skcipher_walk walk;
- u8 *src, *dst;
- unsigned int chunksize, base;
-
- ops->skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- src = walk.src.virt.addr;
- dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops->crypt_blocks(state, chunksize, src, dst);
-
- base = chunksize & ~(AEGIS128_BLOCK_SIZE - 1);
- src += base;
- dst += base;
- chunksize &= AEGIS128_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops->crypt_tail(state, chunksize, src, dst);
+ while (walk->nbytes >= AEGIS128_BLOCK_SIZE) {
+ ops->crypt_blocks(state,
+ round_down(walk->nbytes, AEGIS128_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE);
+ }
- skcipher_walk_done(&walk, 0);
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
}
}
@@ -186,13 +175,16 @@ static void crypto_aegis128_aesni_crypt(struct aead_request *req,
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm);
+ struct skcipher_walk walk;
struct aegis_state state;
+ ops->skcipher_walk_init(&walk, req, true);
+
kernel_fpu_begin();
crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv);
crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128_aesni_process_crypt(&state, req, ops);
+ crypto_aegis128_aesni_process_crypt(&state, &walk, ops);
crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c
index dbe8bb980da15..1b1b39c66c5e2 100644
--- a/arch/x86/crypto/aegis128l-aesni-glue.c
+++ b/arch/x86/crypto/aegis128l-aesni-glue.c
@@ -119,31 +119,20 @@ static void crypto_aegis128l_aesni_process_ad(
}
static void crypto_aegis128l_aesni_process_crypt(
- struct aegis_state *state, struct aead_request *req,
+ struct aegis_state *state, struct skcipher_walk *walk,
const struct aegis_crypt_ops *ops)
{
- struct skcipher_walk walk;
- u8 *src, *dst;
- unsigned int chunksize, base;
-
- ops->skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- src = walk.src.virt.addr;
- dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops->crypt_blocks(state, chunksize, src, dst);
-
- base = chunksize & ~(AEGIS128L_BLOCK_SIZE - 1);
- src += base;
- dst += base;
- chunksize &= AEGIS128L_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops->crypt_tail(state, chunksize, src, dst);
+ while (walk->nbytes >= AEGIS128L_BLOCK_SIZE) {
+ ops->crypt_blocks(state, round_down(walk->nbytes,
+ AEGIS128L_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS128L_BLOCK_SIZE);
+ }
- skcipher_walk_done(&walk, 0);
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
}
}
@@ -186,13 +175,16 @@ static void crypto_aegis128l_aesni_crypt(struct aead_request *req,
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_ctx *ctx = crypto_aegis128l_aesni_ctx(tfm);
+ struct skcipher_walk walk;
struct aegis_state state;
+ ops->skcipher_walk_init(&walk, req, true);
+
kernel_fpu_begin();
crypto_aegis128l_aesni_init(&state, ctx->key.bytes, req->iv);
crypto_aegis128l_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis128l_aesni_process_crypt(&state, req, ops);
+ crypto_aegis128l_aesni_process_crypt(&state, &walk, ops);
crypto_aegis128l_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c
index 8bebda2de92fe..6227ca3220a05 100644
--- a/arch/x86/crypto/aegis256-aesni-glue.c
+++ b/arch/x86/crypto/aegis256-aesni-glue.c
@@ -119,31 +119,20 @@ static void crypto_aegis256_aesni_process_ad(
}
static void crypto_aegis256_aesni_process_crypt(
- struct aegis_state *state, struct aead_request *req,
+ struct aegis_state *state, struct skcipher_walk *walk,
const struct aegis_crypt_ops *ops)
{
- struct skcipher_walk walk;
- u8 *src, *dst;
- unsigned int chunksize, base;
-
- ops->skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- src = walk.src.virt.addr;
- dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops->crypt_blocks(state, chunksize, src, dst);
-
- base = chunksize & ~(AEGIS256_BLOCK_SIZE - 1);
- src += base;
- dst += base;
- chunksize &= AEGIS256_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops->crypt_tail(state, chunksize, src, dst);
+ while (walk->nbytes >= AEGIS256_BLOCK_SIZE) {
+ ops->crypt_blocks(state,
+ round_down(walk->nbytes, AEGIS256_BLOCK_SIZE),
+ walk->src.virt.addr, walk->dst.virt.addr);
+ skcipher_walk_done(walk, walk->nbytes % AEGIS256_BLOCK_SIZE);
+ }
- skcipher_walk_done(&walk, 0);
+ if (walk->nbytes) {
+ ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr,
+ walk->dst.virt.addr);
+ skcipher_walk_done(walk, 0);
}
}
@@ -186,13 +175,16 @@ static void crypto_aegis256_aesni_crypt(struct aead_request *req,
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct aegis_ctx *ctx = crypto_aegis256_aesni_ctx(tfm);
+ struct skcipher_walk walk;
struct aegis_state state;
+ ops->skcipher_walk_init(&walk, req, true);
+
kernel_fpu_begin();
crypto_aegis256_aesni_init(&state, ctx->key, req->iv);
crypto_aegis256_aesni_process_ad(&state, req->src, req->assoclen);
- crypto_aegis256_aesni_process_crypt(&state, req, ops);
+ crypto_aegis256_aesni_process_crypt(&state, &walk, ops);
crypto_aegis256_aesni_final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 1321700d6647f..1e3d2102033a0 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -175,26 +175,18 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
-static struct aesni_gcm_tfm_s {
-void (*init)(void *ctx,
- struct gcm_context_data *gdata,
- u8 *iv,
- u8 *hash_subkey, const u8 *aad,
- unsigned long aad_len);
-void (*enc_update)(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long plaintext_len);
-void (*dec_update)(void *ctx,
- struct gcm_context_data *gdata, u8 *out,
- const u8 *in,
- unsigned long ciphertext_len);
-void (*finalize)(void *ctx,
- struct gcm_context_data *gdata,
- u8 *auth_tag, unsigned long auth_tag_len);
+static const struct aesni_gcm_tfm_s {
+ void (*init)(void *ctx, struct gcm_context_data *gdata, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len);
+ void (*enc_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+ void (*dec_update)(void *ctx, struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long ciphertext_len);
+ void (*finalize)(void *ctx, struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
} *aesni_gcm_tfm;
-struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
.init = &aesni_gcm_init,
.enc_update = &aesni_gcm_enc_update,
.dec_update = &aesni_gcm_dec_update,
@@ -243,7 +235,7 @@ asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
.init = &aesni_gcm_init_avx_gen2,
.enc_update = &aesni_gcm_enc_update_avx_gen2,
.dec_update = &aesni_gcm_dec_update_avx_gen2,
@@ -288,7 +280,7 @@ asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
+static const struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
.init = &aesni_gcm_init_avx_gen4,
.enc_update = &aesni_gcm_enc_update_avx_gen4,
.dec_update = &aesni_gcm_dec_update_avx_gen4,
@@ -778,7 +770,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
+ const struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
struct gcm_context_data data AESNI_ALIGN_ATTR;
struct scatter_walk dst_sg_walk = {};
unsigned long left = req->cryptlen;
@@ -821,11 +813,14 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
}
- src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
- scatterwalk_start(&src_sg_walk, src_sg);
- if (req->src != req->dst) {
- dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen);
- scatterwalk_start(&dst_sg_walk, dst_sg);
+ if (left) {
+ src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+ scatterwalk_start(&src_sg_walk, src_sg);
+ if (req->src != req->dst) {
+ dst_sg = scatterwalk_ffwd(dst_start, req->dst,
+ req->assoclen);
+ scatterwalk_start(&dst_sg_walk, dst_sg);
+ }
}
kernel_fpu_begin();
diff --git a/arch/x86/crypto/crct10dif-pcl-asm_64.S b/arch/x86/crypto/crct10dif-pcl-asm_64.S
index de04d3e98d8d3..3d873e67749d7 100644
--- a/arch/x86/crypto/crct10dif-pcl-asm_64.S
+++ b/arch/x86/crypto/crct10dif-pcl-asm_64.S
@@ -43,609 +43,291 @@
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-########################################################################
-# Function API:
-# UINT16 crc_t10dif_pcl(
-# UINT16 init_crc, //initial CRC value, 16 bits
-# const unsigned char *buf, //buffer pointer to calculate CRC on
-# UINT64 len //buffer length in bytes (64-bit data)
-# );
#
# Reference paper titled "Fast CRC Computation for Generic
# Polynomials Using PCLMULQDQ Instruction"
# URL: http://www.intel.com/content/dam/www/public/us/en/documents
# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
#
-#
#include <linux/linkage.h>
.text
-#define arg1 %rdi
-#define arg2 %rsi
-#define arg3 %rdx
-
-#define arg1_low32 %edi
+#define init_crc %edi
+#define buf %rsi
+#define len %rdx
+
+#define FOLD_CONSTS %xmm10
+#define BSWAP_MASK %xmm11
+
+# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
+# reg1, reg2.
+.macro fold_32_bytes offset, reg1, reg2
+ movdqu \offset(buf), %xmm9
+ movdqu \offset+16(buf), %xmm12
+ pshufb BSWAP_MASK, %xmm9
+ pshufb BSWAP_MASK, %xmm12
+ movdqa \reg1, %xmm8
+ movdqa \reg2, %xmm13
+ pclmulqdq $0x00, FOLD_CONSTS, \reg1
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm8
+ pclmulqdq $0x00, FOLD_CONSTS, \reg2
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm13
+ pxor %xmm9 , \reg1
+ xorps %xmm8 , \reg1
+ pxor %xmm12, \reg2
+ xorps %xmm13, \reg2
+.endm
+
+# Fold src_reg into dst_reg.
+.macro fold_16_bytes src_reg, dst_reg
+ movdqa \src_reg, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, \src_reg
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
+ pxor %xmm8, \dst_reg
+ xorps \src_reg, \dst_reg
+.endm
-ENTRY(crc_t10dif_pcl)
+#
+# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
+#
+# Assumes len >= 16.
+#
.align 16
+ENTRY(crc_t10dif_pcl)
- # adjust the 16-bit initial_crc value, scale it to 32 bits
- shl $16, arg1_low32
-
- # Allocate Stack Space
- mov %rsp, %rcx
- sub $16*2, %rsp
- # align stack to 16 byte boundary
- and $~(0x10 - 1), %rsp
-
- # check if smaller than 256
- cmp $256, arg3
-
- # for sizes less than 128, we can't fold 64B at a time...
- jl _less_than_128
-
-
- # load the initial crc value
- movd arg1_low32, %xmm10 # initial crc
-
- # crc value does not need to be byte-reflected, but it needs
- # to be moved to the high part of the register.
- # because data will be byte-reflected and will align with
- # initial crc at correct place.
- pslldq $12, %xmm10
-
- movdqa SHUF_MASK(%rip), %xmm11
- # receive the initial 64B data, xor the initial crc value
- movdqu 16*0(arg2), %xmm0
- movdqu 16*1(arg2), %xmm1
- movdqu 16*2(arg2), %xmm2
- movdqu 16*3(arg2), %xmm3
- movdqu 16*4(arg2), %xmm4
- movdqu 16*5(arg2), %xmm5
- movdqu 16*6(arg2), %xmm6
- movdqu 16*7(arg2), %xmm7
-
- pshufb %xmm11, %xmm0
- # XOR the initial_crc value
- pxor %xmm10, %xmm0
- pshufb %xmm11, %xmm1
- pshufb %xmm11, %xmm2
- pshufb %xmm11, %xmm3
- pshufb %xmm11, %xmm4
- pshufb %xmm11, %xmm5
- pshufb %xmm11, %xmm6
- pshufb %xmm11, %xmm7
-
- movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4
- #imm value of pclmulqdq instruction
- #will determine which constant to use
-
- #################################################################
- # we subtract 256 instead of 128 to save one instruction from the loop
- sub $256, arg3
-
- # at this section of the code, there is 64*x+y (0<=y<64) bytes of
- # buffer. The _fold_64_B_loop will fold 64B at a time
- # until we have 64+y Bytes of buffer
-
-
- # fold 64B at a time. This section of the code folds 4 xmm
- # registers in parallel
-_fold_64_B_loop:
-
- # update the buffer pointer
- add $128, arg2 # buf += 64#
-
- movdqu 16*0(arg2), %xmm9
- movdqu 16*1(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm0, %xmm8
- movdqa %xmm1, %xmm13
- pclmulqdq $0x0 , %xmm10, %xmm0
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0 , %xmm10, %xmm1
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm0
- xorps %xmm8 , %xmm0
- pxor %xmm12, %xmm1
- xorps %xmm13, %xmm1
-
- movdqu 16*2(arg2), %xmm9
- movdqu 16*3(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm2, %xmm8
- movdqa %xmm3, %xmm13
- pclmulqdq $0x0, %xmm10, %xmm2
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0, %xmm10, %xmm3
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm2
- xorps %xmm8 , %xmm2
- pxor %xmm12, %xmm3
- xorps %xmm13, %xmm3
-
- movdqu 16*4(arg2), %xmm9
- movdqu 16*5(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm4, %xmm8
- movdqa %xmm5, %xmm13
- pclmulqdq $0x0, %xmm10, %xmm4
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0, %xmm10, %xmm5
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm4
- xorps %xmm8 , %xmm4
- pxor %xmm12, %xmm5
- xorps %xmm13, %xmm5
-
- movdqu 16*6(arg2), %xmm9
- movdqu 16*7(arg2), %xmm12
- pshufb %xmm11, %xmm9
- pshufb %xmm11, %xmm12
- movdqa %xmm6 , %xmm8
- movdqa %xmm7 , %xmm13
- pclmulqdq $0x0 , %xmm10, %xmm6
- pclmulqdq $0x11, %xmm10, %xmm8
- pclmulqdq $0x0 , %xmm10, %xmm7
- pclmulqdq $0x11, %xmm10, %xmm13
- pxor %xmm9 , %xmm6
- xorps %xmm8 , %xmm6
- pxor %xmm12, %xmm7
- xorps %xmm13, %xmm7
-
- sub $128, arg3
-
- # check if there is another 64B in the buffer to be able to fold
- jge _fold_64_B_loop
- ##################################################################
-
-
- add $128, arg2
- # at this point, the buffer pointer is pointing at the last y Bytes
- # of the buffer the 64B of folded data is in 4 of the xmm
- # registers: xmm0, xmm1, xmm2, xmm3
-
-
- # fold the 8 xmm registers to 1 xmm register with different constants
-
- movdqa rk9(%rip), %xmm10
- movdqa %xmm0, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm0
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm0, %xmm7
-
- movdqa rk11(%rip), %xmm10
- movdqa %xmm1, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm1
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm1, %xmm7
-
- movdqa rk13(%rip), %xmm10
- movdqa %xmm2, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm2
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- pxor %xmm2, %xmm7
-
- movdqa rk15(%rip), %xmm10
- movdqa %xmm3, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm3
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm3, %xmm7
-
- movdqa rk17(%rip), %xmm10
- movdqa %xmm4, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm4
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- pxor %xmm4, %xmm7
-
- movdqa rk19(%rip), %xmm10
- movdqa %xmm5, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm5
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- xorps %xmm5, %xmm7
-
- movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2
- #imm value of pclmulqdq instruction
- #will determine which constant to use
- movdqa %xmm6, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm6
- pclmulqdq $0x0 , %xmm10, %xmm8
- pxor %xmm8, %xmm7
- pxor %xmm6, %xmm7
-
-
- # instead of 64, we add 48 to the loop counter to save 1 instruction
- # from the loop instead of a cmp instruction, we use the negative
- # flag with the jl instruction
- add $128-16, arg3
- jl _final_reduction_for_128
-
- # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7
- # and the rest is in memory. We can fold 16 bytes at a time if y>=16
- # continue folding 16B at a time
-
-_16B_reduction_loop:
+ movdqa .Lbswap_mask(%rip), BSWAP_MASK
+
+ # For sizes less than 256 bytes, we can't fold 128 bytes at a time.
+ cmp $256, len
+ jl .Lless_than_256_bytes
+
+ # Load the first 128 data bytes. Byte swapping is necessary to make the
+ # bit order match the polynomial coefficient order.
+ movdqu 16*0(buf), %xmm0
+ movdqu 16*1(buf), %xmm1
+ movdqu 16*2(buf), %xmm2
+ movdqu 16*3(buf), %xmm3
+ movdqu 16*4(buf), %xmm4
+ movdqu 16*5(buf), %xmm5
+ movdqu 16*6(buf), %xmm6
+ movdqu 16*7(buf), %xmm7
+ add $128, buf
+ pshufb BSWAP_MASK, %xmm0
+ pshufb BSWAP_MASK, %xmm1
+ pshufb BSWAP_MASK, %xmm2
+ pshufb BSWAP_MASK, %xmm3
+ pshufb BSWAP_MASK, %xmm4
+ pshufb BSWAP_MASK, %xmm5
+ pshufb BSWAP_MASK, %xmm6
+ pshufb BSWAP_MASK, %xmm7
+
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm8, %xmm8
+ pinsrw $7, init_crc, %xmm8
+ pxor %xmm8, %xmm0
+
+ movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
+
+ # Subtract 128 for the 128 data bytes just consumed. Subtract another
+ # 128 to simplify the termination condition of the following loop.
+ sub $256, len
+
+ # While >= 128 data bytes remain (not counting xmm0-7), fold the 128
+ # bytes xmm0-7 into them, storing the result back into xmm0-7.
+.Lfold_128_bytes_loop:
+ fold_32_bytes 0, %xmm0, %xmm1
+ fold_32_bytes 32, %xmm2, %xmm3
+ fold_32_bytes 64, %xmm4, %xmm5
+ fold_32_bytes 96, %xmm6, %xmm7
+ add $128, buf
+ sub $128, len
+ jge .Lfold_128_bytes_loop
+
+ # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
+
+ # Fold across 64 bytes.
+ movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm0, %xmm4
+ fold_16_bytes %xmm1, %xmm5
+ fold_16_bytes %xmm2, %xmm6
+ fold_16_bytes %xmm3, %xmm7
+ # Fold across 32 bytes.
+ movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm4, %xmm6
+ fold_16_bytes %xmm5, %xmm7
+ # Fold across 16 bytes.
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ fold_16_bytes %xmm6, %xmm7
+
+ # Add 128 to get the correct number of data bytes remaining in 0...127
+ # (not counting xmm7), following the previous extra subtraction by 128.
+ # Then subtract 16 to simplify the termination condition of the
+ # following loop.
+ add $128-16, len
+
+ # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
+ # xmm7 into them, storing the result back into xmm7.
+ jl .Lfold_16_bytes_loop_done
+.Lfold_16_bytes_loop:
movdqa %xmm7, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm7
- pclmulqdq $0x0 , %xmm10, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
pxor %xmm8, %xmm7
- movdqu (arg2), %xmm0
- pshufb %xmm11, %xmm0
+ movdqu (buf), %xmm0
+ pshufb BSWAP_MASK, %xmm0
pxor %xmm0 , %xmm7
- add $16, arg2
- sub $16, arg3
- # instead of a cmp instruction, we utilize the flags with the
- # jge instruction equivalent of: cmp arg3, 16-16
- # check if there is any more 16B in the buffer to be able to fold
- jge _16B_reduction_loop
-
- #now we have 16+z bytes left to reduce, where 0<= z < 16.
- #first, we reduce the data in the xmm7 register
-
-
-_final_reduction_for_128:
- # check if any more data to fold. If not, compute the CRC of
- # the final 128 bits
- add $16, arg3
- je _128_done
-
- # here we are getting data that is less than 16 bytes.
- # since we know that there was data before the pointer, we can
- # offset the input pointer before the actual point, to receive
- # exactly 16 bytes. after that the registers need to be adjusted.
-_get_last_two_xmms:
+ add $16, buf
+ sub $16, len
+ jge .Lfold_16_bytes_loop
+
+.Lfold_16_bytes_loop_done:
+ # Add 16 to get the correct number of data bytes remaining in 0...15
+ # (not counting xmm7), following the previous extra subtraction by 16.
+ add $16, len
+ je .Lreduce_final_16_bytes
+
+.Lhandle_partial_segment:
+ # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
+ # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do
+ # this without needing a fold constant for each possible 'len', redivide
+ # the bytes into a first chunk of 'len' bytes and a second chunk of 16
+ # bytes, then fold the first chunk into the second.
+
movdqa %xmm7, %xmm2
- movdqu -16(arg2, arg3), %xmm1
- pshufb %xmm11, %xmm1
+ # xmm1 = last 16 original data bytes
+ movdqu -16(buf, len), %xmm1
+ pshufb BSWAP_MASK, %xmm1
- # get rid of the extra data that was loaded before
- # load the shift constant
- lea pshufb_shf_table+16(%rip), %rax
- sub arg3, %rax
+ # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
+ lea .Lbyteshift_table+16(%rip), %rax
+ sub len, %rax
movdqu (%rax), %xmm0
-
- # shift xmm2 to the left by arg3 bytes
pshufb %xmm0, %xmm2
- # shift xmm7 to the right by 16-arg3 bytes
- pxor mask1(%rip), %xmm0
+ # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
+ pxor .Lmask1(%rip), %xmm0
pshufb %xmm0, %xmm7
+
+ # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
+ # then '16-len' bytes from xmm2 (high-order bytes).
pblendvb %xmm2, %xmm1 #xmm0 is implicit
- # fold 16 Bytes
- movdqa %xmm1, %xmm2
+ # Fold the first chunk into the second chunk, storing the result in xmm7.
movdqa %xmm7, %xmm8
- pclmulqdq $0x11, %xmm10, %xmm7
- pclmulqdq $0x0 , %xmm10, %xmm8
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm8
pxor %xmm8, %xmm7
- pxor %xmm2, %xmm7
+ pxor %xmm1, %xmm7
-_128_done:
- # compute crc of a 128-bit value
- movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10
- movdqa %xmm7, %xmm0
+.Lreduce_final_16_bytes:
+ # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
- #64b fold
- pclmulqdq $0x1, %xmm10, %xmm7
- pslldq $8 , %xmm0
- pxor %xmm0, %xmm7
+ # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
+ movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS
- #32b fold
+ # Fold the high 64 bits into the low 64 bits, while also multiplying by
+ # x^64. This produces a 128-bit value congruent to x^64 * M(x) and
+ # whose low 48 bits are 0.
movdqa %xmm7, %xmm0
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
+ pslldq $8, %xmm0
+ pxor %xmm0, %xmm7 # + low bits * x^64
- pand mask2(%rip), %xmm0
-
- psrldq $12, %xmm7
- pclmulqdq $0x10, %xmm10, %xmm7
- pxor %xmm0, %xmm7
-
- #barrett reduction
-_barrett:
- movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10
+ # Fold the high 32 bits into the low 96 bits. This produces a 96-bit
+ # value congruent to x^64 * M(x) and whose low 48 bits are 0.
movdqa %xmm7, %xmm0
- pclmulqdq $0x01, %xmm10, %xmm7
- pslldq $4, %xmm7
- pclmulqdq $0x11, %xmm10, %xmm7
+ pand .Lmask2(%rip), %xmm0 # zero high 32 bits
+ psrldq $12, %xmm7 # extract high 32 bits
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
+ pxor %xmm0, %xmm7 # + low bits
- pslldq $4, %xmm7
- pxor %xmm0, %xmm7
- pextrd $1, %xmm7, %eax
+ # Load G(x) and floor(x^48 / G(x)).
+ movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS
-_cleanup:
- # scale the result back to 16 bits
- shr $16, %eax
- mov %rcx, %rsp
+ # Use Barrett reduction to compute the final CRC value.
+ movdqa %xmm7, %xmm0
+ pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
+ psrlq $32, %xmm7 # /= x^32
+ pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x)
+ psrlq $48, %xmm0
+ pxor %xmm7, %xmm0 # + low 16 nonzero bits
+ # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
+
+ pextrw $0, %xmm0, %eax
ret
-########################################################################
-
.align 16
-_less_than_128:
-
- # check if there is enough buffer to be able to fold 16B at a time
- cmp $32, arg3
- jl _less_than_32
- movdqa SHUF_MASK(%rip), %xmm11
+.Lless_than_256_bytes:
+ # Checksumming a buffer of length 16...255 bytes
- # now if there is, load the constants
- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
+ # Load the first 16 data bytes.
+ movdqu (buf), %xmm7
+ pshufb BSWAP_MASK, %xmm7
+ add $16, buf
- movd arg1_low32, %xmm0 # get the initial crc value
- pslldq $12, %xmm0 # align it to its correct place
- movdqu (arg2), %xmm7 # load the plaintext
- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
+ # XOR the first 16 data *bits* with the initial CRC value.
+ pxor %xmm0, %xmm0
+ pinsrw $7, init_crc, %xmm0
pxor %xmm0, %xmm7
-
- # update the buffer pointer
- add $16, arg2
-
- # update the counter. subtract 32 instead of 16 to save one
- # instruction from the loop
- sub $32, arg3
-
- jmp _16B_reduction_loop
-
-
-.align 16
-_less_than_32:
- # mov initial crc to the return value. this is necessary for
- # zero-length buffers.
- mov arg1_low32, %eax
- test arg3, arg3
- je _cleanup
-
- movdqa SHUF_MASK(%rip), %xmm11
-
- movd arg1_low32, %xmm0 # get the initial crc value
- pslldq $12, %xmm0 # align it to its correct place
-
- cmp $16, arg3
- je _exact_16_left
- jl _less_than_16_left
-
- movdqu (arg2), %xmm7 # load the plaintext
- pshufb %xmm11, %xmm7 # byte-reflect the plaintext
- pxor %xmm0 , %xmm7 # xor the initial crc value
- add $16, arg2
- sub $16, arg3
- movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10
- jmp _get_last_two_xmms
-
-
-.align 16
-_less_than_16_left:
- # use stack space to load data less than 16 bytes, zero-out
- # the 16B in memory first.
-
- pxor %xmm1, %xmm1
- mov %rsp, %r11
- movdqa %xmm1, (%r11)
-
- cmp $4, arg3
- jl _only_less_than_4
-
- # backup the counter value
- mov arg3, %r9
- cmp $8, arg3
- jl _less_than_8_left
-
- # load 8 Bytes
- mov (arg2), %rax
- mov %rax, (%r11)
- add $8, %r11
- sub $8, arg3
- add $8, arg2
-_less_than_8_left:
-
- cmp $4, arg3
- jl _less_than_4_left
-
- # load 4 Bytes
- mov (arg2), %eax
- mov %eax, (%r11)
- add $4, %r11
- sub $4, arg3
- add $4, arg2
-_less_than_4_left:
-
- cmp $2, arg3
- jl _less_than_2_left
-
- # load 2 Bytes
- mov (arg2), %ax
- mov %ax, (%r11)
- add $2, %r11
- sub $2, arg3
- add $2, arg2
-_less_than_2_left:
- cmp $1, arg3
- jl _zero_left
-
- # load 1 Byte
- mov (arg2), %al
- mov %al, (%r11)
-_zero_left:
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- # shl r9, 4
- lea pshufb_shf_table+16(%rip), %rax
- sub %r9, %rax
- movdqu (%rax), %xmm0
- pxor mask1(%rip), %xmm0
-
- pshufb %xmm0, %xmm7
- jmp _128_done
-
-.align 16
-_exact_16_left:
- movdqu (arg2), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- jmp _128_done
-
-_only_less_than_4:
- cmp $3, arg3
- jl _only_less_than_3
-
- # load 3 Bytes
- mov (arg2), %al
- mov %al, (%r11)
-
- mov 1(arg2), %al
- mov %al, 1(%r11)
-
- mov 2(arg2), %al
- mov %al, 2(%r11)
-
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- psrldq $5, %xmm7
-
- jmp _barrett
-_only_less_than_3:
- cmp $2, arg3
- jl _only_less_than_2
-
- # load 2 Bytes
- mov (arg2), %al
- mov %al, (%r11)
-
- mov 1(arg2), %al
- mov %al, 1(%r11)
-
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- psrldq $6, %xmm7
-
- jmp _barrett
-_only_less_than_2:
-
- # load 1 Byte
- mov (arg2), %al
- mov %al, (%r11)
-
- movdqa (%rsp), %xmm7
- pshufb %xmm11, %xmm7
- pxor %xmm0 , %xmm7 # xor the initial crc value
-
- psrldq $7, %xmm7
-
- jmp _barrett
-
+ movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
+ cmp $16, len
+ je .Lreduce_final_16_bytes # len == 16
+ sub $32, len
+ jge .Lfold_16_bytes_loop # 32 <= len <= 255
+ add $16, len
+ jmp .Lhandle_partial_segment # 17 <= len <= 31
ENDPROC(crc_t10dif_pcl)
.section .rodata, "a", @progbits
.align 16
-# precomputed constants
-# these constants are precomputed from the poly:
-# 0x8bb70000 (0x8bb7 scaled to 32 bits)
-# Q = 0x18BB70000
-# rk1 = 2^(32*3) mod Q << 32
-# rk2 = 2^(32*5) mod Q << 32
-# rk3 = 2^(32*15) mod Q << 32
-# rk4 = 2^(32*17) mod Q << 32
-# rk5 = 2^(32*3) mod Q << 32
-# rk6 = 2^(32*2) mod Q << 32
-# rk7 = floor(2^64/Q)
-# rk8 = Q
-rk1:
-.quad 0x2d56000000000000
-rk2:
-.quad 0x06df000000000000
-rk3:
-.quad 0x9d9d000000000000
-rk4:
-.quad 0x7cf5000000000000
-rk5:
-.quad 0x2d56000000000000
-rk6:
-.quad 0x1368000000000000
-rk7:
-.quad 0x00000001f65a57f8
-rk8:
-.quad 0x000000018bb70000
-
-rk9:
-.quad 0xceae000000000000
-rk10:
-.quad 0xbfd6000000000000
-rk11:
-.quad 0x1e16000000000000
-rk12:
-.quad 0x713c000000000000
-rk13:
-.quad 0xf7f9000000000000
-rk14:
-.quad 0x80a6000000000000
-rk15:
-.quad 0x044c000000000000
-rk16:
-.quad 0xe658000000000000
-rk17:
-.quad 0xad18000000000000
-rk18:
-.quad 0xa497000000000000
-rk19:
-.quad 0x6ee3000000000000
-rk20:
-.quad 0xe7b5000000000000
-
+# Fold constants precomputed from the polynomial 0x18bb7
+# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
+.Lfold_across_128_bytes_consts:
+ .quad 0x0000000000006123 # x^(8*128) mod G(x)
+ .quad 0x0000000000002295 # x^(8*128+64) mod G(x)
+.Lfold_across_64_bytes_consts:
+ .quad 0x0000000000001069 # x^(4*128) mod G(x)
+ .quad 0x000000000000dd31 # x^(4*128+64) mod G(x)
+.Lfold_across_32_bytes_consts:
+ .quad 0x000000000000857d # x^(2*128) mod G(x)
+ .quad 0x0000000000007acc # x^(2*128+64) mod G(x)
+.Lfold_across_16_bytes_consts:
+ .quad 0x000000000000a010 # x^(1*128) mod G(x)
+ .quad 0x0000000000001faa # x^(1*128+64) mod G(x)
+.Lfinal_fold_consts:
+ .quad 0x1368000000000000 # x^48 * (x^48 mod G(x))
+ .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x))
+.Lbarrett_reduction_consts:
+ .quad 0x0000000000018bb7 # G(x)
+ .quad 0x00000001f65a57f8 # floor(x^48 / G(x))
.section .rodata.cst16.mask1, "aM", @progbits, 16
.align 16
-mask1:
-.octa 0x80808080808080808080808080808080
+.Lmask1:
+ .octa 0x80808080808080808080808080808080
.section .rodata.cst16.mask2, "aM", @progbits, 16
.align 16
-mask2:
-.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+.Lmask2:
+ .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
+
+.section .rodata.cst16.bswap_mask, "aM", @progbits, 16
+.align 16
+.Lbswap_mask:
+ .octa 0x000102030405060708090A0B0C0D0E0F
-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
+.section .rodata.cst32.byteshift_table, "aM", @progbits, 32
.align 16
-SHUF_MASK:
-.octa 0x000102030405060708090A0B0C0D0E0F
-
-.section .rodata.cst32.pshufb_shf_table, "aM", @progbits, 32
-.align 32
-pshufb_shf_table:
-# use these values for shift constants for the pshufb instruction
-# different alignments result in values as shown:
-# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
-# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
-# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
-# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
-# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
-# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
-# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
-# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
-# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
-# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
-# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
-# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
-# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
-# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
-# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
-.octa 0x8f8e8d8c8b8a89888786858483828100
-.octa 0x000e0d0c0b0a09080706050403020100
+# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
+# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
+# 0x80} XOR the index vector to shift right by '16 - len' bytes.
+.Lbyteshift_table:
+ .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
+ .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
+ .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
+ .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index cd4df93225014..0e785c0b23542 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -33,18 +33,12 @@
#include <asm/cpufeatures.h>
#include <asm/cpu_device_id.h>
-asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf,
- size_t len);
+asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len);
struct chksum_desc_ctx {
__u16 crc;
};
-/*
- * Steps through buffer one byte at at time, calculates reflected
- * crc using table.
- */
-
static int chksum_init(struct shash_desc *desc)
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
@@ -59,7 +53,7 @@ static int chksum_update(struct shash_desc *desc, const u8 *data,
{
struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
- if (irq_fpu_usable()) {
+ if (length >= 16 && irq_fpu_usable()) {
kernel_fpu_begin();
ctx->crc = crc_t10dif_pcl(ctx->crc, data, length);
kernel_fpu_end();
@@ -79,7 +73,7 @@ static int chksum_final(struct shash_desc *desc, u8 *out)
static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
- if (irq_fpu_usable()) {
+ if (len >= 16 && irq_fpu_usable()) {
kernel_fpu_begin();
*(__u16 *)out = crc_t10dif_pcl(*crcp, data, len);
kernel_fpu_end();
diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c
index 0dccdda1eb3a1..7e600f8bcdad8 100644
--- a/arch/x86/crypto/morus1280_glue.c
+++ b/arch/x86/crypto/morus1280_glue.c
@@ -85,31 +85,20 @@ static void crypto_morus1280_glue_process_ad(
static void crypto_morus1280_glue_process_crypt(struct morus1280_state *state,
struct morus1280_ops ops,
- struct aead_request *req)
+ struct skcipher_walk *walk)
{
- struct skcipher_walk walk;
- u8 *cursor_src, *cursor_dst;
- unsigned int chunksize, base;
-
- ops.skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- cursor_src = walk.src.virt.addr;
- cursor_dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
-
- base = chunksize & ~(MORUS1280_BLOCK_SIZE - 1);
- cursor_src += base;
- cursor_dst += base;
- chunksize &= MORUS1280_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops.crypt_tail(state, cursor_src, cursor_dst,
- chunksize);
+ while (walk->nbytes >= MORUS1280_BLOCK_SIZE) {
+ ops.crypt_blocks(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes,
+ MORUS1280_BLOCK_SIZE));
+ skcipher_walk_done(walk, walk->nbytes % MORUS1280_BLOCK_SIZE);
+ }
- skcipher_walk_done(&walk, 0);
+ if (walk->nbytes) {
+ ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr,
+ walk->nbytes);
+ skcipher_walk_done(walk, 0);
}
}
@@ -147,12 +136,15 @@ static void crypto_morus1280_glue_crypt(struct aead_request *req,
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct morus1280_ctx *ctx = crypto_aead_ctx(tfm);
struct morus1280_state state;
+ struct skcipher_walk walk;
+
+ ops.skcipher_walk_init(&walk, req, true);
kernel_fpu_begin();
ctx->ops->init(&state, &ctx->key, req->iv);
crypto_morus1280_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
- crypto_morus1280_glue_process_crypt(&state, ops, req);
+ crypto_morus1280_glue_process_crypt(&state, ops, &walk);
ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c
index 7b58fe4d9bd1a..cb3a817320160 100644
--- a/arch/x86/crypto/morus640_glue.c
+++ b/arch/x86/crypto/morus640_glue.c
@@ -85,31 +85,19 @@ static void crypto_morus640_glue_process_ad(
static void crypto_morus640_glue_process_crypt(struct morus640_state *state,
struct morus640_ops ops,
- struct aead_request *req)
+ struct skcipher_walk *walk)
{
- struct skcipher_walk walk;
- u8 *cursor_src, *cursor_dst;
- unsigned int chunksize, base;
-
- ops.skcipher_walk_init(&walk, req, false);
-
- while (walk.nbytes) {
- cursor_src = walk.src.virt.addr;
- cursor_dst = walk.dst.virt.addr;
- chunksize = walk.nbytes;
-
- ops.crypt_blocks(state, cursor_src, cursor_dst, chunksize);
-
- base = chunksize & ~(MORUS640_BLOCK_SIZE - 1);
- cursor_src += base;
- cursor_dst += base;
- chunksize &= MORUS640_BLOCK_SIZE - 1;
-
- if (chunksize > 0)
- ops.crypt_tail(state, cursor_src, cursor_dst,
- chunksize);
+ while (walk->nbytes >= MORUS640_BLOCK_SIZE) {
+ ops.crypt_blocks(state, walk->src.virt.addr,
+ walk->dst.virt.addr,
+ round_down(walk->nbytes, MORUS640_BLOCK_SIZE));
+ skcipher_walk_done(walk, walk->nbytes % MORUS640_BLOCK_SIZE);
+ }
- skcipher_walk_done(&walk, 0);
+ if (walk->nbytes) {
+ ops.crypt_tail(state, walk->src.virt.addr, walk->dst.virt.addr,
+ walk->nbytes);
+ skcipher_walk_done(walk, 0);
}
}
@@ -143,12 +131,15 @@ static void crypto_morus640_glue_crypt(struct aead_request *req,
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
struct morus640_ctx *ctx = crypto_aead_ctx(tfm);
struct morus640_state state;
+ struct skcipher_walk walk;
+
+ ops.skcipher_walk_init(&walk, req, true);
kernel_fpu_begin();
ctx->ops->init(&state, &ctx->key, req->iv);
crypto_morus640_glue_process_ad(&state, ctx->ops, req->src, req->assoclen);
- crypto_morus640_glue_process_crypt(&state, ops, req);
+ crypto_morus640_glue_process_crypt(&state, ops, &walk);
ctx->ops->final(&state, tag_xor, req->assoclen, cryptlen);
kernel_fpu_end();
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
index c88c670cb5fc6..e6add74d78a59 100644
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -272,6 +272,10 @@ ENTRY(poly1305_block_sse2)
dec %rcx
jnz .Ldoblock
+ # Zeroing of key material
+ mov %rcx,0x00(%rsp)
+ mov %rcx,0x08(%rsp)
+
add $0x10,%rsp
pop %r12
pop %rbx
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 8eaf8952c408c..39913770a44d5 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -361,7 +361,8 @@ ENTRY(entry_INT80_compat)
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
- movq %rsp, %rdi
+ /* In the Xen PV case we already run on the thread stack. */
+ ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 6*8(%rdi) /* regs->ss */
@@ -370,8 +371,9 @@ ENTRY(entry_INT80_compat)
pushq 3*8(%rdi) /* regs->cs */
pushq 2*8(%rdi) /* regs->ip */
pushq 1*8(%rdi) /* regs->orig_ax */
-
pushq (%rdi) /* pt_regs->di */
+.Lint80_keep_stack:
+
pushq %rsi /* pt_regs->si */
xorl %esi, %esi /* nospec si */
pushq %rdx /* pt_regs->dx */
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 3cf7b533b3d13..1f9607ed087c0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -24,7 +24,7 @@
10 i386 unlink sys_unlink __ia32_sys_unlink
11 i386 execve sys_execve __ia32_compat_sys_execve
12 i386 chdir sys_chdir __ia32_sys_chdir
-13 i386 time sys_time __ia32_compat_sys_time
+13 i386 time sys_time32 __ia32_sys_time32
14 i386 mknod sys_mknod __ia32_sys_mknod
15 i386 chmod sys_chmod __ia32_sys_chmod
16 i386 lchown sys_lchown16 __ia32_sys_lchown16
@@ -36,12 +36,12 @@
22 i386 umount sys_oldumount __ia32_sys_oldumount
23 i386 setuid sys_setuid16 __ia32_sys_setuid16
24 i386 getuid sys_getuid16 __ia32_sys_getuid16
-25 i386 stime sys_stime __ia32_compat_sys_stime
+25 i386 stime sys_stime32 __ia32_sys_stime32
26 i386 ptrace sys_ptrace __ia32_compat_sys_ptrace
27 i386 alarm sys_alarm __ia32_sys_alarm
28 i386 oldfstat sys_fstat __ia32_sys_fstat
29 i386 pause sys_pause __ia32_sys_pause
-30 i386 utime sys_utime __ia32_compat_sys_utime
+30 i386 utime sys_utime32 __ia32_sys_utime32
31 i386 stty
32 i386 gtty
33 i386 access sys_access __ia32_sys_access
@@ -135,7 +135,7 @@
121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname
122 i386 uname sys_newuname __ia32_sys_newuname
123 i386 modify_ldt sys_modify_ldt __ia32_sys_modify_ldt
-124 i386 adjtimex sys_adjtimex __ia32_compat_sys_adjtimex
+124 i386 adjtimex sys_adjtimex_time32 __ia32_sys_adjtimex_time32
125 i386 mprotect sys_mprotect __ia32_sys_mprotect
126 i386 sigprocmask sys_sigprocmask __ia32_compat_sys_sigprocmask
127 i386 create_module
@@ -172,8 +172,8 @@
158 i386 sched_yield sys_sched_yield __ia32_sys_sched_yield
159 i386 sched_get_priority_max sys_sched_get_priority_max __ia32_sys_sched_get_priority_max
160 i386 sched_get_priority_min sys_sched_get_priority_min __ia32_sys_sched_get_priority_min
-161 i386 sched_rr_get_interval sys_sched_rr_get_interval __ia32_compat_sys_sched_rr_get_interval
-162 i386 nanosleep sys_nanosleep __ia32_compat_sys_nanosleep
+161 i386 sched_rr_get_interval sys_sched_rr_get_interval_time32 __ia32_sys_sched_rr_get_interval_time32
+162 i386 nanosleep sys_nanosleep_time32 __ia32_sys_nanosleep_time32
163 i386 mremap sys_mremap __ia32_sys_mremap
164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16
165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16
@@ -188,7 +188,7 @@
174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction
175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_sys_rt_sigprocmask
176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending
-177 i386 rt_sigtimedwait sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait
+177 i386 rt_sigtimedwait sys_rt_sigtimedwait_time32 __ia32_compat_sys_rt_sigtimedwait_time32
178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo __ia32_compat_sys_rt_sigqueueinfo
179 i386 rt_sigsuspend sys_rt_sigsuspend __ia32_sys_rt_sigsuspend
180 i386 pread64 sys_pread64 __ia32_compat_sys_x86_pread
@@ -251,14 +251,14 @@
237 i386 fremovexattr sys_fremovexattr __ia32_sys_fremovexattr
238 i386 tkill sys_tkill __ia32_sys_tkill
239 i386 sendfile64 sys_sendfile64 __ia32_sys_sendfile64
-240 i386 futex sys_futex __ia32_compat_sys_futex
+240 i386 futex sys_futex_time32 __ia32_sys_futex_time32
241 i386 sched_setaffinity sys_sched_setaffinity __ia32_compat_sys_sched_setaffinity
242 i386 sched_getaffinity sys_sched_getaffinity __ia32_compat_sys_sched_getaffinity
243 i386 set_thread_area sys_set_thread_area __ia32_sys_set_thread_area
244 i386 get_thread_area sys_get_thread_area __ia32_sys_get_thread_area
245 i386 io_setup sys_io_setup __ia32_compat_sys_io_setup
246 i386 io_destroy sys_io_destroy __ia32_sys_io_destroy
-247 i386 io_getevents sys_io_getevents __ia32_compat_sys_io_getevents
+247 i386 io_getevents sys_io_getevents_time32 __ia32_sys_io_getevents_time32
248 i386 io_submit sys_io_submit __ia32_compat_sys_io_submit
249 i386 io_cancel sys_io_cancel __ia32_sys_io_cancel
250 i386 fadvise64 sys_fadvise64 __ia32_compat_sys_x86_fadvise64
@@ -271,18 +271,18 @@
257 i386 remap_file_pages sys_remap_file_pages __ia32_sys_remap_file_pages
258 i386 set_tid_address sys_set_tid_address __ia32_sys_set_tid_address
259 i386 timer_create sys_timer_create __ia32_compat_sys_timer_create
-260 i386 timer_settime sys_timer_settime __ia32_compat_sys_timer_settime
-261 i386 timer_gettime sys_timer_gettime __ia32_compat_sys_timer_gettime
+260 i386 timer_settime sys_timer_settime32 __ia32_sys_timer_settime32
+261 i386 timer_gettime sys_timer_gettime32 __ia32_sys_timer_gettime32
262 i386 timer_getoverrun sys_timer_getoverrun __ia32_sys_timer_getoverrun
263 i386 timer_delete sys_timer_delete __ia32_sys_timer_delete
-264 i386 clock_settime sys_clock_settime __ia32_compat_sys_clock_settime
-265 i386 clock_gettime sys_clock_gettime __ia32_compat_sys_clock_gettime
-266 i386 clock_getres sys_clock_getres __ia32_compat_sys_clock_getres
-267 i386 clock_nanosleep sys_clock_nanosleep __ia32_compat_sys_clock_nanosleep
+264 i386 clock_settime sys_clock_settime32 __ia32_sys_clock_settime32
+265 i386 clock_gettime sys_clock_gettime32 __ia32_sys_clock_gettime32
+266 i386 clock_getres sys_clock_getres_time32 __ia32_sys_clock_getres_time32
+267 i386 clock_nanosleep sys_clock_nanosleep_time32 __ia32_sys_clock_nanosleep_time32
268 i386 statfs64 sys_statfs64 __ia32_compat_sys_statfs64
269 i386 fstatfs64 sys_fstatfs64 __ia32_compat_sys_fstatfs64
270 i386 tgkill sys_tgkill __ia32_sys_tgkill
-271 i386 utimes sys_utimes __ia32_compat_sys_utimes
+271 i386 utimes sys_utimes_time32 __ia32_sys_utimes_time32
272 i386 fadvise64_64 sys_fadvise64_64 __ia32_compat_sys_x86_fadvise64_64
273 i386 vserver
274 i386 mbind sys_mbind __ia32_sys_mbind
@@ -290,8 +290,8 @@
276 i386 set_mempolicy sys_set_mempolicy __ia32_sys_set_mempolicy
277 i386 mq_open sys_mq_open __ia32_compat_sys_mq_open
278 i386 mq_unlink sys_mq_unlink __ia32_sys_mq_unlink
-279 i386 mq_timedsend sys_mq_timedsend __ia32_compat_sys_mq_timedsend
-280 i386 mq_timedreceive sys_mq_timedreceive __ia32_compat_sys_mq_timedreceive
+279 i386 mq_timedsend sys_mq_timedsend_time32 __ia32_sys_mq_timedsend_time32
+280 i386 mq_timedreceive sys_mq_timedreceive_time32 __ia32_sys_mq_timedreceive_time32
281 i386 mq_notify sys_mq_notify __ia32_compat_sys_mq_notify
282 i386 mq_getsetattr sys_mq_getsetattr __ia32_compat_sys_mq_getsetattr
283 i386 kexec_load sys_kexec_load __ia32_compat_sys_kexec_load
@@ -310,7 +310,7 @@
296 i386 mkdirat sys_mkdirat __ia32_sys_mkdirat
297 i386 mknodat sys_mknodat __ia32_sys_mknodat
298 i386 fchownat sys_fchownat __ia32_sys_fchownat
-299 i386 futimesat sys_futimesat __ia32_compat_sys_futimesat
+299 i386 futimesat sys_futimesat_time32 __ia32_sys_futimesat_time32
300 i386 fstatat64 sys_fstatat64 __ia32_compat_sys_x86_fstatat
301 i386 unlinkat sys_unlinkat __ia32_sys_unlinkat
302 i386 renameat sys_renameat __ia32_sys_renameat
@@ -319,8 +319,8 @@
305 i386 readlinkat sys_readlinkat __ia32_sys_readlinkat
306 i386 fchmodat sys_fchmodat __ia32_sys_fchmodat
307 i386 faccessat sys_faccessat __ia32_sys_faccessat
-308 i386 pselect6 sys_pselect6 __ia32_compat_sys_pselect6
-309 i386 ppoll sys_ppoll __ia32_compat_sys_ppoll
+308 i386 pselect6 sys_pselect6_time32 __ia32_compat_sys_pselect6_time32
+309 i386 ppoll sys_ppoll_time32 __ia32_compat_sys_ppoll_time32
310 i386 unshare sys_unshare __ia32_sys_unshare
311 i386 set_robust_list sys_set_robust_list __ia32_compat_sys_set_robust_list
312 i386 get_robust_list sys_get_robust_list __ia32_compat_sys_get_robust_list
@@ -331,13 +331,13 @@
317 i386 move_pages sys_move_pages __ia32_compat_sys_move_pages
318 i386 getcpu sys_getcpu __ia32_sys_getcpu
319 i386 epoll_pwait sys_epoll_pwait __ia32_sys_epoll_pwait
-320 i386 utimensat sys_utimensat __ia32_compat_sys_utimensat
+320 i386 utimensat sys_utimensat_time32 __ia32_sys_utimensat_time32
321 i386 signalfd sys_signalfd __ia32_compat_sys_signalfd
322 i386 timerfd_create sys_timerfd_create __ia32_sys_timerfd_create
323 i386 eventfd sys_eventfd __ia32_sys_eventfd
324 i386 fallocate sys_fallocate __ia32_compat_sys_x86_fallocate
-325 i386 timerfd_settime sys_timerfd_settime __ia32_compat_sys_timerfd_settime
-326 i386 timerfd_gettime sys_timerfd_gettime __ia32_compat_sys_timerfd_gettime
+325 i386 timerfd_settime sys_timerfd_settime32 __ia32_sys_timerfd_settime32
+326 i386 timerfd_gettime sys_timerfd_gettime32 __ia32_sys_timerfd_gettime32
327 i386 signalfd4 sys_signalfd4 __ia32_compat_sys_signalfd4
328 i386 eventfd2 sys_eventfd2 __ia32_sys_eventfd2
329 i386 epoll_create1 sys_epoll_create1 __ia32_sys_epoll_create1
@@ -348,13 +348,13 @@
334 i386 pwritev sys_pwritev __ia32_compat_sys_pwritev
335 i386 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo __ia32_compat_sys_rt_tgsigqueueinfo
336 i386 perf_event_open sys_perf_event_open __ia32_sys_perf_event_open
-337 i386 recvmmsg sys_recvmmsg __ia32_compat_sys_recvmmsg
+337 i386 recvmmsg sys_recvmmsg_time32 __ia32_compat_sys_recvmmsg_time32
338 i386 fanotify_init sys_fanotify_init __ia32_sys_fanotify_init
339 i386 fanotify_mark sys_fanotify_mark __ia32_compat_sys_fanotify_mark
340 i386 prlimit64 sys_prlimit64 __ia32_sys_prlimit64
341 i386 name_to_handle_at sys_name_to_handle_at __ia32_sys_name_to_handle_at
342 i386 open_by_handle_at sys_open_by_handle_at __ia32_compat_sys_open_by_handle_at
-343 i386 clock_adjtime sys_clock_adjtime __ia32_compat_sys_clock_adjtime
+343 i386 clock_adjtime sys_clock_adjtime32 __ia32_sys_clock_adjtime32
344 i386 syncfs sys_syncfs __ia32_sys_syncfs
345 i386 sendmmsg sys_sendmmsg __ia32_compat_sys_sendmmsg
346 i386 setns sys_setns __ia32_sys_setns
@@ -396,5 +396,40 @@
382 i386 pkey_free sys_pkey_free __ia32_sys_pkey_free
383 i386 statx sys_statx __ia32_sys_statx
384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl
-385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents
+385 i386 io_pgetevents sys_io_pgetevents_time32 __ia32_compat_sys_io_pgetevents
386 i386 rseq sys_rseq __ia32_sys_rseq
+# don't use numbers 387 through 392, add new calls at the end
+393 i386 semget sys_semget __ia32_sys_semget
+394 i386 semctl sys_semctl __ia32_compat_sys_semctl
+395 i386 shmget sys_shmget __ia32_sys_shmget
+396 i386 shmctl sys_shmctl __ia32_compat_sys_shmctl
+397 i386 shmat sys_shmat __ia32_compat_sys_shmat
+398 i386 shmdt sys_shmdt __ia32_sys_shmdt
+399 i386 msgget sys_msgget __ia32_sys_msgget
+400 i386 msgsnd sys_msgsnd __ia32_compat_sys_msgsnd
+401 i386 msgrcv sys_msgrcv __ia32_compat_sys_msgrcv
+402 i386 msgctl sys_msgctl __ia32_compat_sys_msgctl
+403 i386 clock_gettime64 sys_clock_gettime __ia32_sys_clock_gettime
+404 i386 clock_settime64 sys_clock_settime __ia32_sys_clock_settime
+405 i386 clock_adjtime64 sys_clock_adjtime __ia32_sys_clock_adjtime
+406 i386 clock_getres_time64 sys_clock_getres __ia32_sys_clock_getres
+407 i386 clock_nanosleep_time64 sys_clock_nanosleep __ia32_sys_clock_nanosleep
+408 i386 timer_gettime64 sys_timer_gettime __ia32_sys_timer_gettime
+409 i386 timer_settime64 sys_timer_settime __ia32_sys_timer_settime
+410 i386 timerfd_gettime64 sys_timerfd_gettime __ia32_sys_timerfd_gettime
+411 i386 timerfd_settime64 sys_timerfd_settime __ia32_sys_timerfd_settime
+412 i386 utimensat_time64 sys_utimensat __ia32_sys_utimensat
+413 i386 pselect6_time64 sys_pselect6 __ia32_compat_sys_pselect6_time64
+414 i386 ppoll_time64 sys_ppoll __ia32_compat_sys_ppoll_time64
+416 i386 io_pgetevents_time64 sys_io_pgetevents __ia32_sys_io_pgetevents
+417 i386 recvmmsg_time64 sys_recvmmsg __ia32_compat_sys_recvmmsg_time64
+418 i386 mq_timedsend_time64 sys_mq_timedsend __ia32_sys_mq_timedsend
+419 i386 mq_timedreceive_time64 sys_mq_timedreceive __ia32_sys_mq_timedreceive
+420 i386 semtimedop_time64 sys_semtimedop __ia32_sys_semtimedop
+421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64
+422 i386 futex_time64 sys_futex __ia32_sys_futex
+423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval
+424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal
+425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
+426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter
+427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb2..92ee0b4378d4c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,12 @@
332 common statx __x64_sys_statx
333 common io_pgetevents __x64_sys_io_pgetevents
334 common rseq __x64_sys_rseq
+# don't use numbers 387 through 423, add new calls after the last
+# 'common' entry
+424 common pidfd_send_signal __x64_sys_pidfd_send_signal
+425 common io_uring_setup __x64_sys_io_uring_setup
+426 common io_uring_enter __x64_sys_io_uring_enter
+427 common io_uring_register __x64_sys_io_uring_register
#
# x32-specific system call numbers start at 512 to avoid cache impact
@@ -361,7 +367,7 @@
520 x32 execve __x32_compat_sys_execve/ptregs
521 x32 ptrace __x32_compat_sys_ptrace
522 x32 rt_sigpending __x32_compat_sys_rt_sigpending
-523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait
+523 x32 rt_sigtimedwait __x32_compat_sys_rt_sigtimedwait_time64
524 x32 rt_sigqueueinfo __x32_compat_sys_rt_sigqueueinfo
525 x32 sigaltstack __x32_compat_sys_sigaltstack
526 x32 timer_create __x32_compat_sys_timer_create
@@ -375,7 +381,7 @@
534 x32 preadv __x32_compat_sys_preadv64
535 x32 pwritev __x32_compat_sys_pwritev64
536 x32 rt_tgsigqueueinfo __x32_compat_sys_rt_tgsigqueueinfo
-537 x32 recvmmsg __x32_compat_sys_recvmmsg
+537 x32 recvmmsg __x32_compat_sys_recvmmsg_time64
538 x32 sendmmsg __x32_compat_sys_sendmmsg
539 x32 process_vm_readv __x32_compat_sys_process_vm_readv
540 x32 process_vm_writev __x32_compat_sys_process_vm_writev
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index d50bb4dc06503..62f317c9113af 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -253,15 +253,6 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
return -EOPNOTSUPP;
}
-static const struct perf_event_attr ibs_notsupp = {
- .exclude_user = 1,
- .exclude_kernel = 1,
- .exclude_hv = 1,
- .exclude_idle = 1,
- .exclude_host = 1,
- .exclude_guest = 1,
-};
-
static int perf_ibs_init(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -282,9 +273,6 @@ static int perf_ibs_init(struct perf_event *event)
if (event->pmu != &perf_ibs->pmu)
return -ENOENT;
- if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
- return -EINVAL;
-
if (config & ~perf_ibs->config_mask)
return -EINVAL;
@@ -537,6 +525,7 @@ static struct perf_ibs perf_ibs_fetch = {
.start = perf_ibs_start,
.stop = perf_ibs_stop,
.read = perf_ibs_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
},
.msr = MSR_AMD64_IBSFETCHCTL,
.config_mask = IBS_FETCH_CONFIG_MASK,
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 3210fee27e7f9..7635c23f7d82e 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -223,11 +223,6 @@ static int perf_iommu_event_init(struct perf_event *event)
if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
return -EINVAL;
- /* IOMMU counters do not have usr/os/guest/host bits */
- if (event->attr.exclude_user || event->attr.exclude_kernel ||
- event->attr.exclude_host || event->attr.exclude_guest)
- return -EINVAL;
-
if (event->cpu < 0)
return -EINVAL;
@@ -414,6 +409,7 @@ static const struct pmu iommu_pmu __initconst = {
.read = perf_iommu_read,
.task_ctx_nr = perf_invalid_context,
.attr_groups = amd_iommu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
};
static __init int init_one_iommu(unsigned int idx)
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
index 2aefacf5c5b2a..c5ff084551c6f 100644
--- a/arch/x86/events/amd/power.c
+++ b/arch/x86/events/amd/power.c
@@ -136,14 +136,7 @@ static int pmu_event_init(struct perf_event *event)
return -ENOENT;
/* Unsupported modes and filters. */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- /* no sampling */
- event->attr.sample_period)
+ if (event->attr.sample_period)
return -EINVAL;
if (cfg != AMD_POWER_EVENTSEL_PKG)
@@ -226,6 +219,7 @@ static struct pmu pmu_class = {
.start = pmu_event_start,
.stop = pmu_event_stop,
.read = pmu_event_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
};
static int power_cpu_exit(unsigned int cpu)
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 398df6eaa1094..79cfd3b30ceb4 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -201,11 +201,6 @@ static int amd_uncore_event_init(struct perf_event *event)
if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
return -EINVAL;
- /* NB and Last level cache counters do not have usr/os/guest/host bits */
- if (event->attr.exclude_user || event->attr.exclude_kernel ||
- event->attr.exclude_host || event->attr.exclude_guest)
- return -EINVAL;
-
/* and we do not enable counter overflow interrupts */
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
hwc->idx = -1;
@@ -307,6 +302,7 @@ static struct pmu amd_nb_pmu = {
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
};
static struct pmu amd_llc_pmu = {
@@ -317,6 +313,7 @@ static struct pmu amd_llc_pmu = {
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
};
static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 374a19712e200..e2b1447192a88 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1995,7 +1995,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
*/
static void free_fake_cpuc(struct cpu_hw_events *cpuc)
{
- kfree(cpuc->shared_regs);
+ intel_cpuc_finish(cpuc);
kfree(cpuc);
}
@@ -2007,14 +2007,11 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
if (!cpuc)
return ERR_PTR(-ENOMEM);
-
- /* only needed, if we have extra_regs */
- if (x86_pmu.extra_regs) {
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- goto error;
- }
cpuc->is_fake = 1;
+
+ if (intel_cpuc_prepare(cpuc, cpu))
+ goto error;
+
return cpuc;
error:
free_fake_cpuc(cpuc);
@@ -2278,6 +2275,19 @@ void perf_check_microcode(void)
x86_pmu.check_microcode();
}
+static int x86_pmu_check_period(struct perf_event *event, u64 value)
+{
+ if (x86_pmu.check_period && x86_pmu.check_period(event, value))
+ return -EINVAL;
+
+ if (value && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, value) > value)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
.pmu_disable = x86_pmu_disable,
@@ -2302,6 +2312,7 @@ static struct pmu pmu = {
.event_idx = x86_pmu_event_idx,
.sched_task = x86_pmu_sched_task,
.task_ctx_size = sizeof(struct x86_perf_task_context),
+ .check_period = x86_pmu_check_period,
};
void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index a01ef1b0f8833..7cdd7b13bbda6 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -77,10 +77,12 @@ static size_t buf_size(struct page *page)
}
static void *
-bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+bts_buffer_setup_aux(struct perf_event *event, void **pages,
+ int nr_pages, bool overwrite)
{
struct bts_buffer *buf;
struct page *page;
+ int cpu = event->cpu;
int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
unsigned long offset;
size_t size = nr_pages << PAGE_SHIFT;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 40e12cfc87f62..8baa441d8000f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -18,6 +18,7 @@
#include <asm/hardirq.h>
#include <asm/intel-family.h>
#include <asm/apic.h>
+#include <asm/cpu_device_id.h>
#include "../perf_event.h"
@@ -1999,6 +2000,39 @@ static void intel_pmu_nhm_enable_all(int added)
intel_pmu_enable_all(added);
}
+static void intel_set_tfa(struct cpu_hw_events *cpuc, bool on)
+{
+ u64 val = on ? MSR_TFA_RTM_FORCE_ABORT : 0;
+
+ if (cpuc->tfa_shadow != val) {
+ cpuc->tfa_shadow = val;
+ wrmsrl(MSR_TSX_FORCE_ABORT, val);
+ }
+}
+
+static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+ /*
+ * We're going to use PMC3, make sure TFA is set before we touch it.
+ */
+ if (cntr == 3 && !cpuc->is_fake)
+ intel_set_tfa(cpuc, true);
+}
+
+static void intel_tfa_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * If we find PMC3 is no longer used when we enable the PMU, we can
+ * clear TFA.
+ */
+ if (!test_bit(3, cpuc->active_mask))
+ intel_set_tfa(cpuc, false);
+
+ intel_pmu_enable_all(added);
+}
+
static void enable_counter_freeze(void)
{
update_debugctlmsr(get_debugctlmsr() |
@@ -2769,6 +2803,35 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
}
static struct event_constraint *
+dyn_constraint(struct cpu_hw_events *cpuc, struct event_constraint *c, int idx)
+{
+ WARN_ON_ONCE(!cpuc->constraint_list);
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+ struct event_constraint *cx;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ *cx = *c;
+
+ /*
+ * mark constraint as dynamic
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ c = cx;
+ }
+
+ return c;
+}
+
+static struct event_constraint *
intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
int idx, struct event_constraint *c)
{
@@ -2798,27 +2861,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
* only needed when constraint has not yet
* been cloned (marked dynamic)
*/
- if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
- struct event_constraint *cx;
-
- /*
- * grab pre-allocated constraint entry
- */
- cx = &cpuc->constraint_list[idx];
-
- /*
- * initialize dynamic constraint
- * with static constraint
- */
- *cx = *c;
-
- /*
- * mark constraint as dynamic, so we
- * can free it later on
- */
- cx->flags |= PERF_X86_EVENT_DYNAMIC;
- c = cx;
- }
+ c = dyn_constraint(cpuc, c, idx);
/*
* From here on, the constraint is dynamic.
@@ -3206,16 +3249,27 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
- /*
- * If PMU counter has PEBS enabled it is not enough to disable counter
- * on a guest entry since PEBS memory write can overshoot guest entry
- * and corrupt guest memory. Disabling PEBS solves the problem.
- */
- arr[1].msr = MSR_IA32_PEBS_ENABLE;
- arr[1].host = cpuc->pebs_enabled;
- arr[1].guest = 0;
+ if (x86_pmu.flags & PMU_FL_PEBS_ALL)
+ arr[0].guest &= ~cpuc->pebs_enabled;
+ else
+ arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK);
+ *nr = 1;
+
+ if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) {
+ /*
+ * If PMU counter has PEBS enabled it is not enough to
+ * disable counter on a guest entry since PEBS memory
+ * write can overshoot guest entry and corrupt guest
+ * memory. Disabling PEBS solves the problem.
+ *
+ * Don't do this if the CPU already enforces it.
+ */
+ arr[1].msr = MSR_IA32_PEBS_ENABLE;
+ arr[1].host = cpuc->pebs_enabled;
+ arr[1].guest = 0;
+ *nr = 2;
+ }
- *nr = 2;
return arr;
}
@@ -3345,6 +3399,26 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return c;
}
+static bool allow_tsx_force_abort = true;
+
+static struct event_constraint *
+tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c = hsw_get_event_constraints(cpuc, idx, event);
+
+ /*
+ * Without TFA we must not use PMC3.
+ */
+ if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) {
+ c = dyn_constraint(cpuc, c, idx);
+ c->idxmsk64 &= ~(1ULL << 3);
+ c->weight--;
+ }
+
+ return c;
+}
+
/*
* Broadwell:
*
@@ -3398,7 +3472,7 @@ ssize_t intel_event_sysfs_show(char *page, u64 config)
return x86_event_sysfs_show(page, config, event);
}
-struct intel_shared_regs *allocate_shared_regs(int cpu)
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
{
struct intel_shared_regs *regs;
int i;
@@ -3430,23 +3504,24 @@ static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
return c;
}
-static int intel_pmu_cpu_prepare(int cpu)
-{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
goto err;
}
- if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ if (x86_pmu.flags & (PMU_FL_EXCL_CNTRS | PMU_FL_TFA)) {
size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
- cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+ cpuc->constraint_list = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu));
if (!cpuc->constraint_list)
goto err_shared_regs;
+ }
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
if (!cpuc->excl_cntrs)
goto err_constraint_list;
@@ -3468,6 +3543,11 @@ err:
return -ENOMEM;
}
+static int intel_pmu_cpu_prepare(int cpu)
+{
+ return intel_cpuc_prepare(&per_cpu(cpu_hw_events, cpu), cpu);
+}
+
static void flip_smm_bit(void *data)
{
unsigned long set = *(unsigned long *)data;
@@ -3542,9 +3622,8 @@ static void intel_pmu_cpu_starting(int cpu)
}
}
-static void free_excl_cntrs(int cpu)
+static void free_excl_cntrs(struct cpu_hw_events *cpuc)
{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
struct intel_excl_cntrs *c;
c = cpuc->excl_cntrs;
@@ -3552,14 +3631,22 @@ static void free_excl_cntrs(int cpu)
if (c->core_id == -1 || --c->refcnt == 0)
kfree(c);
cpuc->excl_cntrs = NULL;
- kfree(cpuc->constraint_list);
- cpuc->constraint_list = NULL;
}
+
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
}
static void intel_pmu_cpu_dying(int cpu)
{
- struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ fini_debug_store_on_cpu(cpu);
+
+ if (x86_pmu.counter_freezing)
+ disable_counter_freeze();
+}
+
+void intel_cpuc_finish(struct cpu_hw_events *cpuc)
+{
struct intel_shared_regs *pc;
pc = cpuc->shared_regs;
@@ -3569,12 +3656,12 @@ static void intel_pmu_cpu_dying(int cpu)
cpuc->shared_regs = NULL;
}
- free_excl_cntrs(cpu);
-
- fini_debug_store_on_cpu(cpu);
+ free_excl_cntrs(cpuc);
+}
- if (x86_pmu.counter_freezing)
- disable_counter_freeze();
+static void intel_pmu_cpu_dead(int cpu)
+{
+ intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3584,6 +3671,11 @@ static void intel_pmu_sched_task(struct perf_event_context *ctx,
intel_pmu_lbr_sched_task(ctx, sched_in);
}
+static int intel_pmu_check_period(struct perf_event *event, u64 value)
+{
+ return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0;
+}
+
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -3663,6 +3755,9 @@ static __initconst const struct x86_pmu core_pmu = {
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
+ .check_period = intel_pmu_check_period,
};
static struct attribute *intel_pmu_attrs[];
@@ -3703,8 +3798,12 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_prepare = intel_pmu_cpu_prepare,
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
+ .cpu_dead = intel_pmu_cpu_dead,
+
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_sched_task,
+
+ .check_period = intel_pmu_check_period,
};
static __init void intel_clovertown_quirk(void)
@@ -3733,36 +3832,62 @@ static __init void intel_clovertown_quirk(void)
x86_pmu.pebs_constraints = NULL;
}
-static int intel_snb_pebs_broken(int cpu)
+static const struct x86_cpu_desc isolation_ucodes[] = {
+ INTEL_CPU_DESC(INTEL_FAM6_HASWELL_CORE, 3, 0x0000001f),
+ INTEL_CPU_DESC(INTEL_FAM6_HASWELL_ULT, 1, 0x0000001e),
+ INTEL_CPU_DESC(INTEL_FAM6_HASWELL_GT3E, 1, 0x00000015),
+ INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 2, 0x00000037),
+ INTEL_CPU_DESC(INTEL_FAM6_HASWELL_X, 4, 0x0000000a),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_CORE, 4, 0x00000023),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_GT3E, 1, 0x00000014),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 2, 0x00000010),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 3, 0x07000009),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 4, 0x0f000009),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_XEON_D, 5, 0x0e000002),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_MOBILE, 3, 0x0000007c),
+ INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_DESKTOP, 3, 0x0000007c),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 9, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 9, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 10, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 11, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_MOBILE, 12, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 10, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 11, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 12, 0x0000004e),
+ INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE_DESKTOP, 13, 0x0000004e),
+ {}
+};
+
+static void intel_check_pebs_isolation(void)
{
- u32 rev = UINT_MAX; /* default to broken for unknown models */
+ x86_pmu.pebs_no_isolation = !x86_cpu_has_min_microcode_rev(isolation_ucodes);
+}
- switch (cpu_data(cpu).x86_model) {
- case INTEL_FAM6_SANDYBRIDGE:
- rev = 0x28;
- break;
+static __init void intel_pebs_isolation_quirk(void)
+{
+ WARN_ON_ONCE(x86_pmu.check_microcode);
+ x86_pmu.check_microcode = intel_check_pebs_isolation;
+ intel_check_pebs_isolation();
+}
- case INTEL_FAM6_SANDYBRIDGE_X:
- switch (cpu_data(cpu).x86_stepping) {
- case 6: rev = 0x618; break;
- case 7: rev = 0x70c; break;
- }
- }
+static const struct x86_cpu_desc pebs_ucodes[] = {
+ INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE, 7, 0x00000028),
+ INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 6, 0x00000618),
+ INTEL_CPU_DESC(INTEL_FAM6_SANDYBRIDGE_X, 7, 0x0000070c),
+ {}
+};
- return (cpu_data(cpu).microcode < rev);
+static bool intel_snb_pebs_broken(void)
+{
+ return !x86_cpu_has_min_microcode_rev(pebs_ucodes);
}
static void intel_snb_check_microcode(void)
{
- int pebs_broken = 0;
- int cpu;
-
- for_each_online_cpu(cpu) {
- if ((pebs_broken = intel_snb_pebs_broken(cpu)))
- break;
- }
-
- if (pebs_broken == x86_pmu.pebs_broken)
+ if (intel_snb_pebs_broken() == x86_pmu.pebs_broken)
return;
/*
@@ -3879,23 +4004,22 @@ static __init void intel_nehalem_quirk(void)
}
}
-static bool intel_glp_counter_freezing_broken(int cpu)
-{
- u32 rev = UINT_MAX; /* default to broken for unknown stepping */
-
- switch (cpu_data(cpu).x86_stepping) {
- case 1:
- rev = 0x28;
- break;
- case 8:
- rev = 0x6;
- break;
- }
+static const struct x86_cpu_desc counter_freezing_ucodes[] = {
+ INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e),
+ INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e),
+ INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008),
+ INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_X, 1, 0x00000028),
+ INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028),
+ INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006),
+ {}
+};
- return (cpu_data(cpu).microcode < rev);
+static bool intel_counter_freezing_broken(void)
+{
+ return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes);
}
-static __init void intel_glp_counter_freezing_quirk(void)
+static __init void intel_counter_freezing_quirk(void)
{
/* Check if it's already disabled */
if (disable_counter_freezing)
@@ -3905,7 +4029,7 @@ static __init void intel_glp_counter_freezing_quirk(void)
* If the system starts with the wrong ucode, leave the
* counter-freezing feature permanently disabled.
*/
- if (intel_glp_counter_freezing_broken(raw_smp_processor_id())) {
+ if (intel_counter_freezing_broken()) {
pr_info("PMU counter freezing disabled due to CPU errata,"
"please upgrade microcode\n");
x86_pmu.counter_freezing = false;
@@ -4055,8 +4179,11 @@ static struct attribute *intel_pmu_caps_attrs[] = {
NULL
};
+static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort);
+
static struct attribute *intel_pmu_attrs[] = {
&dev_attr_freeze_on_smi.attr,
+ NULL, /* &dev_attr_allow_tsx_force_abort.attr.attr */
NULL,
};
@@ -4168,6 +4295,8 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_CORE2_MEROM:
x86_add_quirk(intel_clovertown_quirk);
+ /* fall through */
+
case INTEL_FAM6_CORE2_MEROM_L:
case INTEL_FAM6_CORE2_PENRYN:
case INTEL_FAM6_CORE2_DUNNINGTON:
@@ -4256,6 +4385,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GOLDMONT_X:
+ x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
@@ -4282,7 +4412,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
- x86_add_quirk(intel_glp_counter_freezing_quirk);
+ x86_add_quirk(intel_counter_freezing_quirk);
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs,
@@ -4425,6 +4555,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_HASWELL_ULT:
case INTEL_FAM6_HASWELL_GT3E:
x86_add_quirk(intel_ht_bug);
+ x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -4456,6 +4587,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_BROADWELL_XEON_D:
case INTEL_FAM6_BROADWELL_GT3E:
case INTEL_FAM6_BROADWELL_X:
+ x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -4518,6 +4650,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_KABYLAKE_MOBILE:
case INTEL_FAM6_KABYLAKE_DESKTOP:
+ x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -4549,6 +4682,15 @@ __init int intel_pmu_init(void)
tsx_attr = hsw_tsx_events_attrs;
intel_pmu_pebs_data_source_skl(
boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
+
+ if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) {
+ x86_pmu.flags |= PMU_FL_TFA;
+ x86_pmu.get_event_constraints = tfa_get_event_constraints;
+ x86_pmu.enable_all = intel_tfa_pmu_enable_all;
+ x86_pmu.commit_scheduling = intel_tfa_commit_scheduling;
+ intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr;
+ }
+
pr_cont("Skylake events, ");
name = "skylake";
break;
@@ -4700,7 +4842,7 @@ static __init int fixup_ht_bug(void)
hardlockup_detector_perf_restart();
for_each_online_cpu(c)
- free_excl_cntrs(c);
+ free_excl_cntrs(&per_cpu(cpu_hw_events, c));
cpus_read_unlock();
pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index d2e780705c5a2..94a4b7fc75d0e 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -280,13 +280,7 @@ static int cstate_pmu_event_init(struct perf_event *event)
return -ENOENT;
/* unsupported modes and filters */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- event->attr.sample_period) /* no sampling */
+ if (event->attr.sample_period) /* no sampling */
return -EINVAL;
if (event->cpu < 0)
@@ -437,7 +431,7 @@ static struct pmu cstate_core_pmu = {
.start = cstate_pmu_event_start,
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
- .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.module = THIS_MODULE,
};
@@ -451,7 +445,7 @@ static struct pmu cstate_pkg_pmu = {
.start = cstate_pmu_event_start,
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
- .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.module = THIS_MODULE,
};
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index e9acf1d2e7b28..10c99ce1feadd 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1628,6 +1628,8 @@ void __init intel_ds_init(void)
x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE;
+ if (x86_pmu.version <= 4)
+ x86_pmu.pebs_no_isolation = 1;
if (x86_pmu.pebs) {
char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
int format = x86_pmu.intel_cap.pebs_format;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index c88ed39582a10..580c1b91c4540 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -931,6 +931,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_ZERO_CALL;
break;
}
+ /* fall through */
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 9494ca68fd9df..fb3a2f13fc709 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1114,10 +1114,11 @@ static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
* Return: Our private PT buffer structure.
*/
static void *
-pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+pt_buffer_setup_aux(struct perf_event *event, void **pages,
+ int nr_pages, bool snapshot)
{
struct pt_buffer *buf;
- int node, ret;
+ int node, ret, cpu = event->cpu;
if (!nr_pages)
return NULL;
@@ -1222,7 +1223,8 @@ static int pt_event_addr_filters_validate(struct list_head *filters)
static void pt_event_addr_filters_sync(struct perf_event *event)
{
struct perf_addr_filters_head *head = perf_event_addr_filters(event);
- unsigned long msr_a, msr_b, *offs = event->addr_filters_offs;
+ unsigned long msr_a, msr_b;
+ struct perf_addr_filter_range *fr = event->addr_filter_ranges;
struct pt_filters *filters = event->hw.addr_filters;
struct perf_addr_filter *filter;
int range = 0;
@@ -1231,12 +1233,12 @@ static void pt_event_addr_filters_sync(struct perf_event *event)
return;
list_for_each_entry(filter, &head->list, entry) {
- if (filter->path.dentry && !offs[range]) {
+ if (filter->path.dentry && !fr[range].start) {
msr_a = msr_b = 0;
} else {
/* apply the offset */
- msr_a = filter->offset + offs[range];
- msr_b = filter->size + msr_a - 1;
+ msr_a = fr[range].start;
+ msr_b = msr_a + fr[range].size - 1;
}
filters->filter[range].msr_a = msr_a;
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 91039ffed6333..94dc564146ca8 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -397,13 +397,7 @@ static int rapl_pmu_event_init(struct perf_event *event)
return -EINVAL;
/* unsupported modes and filters */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- event->attr.sample_period) /* no sampling */
+ if (event->attr.sample_period) /* no sampling */
return -EINVAL;
/* must be done before validate_group */
@@ -699,6 +693,7 @@ static int __init init_rapl_pmus(void)
rapl_pmus->pmu.stop = rapl_pmu_event_stop;
rapl_pmus->pmu.read = rapl_pmu_event_read;
rapl_pmus->pmu.module = THIS_MODULE;
+ rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
return 0;
}
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 27a461414b306..9fe64c01a2e5a 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -695,14 +695,6 @@ static int uncore_pmu_event_init(struct perf_event *event)
if (pmu->func_id < 0)
return -ENOENT;
- /*
- * Uncore PMU does measure at all privilege level all the time.
- * So it doesn't make sense to specify any exclude bits.
- */
- if (event->attr.exclude_user || event->attr.exclude_kernel ||
- event->attr.exclude_hv || event->attr.exclude_idle)
- return -EINVAL;
-
/* Sampling not supported yet */
if (hwc->sample_period)
return -EINVAL;
@@ -740,6 +732,7 @@ static int uncore_pmu_event_init(struct perf_event *event)
/* fixed counters have event field hardcoded to zero */
hwc->config = 0ULL;
} else if (is_freerunning_event(event)) {
+ hwc->config = event->attr.config;
if (!check_valid_freerunning_event(box, event))
return -EINVAL;
event->hw.idx = UNCORE_PMC_IDX_FREERUNNING;
@@ -800,6 +793,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
.stop = uncore_pmu_event_stop,
.read = uncore_pmu_event_read,
.module = THIS_MODULE,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
};
} else {
pmu->pmu = *pmu->type->pmu;
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index cb46d602a6b8b..853a49a8ccf67 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -292,8 +292,8 @@ static inline
unsigned int uncore_freerunning_counter(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
- unsigned int idx = uncore_freerunning_idx(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
struct intel_uncore_pmu *pmu = box->pmu;
return pmu->type->freerunning[type].counter_base +
@@ -377,7 +377,7 @@ static inline
unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
return box->pmu->type->freerunning[type].bits;
}
@@ -385,7 +385,7 @@ unsigned int uncore_freerunning_bits(struct intel_uncore_box *box,
static inline int uncore_num_freerunning(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
return box->pmu->type->freerunning[type].num_counters;
}
@@ -399,8 +399,8 @@ static inline int uncore_num_freerunning_types(struct intel_uncore_box *box,
static inline bool check_valid_freerunning_event(struct intel_uncore_box *box,
struct perf_event *event)
{
- unsigned int type = uncore_freerunning_type(event->attr.config);
- unsigned int idx = uncore_freerunning_idx(event->attr.config);
+ unsigned int type = uncore_freerunning_type(event->hw.config);
+ unsigned int idx = uncore_freerunning_idx(event->hw.config);
return (type < uncore_num_freerunning_types(box, event)) &&
(idx < uncore_num_freerunning(box, event));
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 2593b0d7aeee6..13493f43b2473 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -397,13 +397,7 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
return -EINVAL;
/* unsupported modes and filters */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- event->attr.sample_period) /* no sampling */
+ if (event->attr.sample_period) /* no sampling */
return -EINVAL;
/*
@@ -448,9 +442,11 @@ static int snb_uncore_imc_event_init(struct perf_event *event)
/* must be done before validate_group */
event->hw.event_base = base;
- event->hw.config = cfg;
event->hw.idx = idx;
+ /* Convert to standard encoding format for freerunning counters */
+ event->hw.config = ((cfg - 1) << 8) | 0x10ff;
+
/* no group validation needed, we have free running counters */
return 0;
@@ -497,6 +493,7 @@ static struct pmu snb_uncore_imc_pmu = {
.start = uncore_pmu_event_start,
.stop = uncore_pmu_event_stop,
.read = uncore_pmu_event_read,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
};
static struct intel_uncore_ops snb_uncore_imc_ops = {
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index c07bee31abe85..b10e04387f380 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1222,6 +1222,8 @@ static struct pci_driver snbep_uncore_pci_driver = {
.id_table = snbep_uncore_pci_ids,
};
+#define NODE_ID_MASK 0x7
+
/*
* build pci bus to socket mapping
*/
@@ -1243,7 +1245,7 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool
err = pci_read_config_dword(ubox_dev, nodeid_loc, &config);
if (err)
break;
- nodeid = config;
+ nodeid = config & NODE_ID_MASK;
/* get the Node ID mapping */
err = pci_read_config_dword(ubox_dev, idmap_loc, &config);
if (err)
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 1b9f85abf9bc1..a878e6286e4af 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -160,13 +160,7 @@ static int msr_event_init(struct perf_event *event)
return -ENOENT;
/* unsupported modes and filters */
- if (event->attr.exclude_user ||
- event->attr.exclude_kernel ||
- event->attr.exclude_hv ||
- event->attr.exclude_idle ||
- event->attr.exclude_host ||
- event->attr.exclude_guest ||
- event->attr.sample_period) /* no sampling */
+ if (event->attr.sample_period) /* no sampling */
return -EINVAL;
if (cfg >= PERF_MSR_EVENT_MAX)
@@ -256,7 +250,7 @@ static struct pmu pmu_msr = {
.start = msr_event_start,
.stop = msr_event_stop,
.read = msr_event_update,
- .capabilities = PERF_PMU_CAP_NO_INTERRUPT,
+ .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
};
static int __init msr_init(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 78d7b7031bfcc..a75955741c504 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -243,6 +243,11 @@ struct cpu_hw_events {
int excl_thread_id; /* 0 or 1 */
/*
+ * SKL TSX_FORCE_ABORT shadow
+ */
+ u64 tfa_shadow;
+
+ /*
* AMD specific bits
*/
struct amd_nb *amd_nb;
@@ -601,13 +606,14 @@ struct x86_pmu {
/*
* Intel DebugStore bits
*/
- unsigned int bts :1,
- bts_active :1,
- pebs :1,
- pebs_active :1,
- pebs_broken :1,
- pebs_prec_dist :1,
- pebs_no_tlb :1;
+ unsigned int bts :1,
+ bts_active :1,
+ pebs :1,
+ pebs_active :1,
+ pebs_broken :1,
+ pebs_prec_dist :1,
+ pebs_no_tlb :1,
+ pebs_no_isolation :1;
int pebs_record_size;
int pebs_buffer_size;
void (*drain_pebs)(struct pt_regs *regs);
@@ -646,6 +652,11 @@ struct x86_pmu {
* Intel host/guest support (KVM)
*/
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
+
+ /*
+ * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
+ */
+ int (*check_period) (struct perf_event *event, u64 period);
};
struct x86_perf_task_context {
@@ -676,6 +687,7 @@ do { \
#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
+#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -857,7 +869,7 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
-static inline bool intel_pmu_has_bts(struct perf_event *event)
+static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int hw_event, bts_event;
@@ -868,7 +880,14 @@ static inline bool intel_pmu_has_bts(struct perf_event *event)
hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
- return hw_event == bts_event && hwc->sample_period == 1;
+ return hw_event == bts_event && period == 1;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ return intel_pmu_has_bts_period(event, hwc->sample_period);
}
int intel_pmu_save_and_restart(struct perf_event *event);
@@ -877,7 +896,8 @@ struct event_constraint *
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event);
-struct intel_shared_regs *allocate_shared_regs(int cpu);
+extern int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu);
+extern void intel_cpuc_finish(struct cpu_hw_events *cpuc);
int intel_pmu_init(void);
@@ -1013,9 +1033,13 @@ static inline int intel_pmu_init(void)
return 0;
}
-static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
+static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
+{
+ return 0;
+}
+
+static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc)
{
- return NULL;
}
static inline int is_ht_workaround_enabled(void)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index 7abb09e2eeb81..6461a16b45594 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -96,6 +96,7 @@ void __percpu **hyperv_pcpu_input_arg;
EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg);
u32 hv_max_vp_index;
+EXPORT_SYMBOL_GPL(hv_max_vp_index);
static int hv_cpu_init(unsigned int cpu)
{
@@ -406,6 +407,13 @@ void hyperv_cleanup(void)
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
+ /*
+ * Reset hypercall page reference before reset the page,
+ * let hypercall operations fail safely rather than
+ * panic the kernel for using invalid hypercall page
+ */
+ hv_hypercall_pg = NULL;
+
/* Reset the hypercall page */
hypercall_msr.as_uint64 = 0;
wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f65b78d32f5eb..3c135084e1eb9 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -39,82 +39,10 @@
static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file *);
-#ifdef CONFIG_COREDUMP
-static int aout_core_dump(struct coredump_params *);
-
-static unsigned long get_dr(int n)
-{
- struct perf_event *bp = current->thread.ptrace_bps[n];
- return bp ? bp->hw.info.address : 0;
-}
-
-/*
- * fill in the user structure for a core dump..
- */
-static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
-{
- u32 fs, gs;
- memset(dump, 0, sizeof(*dump));
-
-/* changed the size calculations - should hopefully work better. lbt */
- dump->magic = CMAGIC;
- dump->start_code = 0;
- dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
- dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
- dump->u_dsize = ((unsigned long)
- (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT;
- dump->u_dsize -= dump->u_tsize;
- dump->u_debugreg[0] = get_dr(0);
- dump->u_debugreg[1] = get_dr(1);
- dump->u_debugreg[2] = get_dr(2);
- dump->u_debugreg[3] = get_dr(3);
- dump->u_debugreg[6] = current->thread.debugreg6;
- dump->u_debugreg[7] = current->thread.ptrace_dr7;
-
- if (dump->start_stack < 0xc0000000) {
- unsigned long tmp;
-
- tmp = (unsigned long) (0xc0000000 - dump->start_stack);
- dump->u_ssize = tmp >> PAGE_SHIFT;
- }
-
- dump->regs.ebx = regs->bx;
- dump->regs.ecx = regs->cx;
- dump->regs.edx = regs->dx;
- dump->regs.esi = regs->si;
- dump->regs.edi = regs->di;
- dump->regs.ebp = regs->bp;
- dump->regs.eax = regs->ax;
- dump->regs.ds = current->thread.ds;
- dump->regs.es = current->thread.es;
- savesegment(fs, fs);
- dump->regs.fs = fs;
- savesegment(gs, gs);
- dump->regs.gs = gs;
- dump->regs.orig_eax = regs->orig_ax;
- dump->regs.eip = regs->ip;
- dump->regs.cs = regs->cs;
- dump->regs.eflags = regs->flags;
- dump->regs.esp = regs->sp;
- dump->regs.ss = regs->ss;
-
-#if 1 /* FIXME */
- dump->u_fpvalid = 0;
-#else
- dump->u_fpvalid = dump_fpu(regs, &dump->i387);
-#endif
-}
-
-#endif
-
static struct linux_binfmt aout_format = {
.module = THIS_MODULE,
.load_binary = load_aout_binary,
.load_shlib = load_aout_library,
-#ifdef CONFIG_COREDUMP
- .core_dump = aout_core_dump,
-#endif
- .min_coredump = PAGE_SIZE
};
static int set_brk(unsigned long start, unsigned long end)
@@ -126,91 +54,6 @@ static int set_brk(unsigned long start, unsigned long end)
return vm_brk(start, end - start);
}
-#ifdef CONFIG_COREDUMP
-/*
- * These are the only things you should do on a core-file: use only these
- * macros to write out all the necessary info.
- */
-
-#include <linux/coredump.h>
-
-#define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
-#define START_STACK(u) (u.start_stack)
-
-/*
- * Routine writes a core dump image in the current directory.
- * Currently only a stub-function.
- *
- * Note that setuid/setgid files won't make a core-dump if the uid/gid
- * changed due to the set[u|g]id. It's enforced by the "current->mm->dumpable"
- * field, which also makes sure the core-dumps won't be recursive if the
- * dumping of the process results in another error..
- */
-
-static int aout_core_dump(struct coredump_params *cprm)
-{
- mm_segment_t fs;
- int has_dumped = 0;
- unsigned long dump_start, dump_size;
- struct user32 dump;
-
- fs = get_fs();
- set_fs(KERNEL_DS);
- has_dumped = 1;
- strncpy(dump.u_comm, current->comm, sizeof(current->comm));
- dump.u_ar0 = offsetof(struct user32, regs);
- dump.signal = cprm->siginfo->si_signo;
- dump_thread32(cprm->regs, &dump);
-
- /*
- * If the size of the dump file exceeds the rlimit, then see
- * what would happen if we wrote the stack, but not the data
- * area.
- */
- if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
- dump.u_dsize = 0;
-
- /* Make sure we have enough room to write the stack and data areas. */
- if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
- dump.u_ssize = 0;
-
- /* make sure we actually have a data and stack area to dump */
- set_fs(USER_DS);
- if (!access_ok((void *) (unsigned long)START_DATA(dump),
- dump.u_dsize << PAGE_SHIFT))
- dump.u_dsize = 0;
- if (!access_ok((void *) (unsigned long)START_STACK(dump),
- dump.u_ssize << PAGE_SHIFT))
- dump.u_ssize = 0;
-
- set_fs(KERNEL_DS);
- /* struct user */
- if (!dump_emit(cprm, &dump, sizeof(dump)))
- goto end_coredump;
- /* Now dump all of the user data. Include malloced stuff as well */
- if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
- goto end_coredump;
- /* now we start writing out the user space info */
- set_fs(USER_DS);
- /* Dump the data area */
- if (dump.u_dsize != 0) {
- dump_start = START_DATA(dump);
- dump_size = dump.u_dsize << PAGE_SHIFT;
- if (!dump_emit(cprm, (void *)dump_start, dump_size))
- goto end_coredump;
- }
- /* Now prepare to dump the stack area */
- if (dump.u_ssize != 0) {
- dump_start = START_STACK(dump);
- dump_size = dump.u_ssize << PAGE_SHIFT;
- if (!dump_emit(cprm, (void *)dump_start, dump_size))
- goto end_coredump;
- }
-end_coredump:
- set_fs(fs);
- return has_dumped;
-}
-#endif
/*
* create_aout_tables() parses the env- and arg-strings in new user
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
deleted file mode 100644
index 7d3ece8bfb616..0000000000000
--- a/arch/x86/include/asm/a.out-core.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* a.out coredump register dumper
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _ASM_X86_A_OUT_CORE_H
-#define _ASM_X86_A_OUT_CORE_H
-
-#ifdef __KERNEL__
-#ifdef CONFIG_X86_32
-
-#include <linux/user.h>
-#include <linux/elfcore.h>
-#include <linux/mm_types.h>
-
-#include <asm/debugreg.h>
-
-/*
- * fill in the user structure for an a.out core dump
- */
-static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
-{
-/* changed the size calculations - should hopefully work better. lbt */
- dump->magic = CMAGIC;
- dump->start_code = 0;
- dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
- dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
- dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
- >> PAGE_SHIFT;
- dump->u_dsize -= dump->u_tsize;
- dump->u_ssize = 0;
- aout_dump_debugregs(dump);
-
- if (dump->start_stack < TASK_SIZE)
- dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
- >> PAGE_SHIFT;
-
- dump->regs.bx = regs->bx;
- dump->regs.cx = regs->cx;
- dump->regs.dx = regs->dx;
- dump->regs.si = regs->si;
- dump->regs.di = regs->di;
- dump->regs.bp = regs->bp;
- dump->regs.ax = regs->ax;
- dump->regs.ds = (u16)regs->ds;
- dump->regs.es = (u16)regs->es;
- dump->regs.fs = (u16)regs->fs;
- dump->regs.gs = get_user_gs(regs);
- dump->regs.orig_ax = regs->orig_ax;
- dump->regs.ip = regs->ip;
- dump->regs.cs = (u16)regs->cs;
- dump->regs.flags = regs->flags;
- dump->regs.sp = regs->sp;
- dump->regs.ss = (u16)regs->ss;
-
- dump->u_fpvalid = dump_fpu(regs, &dump->i387);
-}
-
-#endif /* CONFIG_X86_32 */
-#endif /* __KERNEL__ */
-#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0660e14690c8b..4c74073a19ccd 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -94,13 +94,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alt_total_slen alt_end_marker"b-661b"
#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
-#define __OLDINSTR(oldinstr, num) \
+#define OLDINSTR(oldinstr, num) \
+ "# ALT: oldnstr\n" \
"661:\n\t" oldinstr "\n662:\n" \
+ "# ALT: padding\n" \
".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
- "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
-
-#define OLDINSTR(oldinstr, num) \
- __OLDINSTR(oldinstr, num) \
+ "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" \
alt_end_marker ":\n"
/*
@@ -116,11 +115,23 @@ static inline int alternatives_text_reserved(void *start, void *end)
* additionally longer than the first replacement alternative.
*/
#define OLDINSTR_2(oldinstr, num1, num2) \
+ "# ALT: oldinstr2\n" \
"661:\n\t" oldinstr "\n662:\n" \
+ "# ALT: padding2\n" \
".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \
"(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \
alt_end_marker ":\n"
+#define OLDINSTR_3(oldinsn, n1, n2, n3) \
+ "# ALT: oldinstr3\n" \
+ "661:\n\t" oldinsn "\n662:\n" \
+ "# ALT: padding3\n" \
+ ".skip -((" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \
+ " - (" alt_slen ")) > 0) * " \
+ "(" alt_max_short(alt_max_short(alt_rlen(n1), alt_rlen(n2)), alt_rlen(n3)) \
+ " - (" alt_slen ")), 0x90\n" \
+ alt_end_marker ":\n"
+
#define ALTINSTR_ENTRY(feature, num) \
" .long 661b - .\n" /* label */ \
" .long " b_replacement(num)"f - .\n" /* new instruction */ \
@@ -129,8 +140,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
" .byte " alt_rlen(num) "\n" /* replacement len */ \
" .byte " alt_pad_len "\n" /* pad len */
-#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
- b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+ "# ALT: replacement " #num "\n" \
+ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
/* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, newinstr, feature) \
@@ -153,6 +165,19 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
".popsection\n"
+#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \
+ OLDINSTR_3(oldinsn, 1, 2, 3) \
+ ".pushsection .altinstructions,\"a\"\n" \
+ ALTINSTR_ENTRY(feat1, 1) \
+ ALTINSTR_ENTRY(feat2, 2) \
+ ALTINSTR_ENTRY(feat3, 3) \
+ ".popsection\n" \
+ ".pushsection .altinstr_replacement, \"ax\"\n" \
+ ALTINSTR_REPLACEMENT(newinsn1, feat1, 1) \
+ ALTINSTR_REPLACEMENT(newinsn2, feat2, 2) \
+ ALTINSTR_REPLACEMENT(newinsn3, feat3, 3) \
+ ".popsection\n"
+
/*
* Alternative instructions for different CPU types or capabilities.
*
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 1908214b91257..ce92c4acc9133 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -7,7 +7,6 @@
#include <asm-generic/asm-prototypes.h>
-#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/special_insns.h>
#include <asm/preempt.h>
diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index baeba05671268..3417110574c12 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -11,4 +11,32 @@
extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match);
+/*
+ * Match specific microcode revisions.
+ *
+ * vendor/family/model/stepping must be all set.
+ *
+ * Only checks against the boot CPU. When mixed-stepping configs are
+ * valid for a CPU model, add a quirk for every valid stepping and
+ * do the fine-tuning in the quirk handler.
+ */
+
+struct x86_cpu_desc {
+ __u8 x86_family;
+ __u8 x86_vendor;
+ __u8 x86_model;
+ __u8 x86_stepping;
+ __u32 x86_microcode_rev;
+};
+
+#define INTEL_CPU_DESC(mod, step, rev) { \
+ .x86_family = 6, \
+ .x86_vendor = X86_VENDOR_INTEL, \
+ .x86_model = mod, \
+ .x86_stepping = step, \
+ .x86_microcode_rev = rev, \
+}
+
+extern bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table);
+
#endif
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 6d61225247114..981ff94796484 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -344,6 +344,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 107283b1eb1e4..606a4b6a9812c 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -170,7 +170,6 @@ static inline bool efi_runtime_supported(void)
return false;
}
-extern struct console early_efi_console;
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index fa2c93cb42a27..fb04a3ded7ddb 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -137,37 +137,25 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
- else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+ else
return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
- /* See comment in copy_fxregs_to_kernel() below. */
- return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
}
static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
{
- if (IS_ENABLED(CONFIG_X86_32)) {
+ if (IS_ENABLED(CONFIG_X86_32))
kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- } else {
- if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) {
- kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
- } else {
- /* See comment in copy_fxregs_to_kernel() below. */
- kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
- }
- }
+ else
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
{
if (IS_ENABLED(CONFIG_X86_32))
return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
- else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+ else
return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
- /* See comment in copy_fxregs_to_kernel() below. */
- return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
- "m" (*fx));
}
static inline void copy_kernel_to_fregs(struct fregs_state *fx)
@@ -184,34 +172,8 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
{
if (IS_ENABLED(CONFIG_X86_32))
asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
- else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
+ else
asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
- else {
- /* Using "rex64; fxsave %0" is broken because, if the memory
- * operand uses any extended registers for addressing, a second
- * REX prefix will be generated (to the assembler, rex64
- * followed by semicolon is a separate instruction), and hence
- * the 64-bitness is lost.
- *
- * Using "fxsaveq %0" would be the ideal choice, but is only
- * supported starting with gas 2.16.
- *
- * Using, as a workaround, the properly prefixed form below
- * isn't accepted by any binutils version so far released,
- * complaining that the same type of prefix is used twice if
- * an extended register is needed for addressing (fix submitted
- * to mainline 2005-11-21).
- *
- * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
- *
- * This, however, we can work around by forcing the compiler to
- * select an addressing mode that doesn't require extended
- * registers.
- */
- asm volatile( "rex64/fxsave (%[fx])"
- : "=m" (fpu->state.fxsave)
- : [fx] "R" (&fpu->state.fxsave));
- }
}
/* These macros all use (%edi)/(%rdi) as the single memory argument. */
@@ -414,6 +376,13 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
copy_xregs_to_kernel(&fpu->state.xsave);
+
+ /*
+ * AVX512 state is tracked here because its use is
+ * known to slow the max clock speed of the core.
+ */
+ if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+ fpu->avx512_timestamp = jiffies;
return 1;
}
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 202c53918ecfa..2e32e178e0645 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -303,6 +303,13 @@ struct fpu {
unsigned char initialized;
/*
+ * @avx512_timestamp:
+ *
+ * Records the timestamp of AVX512 use during last context switch.
+ */
+ unsigned long avx512_timestamp;
+
+ /*
* @state:
*
* In-memory copy of all FPU registers that we save/restore
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 705dafc2d11ab..2bdbbbcfa393f 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -841,7 +841,7 @@ union hv_gpa_page_range {
* count is equal with how many entries of union hv_gpa_page_range can
* be populated into the input parameter page.
*/
-#define HV_MAX_FLUSH_REP_COUNT (PAGE_SIZE - 2 * sizeof(u64) / \
+#define HV_MAX_FLUSH_REP_COUNT ((PAGE_SIZE - 2 * sizeof(u64)) / \
sizeof(union hv_gpa_page_range))
struct hv_guest_mapping_flush_list {
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 0dd6b0f4000e8..9f15384c504a4 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -6,7 +6,7 @@
* "Big Core" Processors (Branded as Core, Xeon, etc...)
*
* The "_X" parts are generally the EP and EX Xeons, or the
- * "Extreme" ones, like Broadwell-E.
+ * "Extreme" ones, like Broadwell-E, or Atom microserver.
*
* While adding a new CPUID for a new microarchitecture, add a new
* group to keep logically sorted out in chronological order. Within
@@ -52,6 +52,8 @@
#define INTEL_FAM6_CANNONLAKE_MOBILE 0x66
+#define INTEL_FAM6_ICELAKE_MOBILE 0x7E
+
/* "Small Core" Processors (Atom) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
@@ -71,6 +73,7 @@
#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
+#define INTEL_FAM6_ATOM_TREMONT_X 0x86 /* Jacobsville */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4660ce90de7ff..a5db4475e72db 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
#include <asm/msr-index.h>
#include <asm/asm.h>
#include <asm/kvm_page_track.h>
+#include <asm/kvm_vcpu_regs.h>
#include <asm/hyperv-tlfs.h>
#define KVM_MAX_VCPUS 288
@@ -137,23 +138,23 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define ASYNC_PF_PER_VCPU 64
enum kvm_reg {
- VCPU_REGS_RAX = 0,
- VCPU_REGS_RCX = 1,
- VCPU_REGS_RDX = 2,
- VCPU_REGS_RBX = 3,
- VCPU_REGS_RSP = 4,
- VCPU_REGS_RBP = 5,
- VCPU_REGS_RSI = 6,
- VCPU_REGS_RDI = 7,
+ VCPU_REGS_RAX = __VCPU_REGS_RAX,
+ VCPU_REGS_RCX = __VCPU_REGS_RCX,
+ VCPU_REGS_RDX = __VCPU_REGS_RDX,
+ VCPU_REGS_RBX = __VCPU_REGS_RBX,
+ VCPU_REGS_RSP = __VCPU_REGS_RSP,
+ VCPU_REGS_RBP = __VCPU_REGS_RBP,
+ VCPU_REGS_RSI = __VCPU_REGS_RSI,
+ VCPU_REGS_RDI = __VCPU_REGS_RDI,
#ifdef CONFIG_X86_64
- VCPU_REGS_R8 = 8,
- VCPU_REGS_R9 = 9,
- VCPU_REGS_R10 = 10,
- VCPU_REGS_R11 = 11,
- VCPU_REGS_R12 = 12,
- VCPU_REGS_R13 = 13,
- VCPU_REGS_R14 = 14,
- VCPU_REGS_R15 = 15,
+ VCPU_REGS_R8 = __VCPU_REGS_R8,
+ VCPU_REGS_R9 = __VCPU_REGS_R9,
+ VCPU_REGS_R10 = __VCPU_REGS_R10,
+ VCPU_REGS_R11 = __VCPU_REGS_R11,
+ VCPU_REGS_R12 = __VCPU_REGS_R12,
+ VCPU_REGS_R13 = __VCPU_REGS_R13,
+ VCPU_REGS_R14 = __VCPU_REGS_R14,
+ VCPU_REGS_R15 = __VCPU_REGS_R15,
#endif
VCPU_REGS_RIP,
NR_VCPU_REGS
@@ -299,6 +300,7 @@ union kvm_mmu_extended_role {
unsigned int cr4_smap:1;
unsigned int cr4_smep:1;
unsigned int cr4_la57:1;
+ unsigned int maxphyaddr:6;
};
};
@@ -318,6 +320,7 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
bool unsync;
+ bool mmio_cached;
/*
* The following two entries are used to key the shadow page in the
@@ -332,10 +335,6 @@ struct kvm_mmu_page {
int root_count; /* Currently serving as active root */
unsigned int unsync_children;
struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
-
- /* The page is obsolete if mmu_valid_gen != kvm->arch.mmu_valid_gen. */
- unsigned long mmu_valid_gen;
-
DECLARE_BITMAP(unsync_child_bitmap, 512);
#ifdef CONFIG_X86_32
@@ -397,6 +396,7 @@ struct kvm_mmu {
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte);
hpa_t root_hpa;
+ gpa_t root_cr3;
union kvm_mmu_role mmu_role;
u8 root_level;
u8 shadow_root_level;
@@ -846,13 +846,11 @@ struct kvm_arch {
unsigned int n_requested_mmu_pages;
unsigned int n_max_mmu_pages;
unsigned int indirect_shadow_pages;
- unsigned long mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
- struct list_head zapped_obsolete_pages;
struct kvm_page_track_notifier_node mmu_sp_tracker;
struct kvm_page_track_notifier_head track_notifier_head;
@@ -1253,7 +1251,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
diff --git a/arch/x86/include/asm/kvm_vcpu_regs.h b/arch/x86/include/asm/kvm_vcpu_regs.h
new file mode 100644
index 0000000000000..1af2cb59233b5
--- /dev/null
+++ b/arch/x86/include/asm/kvm_vcpu_regs.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_KVM_VCPU_REGS_H
+#define _ASM_X86_KVM_VCPU_REGS_H
+
+#define __VCPU_REGS_RAX 0
+#define __VCPU_REGS_RCX 1
+#define __VCPU_REGS_RDX 2
+#define __VCPU_REGS_RBX 3
+#define __VCPU_REGS_RSP 4
+#define __VCPU_REGS_RBP 5
+#define __VCPU_REGS_RSI 6
+#define __VCPU_REGS_RDI 7
+
+#ifdef CONFIG_X86_64
+#define __VCPU_REGS_R8 8
+#define __VCPU_REGS_R9 9
+#define __VCPU_REGS_R10 10
+#define __VCPU_REGS_R11 11
+#define __VCPU_REGS_R12 12
+#define __VCPU_REGS_R13 13
+#define __VCPU_REGS_R14 14
+#define __VCPU_REGS_R15 15
+#endif
+
+#endif /* _ASM_X86_KVM_VCPU_REGS_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c1a812bd5a27d..22d05e3835f0b 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -48,6 +48,7 @@
#define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */
#define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */
+#define MCI_STATUS_SCRUB BIT_ULL(40) /* Error detected during scrub operation */
/*
* McaX field if set indicates a given bank supports MCA extensions:
@@ -307,11 +308,17 @@ enum smca_bank_types {
SMCA_FP, /* Floating Point */
SMCA_L3_CACHE, /* L3 Cache */
SMCA_CS, /* Coherent Slave */
+ SMCA_CS_V2, /* Coherent Slave */
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
+ SMCA_PSP_V2, /* Platform Security Processor */
SMCA_SMU, /* System Management Unit */
+ SMCA_SMU_V2, /* System Management Unit */
+ SMCA_MP5, /* Microprocessor 5 Unit */
+ SMCA_NBIO, /* Northbridge IO Unit */
+ SMCA_PCIE, /* PCI Express Unit */
N_SMCA_BANK_TYPES
};
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 0ca50611e8cec..19d18fae6ec66 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -178,6 +178,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+/*
+ * Init a new mm. Used on mm copies, like at fork()
+ * and on mm's that are brand-new, like at execve().
+ */
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
@@ -228,8 +232,22 @@ do { \
} while (0)
#endif
+static inline void arch_dup_pkeys(struct mm_struct *oldmm,
+ struct mm_struct *mm)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
+ return;
+
+ /* Duplicate the oldmm pkey state in mm: */
+ mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
+ mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+#endif
+}
+
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
+ arch_dup_pkeys(oldmm, mm);
paravirt_arch_dup_mmap(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8e40c2446fd19..ca5bc0eacb95f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -666,6 +666,12 @@
#define MSR_IA32_TSC_DEADLINE 0x000006E0
+
+#define MSR_TSX_FORCE_ABORT 0x0000010F
+
+#define MSR_TFA_RTM_FORCE_ABORT_BIT 0
+#define MSR_TFA_RTM_FORCE_ABORT BIT_ULL(MSR_TFA_RTM_FORCE_ABORT_BIT)
+
/* P4/Xeon+ specific */
#define MSR_IA32_MCG_EAX 0x00000180
#define MSR_IA32_MCG_EBX 0x00000181
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 91e4cf189914a..5cc3930cb465e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -217,6 +217,8 @@ static __always_inline unsigned long long rdtsc(void)
*/
static __always_inline unsigned long long rdtsc_ordered(void)
{
+ DECLARE_ARGS(val, low, high);
+
/*
* The RDTSC instruction is not ordered relative to memory
* access. The Intel SDM and the AMD APM are both vague on this
@@ -227,9 +229,19 @@ static __always_inline unsigned long long rdtsc_ordered(void)
* ordering guarantees as reading from a global memory location
* that some other imaginary CPU is updating continuously with a
* time stamp.
+ *
+ * Thus, use the preferred barrier on the respective CPU, aiming for
+ * RDTSCP as the default.
*/
- barrier_nospec();
- return rdtsc();
+ asm volatile(ALTERNATIVE_3("rdtsc",
+ "mfence; rdtsc", X86_FEATURE_MFENCE_RDTSC,
+ "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC,
+ "rdtscp", X86_FEATURE_RDTSCP)
+ : EAX_EDX_RET(val, low, high)
+ /* RDTSCP clobbers ECX with MSR_TSC_AUX. */
+ :: "ecx");
+
+ return EAX_EDX_VAL(val, low, high);
}
static inline unsigned long long native_read_pmc(int counter)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a97f28d914d56..c25c38a05c1c9 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -422,25 +422,26 @@ static inline pgdval_t pgd_val(pgd_t pgd)
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
-static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
+static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep)
{
pteval_t ret;
- ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep);
+ ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep);
return (pte_t) { .pte = ret };
}
-static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep, pte_t old_pte, pte_t pte)
{
+
if (sizeof(pteval_t) > sizeof(long))
/* 5 arg words */
- pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte);
+ pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte);
else
PVOP_VCALL4(mmu.ptep_modify_prot_commit,
- mm, addr, ptep, pte.pte);
+ vma, addr, ptep, pte.pte);
}
static inline void set_pte(pte_t *ptep, pte_t pte)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 488c59686a733..2474e434a6f72 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -55,6 +55,7 @@ struct task_struct;
struct cpumask;
struct flush_tlb_info;
struct mmu_gather;
+struct vm_area_struct;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -254,9 +255,9 @@ struct pv_mmu_ops {
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
- pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
+ pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep);
- void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
+ void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte);
struct paravirt_callee_save pte_val;
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 662963681ea6c..e662f987dfa2c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/pat.h>
#include <asm/x86_init.h>
@@ -141,7 +142,7 @@ cpumask_of_pcibus(const struct pci_bus *bus)
int node;
node = __pcibus_to_node(bus);
- return (node == -1) ? cpu_online_mask :
+ return (node == NUMA_NO_NODE) ? cpu_online_mask :
cpumask_of_node(node);
}
#endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 40616e8052924..2779ace16d23f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1065,7 +1065,7 @@ static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- native_set_pmd(pmdp, pmd);
+ set_pmd(pmdp, pmd);
}
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9c85b54bf03ca..0bb5663156218 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -259,8 +259,7 @@ extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
#define gup_fast_permitted gup_fast_permitted
-static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
- int write)
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages)
{
unsigned long len, end;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 33051436c8645..2bb3a648fc12c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -742,7 +742,6 @@ enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
-void early_trap_pf_init(void);
/* Defined in head.S */
extern struct desc_ptr early_gdt_descr;
diff --git a/arch/x86/include/asm/refcount.h b/arch/x86/include/asm/refcount.h
index dbaed55c1c244..232f856e0db06 100644
--- a/arch/x86/include/asm/refcount.h
+++ b/arch/x86/include/asm/refcount.h
@@ -67,16 +67,30 @@ static __always_inline void refcount_dec(refcount_t *r)
static __always_inline __must_check
bool refcount_sub_and_test(unsigned int i, refcount_t *r)
{
- return GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl",
+ bool ret = GEN_BINARY_SUFFIXED_RMWcc(LOCK_PREFIX "subl",
REFCOUNT_CHECK_LT_ZERO,
r->refs.counter, e, "er", i, "cx");
+
+ if (ret) {
+ smp_acquire__after_ctrl_dep();
+ return true;
+ }
+
+ return false;
}
static __always_inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
- return GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl",
- REFCOUNT_CHECK_LT_ZERO,
- r->refs.counter, e, "cx");
+ bool ret = GEN_UNARY_SUFFIXED_RMWcc(LOCK_PREFIX "decl",
+ REFCOUNT_CHECK_LT_ZERO,
+ r->refs.counter, e, "cx");
+
+ if (ret) {
+ smp_acquire__after_ctrl_dep();
+ return true;
+ }
+
+ return false;
}
static __always_inline __must_check
diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl_sched.h
index 54990fe2a3ae8..f6b7fe2833cc7 100644
--- a/arch/x86/include/asm/resctrl_sched.h
+++ b/arch/x86/include/asm/resctrl_sched.h
@@ -2,7 +2,7 @@
#ifndef _ASM_X86_RESCTRL_SCHED_H
#define _ASM_X86_RESCTRL_SCHED_H
-#ifdef CONFIG_RESCTRL
+#ifdef CONFIG_X86_CPU_RESCTRL
#include <linux/sched.h>
#include <linux/jump_label.h>
@@ -88,6 +88,6 @@ static inline void resctrl_sched_in(void)
static inline void resctrl_sched_in(void) {}
-#endif /* CONFIG_RESCTRL */
+#endif /* CONFIG_X86_CPU_RESCTRL */
#endif /* _ASM_X86_RESCTRL_SCHED_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index a77445d1b0348..1954dd5552a2e 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -25,7 +25,6 @@
#define KERNEL_DS MAKE_MM_SEG(-1UL)
#define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX)
-#define get_ds() (KERNEL_DS)
#define get_fs() (current->thread.addr_limit)
static inline void set_fs(mm_segment_t fs)
{
@@ -35,10 +34,7 @@ static inline void set_fs(mm_segment_t fs)
}
#define segment_eq(a, b) ((a).seg == (b).seg)
-
#define user_addr_max() (current->thread.addr_limit.seg)
-#define __addr_ok(addr) \
- ((unsigned long __force)(addr) < user_addr_max())
/*
* Test whether a block of memory is a valid user space address.
@@ -76,7 +72,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
#endif
/**
- * access_ok: - Checks if a user space pointer is valid
+ * access_ok - Checks if a user space pointer is valid
* @addr: User space pointer to start of block to check
* @size: Size of block to check
*
@@ -85,12 +81,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
*
* Checks if a pointer to a block of memory in user space is valid.
*
- * Returns true (nonzero) if the memory block may be valid, false (zero)
- * if it is definitely invalid.
- *
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
+ *
+ * Return: true (nonzero) if the memory block may be valid, false (zero)
+ * if it is definitely invalid.
*/
#define access_ok(addr, size) \
({ \
@@ -135,7 +131,7 @@ extern int __get_user_bad(void);
__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
/**
- * get_user: - Get a simple variable from user space.
+ * get_user - Get a simple variable from user space.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
@@ -149,7 +145,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
* @ptr must have pointer-to-simple-variable type, and the result of
* dereferencing @ptr must be assignable to @x without a cast.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
/*
@@ -227,7 +223,7 @@ extern void __put_user_4(void);
extern void __put_user_8(void);
/**
- * put_user: - Write a simple value into user space.
+ * put_user - Write a simple value into user space.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
@@ -241,7 +237,7 @@ extern void __put_user_8(void);
* @ptr must have pointer-to-simple-variable type, and @x must be assignable
* to the result of dereferencing @ptr.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
*/
#define put_user(x, ptr) \
({ \
@@ -284,7 +280,7 @@ do { \
__put_user_goto(x, ptr, "l", "k", "ir", label); \
break; \
case 8: \
- __put_user_goto_u64((__typeof__(*ptr))(x), ptr, label); \
+ __put_user_goto_u64(x, ptr, label); \
break; \
default: \
__put_user_bad(); \
@@ -431,8 +427,10 @@ do { \
({ \
__label__ __pu_label; \
int __pu_err = -EFAULT; \
+ __typeof__(*(ptr)) __pu_val; \
+ __pu_val = x; \
__uaccess_begin(); \
- __put_user_size((x), (ptr), (size), __pu_label); \
+ __put_user_size(__pu_val, (ptr), (size), __pu_label); \
__pu_err = 0; \
__pu_label: \
__uaccess_end(); \
@@ -501,7 +499,7 @@ struct __large_struct { unsigned long buf[100]; };
} while (0)
/**
- * __get_user: - Get a simple variable from user space, with less checking.
+ * __get_user - Get a simple variable from user space, with less checking.
* @x: Variable to store result.
* @ptr: Source address, in user space.
*
@@ -518,7 +516,7 @@ struct __large_struct { unsigned long buf[100]; };
* Caller must check the pointer with access_ok() before calling this
* function.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
* On error, the variable @x is set to zero.
*/
@@ -526,7 +524,7 @@ struct __large_struct { unsigned long buf[100]; };
__get_user_nocheck((x), (ptr), sizeof(*(ptr)))
/**
- * __put_user: - Write a simple value into user space, with less checking.
+ * __put_user - Write a simple value into user space, with less checking.
* @x: Value to copy to user space.
* @ptr: Destination address, in user space.
*
@@ -543,7 +541,7 @@ struct __large_struct { unsigned long buf[100]; };
* Caller must check the pointer with access_ok() before calling this
* function.
*
- * Returns zero on success, or -EFAULT on error.
+ * Return: zero on success, or -EFAULT on error.
*/
#define __put_user(x, ptr) \
@@ -711,7 +709,7 @@ static __must_check inline bool user_access_begin(const void __user *ptr, size_t
{
if (unlikely(!access_ok(ptr,len)))
return 0;
- __uaccess_begin();
+ __uaccess_begin_nospec();
return 1;
}
#define user_access_begin(a,b) user_access_begin(a,b)
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index dc4ed8bc2382c..146859efd83c6 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -23,8 +23,8 @@
# include <asm/unistd_64.h>
# include <asm/unistd_64_x32.h>
-# define __ARCH_WANT_COMPAT_SYS_TIME
-# define __ARCH_WANT_SYS_UTIME32
+# define __ARCH_WANT_SYS_TIME
+# define __ARCH_WANT_SYS_UTIME
# define __ARCH_WANT_COMPAT_SYS_PREADV64
# define __ARCH_WANT_COMPAT_SYS_PWRITEV64
# define __ARCH_WANT_COMPAT_SYS_PREADV64V2
@@ -48,8 +48,8 @@
# define __ARCH_WANT_SYS_SIGPENDING
# define __ARCH_WANT_SYS_SIGPROCMASK
# define __ARCH_WANT_SYS_SOCKETCALL
-# define __ARCH_WANT_SYS_TIME
-# define __ARCH_WANT_SYS_UTIME
+# define __ARCH_WANT_SYS_TIME32
+# define __ARCH_WANT_SYS_UTIME32
# define __ARCH_WANT_SYS_WAITPID
# define __ARCH_WANT_SYS_FORK
# define __ARCH_WANT_SYS_VFORK
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 1f86e1b0a5cdc..499578f7e6d7b 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -23,6 +23,12 @@ struct unwind_state {
#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
bool got_irq;
unsigned long *bp, *orig_sp, ip;
+ /*
+ * If non-NULL: The current frame is incomplete and doesn't contain a
+ * valid BP. When looking for the next frame, use this instead of the
+ * non-existent saved BP.
+ */
+ unsigned long *next_bp;
struct pt_regs *regs;
#else
unsigned long *sp;
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index e652a7cc61863..8cfccc3cbbf42 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -48,7 +48,8 @@ enum {
BIOS_STATUS_SUCCESS = 0,
BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
BIOS_STATUS_EINVAL = -EINVAL,
- BIOS_STATUS_UNAVAIL = -EBUSY
+ BIOS_STATUS_UNAVAIL = -EBUSY,
+ BIOS_STATUS_ABORT = -EINTR,
};
/* Address map parameters */
@@ -140,7 +141,6 @@ enum uv_memprotect {
*/
extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
-extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
extern s64 uv_bios_freq_base(u64, u64 *);
@@ -151,11 +151,7 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
-#ifdef CONFIG_EFI
extern void uv_bios_init(void);
-#else
-void uv_bios_init(void) { }
-#endif
extern unsigned long sn_rtc_cycles_per_second;
extern int uv_type;
@@ -167,4 +163,9 @@ extern long system_serial_number;
extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
+/*
+ * EFI runtime lock; cf. firmware/efi/runtime-wrappers.c for details
+ */
+extern struct semaphore __efi_uv_runtime_lock;
+
#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index ef05bea7010de..de6f0d59a24f4 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -332,15 +332,11 @@ HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
return _hypercall4(int, update_va_mapping, va,
new_val.pte, new_val.pte >> 32, flags);
}
-extern int __must_check xen_event_channel_op_compat(int, void *);
static inline int
HYPERVISOR_event_channel_op(int cmd, void *arg)
{
- int rc = _hypercall2(int, event_channel_op, cmd, arg);
- if (unlikely(rc == -ENOSYS))
- rc = xen_event_channel_op_compat(cmd, arg);
- return rc;
+ return _hypercall2(int, event_channel_op, cmd, arg);
}
static inline int
@@ -355,15 +351,10 @@ HYPERVISOR_console_io(int cmd, int count, char *str)
return _hypercall3(int, console_io, cmd, count, str);
}
-extern int __must_check xen_physdev_op_compat(int, void *);
-
static inline int
HYPERVISOR_physdev_op(int cmd, void *arg)
{
- int rc = _hypercall2(int, physdev_op, cmd, arg);
- if (unlikely(rc == -ENOSYS))
- rc = xen_physdev_op_compat(cmd, arg);
- return rc;
+ return _hypercall2(int, physdev_op, cmd, arg);
}
static inline int
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
index f6648e9928b31..efe701b7c6ceb 100644
--- a/arch/x86/include/uapi/asm/Kbuild
+++ b/arch/x86/include/uapi/asm/Kbuild
@@ -3,3 +3,4 @@ include include/uapi/asm-generic/Kbuild.asm
generated-y += unistd_32.h
generated-y += unistd_64.h
generated-y += unistd_x32.h
+generic-y += socket.h
diff --git a/arch/x86/include/uapi/asm/socket.h b/arch/x86/include/uapi/asm/socket.h
deleted file mode 100644
index 6b71384b9d8b4..0000000000000
--- a/arch/x86/include/uapi/asm/socket.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/socket.h>
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2624de16cd7ab..8dcbf68907146 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -935,6 +935,9 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
#define HPET_RESOURCE_NAME_SIZE 9
hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE,
SMP_CACHE_BYTES);
+ if (!hpet_res)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 0c26b1b44e51a..4203d4f0c68d0 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -90,7 +90,7 @@ ret_point:
.data
ALIGN
ENTRY(saved_magic) .long 0
-ENTRY(saved_eip) .long 0
+saved_eip: .long 0
# saved registers
saved_idt: .long 0,0
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 50b8ed0317a34..510fa12aab73a 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -125,12 +125,12 @@ ENTRY(do_suspend_lowlevel)
ENDPROC(do_suspend_lowlevel)
.data
-ENTRY(saved_rbp) .quad 0
-ENTRY(saved_rsi) .quad 0
-ENTRY(saved_rdi) .quad 0
-ENTRY(saved_rbx) .quad 0
+saved_rbp: .quad 0
+saved_rsi: .quad 0
+saved_rdi: .quad 0
+saved_rbx: .quad 0
-ENTRY(saved_rip) .quad 0
-ENTRY(saved_rsp) .quad 0
+saved_rip: .quad 0
+saved_rsp: .quad 0
ENTRY(saved_magic) .quad 0
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ebeac487a20c7..9a79c7808f9cc 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -11,6 +11,7 @@
#include <linux/stop_machine.h>
#include <linux/slab.h>
#include <linux/kdebug.h>
+#include <linux/kprobes.h>
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
@@ -393,10 +394,10 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
continue;
}
- DPRINTK("feat: %d*32+%d, old: (%px len: %d), repl: (%px, len: %d), pad: %d",
+ DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
a->cpuid >> 5,
a->cpuid & 0x1f,
- instr, a->instrlen,
+ instr, instr, a->instrlen,
replacement, a->replacementlen, a->padlen);
DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
@@ -764,8 +765,8 @@ int poke_int3_handler(struct pt_regs *regs)
regs->ip = (unsigned long) bp_int3_handler;
return 1;
-
}
+NOKPROBE_SYMBOL(poke_int3_handler);
/**
* text_poke_bp() -- update instructions on live kernel on SMP
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2953bbf05c085..53aa234a6803f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -812,6 +812,7 @@ static int irq_polarity(int idx)
return IOAPIC_POL_HIGH;
case MP_IRQPOL_RESERVED:
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+ /* fall through */
case MP_IRQPOL_ACTIVE_LOW:
default: /* Pointless default required due to do gcc stupidity */
return IOAPIC_POL_LOW;
@@ -859,6 +860,7 @@ static int irq_trigger(int idx)
return IOAPIC_EDGE;
case MP_IRQTRIG_RESERVED:
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+ /* fall through */
case MP_IRQTRIG_LEVEL:
default: /* Pointless default required due to do gcc stupidity */
return IOAPIC_LEVEL;
@@ -2579,6 +2581,8 @@ static struct resource * __init ioapic_setup_resources(void)
n *= nr_ioapics;
mem = memblock_alloc(n, SMP_CACHE_BYTES);
+ if (!mem)
+ panic("%s: Failed to allocate %lu bytes\n", __func__, n);
res = (void *)mem;
mem += sizeof(struct resource) * nr_ioapics;
@@ -2623,6 +2627,9 @@ fake_ioapic_page:
#endif
ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE,
PAGE_SIZE);
+ if (!ioapic_phys)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, PAGE_SIZE, PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index a555da0941570..1e225528f0d7f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -27,6 +27,7 @@
#include <linux/crash_dump.h>
#include <linux/reboot.h>
#include <linux/memory.h>
+#include <linux/numa.h>
#include <asm/uv/uv_mmrs.h>
#include <asm/uv/uv_hub.h>
@@ -1390,7 +1391,7 @@ static void __init build_socket_tables(void)
}
/* Set socket -> node values: */
- lnid = -1;
+ lnid = NUMA_NO_NODE;
for_each_present_cpu(cpu) {
int nid = cpu_to_node(cpu);
int apicid, sockid;
@@ -1521,7 +1522,7 @@ static void __init uv_system_init_hub(void)
new_hub->pnode = 0xffff;
new_hub->numa_blade_id = uv_node_to_blade_id(nodeid);
- new_hub->memory_nid = -1;
+ new_hub->memory_nid = NUMA_NO_NODE;
new_hub->nr_possible_cpus = 0;
new_hub->nr_online_cpus = 0;
}
@@ -1538,7 +1539,7 @@ static void __init uv_system_init_hub(void)
uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid);
uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++;
- if (uv_cpu_hub_info(cpu)->memory_nid == -1)
+ if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE)
uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu);
/* Init memoryless node: */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ac78f90aea562..cfd24f9f76144 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
-obj-$(CONFIG_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 69f6bbb41be0b..01004bfb1a1bc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -819,11 +819,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
static void init_amd_zn(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_ZEN);
- /*
- * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
- * all up to and including B1.
- */
- if (c->x86_model <= 1 && c->x86_stepping <= 1)
+
+ /* Fix erratum 1076: CPB feature bit not being set in CPUID. */
+ if (!cpu_has(c, X86_FEATURE_CPB))
set_cpu_cap(c, X86_FEATURE_CPB);
}
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 8654b8b0c8484..2da82eff0eb4f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -71,7 +71,7 @@ void __init check_bugs(void)
* identify_boot_cpu() initialized SMT support information, let the
* core code know.
*/
- cpu_smt_check_topology_early();
+ cpu_smt_check_topology();
if (!IS_ENABLED(CONFIG_SMP)) {
pr_info("CPU: ");
@@ -215,7 +215,7 @@ static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
SPECTRE_V2_USER_NONE;
-#ifdef RETPOLINE
+#ifdef CONFIG_RETPOLINE
static bool spectre_v2_bad_module;
bool retpoline_module_ok(bool has_retpoline)
@@ -798,15 +798,25 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
if (task_spec_ssb_force_disable(task))
return -EPERM;
task_clear_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
task_update_spec_tif(task);
break;
case PR_SPEC_DISABLE:
task_set_spec_ssb_disable(task);
+ task_clear_spec_ssb_noexec(task);
task_update_spec_tif(task);
break;
case PR_SPEC_FORCE_DISABLE:
task_set_spec_ssb_disable(task);
task_set_spec_ssb_force_disable(task);
+ task_clear_spec_ssb_noexec(task);
+ task_update_spec_tif(task);
+ break;
+ case PR_SPEC_DISABLE_NOEXEC:
+ if (task_spec_ssb_force_disable(task))
+ return -EPERM;
+ task_set_spec_ssb_disable(task);
+ task_set_spec_ssb_noexec(task);
task_update_spec_tif(task);
break;
default:
@@ -885,6 +895,8 @@ static int ssb_prctl_get(struct task_struct *task)
case SPEC_STORE_BYPASS_PRCTL:
if (task_spec_ssb_force_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
+ if (task_spec_ssb_noexec(task))
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;
if (task_spec_ssb_disable(task))
return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index c4d1023fb0abc..395d46f78582b 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -248,6 +248,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
switch (leaf) {
case 1:
l1 = &l1i;
+ /* fall through */
case 0:
if (!l1->val)
return;
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 3fed38812eea3..6dd78d8235e44 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -48,3 +48,34 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
return NULL;
}
EXPORT_SYMBOL(x86_match_cpu);
+
+static const struct x86_cpu_desc *
+x86_match_cpu_with_stepping(const struct x86_cpu_desc *match)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ const struct x86_cpu_desc *m;
+
+ for (m = match; m->x86_family | m->x86_model; m++) {
+ if (c->x86_vendor != m->x86_vendor)
+ continue;
+ if (c->x86 != m->x86_family)
+ continue;
+ if (c->x86_model != m->x86_model)
+ continue;
+ if (c->x86_stepping != m->x86_stepping)
+ continue;
+ return m;
+ }
+ return NULL;
+}
+
+bool x86_cpu_has_min_microcode_rev(const struct x86_cpu_desc *table)
+{
+ const struct x86_cpu_desc *res = x86_match_cpu_with_stepping(table);
+
+ if (!res || res->x86_microcode_rev > boot_cpu_data.microcode)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(x86_cpu_has_min_microcode_rev);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 89298c83de53b..e64de5149e50e 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -88,11 +88,17 @@ static struct smca_bank_name smca_names[] = {
[SMCA_FP] = { "floating_point", "Floating Point Unit" },
[SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
[SMCA_CS] = { "coherent_slave", "Coherent Slave" },
+ [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
[SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
[SMCA_UMC] = { "umc", "Unified Memory Controller" },
[SMCA_PB] = { "param_block", "Parameter Block" },
[SMCA_PSP] = { "psp", "Platform Security Processor" },
+ [SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
[SMCA_SMU] = { "smu", "System Management Unit" },
+ [SMCA_SMU_V2] = { "smu", "System Management Unit" },
+ [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
+ [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
+ [SMCA_PCIE] = { "pcie", "PCI Express Unit" },
};
static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init =
@@ -138,30 +144,42 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 },
/* ZN Core (HWID=0xB0) MCA types */
- { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF },
+ { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF },
{ SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
{ SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
{ SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
/* HWID 0xB0 MCATYPE 0x4 is Reserved */
- { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF },
+ { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF },
{ SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F },
{ SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF },
/* Data Fabric MCA types */
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF },
- { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF },
+ { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F },
+ { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF },
/* Unified Memory Controller MCA type */
- { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F },
+ { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF },
/* Parameter Block MCA type */
{ SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 },
/* Platform Security Processor MCA type */
{ SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 },
+ { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF },
/* System Management Unit MCA type */
{ SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
+ { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF },
+
+ /* Microprocessor 5 Unit MCA type */
+ { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF },
+
+ /* Northbridge IO Unit MCA type */
+ { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F },
+
+ /* PCI Express Unit MCA type */
+ { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F },
};
struct smca_bank smca_banks[MAX_NR_BANKS];
@@ -545,6 +563,40 @@ out:
return offset;
}
+/*
+ * Turn off MC4_MISC thresholding banks on all family 0x15 models since
+ * they're not supported there.
+ */
+void disable_err_thresholding(struct cpuinfo_x86 *c)
+{
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
+ 0x00000413, /* MC4_MISC0 */
+ 0xc0000408, /* MC4_MISC1 */
+ };
+
+ if (c->x86 != 0x15)
+ return;
+
+ rdmsrl(MSR_K7_HWCR, hwcr);
+
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
+
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
+
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -552,6 +604,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;
+ disable_err_thresholding(c);
+
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 1d9b3ce662a0b..c038e5c00a59f 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -64,11 +64,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
#define CPER_CREATOR_MCE \
- UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
- 0x64, 0x90, 0xb8, 0x9d)
+ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
#define CPER_SECTION_TYPE_MCE \
- UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
- 0x04, 0x4a, 0x38, 0xfc)
+ GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
/*
* CPER specification (in UEFI specification 2.3 appendix N) requires
@@ -135,7 +135,7 @@ retry:
goto out;
/* try to skip other type records in storage */
else if (rc != sizeof(rcd) ||
- uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
+ !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE))
goto retry;
memcpy(m, &rcd.mce, sizeof(*m));
rc = sizeof(*m);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 672c7225cb1b8..b7fb541a4873f 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -784,6 +784,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
quirk_no_way_out(i, m, regs);
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ m->bank = i;
mce_read_aux(m, i);
*msg = tmp;
return 1;
@@ -1611,36 +1612,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
if (c->x86 == 0x15 && c->x86_model <= 0xf)
mce_flags.overflow_recov = 1;
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 hwcr;
- bool need_toggle;
- u32 msrs[] = {
- 0x00000413, /* MC4_MISC0 */
- 0xc0000408, /* MC4_MISC1 */
- };
-
- rdmsrl(MSR_K7_HWCR, hwcr);
-
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
-
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
-
- /* Clear CntP bit safely */
- for (i = 0; i < ARRAY_SIZE(msrs); i++)
- msr_clear_bit(msrs[i], 62);
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index dc3e26e905a32..65201e180fe0e 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -165,6 +165,11 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
),
+ MCESEV(
+ PANIC, "Instruction fetch error in kernel",
+ SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
+ KERNEL
+ ),
#endif
MCESEV(
PANIC, "Action required: unknown MCACOD",
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 51adde0a0f1a0..e1f3ba19ba54f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -855,7 +855,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
if (!p) {
return ret;
} else {
- if (boot_cpu_data.microcode == p->patch_id)
+ if (boot_cpu_data.microcode >= p->patch_id)
return ret;
ret = UCODE_NEW;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e81a2db42df7b..3fa238a137d2d 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -328,6 +328,18 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
# endif
+
+ /*
+ * Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
+ * set x2apic destination mode to physcial mode when x2apic is available
+ * and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
+ * have 8-bit APIC id.
+ */
+# ifdef CONFIG_X86_X2APIC
+ if (x2apic_supported())
+ x2apic_phys = 1;
+# endif
+
#endif
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 3668c5df90c69..5bd011737272d 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -296,7 +296,7 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
unsigned long sizek)
{
unsigned long hole_basek, hole_sizek;
- unsigned long second_basek, second_sizek;
+ unsigned long second_sizek;
unsigned long range0_basek, range0_sizek;
unsigned long range_basek, range_sizek;
unsigned long chunk_sizek;
@@ -304,7 +304,6 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
hole_basek = 0;
hole_sizek = 0;
- second_basek = 0;
second_sizek = 0;
chunk_sizek = state->chunk_sizek;
gran_sizek = state->gran_sizek;
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index 6895049ceef73..4a06c37b9cf11 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,4 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_RESCTRL) += core.o rdtgroup.o monitor.o
-obj-$(CONFIG_RESCTRL) += ctrlmondata.o pseudo_lock.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o
+obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o pseudo_lock.o
CFLAGS_pseudo_lock.o = -I$(src)
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 822b7db634ee7..e49b77283924a 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/kernfs.h>
+#include <linux/fs_context.h>
#include <linux/jump_label.h>
#define MSR_IA32_L3_QOS_CFG 0xc81
@@ -40,6 +41,21 @@
#define RMID_VAL_ERROR BIT_ULL(63)
#define RMID_VAL_UNAVAIL BIT_ULL(62)
+
+struct rdt_fs_context {
+ struct kernfs_fs_context kfc;
+ bool enable_cdpl2;
+ bool enable_cdpl3;
+ bool enable_mba_mbps;
+};
+
+static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct rdt_fs_context, kfc);
+}
+
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
/**
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 14bed6af83773..604c0e3bcc830 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -34,13 +34,6 @@
#include "pseudo_lock_event.h"
/*
- * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware
- * prefetcher state. Details about this register can be found in the MSR
- * tables for specific platforms found in Intel's SDM.
- */
-#define MSR_MISC_FEATURE_CONTROL 0x000001a4
-
-/*
* The bits needed to disable hardware prefetching varies based on the
* platform. During initialization we will discover which bits to use.
*/
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 8388adf241b22..399601eda8e43 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -24,6 +24,7 @@
#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
+#include <linux/fs_parser.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
#include <linux/seq_buf.h>
@@ -32,6 +33,7 @@
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/task_work.h>
+#include <linux/user_namespace.h>
#include <uapi/linux/magic.h>
@@ -1858,46 +1860,6 @@ static void cdp_disable_all(void)
cdpl2_disable();
}
-static int parse_rdtgroupfs_options(char *data)
-{
- char *token, *o = data;
- int ret = 0;
-
- while ((token = strsep(&o, ",")) != NULL) {
- if (!*token) {
- ret = -EINVAL;
- goto out;
- }
-
- if (!strcmp(token, "cdp")) {
- ret = cdpl3_enable();
- if (ret)
- goto out;
- } else if (!strcmp(token, "cdpl2")) {
- ret = cdpl2_enable();
- if (ret)
- goto out;
- } else if (!strcmp(token, "mba_MBps")) {
- if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- ret = set_mba_sc(true);
- else
- ret = -EINVAL;
- if (ret)
- goto out;
- } else {
- ret = -EINVAL;
- goto out;
- }
- }
-
- return 0;
-
-out:
- pr_err("Invalid mount option \"%s\"\n", token);
-
- return ret;
-}
-
/*
* We don't allow rdtgroup directories to be created anywhere
* except the root directory. Thus when looking for the rdtgroup
@@ -1969,13 +1931,27 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
struct rdtgroup *prgrp,
struct kernfs_node **mon_data_kn);
-static struct dentry *rdt_mount(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data)
+static int rdt_enable_ctx(struct rdt_fs_context *ctx)
+{
+ int ret = 0;
+
+ if (ctx->enable_cdpl2)
+ ret = cdpl2_enable();
+
+ if (!ret && ctx->enable_cdpl3)
+ ret = cdpl3_enable();
+
+ if (!ret && ctx->enable_mba_mbps)
+ ret = set_mba_sc(true);
+
+ return ret;
+}
+
+static int rdt_get_tree(struct fs_context *fc)
{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct rdt_domain *dom;
struct rdt_resource *r;
- struct dentry *dentry;
int ret;
cpus_read_lock();
@@ -1984,53 +1960,42 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
* resctrl file system can only be mounted once.
*/
if (static_branch_unlikely(&rdt_enable_key)) {
- dentry = ERR_PTR(-EBUSY);
+ ret = -EBUSY;
goto out;
}
- ret = parse_rdtgroupfs_options(data);
- if (ret) {
- dentry = ERR_PTR(ret);
+ ret = rdt_enable_ctx(ctx);
+ if (ret < 0)
goto out_cdp;
- }
closid_init();
ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
- if (ret) {
- dentry = ERR_PTR(ret);
- goto out_cdp;
- }
+ if (ret < 0)
+ goto out_mba;
if (rdt_mon_capable) {
ret = mongroup_create_dir(rdtgroup_default.kn,
NULL, "mon_groups",
&kn_mongrp);
- if (ret) {
- dentry = ERR_PTR(ret);
+ if (ret < 0)
goto out_info;
- }
kernfs_get(kn_mongrp);
ret = mkdir_mondata_all(rdtgroup_default.kn,
&rdtgroup_default, &kn_mondata);
- if (ret) {
- dentry = ERR_PTR(ret);
+ if (ret < 0)
goto out_mongrp;
- }
kernfs_get(kn_mondata);
rdtgroup_default.mon.mon_data_kn = kn_mondata;
}
ret = rdt_pseudo_lock_init();
- if (ret) {
- dentry = ERR_PTR(ret);
+ if (ret)
goto out_mondata;
- }
- dentry = kernfs_mount(fs_type, flags, rdt_root,
- RDTGROUP_SUPER_MAGIC, NULL);
- if (IS_ERR(dentry))
+ ret = kernfs_get_tree(fc);
+ if (ret < 0)
goto out_psl;
if (rdt_alloc_capable)
@@ -2059,14 +2024,95 @@ out_mongrp:
kernfs_remove(kn_mongrp);
out_info:
kernfs_remove(kn_info);
+out_mba:
+ if (ctx->enable_mba_mbps)
+ set_mba_sc(false);
out_cdp:
cdp_disable_all();
out:
rdt_last_cmd_clear();
mutex_unlock(&rdtgroup_mutex);
cpus_read_unlock();
+ return ret;
+}
+
+enum rdt_param {
+ Opt_cdp,
+ Opt_cdpl2,
+ Opt_mba_mpbs,
+ nr__rdt_params
+};
+
+static const struct fs_parameter_spec rdt_param_specs[] = {
+ fsparam_flag("cdp", Opt_cdp),
+ fsparam_flag("cdpl2", Opt_cdpl2),
+ fsparam_flag("mba_mpbs", Opt_mba_mpbs),
+ {}
+};
+
+static const struct fs_parameter_description rdt_fs_parameters = {
+ .name = "rdt",
+ .specs = rdt_param_specs,
+};
+
+static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- return dentry;
+ switch (opt) {
+ case Opt_cdp:
+ ctx->enable_cdpl3 = true;
+ return 0;
+ case Opt_cdpl2:
+ ctx->enable_cdpl2 = true;
+ return 0;
+ case Opt_mba_mpbs:
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ return -EINVAL;
+ ctx->enable_mba_mbps = true;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void rdt_fs_context_free(struct fs_context *fc)
+{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
+
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations rdt_fs_context_ops = {
+ .free = rdt_fs_context_free,
+ .parse_param = rdt_parse_param,
+ .get_tree = rdt_get_tree,
+};
+
+static int rdt_init_fs_context(struct fs_context *fc)
+{
+ struct rdt_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->kfc.root = rdt_root;
+ ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
+ fc->fs_private = &ctx->kfc;
+ fc->ops = &rdt_fs_context_ops;
+ if (fc->user_ns)
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(&init_user_ns);
+ fc->global = true;
+ return 0;
}
static int reset_all_ctrls(struct rdt_resource *r)
@@ -2239,9 +2285,10 @@ static void rdt_kill_sb(struct super_block *sb)
}
static struct file_system_type rdt_fs_type = {
- .name = "resctrl",
- .mount = rdt_mount,
- .kill_sb = rdt_kill_sb,
+ .name = "resctrl",
+ .init_fs_context = rdt_init_fs_context,
+ .parameters = &rdt_fs_parameters,
+ .kill_sb = rdt_kill_sb,
};
static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c8b07d8ea5a2b..17ffc869cab82 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -470,6 +470,7 @@ int crash_load_segments(struct kimage *image)
kbuf.memsz = kbuf.bufsz;
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret) {
vfree((void *)image->arch.elf_headers);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 50895c2f937d1..2879e234e1936 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/sort.h>
+#include <linux/memory_hotplug.h>
#include <asm/e820/api.h>
#include <asm/setup.h>
@@ -671,21 +672,18 @@ __init void e820__reallocate_tables(void)
int size;
size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table->nr_entries;
- n = kmalloc(size, GFP_KERNEL);
+ n = kmemdup(e820_table, size, GFP_KERNEL);
BUG_ON(!n);
- memcpy(n, e820_table, size);
e820_table = n;
size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_kexec->nr_entries;
- n = kmalloc(size, GFP_KERNEL);
+ n = kmemdup(e820_table_kexec, size, GFP_KERNEL);
BUG_ON(!n);
- memcpy(n, e820_table_kexec, size);
e820_table_kexec = n;
size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry)*e820_table_firmware->nr_entries;
- n = kmalloc(size, GFP_KERNEL);
+ n = kmemdup(e820_table_firmware, size, GFP_KERNEL);
BUG_ON(!n);
- memcpy(n, e820_table_firmware, size);
e820_table_firmware = n;
}
@@ -778,7 +776,7 @@ u64 __init e820__memblock_alloc_reserved(u64 size, u64 align)
{
u64 addr;
- addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
+ addr = memblock_phys_alloc(size, align);
if (addr) {
e820__range_update_kexec(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
pr_info("update e820_table_kexec for e820__memblock_alloc_reserved()\n");
@@ -881,6 +879,10 @@ static int __init parse_memopt(char *p)
e820__range_remove(mem_size, ULLONG_MAX - mem_size, E820_TYPE_RAM, 1);
+#ifdef CONFIG_MEMORY_HOTPLUG
+ max_mem_size = mem_size;
+#endif
+
return 0;
}
early_param("mem", parse_memopt);
@@ -1095,6 +1097,9 @@ void __init e820__reserve_resources(void)
res = memblock_alloc(sizeof(*res) * e820_table->nr_entries,
SMP_CACHE_BYTES);
+ if (!res)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(*res) * e820_table->nr_entries);
e820_res = res;
for (i = 0; i < e820_table->nr_entries; i++) {
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 374a52fa52969..9b33904251a9f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -388,10 +388,6 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
-#ifdef CONFIG_EARLY_PRINTK_EFI
- if (!strncmp(buf, "efi", 3))
- early_console_register(&early_efi_console, keep);
-#endif
#ifdef CONFIG_EARLY_PRINTK_USB_XDBC
if (!strncmp(buf, "xdbc", 4))
early_xdbc_parse_parameter(buf + 4);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 9cc108456d0be..d7432c2b10514 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -669,7 +669,7 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size)
return false;
}
-static int init_xstate_size(void)
+static int __init init_xstate_size(void)
{
/* Recompute the context size for enabled features: */
unsigned int possible_xstate_size;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8257a59704ae9..ef49517f6bb24 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -49,7 +49,7 @@ int ftrace_arch_code_modify_post_process(void)
union ftrace_code_union {
char code[MCOUNT_INSN_SIZE];
struct {
- unsigned char e8;
+ unsigned char op;
int offset;
} __attribute__((packed));
};
@@ -59,20 +59,23 @@ static int ftrace_calc_offset(long ip, long addr)
return (int)(addr - ip);
}
-static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *
+ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
{
static union ftrace_code_union calc;
- calc.e8 = 0xe8;
+ calc.op = op;
calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
- /*
- * No locking needed, this must be called via kstop_machine
- * which in essence is like running on a uniprocessor machine.
- */
return calc.code;
}
+static unsigned char *
+ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+ return ftrace_text_replace(0xe8, ip, addr);
+}
+
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
@@ -269,7 +272,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
return ret;
}
-static int is_ftrace_caller(unsigned long ip)
+static nokprobe_inline int is_ftrace_caller(unsigned long ip)
{
if (ip == ftrace_update_func)
return 1;
@@ -299,6 +302,7 @@ int ftrace_int3_handler(struct pt_regs *regs)
return 1;
}
+NOKPROBE_SYMBOL(ftrace_int3_handler);
static int ftrace_write(unsigned long ip, const char *val, int size)
{
@@ -664,22 +668,6 @@ int __init ftrace_dyn_arch_init(void)
return 0;
}
-#if defined(CONFIG_X86_64) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
-{
- static union ftrace_code_union calc;
-
- /* Jmp not a call (ignore the .e8) */
- calc.e8 = 0xe9;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
-
- /*
- * ftrace external locks synchronize the access to the static variable.
- */
- return calc.code;
-}
-#endif
-
/* Currently only x86_64 supports dynamic trampolines */
#ifdef CONFIG_X86_64
@@ -891,8 +879,8 @@ static void *addr_from_call(void *ptr)
return NULL;
/* Make sure this is a call */
- if (WARN_ON_ONCE(calc.e8 != 0xe8)) {
- pr_warn("Expected e8, got %x\n", calc.e8);
+ if (WARN_ON_ONCE(calc.op != 0xe8)) {
+ pr_warn("Expected e8, got %x\n", calc.op);
return NULL;
}
@@ -963,6 +951,11 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
+static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+{
+ return ftrace_text_replace(0xe9, ip, addr);
+}
+
static int ftrace_mod_jmp(unsigned long ip, void *func)
{
unsigned char *new;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index b0acb22e5a465..dfd3aca82c61c 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -21,10 +21,6 @@
#define HPET_MASK CLOCKSOURCE_MASK(32)
-/* FSEC = 10^-15
- NSEC = 10^-9 */
-#define FSEC_PER_NSEC 1000000L
-
#define HPET_DEV_USED_BIT 2
#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
#define HPET_DEV_VALID 0x8
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 34a5c17151487..ff9bfd40429ef 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -261,12 +261,8 @@ static int arch_build_bp_info(struct perf_event *bp,
* allow kernel breakpoints at all.
*/
if (attr->bp_addr >= TASK_SIZE_MAX) {
-#ifdef CONFIG_KPROBES
if (within_kprobe_blacklist(attr->bp_addr))
return -EINVAL;
-#else
- return -EINVAL;
-#endif
}
hw->type = X86_BREAKPOINT_EXECUTE;
@@ -279,6 +275,7 @@ static int arch_build_bp_info(struct perf_event *bp,
hw->len = X86_BREAKPOINT_LEN_X;
return 0;
}
+ /* fall through */
default:
return -EINVAL;
}
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 278cd07228dd8..22f60dd26460c 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -167,6 +167,9 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
struct efi_info *current_ei = &boot_params.efi_info;
struct efi_info *ei = &params->efi_info;
+ if (!efi_enabled(EFI_RUNTIME_SERVICES))
+ return 0;
+
if (!current_ei->efi_memmap_size)
return 0;
@@ -215,6 +218,9 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
params->screen_info.ext_mem_k = 0;
params->alt_mem_k = 0;
+ /* Always fill in RSDP: it is either 0 or a valid value */
+ params->acpi_rsdp_addr = boot_params.acpi_rsdp_addr;
+
/* Default APM info */
memset(&params->apm_bios_info, 0, sizeof(params->apm_bios_info));
@@ -253,7 +259,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
efi_setup_data_offset);
#endif
-
/* Setup EDD info */
memcpy(params->eddbuf, boot_params.eddbuf,
EDDMAXNR * sizeof(struct edd_info));
@@ -434,6 +439,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.memsz = PAGE_ALIGN(header->init_size);
kbuf.buf_align = header->kernel_alignment;
kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
@@ -448,6 +454,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
kbuf.bufsz = kbuf.memsz = initrd_len;
kbuf.buf_align = PAGE_SIZE;
kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
@@ -531,9 +538,17 @@ static int bzImage64_cleanup(void *loader_data)
#ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
{
- return verify_pefile_signature(kernel, kernel_len,
- VERIFY_USE_SECONDARY_KEYRING,
- VERIFYING_KEXEC_PE_SIGNATURE);
+ int ret;
+
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_SECONDARY_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ if (ret == -ENOKEY && IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING)) {
+ ret = verify_pefile_signature(kernel, kernel_len,
+ VERIFY_USE_PLATFORM_KEYRING,
+ VERIFYING_KEXEC_PE_SIGNATURE);
+ }
+ return ret;
}
#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5db08425063ed..4ff6b4cdb9419 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -467,6 +467,7 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
ptr = &remcomInBuffer[1];
if (kgdb_hex2long(&ptr, &addr))
linux_regs->ip = addr;
+ /* fall through */
case 'D':
case 'k':
/* clear the trace bit */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4ba75afba5271..a034cb808e7eb 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -1028,6 +1028,13 @@ NOKPROBE_SYMBOL(kprobe_fault_handler);
int __init arch_populate_kprobe_blacklist(void)
{
+ int ret;
+
+ ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start,
+ (unsigned long)__irqentry_text_end);
+ if (ret)
+ return ret;
+
return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
(unsigned long)__entry_text_end);
}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 6adf6e6c29339..f142629520158 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -97,6 +97,7 @@ static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
}
asm (
+ ".pushsection .rodata\n"
"optprobe_template_func:\n"
".global optprobe_template_entry\n"
"optprobe_template_entry:\n"
@@ -136,8 +137,7 @@ asm (
#endif
".global optprobe_template_end\n"
"optprobe_template_end:\n"
- ".type optprobe_template_func, @function\n"
- ".size optprobe_template_func, .-optprobe_template_func\n");
+ ".popsection\n");
void optprobe_template_func(void);
STACK_FRAME_NON_STANDARD(optprobe_template_func);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index ba4bfb7f6a369..5c93a65ee1e5c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -457,6 +457,7 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
#else
u64 ipi_bitmap = 0;
#endif
+ long ret;
if (cpumask_empty(mask))
return;
@@ -482,8 +483,9 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
max = apic_id < max ? max : apic_id;
} else {
- kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
min = max = apic_id;
ipi_bitmap = 0;
}
@@ -491,8 +493,9 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector)
}
if (ipi_bitmap) {
- kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
+ ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
+ WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
}
local_irq_restore(flags);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e811d4d1c8247..904494b924c13 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -104,12 +104,8 @@ static u64 kvm_sched_clock_read(void)
static inline void kvm_sched_clock_init(bool stable)
{
- if (!stable) {
- pv_ops.time.sched_clock = kvm_clock_read;
+ if (!stable)
clear_sched_clock_stable();
- return;
- }
-
kvm_sched_clock_offset = kvm_clock_read();
pv_ops.time.sched_clock = kvm_sched_clock_read;
@@ -355,6 +351,20 @@ void __init kvmclock_init(void)
machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
kvm_get_preset_lpj();
+
+ /*
+ * X86_FEATURE_NONSTOP_TSC is TSC runs at constant rate
+ * with P/T states and does not stop in deep C-states.
+ *
+ * Invariant TSC exposed by host means kvmclock is not necessary:
+ * can use TSC as clocksource.
+ *
+ */
+ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+ boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+ !check_tsc_unstable())
+ kvm_clock.rating = 299;
+
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
pv_info.name = "KVM";
}
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 4c8acdfdc5a74..ceba408ea9824 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -352,6 +352,8 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
+ u64 sme_mask = sme_me_mask;
+
VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_top_pgt);
vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
@@ -364,6 +366,7 @@ void arch_crash_save_vmcoreinfo(void)
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_NUMBER(sme_mask);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 90ae0ca510837..58ac7be52c7a6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -255,6 +255,18 @@ void arch_setup_new_exec(void)
/* If cpuid was previously disabled for this task, re-enable it. */
if (test_thread_flag(TIF_NOCPUID))
enable_cpuid();
+
+ /*
+ * Don't inherit TIF_SSBD across exec boundary when
+ * PR_SPEC_DISABLE_NOEXEC is used.
+ */
+ if (test_thread_flag(TIF_SSBD) &&
+ task_spec_ssb_noexec(current)) {
+ clear_thread_flag(TIF_SSBD);
+ task_clear_spec_ssb_disable(current);
+ task_clear_spec_ssb_noexec(current);
+ speculation_ctrl_update(task_thread_info(current)->flags);
+ }
}
static inline void switch_to_bitmap(struct thread_struct *prev,
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index e8796fcd7e5a5..4bf46575568a2 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -106,22 +106,22 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
void *ptr;
if (!node_online(node) || !NODE_DATA(node)) {
- ptr = memblock_alloc_from_nopanic(size, align, goal);
+ ptr = memblock_alloc_from(size, align, goal);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
cpu, size, __pa(ptr));
} else {
- ptr = memblock_alloc_try_nid_nopanic(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE,
- node);
+ ptr = memblock_alloc_try_nid(size, align, goal,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ node);
pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
cpu, size, node, __pa(ptr));
}
return ptr;
#else
- return memblock_alloc_from_nopanic(size, align, goal);
+ return memblock_alloc_from(size, align, goal);
#endif
}
@@ -171,7 +171,7 @@ void __init setup_per_cpu_areas(void)
unsigned long delta;
int rc;
- pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n",
+ pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
/*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ccd1f2a8e5577..ce1a67b70168e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -56,6 +56,7 @@
#include <linux/stackprotector.h>
#include <linux/gfp.h>
#include <linux/cpuidle.h>
+#include <linux/numa.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -149,7 +150,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*/
static void smp_callin(void)
{
- int cpuid, phys_id;
+ int cpuid;
/*
* If waken up by an INIT in an 82489DX configuration
@@ -160,11 +161,6 @@ static void smp_callin(void)
cpuid = smp_processor_id();
/*
- * (This works even if the APIC is not enabled.)
- */
- phys_id = read_apic_id();
-
- /*
* the boot CPU has finished the init stage and is spinning
* on callin_map until we finish. We are free to set up this
* CPU, first the APIC. (this is probably redundant on most
@@ -841,7 +837,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
- static int current_node = -1;
+ static int current_node = NUMA_NO_NODE;
int node = early_cpu_to_node(cpu);
static int width, node_width;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9b7c4ca8f0a73..d26f9e9c3d830 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -111,6 +111,7 @@ void ist_enter(struct pt_regs *regs)
/* This code is a bit fragile. Test it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
}
+NOKPROBE_SYMBOL(ist_enter);
void ist_exit(struct pt_regs *regs)
{
@@ -880,12 +881,12 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
- unsigned long cr0;
+ unsigned long cr0 = read_cr0();
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
#ifdef CONFIG_MATH_EMULATION
- if (!boot_cpu_has(X86_FEATURE_FPU) && (read_cr0() & X86_CR0_EM)) {
+ if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
cond_local_irq_enable(regs);
@@ -897,7 +898,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
#endif
/* This should not happen. */
- cr0 = read_cr0();
if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
/* Try to fix it up and carry on. */
write_cr0(cr0 & ~X86_CR0_TS);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e9f777bfed404..3fae238340699 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -297,15 +297,16 @@ static int __init tsc_setup(char *str)
__setup("tsc=", tsc_setup);
-#define MAX_RETRIES 5
-#define SMI_TRESHOLD 50000
+#define MAX_RETRIES 5
+#define TSC_DEFAULT_THRESHOLD 0x20000
/*
- * Read TSC and the reference counters. Take care of SMI disturbance
+ * Read TSC and the reference counters. Take care of any disturbances
*/
static u64 tsc_read_refs(u64 *p, int hpet)
{
u64 t1, t2;
+ u64 thresh = tsc_khz ? tsc_khz >> 5 : TSC_DEFAULT_THRESHOLD;
int i;
for (i = 0; i < MAX_RETRIES; i++) {
@@ -315,7 +316,7 @@ static u64 tsc_read_refs(u64 *p, int hpet)
else
*p = acpi_pm_read_early();
t2 = get_cycles();
- if ((t2 - t1) < SMI_TRESHOLD)
+ if ((t2 - t1) < thresh)
return t2;
}
return ULLONG_MAX;
@@ -703,15 +704,15 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
* zero. In each wait loop iteration we read the TSC and check
* the delta to the previous read. We keep track of the min
* and max values of that delta. The delta is mostly defined
- * by the IO time of the PIT access, so we can detect when a
- * SMI/SMM disturbance happened between the two reads. If the
+ * by the IO time of the PIT access, so we can detect when
+ * any disturbance happened between the two reads. If the
* maximum time is significantly larger than the minimum time,
* then we discard the result and have another try.
*
* 2) Reference counter. If available we use the HPET or the
* PMTIMER as a reference to check the sanity of that value.
* We use separate TSC readouts and check inside of the
- * reference read for a SMI/SMM disturbance. We dicard
+ * reference read for any possible disturbance. We dicard
* disturbed values here as well. We do that around the PIT
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
@@ -744,7 +745,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
if (ref1 == ref2)
continue;
- /* Check, whether the sampling was disturbed by an SMI */
+ /* Check, whether the sampling was disturbed */
if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
continue;
@@ -1268,7 +1269,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
*/
static void tsc_refine_calibration_work(struct work_struct *work)
{
- static u64 tsc_start = -1, ref_start;
+ static u64 tsc_start = ULLONG_MAX, ref_start;
static int hpet;
u64 tsc_stop, ref_stop, delta;
unsigned long freq;
@@ -1283,14 +1284,15 @@ static void tsc_refine_calibration_work(struct work_struct *work)
* delayed the first time we expire. So set the workqueue
* again once we know timers are working.
*/
- if (tsc_start == -1) {
+ if (tsc_start == ULLONG_MAX) {
+restart:
/*
* Only set hpet once, to avoid mixing hardware
* if the hpet becomes enabled later.
*/
hpet = is_hpet_enabled();
- schedule_delayed_work(&tsc_irqwork, HZ);
tsc_start = tsc_read_refs(&ref_start, hpet);
+ schedule_delayed_work(&tsc_irqwork, HZ);
return;
}
@@ -1300,9 +1302,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
if (ref_start == ref_stop)
goto out;
- /* Check, whether the sampling was disturbed by an SMI */
- if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
- goto out;
+ /* Check, whether the sampling was disturbed */
+ if (tsc_stop == ULLONG_MAX)
+ goto restart;
delta = tsc_stop - tsc_start;
delta *= 1000000LL;
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index 3dc26f95d46e8..9b9fd4826e7ab 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state)
}
/* Get the next frame pointer: */
- if (state->regs)
+ if (state->next_bp) {
+ next_bp = state->next_bp;
+ state->next_bp = NULL;
+ } else if (state->regs) {
next_bp = (unsigned long *)state->regs->bp;
- else
+ } else {
next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp);
+ }
/* Move to the next frame if it's safe: */
if (!update_stack_state(state, next_bp))
@@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
bp = get_frame_pointer(task, regs);
+ /*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer.
+ * That means that SP points into the middle of an incomplete frame:
+ * *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we
+ * would have written a frame pointer if we hadn't crashed.
+ * Pretend that the frame is complete and that BP points to it, but save
+ * the real BP so that we can use it when looking for the next frame.
+ */
+ if (regs && regs->ip == 0 &&
+ (unsigned long *)kernel_stack_pointer(regs) >= first_frame) {
+ state->next_bp = bp;
+ bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1;
+ }
+
/* Initialize stack info and make sure the frame data is accessible: */
get_stack_info(bp, state->task, &state->stack_info,
&state->stack_mask);
@@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
*/
while (!unwind_done(state) &&
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
- state->bp < first_frame))
+ (state->next_bp == NULL && state->bp < first_frame)))
unwind_next_frame(state);
}
EXPORT_SYMBOL_GPL(__unwind_start);
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 26038eacf74a7..89be1be1790c4 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -113,6 +113,20 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip)
}
#endif
+/*
+ * If we crash with IP==0, the last successfully executed instruction
+ * was probably an indirect function call with a NULL function pointer,
+ * and we don't have unwind information for NULL.
+ * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function
+ * pointer into its parent and then continue normally from there.
+ */
+static struct orc_entry null_orc_entry = {
+ .sp_offset = sizeof(long),
+ .sp_reg = ORC_REG_SP,
+ .bp_reg = ORC_REG_UNDEFINED,
+ .type = ORC_TYPE_CALL
+};
+
static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
@@ -120,6 +134,9 @@ static struct orc_entry *orc_find(unsigned long ip)
if (!orc_init)
return NULL;
+ if (ip == 0)
+ return &null_orc_entry;
+
/* For non-init vmlinux addresses, use the fast lookup table: */
if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) {
unsigned int idx, start, stop;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 843feb94a9501..ccf03416e4342 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -745,6 +745,7 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
* OPCODE1() of the "short" jmp which checks the same condition.
*/
opc1 = OPCODE2(insn) - 0x10;
+ /* fall through */
default:
if (!is_cond_jmp_opcode(opc1))
return -ENOSYS;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0d618ee634ac4..bad8c51fee6ee 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -31,7 +31,7 @@
#undef i386 /* in case the preprocessor is a 32bit one */
-OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT)
#ifdef CONFIG_X86_32
OUTPUT_ARCH(i386)
@@ -401,7 +401,7 @@ SECTIONS
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
*/
-#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
+#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
INIT_PER_CPU(gdt_page);
INIT_PER_CPU(irq_stack_union);
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 69b3a7c300139..31ecf7a76d5a4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,10 +2,6 @@
ccflags-y += -Iarch/x86/kvm
-CFLAGS_x86.o := -I.
-CFLAGS_svm.o := -I.
-CFLAGS_vmx.o := -I.
-
KVM := ../../../virt/kvm
kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index bbffa6c54697a..fd3951638ae45 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -335,6 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
+ unsigned f_la57 = 0;
/* cpuid 1.edx */
const u32 kvm_cpuid_1_edx_x86_features =
@@ -404,7 +405,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
- F(CLDEMOTE);
+ F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -489,7 +490,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
// TSC_ADJUST is emulated
entry->ebx |= F(TSC_ADJUST);
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+ f_la57 = entry->ecx & F(LA57);
cpuid_mask(&entry->ecx, CPUID_7_ECX);
+ /* Set LA57 based on hardware capability. */
+ entry->ecx |= f_la57;
entry->ecx |= f_umip;
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c90a5352d158f..27c43525a05f1 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1636,7 +1636,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
ret = kvm_hvcall_signal_event(vcpu, fast, ingpa);
if (ret != HV_STATUS_INVALID_PORT_ID)
break;
- /* maybe userspace knows this conn_id: fall through */
+ /* fall through - maybe userspace knows this conn_id. */
case HVCALL_POST_MESSAGE:
/* don't bother userspace if it has no way to handle it */
if (unlikely(rep || !vcpu_to_synic(vcpu)->active)) {
@@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
mutex_lock(&hv->hv_lock);
ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
mutex_unlock(&hv->hv_lock);
if (ret >= 0)
@@ -1832,7 +1832,6 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
ent->eax |= HV_X64_MSR_VP_INDEX_AVAILABLE;
ent->eax |= HV_X64_MSR_RESET_AVAILABLE;
ent->eax |= HV_MSR_REFERENCE_TSC_AVAILABLE;
- ent->eax |= HV_X64_MSR_GUEST_IDLE_AVAILABLE;
ent->eax |= HV_X64_ACCESS_FREQUENCY_MSRS;
ent->eax |= HV_X64_ACCESS_REENLIGHTENMENT;
@@ -1848,11 +1847,11 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_ENLIGHTMENT_INFO:
ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED;
ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED;
- ent->eax |= HV_X64_SYSTEM_RESET_RECOMMENDED;
ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED;
ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED;
ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED;
- ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
+ if (evmcs_ver)
+ ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED;
/*
* Default number of spinlock retry attempts, matches
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index af192895b1fc6..4a6dc54cc12be 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pid_t pid_nr;
int ret;
- pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
if (!pit)
return NULL;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bdcd4139eca92..8b38bb4868a65 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm)
struct kvm_pic *s;
int ret;
- s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+ s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
if (!s)
return -ENOMEM;
spin_lock_init(&s->lock);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 4e822ad363f37..1add1bc881e22 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm)
struct kvm_ioapic *ioapic;
int ret;
- ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
if (!ioapic)
return -ENOMEM;
spin_lock_init(&ioapic->lock);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9f089e2e09d02..991fdf7fc17fb 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm)
max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
new = kvzalloc(sizeof(struct kvm_apic_map) +
- sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
+ GFP_KERNEL_ACCOUNT);
if (!new)
goto out;
@@ -1035,6 +1036,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
switch (delivery_mode) {
case APIC_DM_LOWEST:
vcpu->arch.apic_arb_prio++;
+ /* fall through */
case APIC_DM_FIXED:
if (unlikely(trig_mode && !level))
break;
@@ -1874,6 +1876,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVT0:
apic_manage_nmi_watchdog(apic, val);
+ /* fall through */
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
@@ -2257,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
ASSERT(vcpu != NULL);
apic_debug("apic_init %d\n", vcpu->vcpu_id);
- apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+ apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
if (!apic)
goto nomem;
vcpu->arch.apic = apic;
- apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+ apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ce770b4462385..7837ab001d806 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -109,9 +109,11 @@ module_param(dbg, bool, 0644);
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
-#define PT64_DIR_BASE_ADDR_MASK \
- (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#else
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#endif
#define PT64_LVL_ADDR_MASK(level) \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT64_LEVEL_BITS))) - 1))
@@ -330,53 +332,56 @@ static inline bool is_access_track_spte(u64 spte)
}
/*
- * the low bit of the generation number is always presumed to be zero.
- * This disables mmio caching during memslot updates. The concept is
- * similar to a seqcount but instead of retrying the access we just punt
- * and ignore the cache.
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * the memslots generation and is derived as follows:
*
- * spte bits 3-11 are used as bits 1-9 of the generation number,
- * the bits 52-61 are used as bits 10-19 of the generation number.
+ * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
+ * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
+ *
+ * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
+ * the MMIO generation number, as doing so would require stealing a bit from
+ * the "real" generation number and thus effectively halve the maximum number
+ * of MMIO generations that can be handled before encountering a wrap (which
+ * requires a full MMU zap). The flag is instead explicitly queried when
+ * checking for MMIO spte cache hits.
*/
-#define MMIO_SPTE_GEN_LOW_SHIFT 2
-#define MMIO_SPTE_GEN_HIGH_SHIFT 52
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
-#define MMIO_GEN_SHIFT 20
-#define MMIO_GEN_LOW_SHIFT 10
-#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
-#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_SPTE_GEN_LOW_START 3
+#define MMIO_SPTE_GEN_LOW_END 11
+#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+ MMIO_SPTE_GEN_LOW_START)
-static u64 generation_mmio_spte_mask(unsigned int gen)
+#define MMIO_SPTE_GEN_HIGH_START 52
+#define MMIO_SPTE_GEN_HIGH_END 61
+#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
+ MMIO_SPTE_GEN_HIGH_START)
+static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
- WARN_ON(gen & ~MMIO_GEN_MASK);
+ WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
- mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
- mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+ mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
+ mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
return mask;
}
-static unsigned int get_mmio_spte_generation(u64 spte)
+static u64 get_mmio_spte_generation(u64 spte)
{
- unsigned int gen;
+ u64 gen;
spte &= ~shadow_mmio_mask;
- gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
- gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
+ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
return gen;
}
-static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
-{
- return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
-}
-
static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned access)
{
- unsigned int gen = kvm_current_mmio_generation(vcpu);
+ u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
u64 mask = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
@@ -386,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
+ page_header(__pa(sptep))->mmio_cached = true;
+
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
}
@@ -407,7 +414,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
static unsigned get_mmio_spte_access(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
+ u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
return (spte & ~mask) & ~PAGE_MASK;
}
@@ -424,9 +431,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
- unsigned int kvm_gen, spte_gen;
+ u64 kvm_gen, spte_gen, gen;
- kvm_gen = kvm_current_mmio_generation(vcpu);
+ gen = kvm_vcpu_memslots(vcpu)->generation;
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return false;
+
+ kvm_gen = gen & MMIO_SPTE_GEN_MASK;
spte_gen = get_mmio_spte_generation(spte);
trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -959,7 +970,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
if (cache->nobjs >= min)
return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
if (!obj)
return cache->nobjs >= min ? 0 : -ENOMEM;
cache->objects[cache->nobjs++] = obj;
@@ -2049,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
if (!direct)
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-
- /*
- * The active_mmu_pages list is the FIFO list, do not move the
- * page until it is zapped. kvm_zap_obsolete_pages depends on
- * this feature. See the comments in kvm_zap_obsolete_pages().
- */
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
@@ -2195,23 +2200,15 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
--kvm->stat.mmu_unsync;
}
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list);
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list);
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
-/*
- * NOTE: we should pay more attention on the zapped-obsolete page
- * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
- * since it has been deleted from active_mmu_pages but still can be found
- * at hast list.
- *
- * for_each_valid_sp() has skipped that kind of pages.
- */
#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
+ if ((_sp)->role.invalid) { \
} else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2231,18 +2228,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return true;
}
+static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
+ struct list_head *invalid_list,
+ bool remote_flush)
+{
+ if (!remote_flush && !list_empty(invalid_list))
+ return false;
+
+ if (!list_empty(invalid_list))
+ kvm_mmu_commit_zap_page(kvm, invalid_list);
+ else
+ kvm_flush_remote_tlbs(kvm);
+ return true;
+}
+
static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
struct list_head *invalid_list,
bool remote_flush, bool local_flush)
{
- if (!list_empty(invalid_list)) {
- kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
return;
- }
- if (remote_flush)
- kvm_flush_remote_tlbs(vcpu->kvm);
- else if (local_flush)
+ if (local_flush)
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
@@ -2253,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
static void mmu_audit_disable(void) { }
#endif
-static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
-}
-
static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
@@ -2482,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
}
- sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
clear_page(sp->spt);
trace_kvm_mmu_get_page(sp, true);
@@ -2668,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
return zapped;
}
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
- struct list_head *invalid_list)
+static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
+ struct kvm_mmu_page *sp,
+ struct list_head *invalid_list,
+ int *nr_zapped)
{
- int ret;
+ bool list_unstable;
trace_kvm_mmu_prepare_zap_page(sp);
++kvm->stat.mmu_shadow_zapped;
- ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+ *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp);
+ /* Zapping children means active_mmu_pages has become unstable. */
+ list_unstable = *nr_zapped;
+
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp);
@@ -2686,22 +2692,27 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
/* Count self */
- ret++;
+ (*nr_zapped)++;
list_move(&sp->link, invalid_list);
kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
- /*
- * The obsolete pages can not be used on any vcpus.
- * See the comments in kvm_mmu_invalidate_zap_all_pages().
- */
- if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+ if (!sp->role.invalid)
kvm_reload_remote_mmus(kvm);
}
sp->role.invalid = 1;
- return ret;
+ return list_unstable;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+ struct list_head *invalid_list)
+{
+ int nr_zapped;
+
+ __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
+ return nr_zapped;
}
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
@@ -3555,6 +3566,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
&invalid_list);
mmu->root_hpa = INVALID_PAGE;
}
+ mmu->root_cr3 = 0;
}
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
@@ -3610,6 +3622,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
} else
BUG();
+ vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
return 0;
}
@@ -3618,10 +3631,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
u64 pdptr, pm_mask;
- gfn_t root_gfn;
+ gfn_t root_gfn, root_cr3;
int i;
- root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT;
+ root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
+ root_gfn = root_cr3 >> PAGE_SHIFT;
if (mmu_check_root(vcpu, root_gfn))
return 1;
@@ -3646,7 +3660,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
++sp->root_count;
spin_unlock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu->root_hpa = root;
- return 0;
+ goto set_root_cr3;
}
/*
@@ -3700,7 +3714,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
u64 *lm_root;
- lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+ lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
if (lm_root == NULL)
return 1;
@@ -3712,6 +3726,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
}
+set_root_cr3:
+ vcpu->arch.mmu->root_cr3 = root_cr3;
+
return 0;
}
@@ -4163,7 +4180,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
struct kvm_mmu_root_info root;
struct kvm_mmu *mmu = vcpu->arch.mmu;
- root.cr3 = mmu->get_cr3(vcpu);
+ root.cr3 = mmu->root_cr3;
root.hpa = mmu->root_hpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
@@ -4176,6 +4193,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
}
mmu->root_hpa = root.hpa;
+ mmu->root_cr3 = root.cr3;
return i < KVM_MMU_NUM_PREV_ROOTS;
}
@@ -4197,14 +4215,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
return false;
if (cached_root_available(vcpu, new_cr3, new_role)) {
- /*
- * It is possible that the cached previous root page is
- * obsolete because of a change in the MMU
- * generation number. However, that is accompanied by
- * KVM_REQ_MMU_RELOAD, which will free the root that we
- * have set here and allocate a new one.
- */
-
kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
if (!skip_tlb_flush) {
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -4371,6 +4381,7 @@ __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[1][4] =
rsvd_check->rsvd_bits_mask[0][4];
+ /* fall through */
case PT64_ROOT_4LEVEL:
rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
@@ -4769,6 +4780,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
ext.cr4_pse = !!is_pse(vcpu);
ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
+ ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
ext.valid = 1;
@@ -5477,6 +5489,76 @@ void kvm_disable_tdp(void)
}
EXPORT_SYMBOL_GPL(kvm_disable_tdp);
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+
+/* The caller should hold mmu-lock before calling this function. */
+static __always_inline bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+{
+ struct slot_rmap_walk_iterator iterator;
+ bool flush = false;
+
+ for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap);
+
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
+
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+
+ return flush;
+}
+
+static __always_inline bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ bool lock_flush_tlb)
+{
+ return slot_handle_level_range(kvm, memslot, fn, start_level,
+ end_level, memslot->base_gfn,
+ memslot->base_gfn + memslot->npages - 1,
+ lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu->pae_root);
@@ -5496,7 +5578,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
* Therefore we need to allocate shadow page tables in the first
* 4GB of memory, which happens to fit the DMA32 zone.
*/
- page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
if (!page)
return -ENOMEM;
@@ -5515,11 +5597,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.root_mmu.root_cr3 = 0;
vcpu->arch.root_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
+ vcpu->arch.guest_mmu.root_cr3 = 0;
vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
@@ -5532,105 +5616,62 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot,
struct kvm_page_track_notifier_node *node)
{
- kvm_mmu_invalidate_zap_all_pages(kvm);
-}
-
-void kvm_mmu_init_vm(struct kvm *kvm)
-{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-
- node->track_write = kvm_mmu_pte_write;
- node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
- kvm_page_track_register_notifier(kvm, node);
-}
+ struct kvm_mmu_page *sp;
+ LIST_HEAD(invalid_list);
+ unsigned long i;
+ bool flush;
+ gfn_t gfn;
-void kvm_mmu_uninit_vm(struct kvm *kvm)
-{
- struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ spin_lock(&kvm->mmu_lock);
- kvm_page_track_unregister_notifier(kvm, node);
-}
+ if (list_empty(&kvm->arch.active_mmu_pages))
+ goto out_unlock;
-/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+ flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
-/* The caller should hold mmu-lock before calling this function. */
-static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
-{
- struct slot_rmap_walk_iterator iterator;
- bool flush = false;
+ for (i = 0; i < slot->npages; i++) {
+ gfn = slot->base_gfn + i;
- for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
- end_gfn, &iterator) {
- if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap);
+ for_each_valid_sp(kvm, sp, gfn) {
+ if (sp->gfn != gfn)
+ continue;
+ kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+ }
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
- flush = false;
- }
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+ flush = false;
cond_resched_lock(&kvm->mmu_lock);
}
}
+ kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
- if (flush && lock_flush_tlb) {
- kvm_flush_remote_tlbs(kvm);
- flush = false;
- }
-
- return flush;
+out_unlock:
+ spin_unlock(&kvm->mmu_lock);
}
-static __always_inline bool
-slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+void kvm_mmu_init_vm(struct kvm *kvm)
{
- return slot_handle_level_range(kvm, memslot, fn, start_level,
- end_level, memslot->base_gfn,
- memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
-}
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-static __always_inline bool
-slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
-{
- return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+ node->track_write = kvm_mmu_pte_write;
+ node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
+ kvm_page_track_register_notifier(kvm, node);
}
-static __always_inline bool
-slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+void kvm_mmu_uninit_vm(struct kvm *kvm)
{
- return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
+ struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
-static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
-{
- return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+ kvm_page_track_unregister_notifier(kvm, node);
}
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- bool flush_tlb = true;
- bool flush = false;
int i;
- if (kvm_available_flush_tlb_with_range())
- flush_tlb = false;
-
spin_lock(&kvm->mmu_lock);
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
slots = __kvm_memslots(kvm, i);
@@ -5642,17 +5683,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (start >= end)
continue;
- flush |= slot_handle_level_range(kvm, memslot,
- kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL,
- PT_MAX_HUGEPAGE_LEVEL, start,
- end - 1, flush_tlb);
+ slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true);
}
}
- if (flush)
- kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
- gfn_end - gfn_start + 1);
-
spin_unlock(&kvm->mmu_lock);
}
@@ -5804,101 +5840,58 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
-#define BATCH_ZAP_PAGES 10
-static void kvm_zap_obsolete_pages(struct kvm *kvm)
+static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
{
struct kvm_mmu_page *sp, *node;
- int batch = 0;
+ LIST_HEAD(invalid_list);
+ int ign;
+ spin_lock(&kvm->mmu_lock);
restart:
- list_for_each_entry_safe_reverse(sp, node,
- &kvm->arch.active_mmu_pages, link) {
- int ret;
-
- /*
- * No obsolete page exists before new created page since
- * active_mmu_pages is the FIFO list.
- */
- if (!is_obsolete_sp(kvm, sp))
- break;
-
- /*
- * Since we are reversely walking the list and the invalid
- * list will be moved to the head, skip the invalid page
- * can help us to avoid the infinity list walking.
- */
- if (sp->role.invalid)
+ list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+ if (mmio_only && !sp->mmio_cached)
continue;
-
- /*
- * Need not flush tlb since we only zap the sp with invalid
- * generation number.
- */
- if (batch >= BATCH_ZAP_PAGES &&
- cond_resched_lock(&kvm->mmu_lock)) {
- batch = 0;
+ if (sp->role.invalid && sp->root_count)
+ continue;
+ if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
+ WARN_ON_ONCE(mmio_only);
goto restart;
}
-
- ret = kvm_mmu_prepare_zap_page(kvm, sp,
- &kvm->arch.zapped_obsolete_pages);
- batch += ret;
-
- if (ret)
+ if (cond_resched_lock(&kvm->mmu_lock))
goto restart;
}
- /*
- * Should flush tlb before free page tables since lockless-walking
- * may use the pages.
- */
- kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
-}
-
-/*
- * Fast invalidate all shadow pages and use lock-break technique
- * to zap obsolete pages.
- *
- * It's required when memslot is being deleted or VM is being
- * destroyed, in these cases, we should ensure that KVM MMU does
- * not use any resource of the being-deleted slot or all slots
- * after calling the function.
- */
-void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
-{
- spin_lock(&kvm->mmu_lock);
- trace_kvm_mmu_invalidate_zap_all_pages(kvm);
- kvm->arch.mmu_valid_gen++;
-
- /*
- * Notify all vcpus to reload its shadow page table
- * and flush TLB. Then all vcpus will switch to new
- * shadow page table with the new mmu_valid_gen.
- *
- * Note: we should do this under the protection of
- * mmu-lock, otherwise, vcpu would purge shadow page
- * but miss tlb flush.
- */
- kvm_reload_remote_mmus(kvm);
-
- kvm_zap_obsolete_pages(kvm);
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
}
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
+void kvm_mmu_zap_all(struct kvm *kvm)
{
- return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+ return __kvm_mmu_zap_all(kvm, false);
}
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
{
+ WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
+
+ gen &= MMIO_SPTE_GEN_MASK;
+
/*
- * The very rare case: if the generation-number is round,
+ * Generation numbers are incremented in multiples of the number of
+ * address spaces in order to provide unique generations across all
+ * address spaces. Strip what is effectively the address space
+ * modifier prior to checking for a wrap of the MMIO generation so
+ * that a wrap in any address space is detected.
+ */
+ gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+
+ /*
+ * The very rare case: if the MMIO generation number has wrapped,
* zap all shadow pages.
*/
- if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
+ if (unlikely(gen == 0)) {
kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
- kvm_mmu_invalidate_zap_all_pages(kvm);
+ __kvm_mmu_zap_all(kvm, true);
}
}
@@ -5929,24 +5922,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
* want to shrink a VM that only started to populate its MMU
* anyway.
*/
- if (!kvm->arch.n_used_mmu_pages &&
- !kvm_has_zapped_obsolete_pages(kvm))
+ if (!kvm->arch.n_used_mmu_pages)
continue;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
- if (kvm_has_zapped_obsolete_pages(kvm)) {
- kvm_mmu_commit_zap_page(kvm,
- &kvm->arch.zapped_obsolete_pages);
- goto unlock;
- }
-
if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
freed++;
kvm_mmu_commit_zap_page(kvm, &invalid_list);
-unlock:
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c7b333147c4a6..bbdc60f2fae89 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return -(u32)fault & errcode;
}
-void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index c73bf4e4988cb..9f6c855a00439 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -8,18 +8,16 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
-#define KVM_MMU_PAGE_FIELDS \
- __field(unsigned long, mmu_valid_gen) \
- __field(__u64, gfn) \
- __field(__u32, role) \
- __field(__u32, root_count) \
+#define KVM_MMU_PAGE_FIELDS \
+ __field(__u64, gfn) \
+ __field(__u32, role) \
+ __field(__u32, root_count) \
__field(bool, unsync)
-#define KVM_MMU_PAGE_ASSIGN(sp) \
- __entry->mmu_valid_gen = sp->mmu_valid_gen; \
- __entry->gfn = sp->gfn; \
- __entry->role = sp->role.word; \
- __entry->root_count = sp->root_count; \
+#define KVM_MMU_PAGE_ASSIGN(sp) \
+ __entry->gfn = sp->gfn; \
+ __entry->role = sp->role.word; \
+ __entry->root_count = sp->root_count; \
__entry->unsync = sp->unsync;
#define KVM_MMU_PAGE_PRINTK() ({ \
@@ -31,9 +29,8 @@
\
role.word = __entry->role; \
\
- trace_seq_printf(p, "sp gen %lx gfn %llx l%u%s q%u%s %s%s" \
+ trace_seq_printf(p, "sp gfn %llx l%u%s q%u%s %s%s" \
" %snxe %sad root %u %s%c", \
- __entry->mmu_valid_gen, \
__entry->gfn, role.level, \
role.cr4_pae ? " pae" : "", \
role.quadrant, \
@@ -283,27 +280,6 @@ TRACE_EVENT(
);
TRACE_EVENT(
- kvm_mmu_invalidate_zap_all_pages,
- TP_PROTO(struct kvm *kvm),
- TP_ARGS(kvm),
-
- TP_STRUCT__entry(
- __field(unsigned long, mmu_valid_gen)
- __field(unsigned int, mmu_used_pages)
- ),
-
- TP_fast_assign(
- __entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
- __entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
- ),
-
- TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
- __entry->mmu_valid_gen, __entry->mmu_used_pages
- )
-);
-
-
-TRACE_EVENT(
check_mmio_spte,
TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
TP_ARGS(spte, kvm_gen, spte_gen),
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 3052a59a30655..fd04d462fdaee 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
slot->arch.gfn_track[i] =
kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.gfn_track[i])
goto track_free;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 307e5bddb6d97..b5b128a0a0512 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -145,7 +145,6 @@ struct kvm_svm {
/* Struct members for AVIC */
u32 avic_vm_id;
- u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
struct hlist_node hnode;
@@ -236,6 +235,7 @@ struct vcpu_svm {
bool nrips_enabled : 1;
u32 ldr_reg;
+ u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
@@ -1795,9 +1795,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
- pages = vmalloc(size);
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
else
- pages = kmalloc(size, GFP_KERNEL);
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
if (!pages)
return NULL;
@@ -1865,7 +1866,9 @@ static void __unregister_enc_region_locked(struct kvm *kvm,
static struct kvm *svm_vm_alloc(void)
{
- struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
+ struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
return &kvm_svm->kvm;
}
@@ -1940,7 +1943,7 @@ static int avic_vm_init(struct kvm *kvm)
return 0;
/* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL);
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!p_page)
goto free_avic;
@@ -1948,7 +1951,7 @@ static int avic_vm_init(struct kvm *kvm)
clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL);
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!l_page)
goto free_avic;
@@ -2106,6 +2109,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm)
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
+ svm->dfr_reg = APIC_DFR_FLAT;
return ret;
}
@@ -2119,13 +2123,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
struct page *nested_msrpm_pages;
int err;
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!svm) {
err = -ENOMEM;
goto out;
}
- svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
+ svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
if (!svm->vcpu.arch.guest_fpu) {
printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
err = -ENOMEM;
@@ -2137,19 +2142,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto free_svm;
err = -ENOMEM;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
goto uninit;
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
goto free_page1;
- nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!nested_msrpm_pages)
goto free_page2;
- hsave_page = alloc_page(GFP_KERNEL);
+ hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!hsave_page)
goto free_page3;
@@ -3414,6 +3419,14 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
return 0;
}
@@ -4395,7 +4408,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
case MSR_IA32_APICBASE:
if (kvm_vcpu_apicv_active(vcpu))
avic_update_vapic_bar(to_svm(vcpu), data);
- /* Follow through */
+ /* Fall through */
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -4504,28 +4517,19 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
- int i;
- struct kvm_vcpu *vcpu;
- struct kvm *kvm = svm->vcpu.kvm;
struct kvm_lapic *apic = svm->vcpu.arch.apic;
/*
- * At this point, we expect that the AVIC HW has already
- * set the appropriate IRR bits on the valid target
- * vcpus. So, we just need to kick the appropriate vcpu.
+ * Update ICR high and low, then emulate sending IPI,
+ * which is handled when writing APIC_ICR.
*/
- kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & KVM_APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & KVM_APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
- kvm_vcpu_wake_up(vcpu);
- }
+ kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
+ kvm_lapic_reg_write(apic, APIC_ICR, icrl);
break;
}
case AVIC_IPI_FAILURE_INVALID_TARGET:
+ WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
+ index, svm->vcpu.vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -4566,8 +4570,7 @@ static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
return &logical_apic_id_table[index];
}
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
- bool valid)
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
@@ -4580,31 +4583,39 @@ static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
- if (valid)
- new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
- else
- new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
return 0;
}
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+
+ if (entry)
+ WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK);
+}
+
static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
- int ret;
+ int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
- if (!ldr)
- return 1;
+ if (ldr == svm->ldr_reg)
+ return 0;
- ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
- if (ret && svm->ldr_reg) {
- avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
- svm->ldr_reg = 0;
- } else {
+ avic_invalidate_logical_id_entry(vcpu);
+
+ if (ldr)
+ ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
+
+ if (!ret)
svm->ldr_reg = ldr;
- }
+
return ret;
}
@@ -4638,27 +4649,16 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
return 0;
}
-static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
- u32 mod = (dfr >> 28) & 0xf;
-
- /*
- * We assume that all local APICs are using the same type.
- * If this changes, we need to flush the AVIC logical
- * APID id table.
- */
- if (kvm_svm->ldr_mode == mod)
- return 0;
- clear_page(page_address(kvm_svm->avic_logical_id_table_page));
- kvm_svm->ldr_mode = mod;
+ if (svm->dfr_reg == dfr)
+ return;
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
- return 0;
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
}
static int avic_unaccel_trap_write(struct vcpu_svm *svm)
@@ -5126,11 +5126,11 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
- return;
-
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
- mark_dirty(vmcb, VMCB_INTR);
+ if (kvm_vcpu_apicv_active(vcpu))
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ else
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ mark_dirty(vmcb, VMCB_AVIC);
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -5196,7 +5196,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
@@ -6164,8 +6164,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
{
if (avic_handle_apic_id_update(vcpu) != 0)
return;
- if (avic_handle_dfr_update(vcpu) != 0)
- return;
+ avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
@@ -6278,6 +6277,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
int asid, ret;
ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
asid = sev_asid_new();
if (asid < 0)
return ret;
@@ -6309,7 +6311,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
if (ret)
return ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6359,7 +6361,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- start = kzalloc(sizeof(*start), GFP_KERNEL);
+ start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
if (!start)
return -ENOMEM;
@@ -6456,7 +6458,7 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6533,7 +6535,7 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (copy_from_user(&params, measure, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6595,7 +6597,7 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6616,7 +6618,7 @@ static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6644,7 +6646,7 @@ static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
struct sev_data_dbg *data;
int ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6899,7 +6901,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
}
ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
goto e_unpin_memory;
@@ -7005,7 +7007,7 @@ static int svm_register_enc_region(struct kvm *kvm,
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
- region = kzalloc(sizeof(*region), GFP_KERNEL);
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
if (!region)
return -ENOMEM;
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 705f40ae25329..6432d08c7de79 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1465,7 +1465,7 @@ TRACE_EVENT(kvm_hv_send_ipi_ex,
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH arch/x86/kvm
+#define TRACE_INCLUDE_PATH ../../arch/x86/kvm
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 95bc2247478d9..5466c6d85cf3e 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -332,16 +332,17 @@ int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled;
+
+ vmx->nested.enlightened_vmcs_enabled = true;
if (vmcs_version)
*vmcs_version = nested_get_evmcs_version(vcpu);
/* We don't support disabling the feature for simplicity. */
- if (vmx->nested.enlightened_vmcs_enabled)
+ if (evmcs_already_enabled)
return 0;
- vmx->nested.enlightened_vmcs_enabled = true;
-
vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3170e291215d0..f24a2c2250706 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -55,7 +55,7 @@ static u16 shadow_read_write_fields[] = {
static int max_shadow_read_write_fields =
ARRAY_SIZE(shadow_read_write_fields);
-void init_vmcs_shadow_fields(void)
+static void init_vmcs_shadow_fields(void)
{
int i, j;
@@ -273,6 +273,7 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
+ vmx_leave_nested(vcpu);
vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
free_nested(vcpu);
vcpu_put(vcpu);
@@ -1979,17 +1980,6 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
prepare_vmcs02_early_full(vmx, vmcs12);
/*
- * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
- * entry, but only if the current (host) sp changed from the value
- * we wrote last (vmx->host_rsp). This cache is no longer relevant
- * if we switch vmcs, and rather than hold a separate cache per vmcs,
- * here we just force the write to happen on entry. host_rsp will
- * also be written unconditionally by nested_vmx_check_vmentry_hw()
- * if we are doing early consistency checks via hardware.
- */
- vmx->host_rsp = 0;
-
- /*
* PIN CONTROLS
*/
exec_control = vmcs12->pin_based_vm_exec_control;
@@ -2288,10 +2278,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
- vmx->nested.preemption_timer_expired = false;
- if (nested_cpu_has_preemption_timer(vmcs12))
- vmx_start_preemption_timer(vcpu);
-
/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
* bitwise-or of what L1 wants to trap for L2, and what we want to
* trap. Note that CR0.TS also needs updating - we do this later.
@@ -2472,6 +2458,10 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
return -EINVAL;
+ if (!nested_cpu_has_preemption_timer(vmcs12) &&
+ nested_cpu_has_save_preemption_timer(vmcs12))
+ return -EINVAL;
+
if (nested_cpu_has_ept(vmcs12) &&
!valid_ept_address(vcpu, vmcs12->ept_pointer))
return -EINVAL;
@@ -2717,6 +2707,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
+ bool vm_fail;
if (!nested_early_check)
return 0;
@@ -2750,29 +2741,34 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- vmx->__launched = vmx->loaded_vmcs->launched;
-
asm(
- /* Set HOST_RSP */
"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
- "mov %%" _ASM_SP ", %c[host_rsp](%1)\n\t"
+ "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
+ "je 1f \n\t"
+ __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
+ "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
+ "1: \n\t"
"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
/* Check if vmlaunch or vmresume is needed */
- "cmpl $0, %c[launched](%% " _ASM_CX")\n\t"
+ "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
+ /*
+ * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
+ * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
+ * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
+ * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
+ */
"call vmx_vmenter\n\t"
- /* Set vmx->fail accordingly */
- "setbe %c[fail](%% " _ASM_CX")\n\t"
- : ASM_CALL_CONSTRAINT
- : "c"(vmx), "d"((unsigned long)HOST_RSP),
- [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
- [fail]"i"(offsetof(struct vcpu_vmx, fail)),
- [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
+ CC_SET(be)
+ : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
+ : [HOST_RSP]"r"((unsigned long)HOST_RSP),
+ [loaded_vmcs]"r"(vmx->loaded_vmcs),
+ [launched]"i"(offsetof(struct loaded_vmcs, launched)),
+ [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
[wordsize]"i"(sizeof(ulong))
- : "rax", "cc", "memory"
+ : "cc", "memory"
);
preempt_enable();
@@ -2782,10 +2778,9 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
if (vmx->msr_autoload.guest.nr)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
- if (vmx->fail) {
+ if (vm_fail) {
WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- vmx->fail = 0;
return 1;
}
@@ -2808,8 +2803,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
return 0;
}
-STACK_FRAME_NON_STANDARD(nested_vmx_check_vmentry_hw);
-
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
@@ -3026,6 +3019,15 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
kvm_make_request(KVM_REQ_EVENT, vcpu);
/*
+ * Do not start the preemption timer hrtimer until after we know
+ * we are successful, so that only nested_vmx_vmexit needs to cancel
+ * the timer.
+ */
+ vmx->nested.preemption_timer_expired = false;
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ vmx_start_preemption_timer(vcpu);
+
+ /*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
* returned as far as L1 is concerned. It will only return (and set
@@ -3445,13 +3447,10 @@ static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
else
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
- if (nested_cpu_has_preemption_timer(vmcs12)) {
- if (vmcs12->vm_exit_controls &
- VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+ if (nested_cpu_has_preemption_timer(vmcs12) &&
+ vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
vmcs12->vmx_preemption_timer_value =
vmx_get_preemption_timer_value(vcpu);
- hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
- }
/*
* In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3859,6 +3858,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
+ if (nested_cpu_has_preemption_timer(vmcs12))
+ hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
+
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
@@ -3910,9 +3912,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_flush_tlb(vcpu, true);
}
- /* This is needed for same reason as it was needed in prepare_vmcs02 */
- vmx->host_rsp = 0;
-
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
@@ -4030,25 +4029,50 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
/* Addr = segment_base + offset */
/* offset = base + [index * scale] + displacement */
off = exit_qualification; /* holds the displacement */
+ if (addr_size == 1)
+ off = (gva_t)sign_extend64(off, 31);
+ else if (addr_size == 0)
+ off = (gva_t)sign_extend64(off, 15);
if (base_is_valid)
off += kvm_register_read(vcpu, base_reg);
if (index_is_valid)
off += kvm_register_read(vcpu, index_reg)<<scaling;
vmx_get_segment(vcpu, &s, seg_reg);
- *ret = s.base + off;
+ /*
+ * The effective address, i.e. @off, of a memory operand is truncated
+ * based on the address size of the instruction. Note that this is
+ * the *effective address*, i.e. the address prior to accounting for
+ * the segment's base.
+ */
if (addr_size == 1) /* 32 bit */
- *ret &= 0xffffffff;
+ off &= 0xffffffff;
+ else if (addr_size == 0) /* 16 bit */
+ off &= 0xffff;
/* Checks for #GP/#SS exceptions. */
exn = false;
if (is_long_mode(vcpu)) {
+ /*
+ * The virtual/linear address is never truncated in 64-bit
+ * mode, e.g. a 32-bit address size can yield a 64-bit virtual
+ * address when using FS/GS with a non-zero base.
+ */
+ *ret = s.base + off;
+
/* Long mode: #GP(0)/#SS(0) if the memory address is in a
* non-canonical form. This is the only check on the memory
* destination for long mode!
*/
exn = is_noncanonical_address(*ret, vcpu);
- } else if (is_protmode(vcpu)) {
+ } else {
+ /*
+ * When not in long mode, the virtual/linear address is
+ * unconditionally truncated to 32 bits regardless of the
+ * address size.
+ */
+ *ret = (s.base + off) & 0xffffffff;
+
/* Protected mode: apply checks for segment validity in the
* following order:
* - segment type check (#GP(0) may be thrown)
@@ -4072,10 +4096,16 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
*/
exn = (s.unusable != 0);
- /* Protected mode: #GP(0)/#SS(0) if the memory
- * operand is outside the segment limit.
+
+ /*
+ * Protected mode: #GP(0)/#SS(0) if the memory operand is
+ * outside the segment limit. All CPUs that support VMX ignore
+ * limit checks for flat segments, i.e. segments with base==0,
+ * limit==0xffffffff and of type expand-up data or code.
*/
- exn = exn || (off + sizeof(u64) > s.limit);
+ if (!(s.base == 0 && s.limit == 0xffffffff &&
+ ((s.type & 8) || !(s.type & 4))))
+ exn = exn || (off + sizeof(u64) > s.limit);
}
if (exn) {
kvm_queue_exception_e(vcpu,
@@ -4140,11 +4170,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (r < 0)
goto out_vmcs02;
- vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
- vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -4540,9 +4570,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
* given physical address won't match the required
* VMCS12_REVISION identifier.
*/
- nested_vmx_failValid(vcpu,
+ return nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
- return kvm_skip_emulated_instruction(vcpu);
}
new_vmcs12 = kmap(page);
if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
@@ -5264,13 +5293,17 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
copy_shadow_to_vmcs12(vmx);
}
- if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
+ /*
+ * Copy over the full allocated size of vmcs12 rather than just the size
+ * of the struct.
+ */
+ if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE))
return -EFAULT;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull) {
if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
- get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
+ get_shadow_vmcs12(vcpu), VMCS12_SIZE))
return -EFAULT;
}
@@ -5553,9 +5586,11 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
* secondary cpu-based controls. Do not include those that
* depend on CPUID bits, they are added later by vmx_cpuid_update.
*/
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- msrs->secondary_ctls_low,
- msrs->secondary_ctls_high);
+ if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+ msrs->secondary_ctls_low,
+ msrs->secondary_ctls_high);
+
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
@@ -5686,6 +5721,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {
for (i = 0; i < VMX_BITMAP_NR; i++) {
+ /*
+ * The vmx_bitmap is not tied to a VM and so should
+ * not be charged to a memcg.
+ */
vmx_bitmap[i] = (unsigned long *)
__get_free_page(GFP_KERNEL);
if (!vmx_bitmap[i]) {
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 6def3ba88e3b3..cb6079f8a227f 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -34,6 +34,7 @@ struct vmcs_host_state {
unsigned long cr4; /* May not match real cr4 */
unsigned long gs_base;
unsigned long fs_base;
+ unsigned long rsp;
u16 fs_sel, gs_sel, ldt_sel;
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index bcef2c7e9bc48..7b272738c5768 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -1,6 +1,30 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/linkage.h>
#include <asm/asm.h>
+#include <asm/bitsperlong.h>
+#include <asm/kvm_vcpu_regs.h>
+
+#define WORD_SIZE (BITS_PER_LONG / 8)
+
+#define VCPU_RAX __VCPU_REGS_RAX * WORD_SIZE
+#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE
+#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE
+#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE
+/* Intentionally omit RSP as it's context switched by hardware */
+#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE
+#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE
+#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE
+
+#ifdef CONFIG_X86_64
+#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE
+#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE
+#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE
+#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE
+#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE
+#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE
+#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE
+#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
+#endif
.text
@@ -55,3 +79,146 @@ ENDPROC(vmx_vmenter)
ENTRY(vmx_vmexit)
ret
ENDPROC(vmx_vmexit)
+
+/**
+ * __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
+ * @vmx: struct vcpu_vmx *
+ * @regs: unsigned long * (to guest registers)
+ * @launched: %true if the VMCS has been launched
+ *
+ * Returns:
+ * 0 on VM-Exit, 1 on VM-Fail
+ */
+ENTRY(__vmx_vcpu_run)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+#ifdef CONFIG_X86_64
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+#else
+ push %edi
+ push %esi
+#endif
+ push %_ASM_BX
+
+ /*
+ * Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
+ * @regs is needed after VM-Exit to save the guest's register values.
+ */
+ push %_ASM_ARG2
+
+ /* Copy @launched to BL, _ASM_ARG3 is volatile. */
+ mov %_ASM_ARG3B, %bl
+
+ /* Adjust RSP to account for the CALL to vmx_vmenter(). */
+ lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
+ call vmx_update_host_rsp
+
+ /* Load @regs to RAX. */
+ mov (%_ASM_SP), %_ASM_AX
+
+ /* Check if vmlaunch or vmresume is needed */
+ cmpb $0, %bl
+
+ /* Load guest registers. Don't clobber flags. */
+ mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+ mov VCPU_RCX(%_ASM_AX), %_ASM_CX
+ mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+ mov VCPU_RSI(%_ASM_AX), %_ASM_SI
+ mov VCPU_RDI(%_ASM_AX), %_ASM_DI
+ mov VCPU_RBP(%_ASM_AX), %_ASM_BP
+#ifdef CONFIG_X86_64
+ mov VCPU_R8 (%_ASM_AX), %r8
+ mov VCPU_R9 (%_ASM_AX), %r9
+ mov VCPU_R10(%_ASM_AX), %r10
+ mov VCPU_R11(%_ASM_AX), %r11
+ mov VCPU_R12(%_ASM_AX), %r12
+ mov VCPU_R13(%_ASM_AX), %r13
+ mov VCPU_R14(%_ASM_AX), %r14
+ mov VCPU_R15(%_ASM_AX), %r15
+#endif
+ /* Load guest RAX. This kills the vmx_vcpu pointer! */
+ mov VCPU_RAX(%_ASM_AX), %_ASM_AX
+
+ /* Enter guest mode */
+ call vmx_vmenter
+
+ /* Jump on VM-Fail. */
+ jbe 2f
+
+ /* Temporarily save guest's RAX. */
+ push %_ASM_AX
+
+ /* Reload @regs to RAX. */
+ mov WORD_SIZE(%_ASM_SP), %_ASM_AX
+
+ /* Save all guest registers, including RAX from the stack */
+ __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+#ifdef CONFIG_X86_64
+ mov %r8, VCPU_R8 (%_ASM_AX)
+ mov %r9, VCPU_R9 (%_ASM_AX)
+ mov %r10, VCPU_R10(%_ASM_AX)
+ mov %r11, VCPU_R11(%_ASM_AX)
+ mov %r12, VCPU_R12(%_ASM_AX)
+ mov %r13, VCPU_R13(%_ASM_AX)
+ mov %r14, VCPU_R14(%_ASM_AX)
+ mov %r15, VCPU_R15(%_ASM_AX)
+#endif
+
+ /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
+ xor %eax, %eax
+
+ /*
+ * Clear all general purpose registers except RSP and RAX to prevent
+ * speculative use of the guest's values, even those that are reloaded
+ * via the stack. In theory, an L1 cache miss when restoring registers
+ * could lead to speculative execution with the guest's values.
+ * Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
+ * free. RSP and RAX are exempt as RSP is restored by hardware during
+ * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
+ */
+1: xor %ebx, %ebx
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %esi, %esi
+ xor %edi, %edi
+ xor %ebp, %ebp
+#ifdef CONFIG_X86_64
+ xor %r8d, %r8d
+ xor %r9d, %r9d
+ xor %r10d, %r10d
+ xor %r11d, %r11d
+ xor %r12d, %r12d
+ xor %r13d, %r13d
+ xor %r14d, %r14d
+ xor %r15d, %r15d
+#endif
+
+ /* "POP" @regs. */
+ add $WORD_SIZE, %_ASM_SP
+ pop %_ASM_BX
+
+#ifdef CONFIG_X86_64
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+#else
+ pop %esi
+ pop %edi
+#endif
+ pop %_ASM_BP
+ ret
+
+ /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
+2: mov $1, %eax
+ jmp 1b
+ENDPROC(__vmx_vcpu_run)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4d39f731bc332..c73375e01ab8c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -26,6 +26,7 @@
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
#include <linux/sched.h>
+#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
@@ -245,6 +246,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+ /*
+ * This allocation for vmx_l1d_flush_pages is not tied to a VM
+ * lifetime and so should not be charged to a memcg.
+ */
page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
if (!page)
return -ENOMEM;
@@ -423,7 +428,7 @@ static void check_ept_pointer_match(struct kvm *kvm)
to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
}
-int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
+static int kvm_fill_hv_flush_list_func(struct hv_guest_mapping_flush_list *flush,
void *data)
{
struct kvm_tlb_range *range = data;
@@ -453,7 +458,7 @@ static int hv_remote_flush_tlb_with_range(struct kvm *kvm,
struct kvm_tlb_range *range)
{
struct kvm_vcpu *vcpu;
- int ret = -ENOTSUPP, i;
+ int ret = 0, i;
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
@@ -862,7 +867,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
if (!entry_only)
j = find_msr(&m->host, msr);
- if (i == NR_AUTOLOAD_MSRS || j == NR_AUTOLOAD_MSRS) {
+ if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
+ (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
@@ -1192,21 +1198,6 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
return;
- /*
- * First handle the simple case where no cmpxchg is necessary; just
- * allow posting non-urgent interrupts.
- *
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block will do it for us and the wakeup_handler
- * expects the VCPU to be on the blocked_vcpu_list that matches
- * PI.NDST.
- */
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR ||
- vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- return;
- }
-
/* The full case. */
do {
old.control = new.control = pi_desc->control;
@@ -1221,6 +1212,17 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
new.sn = 0;
} while (cmpxchg64(&pi_desc->control, old.control,
new.control) != old.control);
+
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS))
+ pi_set_on(pi_desc);
}
/*
@@ -1773,7 +1775,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
- /* Otherwise falls through */
+ /* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_info->index);
if (msr) {
@@ -2014,7 +2016,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
- /* Otherwise falls through */
+ /* Else, falls through */
default:
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -2344,7 +2346,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
case 37: /* AAT100 */
case 44: /* BC86,AAY89,BD102 */
case 46: /* BA97 */
- _vmexit_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
_vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
"does not work properly. Using workaround\n");
@@ -2389,13 +2391,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return 0;
}
-struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
{
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -2442,7 +2444,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
loaded_vmcs_init(loaded_vmcs);
if (cpu_has_vmx_msr_bitmap()) {
- loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ loaded_vmcs->msr_bitmap = (unsigned long *)
+ __get_free_page(GFP_KERNEL_ACCOUNT);
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
@@ -2483,7 +2486,7 @@ static __init int alloc_kvm_area(void)
for_each_possible_cpu(cpu) {
struct vmcs *vmcs;
- vmcs = alloc_vmcs_cpu(false, cpu);
+ vmcs = alloc_vmcs_cpu(false, cpu, GFP_KERNEL);
if (!vmcs) {
free_kvm_area();
return -ENOMEM;
@@ -6362,10 +6365,20 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->hv_timer_armed = false;
}
+void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+{
+ if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
+ vmx->loaded_vmcs->host_state.rsp = host_rsp;
+ vmcs_writel(HOST_RSP, host_rsp);
+ }
+}
+
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
+
static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long cr3, cr4, evmcs_rsp;
+ unsigned long cr3, cr4;
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
@@ -6429,144 +6442,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
- vmx->__launched = vmx->loaded_vmcs->launched;
-
- evmcs_rsp = static_branch_unlikely(&enable_evmcs) ?
- (unsigned long)&current_evmcs->host_rsp : 0;
-
if (static_branch_unlikely(&vmx_l1d_should_flush))
vmx_l1d_flush(vcpu);
- asm(
- /* Store host registers */
- "push %%" _ASM_DX "; push %%" _ASM_BP ";"
- "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
- "push %%" _ASM_CX " \n\t"
- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- "cmp %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
- "je 1f \n\t"
- "mov %%" _ASM_SP ", %c[host_rsp](%%" _ASM_CX ") \n\t"
- /* Avoid VMWRITE when Enlightened VMCS is in use */
- "test %%" _ASM_SI ", %%" _ASM_SI " \n\t"
- "jz 2f \n\t"
- "mov %%" _ASM_SP ", (%%" _ASM_SI ") \n\t"
- "jmp 1f \n\t"
- "2: \n\t"
- __ex("vmwrite %%" _ASM_SP ", %%" _ASM_DX) "\n\t"
- "1: \n\t"
- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
- /* Reload cr2 if changed */
- "mov %c[cr2](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
- "mov %%cr2, %%" _ASM_DX " \n\t"
- "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
- "je 3f \n\t"
- "mov %%" _ASM_AX", %%cr2 \n\t"
- "3: \n\t"
- /* Check if vmlaunch or vmresume is needed */
- "cmpl $0, %c[launched](%%" _ASM_CX ") \n\t"
- /* Load guest registers. Don't clobber flags. */
- "mov %c[rax](%%" _ASM_CX "), %%" _ASM_AX " \n\t"
- "mov %c[rbx](%%" _ASM_CX "), %%" _ASM_BX " \n\t"
- "mov %c[rdx](%%" _ASM_CX "), %%" _ASM_DX " \n\t"
- "mov %c[rsi](%%" _ASM_CX "), %%" _ASM_SI " \n\t"
- "mov %c[rdi](%%" _ASM_CX "), %%" _ASM_DI " \n\t"
- "mov %c[rbp](%%" _ASM_CX "), %%" _ASM_BP " \n\t"
-#ifdef CONFIG_X86_64
- "mov %c[r8](%%" _ASM_CX "), %%r8 \n\t"
- "mov %c[r9](%%" _ASM_CX "), %%r9 \n\t"
- "mov %c[r10](%%" _ASM_CX "), %%r10 \n\t"
- "mov %c[r11](%%" _ASM_CX "), %%r11 \n\t"
- "mov %c[r12](%%" _ASM_CX "), %%r12 \n\t"
- "mov %c[r13](%%" _ASM_CX "), %%r13 \n\t"
- "mov %c[r14](%%" _ASM_CX "), %%r14 \n\t"
- "mov %c[r15](%%" _ASM_CX "), %%r15 \n\t"
-#endif
- /* Load guest RCX. This kills the vmx_vcpu pointer! */
- "mov %c[rcx](%%" _ASM_CX "), %%" _ASM_CX " \n\t"
-
- /* Enter guest mode */
- "call vmx_vmenter\n\t"
-
- /* Save guest's RCX to the stack placeholder (see above) */
- "mov %%" _ASM_CX ", %c[wordsize](%%" _ASM_SP ") \n\t"
+ if (vcpu->arch.cr2 != read_cr2())
+ write_cr2(vcpu->arch.cr2);
- /* Load host's RCX, i.e. the vmx_vcpu pointer */
- "pop %%" _ASM_CX " \n\t"
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
- /* Set vmx->fail based on EFLAGS.{CF,ZF} */
- "setbe %c[fail](%%" _ASM_CX ")\n\t"
-
- /* Save all guest registers, including RCX from the stack */
- "mov %%" _ASM_AX ", %c[rax](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_BX ", %c[rbx](%%" _ASM_CX ") \n\t"
- __ASM_SIZE(pop) " %c[rcx](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_DX ", %c[rdx](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_SI ", %c[rsi](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_DI ", %c[rdi](%%" _ASM_CX ") \n\t"
- "mov %%" _ASM_BP ", %c[rbp](%%" _ASM_CX ") \n\t"
-#ifdef CONFIG_X86_64
- "mov %%r8, %c[r8](%%" _ASM_CX ") \n\t"
- "mov %%r9, %c[r9](%%" _ASM_CX ") \n\t"
- "mov %%r10, %c[r10](%%" _ASM_CX ") \n\t"
- "mov %%r11, %c[r11](%%" _ASM_CX ") \n\t"
- "mov %%r12, %c[r12](%%" _ASM_CX ") \n\t"
- "mov %%r13, %c[r13](%%" _ASM_CX ") \n\t"
- "mov %%r14, %c[r14](%%" _ASM_CX ") \n\t"
- "mov %%r15, %c[r15](%%" _ASM_CX ") \n\t"
- /*
- * Clear host registers marked as clobbered to prevent
- * speculative use.
- */
- "xor %%r8d, %%r8d \n\t"
- "xor %%r9d, %%r9d \n\t"
- "xor %%r10d, %%r10d \n\t"
- "xor %%r11d, %%r11d \n\t"
- "xor %%r12d, %%r12d \n\t"
- "xor %%r13d, %%r13d \n\t"
- "xor %%r14d, %%r14d \n\t"
- "xor %%r15d, %%r15d \n\t"
-#endif
- "mov %%cr2, %%" _ASM_AX " \n\t"
- "mov %%" _ASM_AX ", %c[cr2](%%" _ASM_CX ") \n\t"
-
- "xor %%eax, %%eax \n\t"
- "xor %%ebx, %%ebx \n\t"
- "xor %%esi, %%esi \n\t"
- "xor %%edi, %%edi \n\t"
- "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
- : ASM_CALL_CONSTRAINT
- : "c"(vmx), "d"((unsigned long)HOST_RSP), "S"(evmcs_rsp),
- [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
- [fail]"i"(offsetof(struct vcpu_vmx, fail)),
- [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
- [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
- [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
- [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
- [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
- [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
- [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
- [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
-#ifdef CONFIG_X86_64
- [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
- [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
- [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
- [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
- [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
- [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
- [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
- [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
-#endif
- [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
- [wordsize]"i"(sizeof(ulong))
- : "cc", "memory"
-#ifdef CONFIG_X86_64
- , "rax", "rbx", "rdi"
- , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
-#else
- , "eax", "ebx", "edi"
-#endif
- );
+ vcpu->arch.cr2 = read_cr2();
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -6648,11 +6533,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
}
-STACK_FRAME_NON_STANDARD(vmx_vcpu_run);
static struct kvm *vmx_vm_alloc(void)
{
- struct kvm_vmx *kvm_vmx = vzalloc(sizeof(struct kvm_vmx));
+ struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
return &kvm_vmx->kvm;
}
@@ -6668,7 +6554,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
if (enable_pml)
vmx_destroy_pml_buffer(vmx);
free_vpid(vmx->vpid);
- leave_guest_mode(vcpu);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
kfree(vmx->guest_msrs);
@@ -6680,14 +6565,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
int err;
- struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
int cpu;
+ vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!vmx)
return ERR_PTR(-ENOMEM);
- vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
+ vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
if (!vmx->vcpu.arch.guest_fpu) {
printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
err = -ENOMEM;
@@ -6709,12 +6596,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
* for the guest, etc.
*/
if (enable_pml) {
- vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vmx->pml_pg)
goto uninit_vcpu;
}
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
> PAGE_SIZE);
@@ -6816,7 +6703,7 @@ static int vmx_vm_init(struct kvm *kvm)
* Warn upon starting the first VM in a potentially
* insecure environment.
*/
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (sched_smt_active())
pr_warn_once(L1TF_MSG_SMT);
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
pr_warn_once(L1TF_MSG_L1D);
@@ -7044,7 +6931,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
/* unmask address range configure area */
for (i = 0; i < vmx->pt_desc.addr_range; i++)
- vmx->pt_desc.ctl_bitmask &= ~(0xf << (32 + i * 4));
+ vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 99328954c2fc1..1554cb45b3931 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -175,7 +175,6 @@ struct nested_vmx {
struct vcpu_vmx {
struct kvm_vcpu vcpu;
- unsigned long host_rsp;
u8 fail;
u8 msr_bitmap_mode;
u32 exit_intr_info;
@@ -209,7 +208,7 @@ struct vcpu_vmx {
struct loaded_vmcs vmcs01;
struct loaded_vmcs *loaded_vmcs;
struct loaded_vmcs *loaded_cpu_state;
- bool __launched; /* temporary, used in vmx_vcpu_run */
+
struct msr_autoload {
struct vmx_msrs guest;
struct vmx_msrs host;
@@ -337,16 +336,16 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
}
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
+static inline void pi_set_sn(struct pi_desc *pi_desc)
{
- return clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
+ set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
}
-static inline void pi_set_sn(struct pi_desc *pi_desc)
+static inline void pi_set_on(struct pi_desc *pi_desc)
{
- return set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
+ set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
}
static inline void pi_clear_on(struct pi_desc *pi_desc)
@@ -445,7 +444,8 @@ static inline u32 vmx_vmentry_ctrl(void)
{
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
- vmentry_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL);
+ vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
+ VM_ENTRY_LOAD_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmentry_ctrl &
~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
@@ -455,9 +455,10 @@ static inline u32 vmx_vmexit_ctrl(void)
{
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
if (pt_mode == PT_MODE_SYSTEM)
- vmexit_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL);
+ vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
+ VM_EXIT_CLEAR_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmcs_config.vmexit_ctrl &
+ return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
}
@@ -478,7 +479,7 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
return &(to_vmx(vcpu)->pi_desc);
}
-struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu);
+struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
@@ -487,7 +488,8 @@ void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
static inline struct vmcs *alloc_vmcs(bool shadow)
{
- return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
+ return alloc_vmcs_cpu(shadow, raw_smp_processor_id(),
+ GFP_KERNEL_ACCOUNT);
}
u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02c8e095a2390..65e4559eef2fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3834,6 +3834,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
case KVM_CAP_HYPERV_SYNIC2:
if (cap->args[0])
return -EINVAL;
+ /* fall through */
+
case KVM_CAP_HYPERV_SYNIC:
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
@@ -3877,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
+ GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.lapic)
@@ -4064,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -4088,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xcrs)
break;
@@ -5114,6 +5117,13 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ /*
+ * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
+ * is returned, but our callers are not ready for that and they blindly
+ * call kvm_inject_page_fault. Ensure that they at least do not leak
+ * uninitialized kernel stack memory into cr2 and error code.
+ */
+ memset(exception, 0, sizeof(*exception));
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
exception);
}
@@ -6480,8 +6490,7 @@ restart:
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE &&
- (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ if (r == EMULATE_DONE && ctxt->tf)
kvm_vcpu_do_singlestep(vcpu, &r);
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP)
@@ -7047,6 +7056,13 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
{
+ if (!lapic_in_kernel(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.apicv_active);
+ return;
+ }
+ if (!vcpu->arch.apicv_active)
+ return;
+
vcpu->arch.apicv_active = false;
kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
}
@@ -7093,10 +7109,10 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_CLOCK_PAIRING:
ret = kvm_pv_clock_pairing(vcpu, a0, a1);
break;
+#endif
case KVM_HC_SEND_IPI:
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
-#endif
default:
ret = -KVM_ENOSYS;
break;
@@ -7793,7 +7809,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* 1) We should set ->mode before checking ->requests. Please see
* the comment in kvm_vcpu_exiting_guest_mode().
*
- * 2) For APICv, we should set ->mode before checking PIR.ON. This
+ * 2) For APICv, we should set ->mode before checking PID.ON. This
* pairs with the memory barrier implicit in pi_test_and_set_on
* (see vmx_deliver_posted_interrupt).
*
@@ -7937,6 +7953,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
+ /* fall through */
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
@@ -8996,7 +9013,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
struct page *page;
int r;
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -9017,6 +9033,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_free_pio_data;
if (irqchip_in_kernel(vcpu->kvm)) {
+ vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -9024,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
static_key_slow_inc(&kvm_no_apic_vcpu);
vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!vcpu->arch.mce_banks) {
r = -ENOMEM;
goto fail_free_lapic;
}
vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT)) {
r = -ENOMEM;
goto fail_free_mce_banks;
}
@@ -9095,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
- INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -9290,13 +9307,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
slot->arch.rmap[i] =
kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i])
goto out_free;
if (i == 0)
continue;
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -9339,13 +9356,13 @@ out_free:
return -ENOMEM;
}
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
- kvm_mmu_invalidate_mmio_sptes(kvm, slots);
+ kvm_mmu_invalidate_mmio_sptes(kvm, gen);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9453,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm_mmu_zap_all(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 224cd0a475684..28406aa1136d7 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -181,6 +181,11 @@ static inline bool emul_is_noncanonical_address(u64 la,
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
gva_t gva, gfn_t gfn, unsigned access)
{
+ u64 gen = kvm_memslots(vcpu->kvm)->generation;
+
+ if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+ return;
+
/*
* If this is a shadow nested page table, the "GVA" is
* actually a nGPA.
@@ -188,7 +193,7 @@ static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
vcpu->arch.mmio_gva = mmu_is_nested(vcpu) ? 0 : gva & PAGE_MASK;
vcpu->arch.access = access;
vcpu->arch.mmio_gfn = gfn;
- vcpu->arch.mmio_gen = kvm_memslots(vcpu->kvm)->generation;
+ vcpu->arch.mmio_gen = gen;
}
static inline bool vcpu_match_mmio_gen(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 9119d8e41f1ff..cf00ab6c66210 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -179,6 +179,8 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
if (insn->addr_bytes == 2)
return -EINVAL;
+ /* fall through */
+
case -EDOM:
case offsetof(struct pt_regs, bx):
case offsetof(struct pt_regs, si):
diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c
index 66894675f3c8d..df50451d94ef7 100644
--- a/arch/x86/lib/iomem.c
+++ b/arch/x86/lib/iomem.c
@@ -2,8 +2,11 @@
#include <linux/module.h>
#include <linux/io.h>
+#define movs(type,to,from) \
+ asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory")
+
/* Originally from i386/string.h */
-static __always_inline void __iomem_memcpy(void *to, const void *from, size_t n)
+static __always_inline void rep_movs(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
asm volatile("rep ; movsl\n\t"
@@ -21,13 +24,37 @@ static __always_inline void __iomem_memcpy(void *to, const void *from, size_t n)
void memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
{
- __iomem_memcpy(to, (const void *)from, n);
+ if (unlikely(!n))
+ return;
+
+ /* Align any unaligned source IO */
+ if (unlikely(1 & (unsigned long)from)) {
+ movs("b", to, from);
+ n--;
+ }
+ if (n > 1 && unlikely(2 & (unsigned long)from)) {
+ movs("w", to, from);
+ n-=2;
+ }
+ rep_movs(to, (const void *)from, n);
}
EXPORT_SYMBOL(memcpy_fromio);
void memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
{
- __iomem_memcpy((void *)to, (const void *) from, n);
+ if (unlikely(!n))
+ return;
+
+ /* Align any unaligned destination IO */
+ if (unlikely(1 & (unsigned long)to)) {
+ movs("b", to, from);
+ n--;
+ }
+ if (n > 1 && unlikely(2 & (unsigned long)to)) {
+ movs("w", to, from);
+ n-=2;
+ }
+ rep_movs((void *)to, (const void *) from, n);
}
EXPORT_SYMBOL(memcpy_toio);
diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index 79778ab200e49..a536651164584 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -36,8 +36,8 @@ static inline u16 i8254(void)
u16 status, timer;
do {
- outb(I8254_PORT_CONTROL,
- I8254_CMD_READBACK | I8254_SELECT_COUNTER0);
+ outb(I8254_CMD_READBACK | I8254_SELECT_COUNTER0,
+ I8254_PORT_CONTROL);
status = inb(I8254_PORT_COUNTER0);
timer = inb(I8254_PORT_COUNTER0);
timer |= inb(I8254_PORT_COUNTER0) << 8;
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index bfd94e7812fcb..7d290777246d2 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -54,13 +54,13 @@ do { \
} while (0)
/**
- * clear_user: - Zero a block of memory in user space.
+ * clear_user - Zero a block of memory in user space.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space.
*
- * Returns number of bytes that could not be cleared.
+ * Return: number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
@@ -74,14 +74,14 @@ clear_user(void __user *to, unsigned long n)
EXPORT_SYMBOL(clear_user);
/**
- * __clear_user: - Zero a block of memory in user space, with less checking.
+ * __clear_user - Zero a block of memory in user space, with less checking.
* @to: Destination address, in user space.
* @n: Number of bytes to zero.
*
* Zero a block of memory in user space. Caller must check
* the specified block with access_ok() before calling this function.
*
- * Returns number of bytes that could not be cleared.
+ * Return: number of bytes that could not be cleared.
* On success, this will be zero.
*/
unsigned long
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 12d7e7fb4efdf..19c6abf9ea317 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -52,7 +52,7 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
}
-static void percpu_setup_debug_store(int cpu)
+static void __init percpu_setup_debug_store(int cpu)
{
#ifdef CONFIG_CPU_SUP_INTEL
int npages;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e3cdc85ce5b6e..ee8f8ab469417 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -444,7 +444,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
int i;
pud_t *start, *pud_start;
pgprotval_t prot, eff;
- pud_t *prev_pud = NULL;
pud_start = start = (pud_t *)p4d_page_vaddr(addr);
@@ -462,7 +461,6 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
} else
note_page(m, st, __pgprot(0), 0, 3);
- prev_pud = start;
start++;
}
}
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 6521134057e8f..3c4568f8fb28e 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -117,67 +117,12 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_fprestore);
-/* Helper to check whether a uaccess fault indicates a kernel bug. */
-static bool bogus_uaccess(struct pt_regs *regs, int trapnr,
- unsigned long fault_addr)
-{
- /* This is the normal case: #PF with a fault address in userspace. */
- if (trapnr == X86_TRAP_PF && fault_addr < TASK_SIZE_MAX)
- return false;
-
- /*
- * This code can be reached for machine checks, but only if the #MC
- * handler has already decided that it looks like a candidate for fixup.
- * This e.g. happens when attempting to access userspace memory which
- * the CPU can't access because of uncorrectable bad memory.
- */
- if (trapnr == X86_TRAP_MC)
- return false;
-
- /*
- * There are two remaining exception types we might encounter here:
- * - #PF for faulting accesses to kernel addresses
- * - #GP for faulting accesses to noncanonical addresses
- * Complain about anything else.
- */
- if (trapnr != X86_TRAP_PF && trapnr != X86_TRAP_GP) {
- WARN(1, "unexpected trap %d in uaccess\n", trapnr);
- return false;
- }
-
- /*
- * This is a faulting memory access in kernel space, on a kernel
- * address, in a usercopy function. This can e.g. be caused by improper
- * use of helpers like __put_user and by improper attempts to access
- * userspace addresses in KERNEL_DS regions.
- * The one (semi-)legitimate exception are probe_kernel_{read,write}(),
- * which can be invoked from places like kgdb, /dev/mem (for reading)
- * and privileged BPF code (for reading).
- * The probe_kernel_*() functions set the kernel_uaccess_faults_ok flag
- * to tell us that faulting on kernel addresses, and even noncanonical
- * addresses, in a userspace accessor does not necessarily imply a
- * kernel bug, root might just be doing weird stuff.
- */
- if (current->kernel_uaccess_faults_ok)
- return false;
-
- /* This is bad. Refuse the fixup so that we go into die(). */
- if (trapnr == X86_TRAP_PF) {
- pr_emerg("BUG: pagefault on kernel address 0x%lx in non-whitelisted uaccess\n",
- fault_addr);
- } else {
- pr_emerg("BUG: GPF in non-whitelisted uaccess (non-canonical address?)\n");
- }
- return true;
-}
-
__visible bool ex_handler_uaccess(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr,
unsigned long error_code,
unsigned long fault_addr)
{
- if (bogus_uaccess(regs, trapnr, fault_addr))
- return false;
+ WARN_ONCE(trapnr == X86_TRAP_GP, "General protection fault in user access. Non-canonical address?");
regs->ip = ex_fixup_addr(fixup);
return true;
}
@@ -188,8 +133,6 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup,
unsigned long error_code,
unsigned long fault_addr)
{
- if (bogus_uaccess(regs, trapnr, fault_addr))
- return false;
/* Special hack for uaccess_err */
current->thread.uaccess_err = 1;
regs->ip = ex_fixup_addr(fixup);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2ff25ad332338..667f1da36208e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -595,7 +595,7 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
return;
}
- addr = desc.base0 | (desc.base1 << 16) | (desc.base2 << 24);
+ addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
#ifdef CONFIG_X86_64
addr |= ((u64)desc.base3 << 32);
#endif
@@ -1031,7 +1031,7 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- unsigned int fault)
+ vm_fault_t fault)
{
struct task_struct *tsk = current;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 5378d10f1d31d..0029604af8a41 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -705,7 +705,7 @@ bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
return arch_memremap_can_ram_remap(phys_addr, size, 0);
}
-#ifdef CONFIG_ARCH_USE_MEMREMAP_PROT
+#ifdef CONFIG_AMD_MEM_ENCRYPT
/* Remap memory with encryption */
void __init *early_memremap_encrypted(resource_size_t phys_addr,
unsigned long size)
@@ -747,7 +747,7 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP);
}
-#endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 462fde83b515e..8dc0fc0b1382b 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -24,14 +24,16 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES];
static p4d_t tmp_p4d_table[MAX_PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
-static __init void *early_alloc(size_t size, int nid, bool panic)
+static __init void *early_alloc(size_t size, int nid, bool should_panic)
{
- if (panic)
- return memblock_alloc_try_nid(size, size,
- __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
- else
- return memblock_alloc_try_nid_nopanic(size, size,
+ void *ptr = memblock_alloc_try_nid(size, size,
__pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+
+ if (!ptr && should_panic)
+ panic("%pS: Failed to allocate page, nid=%d from=%lx\n",
+ (void *)_RET_IP_, nid, __pa(MAX_DMA_ADDRESS));
+
+ return ptr;
}
static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index a19ef1a416ff6..4aa9b1480866b 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -158,8 +158,8 @@ static void __init sme_populate_pgd(struct sme_populate_pgd_data *ppd)
pmd = pmd_offset(pud, ppd->vaddr);
if (pmd_none(*pmd)) {
pte = ppd->pgtable_area;
- memset(pte, 0, sizeof(pte) * PTRS_PER_PTE);
- ppd->pgtable_area += sizeof(pte) * PTRS_PER_PTE;
+ memset(pte, 0, sizeof(*pte) * PTRS_PER_PTE);
+ ppd->pgtable_area += sizeof(*pte) * PTRS_PER_PTE;
set_pmd(pmd, __pmd(PMD_FLAGS | __pa(pte)));
}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index de1851d156997..c805db6236b47 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -9,12 +9,12 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/mm_types.h>
+#include <linux/mman.h>
#include <linux/syscalls.h>
#include <linux/sched/sysctl.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
-#include <asm/mman.h>
#include <asm/mmu_context.h>
#include <asm/mpx.h>
#include <asm/processor.h>
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf74..dfb6c4df639ab 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -123,7 +123,7 @@ void __init setup_node_to_cpumask_map(void)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
- pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
+ pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
}
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
@@ -195,15 +195,11 @@ static void __init alloc_node_data(int nid)
* Allocate node data. Try node-local memory and then any node.
* Never allocate in DMA zone.
*/
- nd_pa = memblock_phys_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
- nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
- MEMBLOCK_ALLOC_ACCESSIBLE);
- if (!nd_pa) {
- pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
- nd_size, nid);
- return;
- }
+ pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
+ nd_size, nid);
+ return;
}
nd = __va(nd_pa);
@@ -866,7 +862,7 @@ const struct cpumask *cpumask_of_node(int node)
{
if (node >= nr_node_ids) {
printk(KERN_WARNING
- "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
+ "cpumask_of_node(%d): node > nr_node_ids(%u)\n",
node, nr_node_ids);
dump_stack();
return cpu_none_mask;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4f8972311a77e..4c570612e24ee 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -230,6 +230,29 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
#endif
+/*
+ * See set_mce_nospec().
+ *
+ * Machine check recovery code needs to change cache mode of poisoned pages to
+ * UC to avoid speculative access logging another error. But passing the
+ * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
+ * speculative access. So we cheat and flip the top bit of the address. This
+ * works fine for the code that updates the page tables. But at the end of the
+ * process we need to flush the TLB and cache and the non-canonical address
+ * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long fix_addr(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+ return (long)(addr << 1) >> 1;
+#else
+ return addr;
+#endif
+}
+
static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
{
if (cpa->flags & CPA_PAGES_ARRAY) {
@@ -313,7 +336,7 @@ void __cpa_flush_tlb(void *data)
unsigned int i;
for (i = 0; i < cpa->numpages; i++)
- __flush_tlb_one_kernel(__cpa_addr(cpa, i));
+ __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
}
static void cpa_flush(struct cpa_data *data, int cache)
@@ -347,7 +370,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
* Only flush present addresses:
*/
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
- clflush_cache_range_opt((void *)addr, PAGE_SIZE);
+ clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
}
mb();
}
@@ -715,7 +738,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
{
unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
pgprot_t old_prot, new_prot, req_prot, chk_prot;
- pte_t new_pte, old_pte, *tmp;
+ pte_t new_pte, *tmp;
enum pg_level level;
/*
@@ -758,7 +781,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address,
* Convert protection attributes to 4k-format, as cpa->mask* are set
* up accordingly.
*/
- old_pte = *kpte;
+
/* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */
req_prot = pgprot_large_2_4k(old_prot);
@@ -1627,29 +1650,6 @@ out:
return ret;
}
-/*
- * Machine check recovery code needs to change cache mode of poisoned
- * pages to UC to avoid speculative access logging another error. But
- * passing the address of the 1:1 mapping to set_memory_uc() is a fine
- * way to encourage a speculative access. So we cheat and flip the top
- * bit of the address. This works fine for the code that updates the
- * page tables. But at the end of the process we need to flush the cache
- * and the non-canonical address causes a #GP fault when used by the
- * CLFLUSH instruction.
- *
- * But in the common case we already have a canonical address. This code
- * will fix the top bit if needed and is a no-op otherwise.
- */
-static inline unsigned long make_addr_canonical_again(unsigned long addr)
-{
-#ifdef CONFIG_X86_64
- return (long)(addr << 1) >> 1;
-#else
- return addr;
-#endif
-}
-
-
static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 999d6d8f0beff..bc4bc7b2f075d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
- unsigned int cpu;
-
- cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
smp_call_function_many(cpumask, flush_tlb_func_remote,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 5542303c43d9c..afabf597c8557 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -881,20 +881,41 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP | BPF_JSLT | BPF_X:
case BPF_JMP | BPF_JSGE | BPF_X:
case BPF_JMP | BPF_JSLE | BPF_X:
+ case BPF_JMP32 | BPF_JEQ | BPF_X:
+ case BPF_JMP32 | BPF_JNE | BPF_X:
+ case BPF_JMP32 | BPF_JGT | BPF_X:
+ case BPF_JMP32 | BPF_JLT | BPF_X:
+ case BPF_JMP32 | BPF_JGE | BPF_X:
+ case BPF_JMP32 | BPF_JLE | BPF_X:
+ case BPF_JMP32 | BPF_JSGT | BPF_X:
+ case BPF_JMP32 | BPF_JSLT | BPF_X:
+ case BPF_JMP32 | BPF_JSGE | BPF_X:
+ case BPF_JMP32 | BPF_JSLE | BPF_X:
/* cmp dst_reg, src_reg */
- EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x39,
- add_2reg(0xC0, dst_reg, src_reg));
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(0x39, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP32 | BPF_JSET | BPF_X:
/* test dst_reg, src_reg */
- EMIT3(add_2mod(0x48, dst_reg, src_reg), 0x85,
- add_2reg(0xC0, dst_reg, src_reg));
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ EMIT1(add_2mod(0x48, dst_reg, src_reg));
+ else if (is_ereg(dst_reg) || is_ereg(src_reg))
+ EMIT1(add_2mod(0x40, dst_reg, src_reg));
+ EMIT2(0x85, add_2reg(0xC0, dst_reg, src_reg));
goto emit_cond_jmp;
case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP32 | BPF_JSET | BPF_K:
/* test dst_reg, imm32 */
- EMIT1(add_1mod(0x48, dst_reg));
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
EMIT2_off32(0xF7, add_1reg(0xC0, dst_reg), imm32);
goto emit_cond_jmp;
@@ -908,8 +929,21 @@ xadd: if (is_imm8(insn->off))
case BPF_JMP | BPF_JSLT | BPF_K:
case BPF_JMP | BPF_JSGE | BPF_K:
case BPF_JMP | BPF_JSLE | BPF_K:
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
+ case BPF_JMP32 | BPF_JNE | BPF_K:
+ case BPF_JMP32 | BPF_JGT | BPF_K:
+ case BPF_JMP32 | BPF_JLT | BPF_K:
+ case BPF_JMP32 | BPF_JGE | BPF_K:
+ case BPF_JMP32 | BPF_JLE | BPF_K:
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
+ case BPF_JMP32 | BPF_JSGE | BPF_K:
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
/* cmp dst_reg, imm8/32 */
- EMIT1(add_1mod(0x48, dst_reg));
+ if (BPF_CLASS(insn->code) == BPF_JMP)
+ EMIT1(add_1mod(0x48, dst_reg));
+ else if (is_ereg(dst_reg))
+ EMIT1(add_1mod(0x40, dst_reg));
if (is_imm8(imm32))
EMIT3(0x83, add_1reg(0xF8, dst_reg), imm32);
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 8f6cc71e08482..0d9cdffce6ac0 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2072,7 +2072,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_JMP | BPF_JSGT | BPF_X:
case BPF_JMP | BPF_JSLE | BPF_X:
case BPF_JMP | BPF_JSLT | BPF_X:
- case BPF_JMP | BPF_JSGE | BPF_X: {
+ case BPF_JMP | BPF_JSGE | BPF_X:
+ case BPF_JMP32 | BPF_JEQ | BPF_X:
+ case BPF_JMP32 | BPF_JNE | BPF_X:
+ case BPF_JMP32 | BPF_JGT | BPF_X:
+ case BPF_JMP32 | BPF_JLT | BPF_X:
+ case BPF_JMP32 | BPF_JGE | BPF_X:
+ case BPF_JMP32 | BPF_JLE | BPF_X:
+ case BPF_JMP32 | BPF_JSGT | BPF_X:
+ case BPF_JMP32 | BPF_JSLE | BPF_X:
+ case BPF_JMP32 | BPF_JSLT | BPF_X:
+ case BPF_JMP32 | BPF_JSGE | BPF_X: {
+ bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
u8 sreg_lo = sstk ? IA32_ECX : src_lo;
@@ -2081,25 +2092,35 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
if (dstk) {
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
STACK_VAR(dst_lo));
- EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
- STACK_VAR(dst_hi));
+ if (is_jmp64)
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
}
if (sstk) {
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
STACK_VAR(src_lo));
- EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
- STACK_VAR(src_hi));
+ if (is_jmp64)
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EBX),
+ STACK_VAR(src_hi));
}
- /* cmp dreg_hi,sreg_hi */
- EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
- EMIT2(IA32_JNE, 2);
+ if (is_jmp64) {
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ }
/* cmp dreg_lo,sreg_lo */
EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
goto emit_cond_jmp;
}
- case BPF_JMP | BPF_JSET | BPF_X: {
+ case BPF_JMP | BPF_JSET | BPF_X:
+ case BPF_JMP32 | BPF_JSET | BPF_X: {
+ bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
u8 sreg_lo = sstk ? IA32_ECX : src_lo;
@@ -2108,15 +2129,21 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
if (dstk) {
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
STACK_VAR(dst_lo));
- EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
- STACK_VAR(dst_hi));
+ if (is_jmp64)
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
}
if (sstk) {
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
STACK_VAR(src_lo));
- EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
- STACK_VAR(src_hi));
+ if (is_jmp64)
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EBX),
+ STACK_VAR(src_hi));
}
/* and dreg_lo,sreg_lo */
EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
@@ -2126,32 +2153,39 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
goto emit_cond_jmp;
}
- case BPF_JMP | BPF_JSET | BPF_K: {
- u32 hi;
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP32 | BPF_JSET | BPF_K: {
+ bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
u8 sreg_lo = IA32_ECX;
u8 sreg_hi = IA32_EBX;
+ u32 hi;
if (dstk) {
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
STACK_VAR(dst_lo));
- EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
- STACK_VAR(dst_hi));
+ if (is_jmp64)
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
}
- hi = imm32 & (1<<31) ? (u32)~0 : 0;
/* mov ecx,imm32 */
- EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
- /* mov ebx,imm32 */
- EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+ EMIT2_off32(0xC7, add_1reg(0xC0, sreg_lo), imm32);
/* and dreg_lo,sreg_lo */
EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
- /* and dreg_hi,sreg_hi */
- EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
- /* or dreg_lo,dreg_hi */
- EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ if (is_jmp64) {
+ hi = imm32 & (1 << 31) ? (u32)~0 : 0;
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, sreg_hi), hi);
+ /* and dreg_hi,sreg_hi */
+ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+ /* or dreg_lo,dreg_hi */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ }
goto emit_cond_jmp;
}
case BPF_JMP | BPF_JEQ | BPF_K:
@@ -2163,29 +2197,44 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
case BPF_JMP | BPF_JSGT | BPF_K:
case BPF_JMP | BPF_JSLE | BPF_K:
case BPF_JMP | BPF_JSLT | BPF_K:
- case BPF_JMP | BPF_JSGE | BPF_K: {
- u32 hi;
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP32 | BPF_JEQ | BPF_K:
+ case BPF_JMP32 | BPF_JNE | BPF_K:
+ case BPF_JMP32 | BPF_JGT | BPF_K:
+ case BPF_JMP32 | BPF_JLT | BPF_K:
+ case BPF_JMP32 | BPF_JGE | BPF_K:
+ case BPF_JMP32 | BPF_JLE | BPF_K:
+ case BPF_JMP32 | BPF_JSGT | BPF_K:
+ case BPF_JMP32 | BPF_JSLE | BPF_K:
+ case BPF_JMP32 | BPF_JSLT | BPF_K:
+ case BPF_JMP32 | BPF_JSGE | BPF_K: {
+ bool is_jmp64 = BPF_CLASS(insn->code) == BPF_JMP;
u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
u8 sreg_lo = IA32_ECX;
u8 sreg_hi = IA32_EBX;
+ u32 hi;
if (dstk) {
EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
STACK_VAR(dst_lo));
- EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
- STACK_VAR(dst_hi));
+ if (is_jmp64)
+ EMIT3(0x8B,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
}
- hi = imm32 & (1<<31) ? (u32)~0 : 0;
/* mov ecx,imm32 */
EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
- /* mov ebx,imm32 */
- EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
-
- /* cmp dreg_hi,sreg_hi */
- EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
- EMIT2(IA32_JNE, 2);
+ if (is_jmp64) {
+ hi = imm32 & (1 << 31) ? (u32)~0 : 0;
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ }
/* cmp dreg_lo,sreg_lo */
EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 30a5111ae5fd9..527e69b120025 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -635,6 +635,22 @@ static void quirk_no_aersid(struct pci_dev *pdev)
DECLARE_PCI_FIXUP_CLASS_EARLY(PCI_VENDOR_ID_INTEL, PCI_ANY_ID,
PCI_CLASS_BRIDGE_PCI, 8, quirk_no_aersid);
+static void quirk_intel_th_dnv(struct pci_dev *dev)
+{
+ struct resource *r = &dev->resource[4];
+
+ /*
+ * Denverton reports 2k of RTIT_BAR (intel_th resource 4), which
+ * appears to be 4 MB in reality.
+ */
+ if (r->end == r->start + 0x7ff) {
+ r->start = 0;
+ r->end = 0x3fffff;
+ r->flags |= IORESOURCE_UNSET;
+ }
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x19e1, quirk_intel_th_dnv);
+
#ifdef CONFIG_PHYS_ADDR_T_64BIT
#define AMD_141b_MMIO_BASE(x) (0x80 + (x) * 0x8)
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index e4dc3862d423c..fe29f3f5d384f 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -3,5 +3,4 @@ OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
-obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/early_printk.c b/arch/x86/platform/efi/early_printk.c
deleted file mode 100644
index 7138bc7a265c0..0000000000000
--- a/arch/x86/platform/efi/early_printk.c
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (C) 2013 Intel Corporation; author Matt Fleming
- *
- * This file is part of the Linux kernel, and is made available under
- * the terms of the GNU General Public License version 2.
- */
-
-#include <linux/console.h>
-#include <linux/efi.h>
-#include <linux/font.h>
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <asm/setup.h>
-
-static const struct font_desc *font;
-static u32 efi_x, efi_y;
-static void *efi_fb;
-static bool early_efi_keep;
-
-/*
- * efi earlyprintk need use early_ioremap to map the framebuffer.
- * But early_ioremap is not usable for earlyprintk=efi,keep, ioremap should
- * be used instead. ioremap will be available after paging_init() which is
- * earlier than initcall callbacks. Thus adding this early initcall function
- * early_efi_map_fb to map the whole efi framebuffer.
- */
-static __init int early_efi_map_fb(void)
-{
- u64 base, size;
-
- if (!early_efi_keep)
- return 0;
-
- base = boot_params.screen_info.lfb_base;
- if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)
- base |= (u64)boot_params.screen_info.ext_lfb_base << 32;
- size = boot_params.screen_info.lfb_size;
- efi_fb = ioremap(base, size);
-
- return efi_fb ? 0 : -ENOMEM;
-}
-early_initcall(early_efi_map_fb);
-
-/*
- * early_efi_map maps efi framebuffer region [start, start + len -1]
- * In case earlyprintk=efi,keep we have the whole framebuffer mapped already
- * so just return the offset efi_fb + start.
- */
-static __ref void *early_efi_map(unsigned long start, unsigned long len)
-{
- u64 base;
-
- base = boot_params.screen_info.lfb_base;
- if (boot_params.screen_info.capabilities & VIDEO_CAPABILITY_64BIT_BASE)
- base |= (u64)boot_params.screen_info.ext_lfb_base << 32;
-
- if (efi_fb)
- return (efi_fb + start);
- else
- return early_ioremap(base + start, len);
-}
-
-static __ref void early_efi_unmap(void *addr, unsigned long len)
-{
- if (!efi_fb)
- early_iounmap(addr, len);
-}
-
-static void early_efi_clear_scanline(unsigned int y)
-{
- unsigned long *dst;
- u16 len;
-
- len = boot_params.screen_info.lfb_linelength;
- dst = early_efi_map(y*len, len);
- if (!dst)
- return;
-
- memset(dst, 0, len);
- early_efi_unmap(dst, len);
-}
-
-static void early_efi_scroll_up(void)
-{
- unsigned long *dst, *src;
- u16 len;
- u32 i, height;
-
- len = boot_params.screen_info.lfb_linelength;
- height = boot_params.screen_info.lfb_height;
-
- for (i = 0; i < height - font->height; i++) {
- dst = early_efi_map(i*len, len);
- if (!dst)
- return;
-
- src = early_efi_map((i + font->height) * len, len);
- if (!src) {
- early_efi_unmap(dst, len);
- return;
- }
-
- memmove(dst, src, len);
-
- early_efi_unmap(src, len);
- early_efi_unmap(dst, len);
- }
-}
-
-static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h)
-{
- const u32 color_black = 0x00000000;
- const u32 color_white = 0x00ffffff;
- const u8 *src;
- u8 s8;
- int m;
-
- src = font->data + c * font->height;
- s8 = *(src + h);
-
- for (m = 0; m < 8; m++) {
- if ((s8 >> (7 - m)) & 1)
- *dst = color_white;
- else
- *dst = color_black;
- dst++;
- }
-}
-
-static void
-early_efi_write(struct console *con, const char *str, unsigned int num)
-{
- struct screen_info *si;
- unsigned int len;
- const char *s;
- void *dst;
-
- si = &boot_params.screen_info;
- len = si->lfb_linelength;
-
- while (num) {
- unsigned int linemax;
- unsigned int h, count = 0;
-
- for (s = str; *s && *s != '\n'; s++) {
- if (count == num)
- break;
- count++;
- }
-
- linemax = (si->lfb_width - efi_x) / font->width;
- if (count > linemax)
- count = linemax;
-
- for (h = 0; h < font->height; h++) {
- unsigned int n, x;
-
- dst = early_efi_map((efi_y + h) * len, len);
- if (!dst)
- return;
-
- s = str;
- n = count;
- x = efi_x;
-
- while (n-- > 0) {
- early_efi_write_char(dst + x*4, *s, h);
- x += font->width;
- s++;
- }
-
- early_efi_unmap(dst, len);
- }
-
- num -= count;
- efi_x += count * font->width;
- str += count;
-
- if (num > 0 && *s == '\n') {
- efi_x = 0;
- efi_y += font->height;
- str++;
- num--;
- }
-
- if (efi_x + font->width > si->lfb_width) {
- efi_x = 0;
- efi_y += font->height;
- }
-
- if (efi_y + font->height > si->lfb_height) {
- u32 i;
-
- efi_y -= font->height;
- early_efi_scroll_up();
-
- for (i = 0; i < font->height; i++)
- early_efi_clear_scanline(efi_y + i);
- }
- }
-}
-
-static __init int early_efi_setup(struct console *con, char *options)
-{
- struct screen_info *si;
- u16 xres, yres;
- u32 i;
-
- si = &boot_params.screen_info;
- xres = si->lfb_width;
- yres = si->lfb_height;
-
- /*
- * early_efi_write_char() implicitly assumes a framebuffer with
- * 32-bits per pixel.
- */
- if (si->lfb_depth != 32)
- return -ENODEV;
-
- font = get_default_font(xres, yres, -1, -1);
- if (!font)
- return -ENODEV;
-
- efi_y = rounddown(yres, font->height) - font->height;
- for (i = 0; i < (yres - efi_y) / font->height; i++)
- early_efi_scroll_up();
-
- /* early_console_register will unset CON_BOOT in case ,keep */
- if (!(con->flags & CON_BOOT))
- early_efi_keep = true;
- return 0;
-}
-
-struct console early_efi_console = {
- .name = "earlyefi",
- .write = early_efi_write,
- .setup = early_efi_setup,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 17456a1d3f049..458a0e2bcc57c 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -304,7 +304,7 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
* - Not within any part of the kernel
* - Not the BIOS reserved area (E820_TYPE_RESERVED, E820_TYPE_NVS, etc)
*/
-static bool can_free_region(u64 start, u64 size)
+static __init bool can_free_region(u64 start, u64 size)
{
if (start + size > __pa_symbol(_text) && start <= __pa_symbol(_end))
return false;
@@ -717,7 +717,7 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
* "efi_mm" cannot be used to check if the page fault had occurred
* in the firmware context because efi=old_map doesn't use efi_pgd.
*/
- if (efi_rts_work.efi_rts_id == NONE)
+ if (efi_rts_work.efi_rts_id == EFI_NONE)
return;
/*
@@ -742,7 +742,7 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
* because this case occurs *very* rarely and hence could be improved
* on a need by basis.
*/
- if (efi_rts_work.efi_rts_id == RESET_SYSTEM) {
+ if (efi_rts_work.efi_rts_id == EFI_RESET_SYSTEM) {
pr_info("efi_reset_system() buggy! Reboot through BIOS\n");
machine_real_restart(MRR_BIOS);
return;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
index 96f438d4b026f..1421d5330b2ce 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bcm43xx.c
@@ -44,7 +44,6 @@ static struct fixed_voltage_config bcm43xx_vmmc = {
*/
.microvolts = 2000000, /* 1.8V */
.startup_delay = 250 * 1000, /* 250ms */
- .enable_high = 1, /* active high */
.enabled_at_boot = 0, /* disabled at boot */
.init_data = &bcm43xx_vmmc_data,
};
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index b4ab779f1d47a..ac9e7bf49b667 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -141,6 +141,9 @@ void * __init prom_early_alloc(unsigned long size)
* wasted bootmem) and hand off chunks of it to callers.
*/
res = memblock_alloc(chunk_size, SMP_CACHE_BYTES);
+ if (!res)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ chunk_size);
BUG_ON(!res);
prom_early_allocated += chunk_size;
memset(res, 0, chunk_size);
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 4a6a5a26c5829..ef60d789c76ed 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -29,7 +29,8 @@
struct uv_systab *uv_systab;
-s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+ u64 a4, u64 a5)
{
struct uv_systab *tab = uv_systab;
s64 ret;
@@ -44,36 +45,42 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
* If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI
* callback method, which uses efi_call() directly, with the kernel page tables:
*/
- if (unlikely(test_bit(EFI_OLD_MEMMAP, &efi.flags)))
+ if (unlikely(efi_enabled(EFI_OLD_MEMMAP)))
ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5);
else
ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
return ret;
}
-EXPORT_SYMBOL_GPL(uv_bios_call);
-s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
- u64 a4, u64 a5)
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
- unsigned long bios_flags;
s64 ret;
- local_irq_save(bios_flags);
- ret = uv_bios_call(which, a1, a2, a3, a4, a5);
- local_irq_restore(bios_flags);
+ if (down_interruptible(&__efi_uv_runtime_lock))
+ return BIOS_STATUS_ABORT;
+
+ ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+ up(&__efi_uv_runtime_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(uv_bios_call);
-s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
u64 a4, u64 a5)
{
+ unsigned long bios_flags;
s64 ret;
- preempt_disable();
- ret = uv_bios_call(which, a1, a2, a3, a4, a5);
- preempt_enable();
+ if (down_interruptible(&__efi_uv_runtime_lock))
+ return BIOS_STATUS_ABORT;
+
+ local_irq_save(bios_flags);
+ ret = __uv_bios_call(which, a1, a2, a3, a4, a5);
+ local_irq_restore(bios_flags);
+
+ up(&__efi_uv_runtime_lock);
return ret;
}
@@ -188,7 +195,6 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
}
EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
-#ifdef CONFIG_EFI
void uv_bios_init(void)
{
uv_systab = NULL;
@@ -218,4 +224,3 @@ void uv_bios_init(void)
}
pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision);
}
-#endif
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index a4130b84d1ff5..2c53b0f19329a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -2010,8 +2010,7 @@ static void make_per_cpu_thp(struct bau_control *smaster)
int cpu;
size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
- smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
- memset(smaster->thp, 0, hpsz);
+ smaster->thp = kzalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
for_each_present_cpu(cpu) {
smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
@@ -2135,15 +2134,12 @@ static int __init summarize_uvhub_sockets(int nuvhubs,
static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
{
unsigned char *uvhub_mask;
- void *vp;
struct uvhub_desc *uvhub_descs;
if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub())
timeout_us = calculate_destination_timeout();
- vp = kmalloc_array(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL);
- uvhub_descs = (struct uvhub_desc *)vp;
- memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL);
uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 4463fa72db945..f60501a384f94 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -37,8 +37,7 @@ REALMODE_OBJS = $(addprefix $(obj)/,$(realmode-y))
sed-pasyms := -n -r -e 's/^([0-9a-fA-F]+) [ABCDGRSTVW] (.+)$$/pa_\2 = \2;/p'
quiet_cmd_pasyms = PASYMS $@
- cmd_pasyms = $(NM) $(filter-out FORCE,$^) | \
- sed $(sed-pasyms) | sort | uniq > $@
+ cmd_pasyms = $(NM) $(real-prereqs) | sed $(sed-pasyms) | sort | uniq > $@
targets += pasyms.h
$(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
@@ -47,7 +46,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE
targets += realmode.lds
$(obj)/realmode.lds: $(obj)/pasyms.h
-LDFLAGS_realmode.elf := --emit-relocs -T
+LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T
CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj)
targets += realmode.elf
diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S
index df8e11e26bc39..3bb980800c581 100644
--- a/arch/x86/realmode/rm/realmode.lds.S
+++ b/arch/x86/realmode/rm/realmode.lds.S
@@ -9,7 +9,7 @@
#undef i386
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_FORMAT("elf32-i386")
OUTPUT_ARCH(i386)
SECTIONS
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index f518b4744ff89..a9e80e44178c7 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -16,7 +16,7 @@ config 64BIT
config X86_32
def_bool !64BIT
- select HAVE_AOUT
+ select ARCH_32BIT_OFF_T
select ARCH_WANT_IPC_PARSE_VERSION
select MODULES_USE_ELF_REL
select CLONE_BACKWARDS
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 2f6787fc71066..c54a493e139a7 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -898,10 +898,7 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
val = native_read_msr_safe(msr, err);
switch (msr) {
case MSR_IA32_APICBASE:
-#ifdef CONFIG_X86_X2APIC
- if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
-#endif
- val &= ~X2APIC_ENABLE;
+ val &= ~X2APIC_ENABLE;
break;
}
return val;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index a7e47cf7ec6cd..6e4c6bd622033 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -17,8 +17,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep);
+void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte);
unsigned long xen_read_cr2_direct(void);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 0f4fe206dcc20..a21e1734fc1f0 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -306,20 +306,20 @@ static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
__xen_set_pte(ptep, pteval);
}
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
- trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
+ trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
return *ptep;
}
-void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
+void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
pte_t *ptep, pte_t pte)
{
struct mmu_update u;
- trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
+ trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte);
xen_mc_batch();
u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
@@ -2114,10 +2114,10 @@ void __init xen_relocate_p2m(void)
pt = early_memremap(pt_phys, PAGE_SIZE);
clear_page(pt);
for (idx_pte = 0;
- idx_pte < min(n_pte, PTRS_PER_PTE);
- idx_pte++) {
- set_pte(pt + idx_pte,
- pfn_pte(p2m_pfn, PAGE_KERNEL));
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ pt[idx_pte] = pfn_pte(p2m_pfn,
+ PAGE_KERNEL);
p2m_pfn++;
}
n_pte -= PTRS_PER_PTE;
@@ -2125,8 +2125,7 @@ void __init xen_relocate_p2m(void)
make_lowmem_page_readonly(__va(pt_phys));
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
PFN_DOWN(pt_phys));
- set_pmd(pmd + idx_pt,
- __pmd(_PAGE_TABLE | pt_phys));
+ pmd[idx_pt] = __pmd(_PAGE_TABLE | pt_phys);
pt_phys += PAGE_SIZE;
}
n_pt -= PTRS_PER_PMD;
@@ -2134,7 +2133,7 @@ void __init xen_relocate_p2m(void)
make_lowmem_page_readonly(__va(pmd_phys));
pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
PFN_DOWN(pmd_phys));
- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pud[idx_pmd] = __pud(_PAGE_TABLE | pmd_phys);
pmd_phys += PAGE_SIZE;
}
n_pmd -= PTRS_PER_PUD;
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 055e37e43541e..95ce9b5be4112 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -181,8 +181,15 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
static void * __ref alloc_p2m_page(void)
{
- if (unlikely(!slab_is_available()))
- return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ if (unlikely(!slab_is_available())) {
+ void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, PAGE_SIZE, PAGE_SIZE);
+
+ return ptr;
+ }
return (void *)__get_free_page(GFP_KERNEL);
}
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index d5f303c0e6563..548d1e0a5ba10 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -12,6 +12,7 @@
#include <linux/memblock.h>
#include <linux/cpuidle.h>
#include <linux/cpufreq.h>
+#include <linux/memory_hotplug.h>
#include <asm/elf.h>
#include <asm/vdso.h>
@@ -589,6 +590,14 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
if (type == E820_TYPE_RAM) {
start = PAGE_ALIGN(start);
end &= ~((phys_addr_t)PAGE_SIZE - 1);
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Don't allow adding memory not in E820 map while booting the
+ * system. Once the balloon driver is up it will remove that
+ * restriction again.
+ */
+ max_mem_size = end;
+#endif
}
e820__range_add(start, end - start, type);
@@ -748,6 +757,10 @@ char * __init xen_memory_setup(void)
memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
+#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_XEN_BALLOON)
+ xen_saved_max_mem_size = max_mem_size;
+#endif
+
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
XENMEM_memory_map;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 72bf446c3fee3..6e29794573b72 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -361,8 +361,6 @@ void xen_timer_resume(void)
{
int cpu;
- pvclock_resume();
-
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
@@ -379,12 +377,15 @@ static const struct pv_time_ops xen_time_ops __initconst = {
};
static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
+static u64 xen_clock_value_saved;
void xen_save_time_memory_area(void)
{
struct vcpu_register_time_memory_area t;
int ret;
+ xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
+
if (!xen_clock)
return;
@@ -404,7 +405,7 @@ void xen_restore_time_memory_area(void)
int ret;
if (!xen_clock)
- return;
+ goto out;
t.addr.v = &xen_clock->pvti;
@@ -421,6 +422,11 @@ void xen_restore_time_memory_area(void)
if (ret != 0)
pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
ret);
+
+out:
+ /* Need pvclock_resume() before using xen_clocksource_read(). */
+ pvclock_resume();
+ xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
}
static void xen_setup_vsyscall_time_info(void)