From c662f77331c98018ed256501557b4dd67133fbd7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Feb 2018 15:16:01 +1100 Subject: KVM: PPC: Fix compile error that occurs when CONFIG_ALTIVEC=n Commit accb757d798c ("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_run", 2017-12-04) added a "goto out" statement and an "out:" label to kvm_arch_vcpu_ioctl_run(). Since the only "goto out" is inside a CONFIG_VSX block, compiling with CONFIG_VSX=n gives a warning that label "out" is defined but not used, and because arch/powerpc is compiled with -Werror, that becomes a compile error that makes the kernel build fail. Merge commit 1ab03c072feb ("Merge tag 'kvm-ppc-next-4.16-2' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc", 2018-02-09) added a similar block of code inside a #ifdef CONFIG_ALTIVEC, with a "goto out" statement. In order to make the build succeed, this adds a #ifdef around the "out:" label. This is a minimal, ugly fix, to be replaced later by a refactoring of the code. Since CONFIG_VSX depends on CONFIG_ALTIVEC, it is sufficient to use #ifdef CONFIG_ALTIVEC here. Fixes: accb757d798c ("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_run") Reported-by: Christian Zigotzky Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 403e642c78f51..0083142c2f848 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1608,7 +1608,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_sigset_deactivate(vcpu); +#ifdef CONFIG_ALTIVEC out: +#endif vcpu_put(vcpu); return r; } -- cgit v1.2.3 From 6df3877fc962c2bb3d0438633dfd24a185af6838 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Feb 2018 15:45:21 +1100 Subject: KVM: PPC: Book3S: Fix compile error that occurs with some gcc versions Some versions of gcc generate a warning that the variable "emulated" may be used uninitialized in function kvmppc_handle_load128_by2x64(). It would be used uninitialized if kvmppc_handle_load128_by2x64 was ever called with vcpu->arch.mmio_vmx_copy_nums == 0, but neither of the callers ever do that, so there is no actual bug. When gcc generates a warning, it causes the build to fail because arch/powerpc is compiled with -Werror. This silences the warning by initializing "emulated" to EMULATE_DONE. Fixes: 09f984961c13 ("KVM: PPC: Book3S: Add MMIO emulation for VMX instructions") Reported-by: Michael Ellerman Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0083142c2f848..52c2053739862 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1345,7 +1345,7 @@ static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu, int kvmppc_handle_load128_by2x64(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int rt, int is_default_endian) { - enum emulation_result emulated; + enum emulation_result emulated = EMULATE_DONE; while (vcpu->arch.mmio_vmx_copy_nums) { emulated = __kvmppc_handle_load(run, vcpu, rt, 8, -- cgit v1.2.3 From 2cb370d615e9fbed9e95ed222c2c8f337181aa90 Mon Sep 17 00:00:00 2001 From: Eugeniu Rosca Date: Sun, 18 Feb 2018 00:10:29 +0100 Subject: s390: Replace IS_ENABLED(EXPOLINE_*) with IS_ENABLED(CONFIG_EXPOLINE_*) I've accidentally stumbled upon the IS_ENABLED(EXPOLINE_*) lines, which obviously always evaluate to false. Fix this. Fixes: f19fbd5ed642 ("s390: introduce execute-trampolines for branches") Signed-off-by: Eugeniu Rosca Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/nospec-branch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 69d7fcf481588..9aff72d3abda3 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -2,8 +2,8 @@ #include #include -int nospec_call_disable = IS_ENABLED(EXPOLINE_OFF); -int nospec_return_disable = !IS_ENABLED(EXPOLINE_FULL); +int nospec_call_disable = IS_ENABLED(CONFIG_EXPOLINE_OFF); +int nospec_return_disable = !IS_ENABLED(CONFIG_EXPOLINE_FULL); static int __init nospectre_v2_setup_early(char *str) { -- cgit v1.2.3 From dc24b7b49a53c7ee5502c877b133558acec0b3f8 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Mon, 19 Feb 2018 11:27:09 +0100 Subject: s390/clean-up: use CFI_* macros in entry.S Commit f19fbd5ed642 ("s390: introduce execute-trampolines for branches") introduces .cfi_* assembler directives. Instead of using the directives directly, use the macros from asm/dwarf.h. This also ensures that the dwarf debug information are created in the .debug_frame section. Fixes: f19fbd5ed642 ("s390: introduce execute-trampolines for branches") Signed-off-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 13a133a6015c9..9ec728fa832c9 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -230,7 +231,7 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) .hidden \name .type \name,@function \name: - .cfi_startproc + CFI_STARTPROC #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES exrl 0,0f #else @@ -239,7 +240,7 @@ _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) #endif j . 0: br \reg - .cfi_endproc + CFI_ENDPROC .endm GEN_BR_THUNK __s390x_indirect_jump_r1use_r9,%r9,%r1 -- cgit v1.2.3 From d5feec04fe578c8dbd9e2e1439afc2f0af761ed4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 22 Feb 2018 13:42:29 +0100 Subject: s390: do not bypass BPENTER for interrupt system calls The system call path can be interrupted before the switch back to the standard branch prediction with BPENTER has been done. The critical section cleanup code skips forward to .Lsysc_do_svc and bypasses the BPENTER. In this case the kernel and all subsequent code will run with the limited branch prediction. Fixes: eacf67eb9b32 ("s390: run user space and KVM guests with modified branch prediction") Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 9ec728fa832c9..73492461c4549 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -1440,6 +1440,7 @@ cleanup_critical: stg %r15,__LC_SYSTEM_TIMER 0: # update accounting time stamp mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP # set up saved register r11 lg %r15,__LC_KERNEL_STACK la %r9,STACK_FRAME_OVERHEAD(%r15) -- cgit v1.2.3 From 1b22b4b28fd5fbc51855219e3238b3ab81da8466 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Feb 2018 17:50:12 +0000 Subject: MIPS: ath25: Check for kzalloc allocation failure Currently there is no null check on a failed allocation of board_data, and hence a null pointer dereference will occurr. Fix this by checking for the out of memory null pointer. Fixes: a7473717483e ("MIPS: ath25: add board configuration detection") Signed-off-by: Colin Ian King Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: # 3.19+ Patchwork: https://patchwork.linux-mips.org/patch/18657/ Signed-off-by: James Hogan --- arch/mips/ath25/board.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/mips/ath25/board.c b/arch/mips/ath25/board.c index 9ab48ff80c1c8..6d11ae581ea77 100644 --- a/arch/mips/ath25/board.c +++ b/arch/mips/ath25/board.c @@ -135,6 +135,8 @@ int __init ath25_find_config(phys_addr_t base, unsigned long size) } board_data = kzalloc(BOARD_CONFIG_BUFSZ, GFP_KERNEL); + if (!board_data) + goto error; ath25_board.config = (struct ath25_boarddata *)board_data; memcpy_fromio(board_data, bcfg, 0x100); if (broken_boarddata) { -- cgit v1.2.3 From 902f4d067a50ccf645a58dd5fb1d113b6e0f9b5b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Feb 2018 18:08:53 +0000 Subject: MIPS: OCTEON: irq: Check for null return on kzalloc allocation The allocation of host_data is not null checked, leading to a null pointer dereference if the allocation fails. Fix this by adding a null check and return with -ENOMEM. Fixes: 64b139f97c01 ("MIPS: OCTEON: irq: add CIB and other fixes") Signed-off-by: Colin Ian King Acked-by: David Daney Cc: Ralf Baechle Cc: "Steven J. Hill" Cc: linux-mips@linux-mips.org Cc: # 4.0+ Patchwork: https://patchwork.linux-mips.org/patch/18658/ Signed-off-by: James Hogan --- arch/mips/cavium-octeon/octeon-irq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index 5b3a3f6a9ad31..d99f5242169e7 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -2277,6 +2277,8 @@ static int __init octeon_irq_init_cib(struct device_node *ciu_node, } host_data = kzalloc(sizeof(*host_data), GFP_KERNEL); + if (!host_data) + return -ENOMEM; raw_spin_lock_init(&host_data->lock); addr = of_get_address(ciu_node, 0, NULL, NULL); -- cgit v1.2.3 From 753e8abc36b2c966caea075db0c845563c8a19bf Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 23 Feb 2018 18:04:48 +0000 Subject: arm64: mm: fix thinko in non-global page table attribute check The routine pgattr_change_is_safe() was extended in commit 4e6020565596 ("arm64: mm: Permit transitioning from Global to Non-Global without BBM") to permit changing the nG attribute from not set to set, but did so in a way that inadvertently disallows such changes if other permitted attribute changes take place at the same time. So update the code to take this into account. Fixes: 4e6020565596 ("arm64: mm: Permit transitioning from Global to ...") Cc: # 4.14.x- Acked-by: Mark Rutland Reviewed-by: Marc Zyngier Acked-by: Will Deacon Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 84a019f550229..8c704f1e53c22 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -108,7 +108,7 @@ static bool pgattr_change_is_safe(u64 old, u64 new) * The following mapping attributes may be updated in live * kernel mappings without the need for break-before-make. */ - static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE; + static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG; /* creating or taking down mappings is always safe */ if (old == 0 || new == 0) @@ -118,9 +118,9 @@ static bool pgattr_change_is_safe(u64 old, u64 new) if ((old | new) & PTE_CONT) return false; - /* Transitioning from Global to Non-Global is safe */ - if (((old ^ new) == PTE_NG) && (new & PTE_NG)) - return true; + /* Transitioning from Non-Global to Global is unsafe */ + if (old & ~new & PTE_NG) + return false; return ((old ^ new) & ~mask) == 0; } -- cgit v1.2.3 From 13a55372b64e00e564a08d785ca87bd9d454ba30 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 26 Feb 2018 13:41:47 -0500 Subject: ARM: orion5x: Revert commit 4904dbda41c8. It is not valid for orion5x to use mac_pton(). First of all, the orion5x buffer is not NULL terminated. mac_pton() has no business operating on non-NULL terminated buffers because only the caller can know that this is valid and in what manner it is ok to parse this NULL'less buffer. Second of all, orion5x operates on an __iomem pointer, which cannot be dereferenced using normal C pointer operations. Accesses to such areas much be performed with the proper iomem accessors. Fixes: 4904dbda41c8 ("ARM: orion5x: use mac_pton() helper") Signed-off-by: David S. Miller --- arch/arm/mach-orion5x/Kconfig | 3 -- arch/arm/mach-orion5x/dns323-setup.c | 53 ++++++++++++++++++++++++++++++++++-- arch/arm/mach-orion5x/tsx09-common.c | 49 ++++++++++++++++++++++++++++++--- 3 files changed, 95 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-orion5x/Kconfig b/arch/arm/mach-orion5x/Kconfig index 2a7bb6ccdcb7e..a810f4dd34b1e 100644 --- a/arch/arm/mach-orion5x/Kconfig +++ b/arch/arm/mach-orion5x/Kconfig @@ -58,7 +58,6 @@ config MACH_KUROBOX_PRO config MACH_DNS323 bool "D-Link DNS-323" - select GENERIC_NET_UTILS select I2C_BOARDINFO if I2C help Say 'Y' here if you want your kernel to support the @@ -66,7 +65,6 @@ config MACH_DNS323 config MACH_TS209 bool "QNAP TS-109/TS-209" - select GENERIC_NET_UTILS help Say 'Y' here if you want your kernel to support the QNAP TS-109/TS-209 platform. @@ -101,7 +99,6 @@ config MACH_LINKSTATION_LS_HGL config MACH_TS409 bool "QNAP TS-409" - select GENERIC_NET_UTILS help Say 'Y' here if you want your kernel to support the QNAP TS-409 platform. diff --git a/arch/arm/mach-orion5x/dns323-setup.c b/arch/arm/mach-orion5x/dns323-setup.c index cd483bfb5ca82..d13344b2ddcd4 100644 --- a/arch/arm/mach-orion5x/dns323-setup.c +++ b/arch/arm/mach-orion5x/dns323-setup.c @@ -173,10 +173,42 @@ static struct mv643xx_eth_platform_data dns323_eth_data = { .phy_addr = MV643XX_ETH_PHY_ADDR(8), }; +/* dns323_parse_hex_*() taken from tsx09-common.c; should a common copy of these + * functions be kept somewhere? + */ +static int __init dns323_parse_hex_nibble(char n) +{ + if (n >= '0' && n <= '9') + return n - '0'; + + if (n >= 'A' && n <= 'F') + return n - 'A' + 10; + + if (n >= 'a' && n <= 'f') + return n - 'a' + 10; + + return -1; +} + +static int __init dns323_parse_hex_byte(const char *b) +{ + int hi; + int lo; + + hi = dns323_parse_hex_nibble(b[0]); + lo = dns323_parse_hex_nibble(b[1]); + + if (hi < 0 || lo < 0) + return -1; + + return (hi << 4) | lo; +} + static int __init dns323_read_mac_addr(void) { u_int8_t addr[6]; - void __iomem *mac_page; + int i; + char *mac_page; /* MAC address is stored as a regular ol' string in /dev/mtdblock4 * (0x007d0000-0x00800000) starting at offset 196480 (0x2ff80). @@ -185,8 +217,23 @@ static int __init dns323_read_mac_addr(void) if (!mac_page) return -ENOMEM; - if (!mac_pton((__force const char *) mac_page, addr)) - goto error_fail; + /* Sanity check the string we're looking at */ + for (i = 0; i < 5; i++) { + if (*(mac_page + (i * 3) + 2) != ':') { + goto error_fail; + } + } + + for (i = 0; i < 6; i++) { + int byte; + + byte = dns323_parse_hex_byte(mac_page + (i * 3)); + if (byte < 0) { + goto error_fail; + } + + addr[i] = byte; + } iounmap(mac_page); printk("DNS-323: Found ethernet MAC address: %pM\n", addr); diff --git a/arch/arm/mach-orion5x/tsx09-common.c b/arch/arm/mach-orion5x/tsx09-common.c index 89774985d3803..905d4f2dd0b82 100644 --- a/arch/arm/mach-orion5x/tsx09-common.c +++ b/arch/arm/mach-orion5x/tsx09-common.c @@ -53,12 +53,53 @@ struct mv643xx_eth_platform_data qnap_tsx09_eth_data = { .phy_addr = MV643XX_ETH_PHY_ADDR(8), }; +static int __init qnap_tsx09_parse_hex_nibble(char n) +{ + if (n >= '0' && n <= '9') + return n - '0'; + + if (n >= 'A' && n <= 'F') + return n - 'A' + 10; + + if (n >= 'a' && n <= 'f') + return n - 'a' + 10; + + return -1; +} + +static int __init qnap_tsx09_parse_hex_byte(const char *b) +{ + int hi; + int lo; + + hi = qnap_tsx09_parse_hex_nibble(b[0]); + lo = qnap_tsx09_parse_hex_nibble(b[1]); + + if (hi < 0 || lo < 0) + return -1; + + return (hi << 4) | lo; +} + static int __init qnap_tsx09_check_mac_addr(const char *addr_str) { u_int8_t addr[6]; + int i; - if (!mac_pton(addr_str, addr)) - return -1; + for (i = 0; i < 6; i++) { + int byte; + + /* + * Enforce "xx:xx:xx:xx:xx:xx\n" format. + */ + if (addr_str[(i * 3) + 2] != ((i < 5) ? ':' : '\n')) + return -1; + + byte = qnap_tsx09_parse_hex_byte(addr_str + (i * 3)); + if (byte < 0) + return -1; + addr[i] = byte; + } printk(KERN_INFO "tsx09: found ethernet mac address %pM\n", addr); @@ -77,12 +118,12 @@ void __init qnap_tsx09_find_mac_addr(u32 mem_base, u32 size) unsigned long addr; for (addr = mem_base; addr < (mem_base + size); addr += 1024) { - void __iomem *nor_page; + char *nor_page; int ret = 0; nor_page = ioremap(addr, 1024); if (nor_page != NULL) { - ret = qnap_tsx09_check_mac_addr((__force const char *)nor_page); + ret = qnap_tsx09_check_mac_addr(nor_page); iounmap(nor_page); } -- cgit v1.2.3 From d269176e766c71c998cb75b4ea8cbc321cc0019d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 26 Feb 2018 22:00:47 +0100 Subject: bpf, ppc64: fix out of bounds access in tail call While working on 16338a9b3ac3 ("bpf, arm64: fix out of bounds access in tail call") I noticed that ppc64 JIT is partially affected as well. While the bound checking is correctly performed as unsigned comparison, the register with the index value however, is never truncated into 32 bit space, so e.g. a index value of 0x100000000ULL with a map of 1 element would pass with PPC_CMPLW() whereas we later on continue with the full 64 bit register value. Therefore, as we do in interpreter and other JITs truncate the value to 32 bit initially in order to fix access. Fixes: ce0761419fae ("powerpc/bpf: Implement support for tail calls") Signed-off-by: Daniel Borkmann Reviewed-by: Naveen N. Rao Tested-by: Naveen N. Rao Signed-off-by: Alexei Starovoitov --- arch/powerpc/net/bpf_jit_comp64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0a34b0cec7b7c..0ef3d9580e98c 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -240,6 +240,7 @@ static void bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 * goto out; */ PPC_LWZ(b2p[TMP_REG_1], b2p_bpf_array, offsetof(struct bpf_array, map.max_entries)); + PPC_RLWINM(b2p_index, b2p_index, 0, 0, 31); PPC_CMPLW(b2p_index, b2p[TMP_REG_1]); PPC_BCC(COND_GE, out); -- cgit v1.2.3 From 64c3f648c25d108f346fdc96c15180c6b7d250e9 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 23 Feb 2018 12:55:59 -0800 Subject: powerpc/boot: Fix random libfdt related build errors Once in a while I see build errors similar to the following when building images from a clean tree. Building powerpc:virtex-ml507:44x/virtex5_defconfig ... failed ------------ Error log: arch/powerpc/boot/treeboot-akebono.c:37:20: fatal error: libfdt.h: No such file or directory Building powerpc:bamboo:smpdev:44x/bamboo_defconfig ... failed ------------ Error log: arch/powerpc/boot/treeboot-akebono.c:37:20: fatal error: libfdt.h: No such file or directory arch/powerpc/boot/treeboot-currituck.c:35:20: fatal error: libfdt.h: No such file or directory Rebuilds will succeed. Turns out that several source files in arch/powerpc/boot/ include libfdt.h, but Makefile dependencies are incomplete. Let's fix that. Signed-off-by: Guenter Roeck Signed-off-by: Michael Ellerman --- arch/powerpc/boot/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index ef6549e571571..26d5d2a5b8e99 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -101,7 +101,8 @@ $(addprefix $(obj)/,$(zlib-y)): \ libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h -$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ +$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o \ + treeboot-akebono.o treeboot-currituck.o treeboot-iss4xx.o): \ $(addprefix $(obj)/,$(libfdtheader)) src-wlib-y := string.S crt0.S stdio.c decompress.c main.c \ -- cgit v1.2.3 From 09a0fb67536a49af19f2bfc632100e9de91fe526 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 28 Feb 2018 18:44:34 +0000 Subject: KVM: s390: provide io interrupt kvm_stat We already count io interrupts, but we forgot to print them. Signed-off-by: Christian Borntraeger Fixes: d8346b7d9b ("KVM: s390: Support for I/O interrupts.") Reviewed-by: Cornelia Huck Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 77d7818130db4..df19f158347e0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -86,6 +86,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "deliver_prefix_signal", VCPU_STAT(deliver_prefix_signal) }, { "deliver_restart_signal", VCPU_STAT(deliver_restart_signal) }, { "deliver_program_interruption", VCPU_STAT(deliver_program_int) }, + { "deliver_io_interrupt", VCPU_STAT(deliver_io_int) }, { "exit_wait_state", VCPU_STAT(exit_wait_state) }, { "instruction_epsw", VCPU_STAT(instruction_epsw) }, { "instruction_gs", VCPU_STAT(instruction_gs) }, -- cgit v1.2.3 From c3856aeb29402e94ad9b3879030165cc6a4fdc56 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 23 Feb 2018 21:21:12 +1100 Subject: KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault handler This fixes several bugs in the radix page fault handler relating to the way large pages in the memory backing the guest were handled. First, the check for large pages only checked for explicit huge pages and missed transparent huge pages. Then the check that the addresses (host virtual vs. guest physical) had appropriate alignment was wrong, meaning that the code never put a large page in the partition scoped radix tree; it was always demoted to a small page. Fixing this exposed bugs in kvmppc_create_pte(). We were never invalidating a 2MB PTE, which meant that if a page was initially faulted in without write permission and the guest then attempted to store to it, we would never update the PTE to have write permission. If we find a valid 2MB PTE in the PMD, we need to clear it and do a TLB invalidation before installing either the new 2MB PTE or a pointer to a page table page. This also corrects an assumption that get_user_pages_fast would set the _PAGE_DIRTY bit if we are writing, which is not true. Instead we mark the page dirty explicitly with set_page_dirty_lock(). This also means we don't need the dirty bit set on the host PTE when providing write access on a read fault. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 69 +++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 0c854816e653e..5cb4e4687107e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -195,6 +195,12 @@ static void kvmppc_pte_free(pte_t *ptep) kmem_cache_free(kvm_pte_cache, ptep); } +/* Like pmd_huge() and pmd_large(), but works regardless of config options */ +static inline int pmd_is_leaf(pmd_t pmd) +{ + return !!(pmd_val(pmd) & _PAGE_PTE); +} + static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, unsigned int level, unsigned long mmu_seq) { @@ -219,7 +225,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, else new_pmd = pmd_alloc_one(kvm->mm, gpa); - if (level == 0 && !(pmd && pmd_present(*pmd))) + if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd))) new_ptep = kvmppc_pte_alloc(); /* Check if we might have been invalidated; let the guest retry if so */ @@ -244,12 +250,30 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, new_pmd = NULL; } pmd = pmd_offset(pud, gpa); - if (pmd_large(*pmd)) { - /* Someone else has instantiated a large page here; retry */ - ret = -EAGAIN; - goto out_unlock; - } - if (level == 1 && !pmd_none(*pmd)) { + if (pmd_is_leaf(*pmd)) { + unsigned long lgpa = gpa & PMD_MASK; + + /* + * If we raced with another CPU which has just put + * a 2MB pte in after we saw a pte page, try again. + */ + if (level == 0 && !new_ptep) { + ret = -EAGAIN; + goto out_unlock; + } + /* Valid 2MB page here already, remove it */ + old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd), + ~0UL, 0, lgpa, PMD_SHIFT); + kvmppc_radix_tlbie_page(kvm, lgpa, PMD_SHIFT); + if (old & _PAGE_DIRTY) { + unsigned long gfn = lgpa >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + memslot = gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) + kvmppc_update_dirty_map(memslot, + gfn, PMD_SIZE); + } + } else if (level == 1 && !pmd_none(*pmd)) { /* * There's a page table page here, but we wanted * to install a large page. Tell the caller and let @@ -412,28 +436,24 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } else { page = pages[0]; pfn = page_to_pfn(page); - if (PageHuge(page)) { - page = compound_head(page); - pte_size <<= compound_order(page); + if (PageCompound(page)) { + pte_size <<= compound_order(compound_head(page)); /* See if we can insert a 2MB large-page PTE here */ if (pte_size >= PMD_SIZE && - (gpa & PMD_MASK & PAGE_MASK) == - (hva & PMD_MASK & PAGE_MASK)) { + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { level = 1; pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); } } /* See if we can provide write access */ if (writing) { - /* - * We assume gup_fast has set dirty on the host PTE. - */ pgflags |= _PAGE_WRITE; } else { local_irq_save(flags); ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); - if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) + if (ptep && pte_write(*ptep)) pgflags |= _PAGE_WRITE; local_irq_restore(flags); } @@ -459,18 +479,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pte = pfn_pte(pfn, __pgprot(pgflags)); ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); } - if (ret == 0 || ret == -EAGAIN) - ret = RESUME_GUEST; if (page) { - /* - * We drop pages[0] here, not page because page might - * have been set to the head page of a compound, but - * we have to drop the reference on the correct tail - * page to match the get inside gup() - */ - put_page(pages[0]); + if (!ret && (pgflags & _PAGE_WRITE)) + set_page_dirty_lock(page); + put_page(page); } + + if (ret == 0 || ret == -EAGAIN) + ret = RESUME_GUEST; return ret; } @@ -644,7 +661,7 @@ void kvmppc_free_radix(struct kvm *kvm) continue; pmd = pmd_offset(pud, 0); for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { - if (pmd_huge(*pmd)) { + if (pmd_is_leaf(*pmd)) { pmd_clear(pmd); continue; } -- cgit v1.2.3 From debd574f4195e205ba505b25e19b2b797f4bcd94 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 2 Mar 2018 15:38:04 +1100 Subject: KVM: PPC: Book3S HV: Fix VRMA initialization with 2MB or 1GB memory backing The current code for initializing the VRMA (virtual real memory area) for HPT guests requires the page size of the backing memory to be one of 4kB, 64kB or 16MB. With a radix host we have the possibility that the backing memory page size can be 2MB or 1GB. In these cases, if the guest switches to HPT mode, KVM will not initialize the VRMA and the guest will fail to run. In fact it is not necessary that the VRMA page size is the same as the backing memory page size; any VRMA page size less than or equal to the backing memory page size is acceptable. Therefore we now choose the largest page size out of the set {4k, 64k, 16M} which is not larger than the backing memory page size. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 89707354c2efd..b4a538b29da55 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3656,15 +3656,17 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto up_out; psize = vma_kernel_pagesize(vma); - porder = __ilog2(psize); up_read(¤t->mm->mmap_sem); /* We can handle 4k, 64k or 16M pages in the VRMA */ - err = -EINVAL; - if (!(psize == 0x1000 || psize == 0x10000 || - psize == 0x1000000)) - goto out_srcu; + if (psize >= 0x1000000) + psize = 0x1000000; + else if (psize >= 0x10000) + psize = 0x10000; + else + psize = 0x1000; + porder = __ilog2(psize); senc = slb_pgsize_encoding(psize); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | -- cgit v1.2.3 From 61e18270f604c744ed9f8f1b740022516f9726f8 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Thu, 1 Mar 2018 14:40:52 -0800 Subject: s390: Fix runtime warning about negative pgtables_bytes When running s390 images with 'compat' processes, the following BUG is seen repeatedly. BUG: non-zero pgtables_bytes on freeing mm: -16384 Bisect points to commit b4e98d9ac775 ("mm: account pud page tables"). Analysis shows that init_new_context() is called with mm->context.asce_limit set to _REGION3_SIZE. In this situation, pgtables_bytes remains set to 0 and is not increased. The message is displayed when the affected process dies and mm_dec_nr_puds() is called. Cc: Kirill A. Shutemov Cc: Heiko Carstens Fixes: b4e98d9ac775 ("mm: account pud page tables") Signed-off-by: Guenter Roeck Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu_context.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 65154eaa3714a..6c8ce15cde7b3 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -63,6 +63,7 @@ static inline int init_new_context(struct task_struct *tsk, _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; /* pgd_alloc() did not account this pmd */ mm_inc_nr_pmds(mm); + mm_inc_nr_puds(mm); } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); return 0; -- cgit v1.2.3 From 61bd0f66ff92d5ce765ff9850fd3cbfec773c560 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Fri, 2 Mar 2018 11:51:56 +0100 Subject: KVM: PPC: Book3S HV: Fix guest time accounting with VIRT_CPU_ACCOUNTING_GEN Since commit 8b24e69fc47e ("KVM: PPC: Book3S HV: Close race with testing for signals on guest entry"), if CONFIG_VIRT_CPU_ACCOUNTING_GEN is set, the guest time is not accounted to guest time and user time, but instead to system time. This is because guest_enter()/guest_exit() are called while interrupts are disabled and the tick counter cannot be updated between them. To fix that, move guest_exit() after local_irq_enable(), and as guest_enter() is called with IRQ disabled, call guest_enter_irqoff() instead. Fixes: 8b24e69fc47e ("KVM: PPC: Book3S HV: Close race with testing for signals on guest entry") Signed-off-by: Laurent Vivier Reviewed-by: Paolo Bonzini Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b4a538b29da55..9cb9448163c4b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2885,7 +2885,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ trace_hardirqs_on(); - guest_enter(); + guest_enter_irqoff(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); @@ -2893,8 +2893,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) srcu_read_unlock(&vc->kvm->srcu, srcu_idx); - guest_exit(); - trace_hardirqs_off(); set_irq_happened(trap); @@ -2937,6 +2935,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_set_host_core(pcpu); local_irq_enable(); + guest_exit(); /* Let secondaries go back to the offline loop */ for (i = 0; i < controlled_threads; ++i) { -- cgit v1.2.3 From 317660940fd9dddd3201c2f92e25c27902c753fa Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 2 Mar 2018 07:22:30 -0800 Subject: perf/x86/intel/uncore: Fix Skylake UPI event format There is no event extension (bit 21) for SKX UPI, so use 'event' instead of 'event_ext'. Reported-by: Stephane Eranian Signed-off-by: Kan Liang Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Link: http://lkml.kernel.org/r/1520004150-4855-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 6d8044ab10607..22ec65bc033a9 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3606,7 +3606,7 @@ static struct intel_uncore_type skx_uncore_imc = { }; static struct attribute *skx_upi_uncore_formats_attr[] = { - &format_attr_event_ext.attr, + &format_attr_event.attr, &format_attr_umask_ext.attr, &format_attr_edge.attr, &format_attr_inv.attr, -- cgit v1.2.3 From 6cfc70c4321bde35cb132831cba4685821e65065 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 1 Mar 2018 10:37:41 +0800 Subject: MIPS: Loongson64: Select ARCH_MIGHT_HAVE_PC_PARPORT Commit a211a0820d3c ("MIPS: Push ARCH_MIGHT_HAVE_PC_PARPORT down to platform level") moves the global MIPS ARCH_MIGHT_HAVE_PC_PARPORT select down to various platforms, but doesn't add it to Loongson64 platforms which need it, so add the selects to these platforms too. Fixes: a211a0820d3c ("MIPS: Push ARCH_MIGHT_HAVE_PC_PARPORT down to platform level") Signed-off-by: Huacai Chen Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/18703/ Signed-off-by: James Hogan --- arch/mips/loongson64/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/mips/loongson64/Kconfig b/arch/mips/loongson64/Kconfig index bc2fdbfa8223c..12812a8b640cf 100644 --- a/arch/mips/loongson64/Kconfig +++ b/arch/mips/loongson64/Kconfig @@ -7,6 +7,7 @@ choice config LEMOTE_FULOONG2E bool "Lemote Fuloong(2e) mini-PC" select ARCH_SPARSEMEM_ENABLE + select ARCH_MIGHT_HAVE_PC_PARPORT select CEVT_R4K select CSRC_R4K select SYS_HAS_CPU_LOONGSON2E @@ -33,6 +34,7 @@ config LEMOTE_FULOONG2E config LEMOTE_MACH2F bool "Lemote Loongson 2F family machines" select ARCH_SPARSEMEM_ENABLE + select ARCH_MIGHT_HAVE_PC_PARPORT select BOARD_SCACHE select BOOT_ELF32 select CEVT_R4K if ! MIPS_EXTERNAL_TIMER @@ -62,6 +64,7 @@ config LEMOTE_MACH2F config LOONGSON_MACH3X bool "Generic Loongson 3 family machines" select ARCH_SPARSEMEM_ENABLE + select ARCH_MIGHT_HAVE_PC_PARPORT select GENERIC_ISA_DMA_SUPPORT_BROKEN select BOOT_ELF32 select BOARD_SCACHE -- cgit v1.2.3 From ee2515d95f9a12e04a3863916ae45831438210ce Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Thu, 1 Mar 2018 10:37:42 +0800 Subject: MIPS: Loongson64: Select ARCH_MIGHT_HAVE_PC_SERIO Commit 7a407aa5e0d3 ("MIPS: Push ARCH_MIGHT_HAVE_PC_SERIO down to platform level") moves the global MIPS ARCH_MIGHT_HAVE_PC_SERIO select down to various platforms, but doesn't add it to Loongson64 platforms which need it, so add the selects to these platforms too. Fixes: 7a407aa5e0d3 ("MIPS: Push ARCH_MIGHT_HAVE_PC_SERIO down to platform level") Signed-off-by: Huacai Chen Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/18704/ Signed-off-by: James Hogan --- arch/mips/loongson64/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/mips/loongson64/Kconfig b/arch/mips/loongson64/Kconfig index 12812a8b640cf..72af0c1839698 100644 --- a/arch/mips/loongson64/Kconfig +++ b/arch/mips/loongson64/Kconfig @@ -8,6 +8,7 @@ config LEMOTE_FULOONG2E bool "Lemote Fuloong(2e) mini-PC" select ARCH_SPARSEMEM_ENABLE select ARCH_MIGHT_HAVE_PC_PARPORT + select ARCH_MIGHT_HAVE_PC_SERIO select CEVT_R4K select CSRC_R4K select SYS_HAS_CPU_LOONGSON2E @@ -35,6 +36,7 @@ config LEMOTE_MACH2F bool "Lemote Loongson 2F family machines" select ARCH_SPARSEMEM_ENABLE select ARCH_MIGHT_HAVE_PC_PARPORT + select ARCH_MIGHT_HAVE_PC_SERIO select BOARD_SCACHE select BOOT_ELF32 select CEVT_R4K if ! MIPS_EXTERNAL_TIMER @@ -65,6 +67,7 @@ config LOONGSON_MACH3X bool "Generic Loongson 3 family machines" select ARCH_SPARSEMEM_ENABLE select ARCH_MIGHT_HAVE_PC_PARPORT + select ARCH_MIGHT_HAVE_PC_SERIO select GENERIC_ISA_DMA_SUPPORT_BROKEN select BOOT_ELF32 select BOARD_SCACHE -- cgit v1.2.3 From bd5edbe677948d0883f59d9625c444818d5284b1 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Wed, 14 Feb 2018 12:19:06 +0000 Subject: ia64: convert unwcheck.py to python3 Since my system use python3 as default, arch/ia64/scripts/unwcheck.py no longer run. This patch convert it to the python3 syntax. I have ran it with python2/python3 while printing values of start/end/rlen_sum which could be impacted by this change and I see no difference. Fixes: 94a47083522e ("scripts: change scripts to use system python instead of env") Signed-off-by: Corentin Labbe Signed-off-by: Tony Luck --- arch/ia64/scripts/unwcheck.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/ia64/scripts/unwcheck.py b/arch/ia64/scripts/unwcheck.py index 89f3a1480a637..c55276e31b6b6 100644 --- a/arch/ia64/scripts/unwcheck.py +++ b/arch/ia64/scripts/unwcheck.py @@ -16,7 +16,7 @@ import re import sys if len(sys.argv) != 2: - print "Usage: %s FILE" % sys.argv[0] + print("Usage: %s FILE" % sys.argv[0]) sys.exit(2) readelf = os.getenv("READELF", "readelf") @@ -29,7 +29,7 @@ def check_func (func, slots, rlen_sum): global num_errors num_errors += 1 if not func: func = "[%#x-%#x]" % (start, end) - print "ERROR: %s: %lu slots, total region length = %lu" % (func, slots, rlen_sum) + print("ERROR: %s: %lu slots, total region length = %lu" % (func, slots, rlen_sum)) return num_funcs = 0 @@ -43,23 +43,23 @@ for line in os.popen("%s -u %s" % (readelf, sys.argv[1])): check_func(func, slots, rlen_sum) func = m.group(1) - start = long(m.group(2), 16) - end = long(m.group(3), 16) + start = int(m.group(2), 16) + end = int(m.group(3), 16) slots = 3 * (end - start) / 16 - rlen_sum = 0L + rlen_sum = 0 num_funcs += 1 else: m = rlen_pattern.match(line) if m: - rlen_sum += long(m.group(1)) + rlen_sum += int(m.group(1)) check_func(func, slots, rlen_sum) if num_errors == 0: - print "No errors detected in %u functions." % num_funcs + print("No errors detected in %u functions." % num_funcs) else: if num_errors > 1: err="errors" else: err="error" - print "%u %s detected in %u functions." % (num_errors, err, num_funcs) + print("%u %s detected in %u functions." % (num_errors, err, num_funcs)) sys.exit(1) -- cgit v1.2.3 From 2879b65f9de8b0f159b87ab57fea03096902ce41 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 19 Feb 2018 09:41:26 -0800 Subject: ia64: Convert remaining atomic operations While we've only seen inlining problems with atomic_sub_return(), the other atomic operations could have the same problem. Convert all remaining operations to use the same solution as atomic_sub_return(). Signed-off-by: Matthew Wilcox Signed-off-by: Tony Luck --- arch/ia64/include/asm/atomic.h | 69 ++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h index 762eeb0fcc1dc..2524fb60fbc28 100644 --- a/arch/ia64/include/asm/atomic.h +++ b/arch/ia64/include/asm/atomic.h @@ -66,38 +66,35 @@ ATOMIC_OPS(add, +) ATOMIC_OPS(sub, -) #ifdef __OPTIMIZE__ -#define __ia64_atomic_const(i) __builtin_constant_p(i) ? \ +#define __ia64_atomic_const(i) \ + static const int __ia64_atomic_p = __builtin_constant_p(i) ? \ ((i) == 1 || (i) == 4 || (i) == 8 || (i) == 16 || \ - (i) == -1 || (i) == -4 || (i) == -8 || (i) == -16) : 0 + (i) == -1 || (i) == -4 || (i) == -8 || (i) == -16) : 0;\ + __ia64_atomic_p +#else +#define __ia64_atomic_const(i) 0 +#endif -#define atomic_add_return(i, v) \ +#define atomic_add_return(i,v) \ ({ \ - int __i = (i); \ - static const int __ia64_atomic_p = __ia64_atomic_const(i); \ - __ia64_atomic_p ? ia64_fetch_and_add(__i, &(v)->counter) : \ - ia64_atomic_add(__i, v); \ + int __ia64_aar_i = (i); \ + __ia64_atomic_const(i) \ + ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \ + : ia64_atomic_add(__ia64_aar_i, v); \ }) -#define atomic_sub_return(i, v) \ +#define atomic_sub_return(i,v) \ ({ \ - int __i = (i); \ - static const int __ia64_atomic_p = __ia64_atomic_const(i); \ - __ia64_atomic_p ? ia64_fetch_and_add(-__i, &(v)->counter) : \ - ia64_atomic_sub(__i, v); \ + int __ia64_asr_i = (i); \ + __ia64_atomic_const(i) \ + ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \ + : ia64_atomic_sub(__ia64_asr_i, v); \ }) -#else -#define atomic_add_return(i, v) ia64_atomic_add(i, v) -#define atomic_sub_return(i, v) ia64_atomic_sub(i, v) -#endif #define atomic_fetch_add(i,v) \ ({ \ int __ia64_aar_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \ - || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \ - || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \ - || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \ + __ia64_atomic_const(i) \ ? ia64_fetchadd(__ia64_aar_i, &(v)->counter, acq) \ : ia64_atomic_fetch_add(__ia64_aar_i, v); \ }) @@ -105,11 +102,7 @@ ATOMIC_OPS(sub, -) #define atomic_fetch_sub(i,v) \ ({ \ int __ia64_asr_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \ - || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \ - || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \ - || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \ + __ia64_atomic_const(i) \ ? ia64_fetchadd(-__ia64_asr_i, &(v)->counter, acq) \ : ia64_atomic_fetch_sub(__ia64_asr_i, v); \ }) @@ -170,11 +163,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_add_return(i,v) \ ({ \ long __ia64_aar_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \ - || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \ - || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \ - || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \ + __ia64_atomic_const(i) \ ? ia64_fetch_and_add(__ia64_aar_i, &(v)->counter) \ : ia64_atomic64_add(__ia64_aar_i, v); \ }) @@ -182,11 +171,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_sub_return(i,v) \ ({ \ long __ia64_asr_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \ - || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \ - || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \ - || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \ + __ia64_atomic_const(i) \ ? ia64_fetch_and_add(-__ia64_asr_i, &(v)->counter) \ : ia64_atomic64_sub(__ia64_asr_i, v); \ }) @@ -194,11 +179,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_fetch_add(i,v) \ ({ \ long __ia64_aar_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_aar_i == 1) || (__ia64_aar_i == 4) \ - || (__ia64_aar_i == 8) || (__ia64_aar_i == 16) \ - || (__ia64_aar_i == -1) || (__ia64_aar_i == -4) \ - || (__ia64_aar_i == -8) || (__ia64_aar_i == -16))) \ + __ia64_atomic_const(i) \ ? ia64_fetchadd(__ia64_aar_i, &(v)->counter, acq) \ : ia64_atomic64_fetch_add(__ia64_aar_i, v); \ }) @@ -206,11 +187,7 @@ ATOMIC64_OPS(sub, -) #define atomic64_fetch_sub(i,v) \ ({ \ long __ia64_asr_i = (i); \ - (__builtin_constant_p(i) \ - && ( (__ia64_asr_i == 1) || (__ia64_asr_i == 4) \ - || (__ia64_asr_i == 8) || (__ia64_asr_i == 16) \ - || (__ia64_asr_i == -1) || (__ia64_asr_i == -4) \ - || (__ia64_asr_i == -8) || (__ia64_asr_i == -16))) \ + __ia64_atomic_const(i) \ ? ia64_fetchadd(-__ia64_asr_i, &(v)->counter, acq) \ : ia64_atomic64_fetch_sub(__ia64_asr_i, v); \ }) -- cgit v1.2.3 From 69c907022a7d9325cdc5c9dd064571e445df9a47 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 22 Jan 2018 09:21:37 -0800 Subject: ia64/err-inject: Use get_user_pages_fast() At the point of sysfs callback, the call to gup is done without mmap_sem (or any lock for that matter). This is racy. As such, use the get_user_pages_fast() alternative and safely avoid taking the lock, if possible. Signed-off-by: Davidlohr Bueso Signed-off-by: Tony Luck --- arch/ia64/kernel/err_inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 85bba43e7d5dc..658a8e06a69bb 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -142,7 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, u64 virt_addr=simple_strtoull(buf, NULL, 16); int ret; - ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL); + ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG printk("Virtual address %lx is not existing.\n",virt_addr); -- cgit v1.2.3 From 48e362dd96f37d819042f848888b2c6407e01e6d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 2 Mar 2018 09:10:30 +0000 Subject: ia64/err-inject: fix spelling mistake: "capapbilities" -> "capabilities" Trivial fix to spelling mistake in debug message text. Signed-off-by: Colin Ian King Signed-off-by: Tony Luck --- arch/ia64/kernel/err_inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 658a8e06a69bb..8b5b8e6bc9d9a 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -117,7 +117,7 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]); - printk(KERN_DEBUG "capapbilities=%lx,\n", capabilities[cpu]); + printk(KERN_DEBUG "capabilities=%lx,\n", capabilities[cpu]); printk(KERN_DEBUG "resources=%lx\n", resources[cpu]); #endif return size; -- cgit v1.2.3 From 06a3f0c9f2725f5d7c63c4203839373c9bd00c28 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Wed, 27 Sep 2017 17:15:15 -0700 Subject: MIPS: BMIPS: Do not mask IPIs during suspend Commit a3e6c1eff548 ("MIPS: IRQ: Fix disable_irq on CPU IRQs") fixes an issue where disable_irq did not actually disable the irq. The bug caused our IPIs to not be disabled, which actually is the correct behavior. With the addition of commit a3e6c1eff548 ("MIPS: IRQ: Fix disable_irq on CPU IRQs"), the IPIs were getting disabled going into suspend, thus schedule_ipi() was not being called. This caused deadlocks where schedulable task were not being scheduled and other cpus were waiting for them to do something. Add the IRQF_NO_SUSPEND flag so an irq_disable will not be called on the IPIs during suspend. Signed-off-by: Justin Chen Fixes: a3e6c1eff548 ("MIPS: IRQ: Fix disabled_irq on CPU IRQs") Cc: Florian Fainelli Cc: linux-mips@linux-mips.org Cc: stable@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/17385/ [jhogan@kernel.org: checkpatch: wrap long lines and fix commit refs] Signed-off-by: James Hogan --- arch/mips/kernel/smp-bmips.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c index 9d41732a9146a..159e83add4bb3 100644 --- a/arch/mips/kernel/smp-bmips.c +++ b/arch/mips/kernel/smp-bmips.c @@ -168,11 +168,11 @@ static void bmips_prepare_cpus(unsigned int max_cpus) return; } - if (request_irq(IPI0_IRQ, bmips_ipi_interrupt, IRQF_PERCPU, - "smp_ipi0", NULL)) + if (request_irq(IPI0_IRQ, bmips_ipi_interrupt, + IRQF_PERCPU | IRQF_NO_SUSPEND, "smp_ipi0", NULL)) panic("Can't request IPI0 interrupt"); - if (request_irq(IPI1_IRQ, bmips_ipi_interrupt, IRQF_PERCPU, - "smp_ipi1", NULL)) + if (request_irq(IPI1_IRQ, bmips_ipi_interrupt, + IRQF_PERCPU | IRQF_NO_SUSPEND, "smp_ipi1", NULL)) panic("Can't request IPI1 interrupt"); } -- cgit v1.2.3 From f6a015498dcaee72f80283cb7873d88deb07129c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 6 Mar 2018 00:29:17 -0600 Subject: signal/x86: Include the field offsets in the build time checks Due to an oversight when refactoring siginfo_t si_pkey has been in the wrong position since 4.16-rc1. Add an explicit check of the offset of every user space field in siginfo_t and compat_siginfo_t to make a mistake like this hard to make in the future. I have run this code on 4.15 and 4.16-rc1 with the position of si_pkey fixed and all of the fields show up in the same location. Signed-off-by: "Eric W. Biederman" --- arch/x86/kernel/signal_compat.c | 65 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index ac057f9b07636..0d930d8987cc7 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -43,6 +43,13 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); #define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + BUILD_BUG_ON(offsetof(siginfo_t, si_signo) != 0); + BUILD_BUG_ON(offsetof(siginfo_t, si_errno) != 4); + BUILD_BUG_ON(offsetof(siginfo_t, si_code) != 8); + + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_signo) != 0); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_errno) != 4); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_code) != 8); /* * Ensure that the size of each si_field never changes. * If it does, it is a sign that the @@ -63,36 +70,94 @@ static inline void signal_compat_build_tests(void) CHECK_CSI_SIZE (_kill, 2*sizeof(int)); CHECK_SI_SIZE (_kill, 2*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0xC); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + CHECK_CSI_OFFSET(_timer); CHECK_CSI_SIZE (_timer, 3*sizeof(int)); CHECK_SI_SIZE (_timer, 6*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_tid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_overrun) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_tid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_overrun) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); + CHECK_CSI_OFFSET(_rt); CHECK_CSI_SIZE (_rt, 3*sizeof(int)); CHECK_SI_SIZE (_rt, 4*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_value) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_value) != 0x14); + CHECK_CSI_OFFSET(_sigchld); CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_pid) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_uid) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_status) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_utime) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_stime) != 0x28); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pid) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_uid) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_status) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_utime) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_stime) != 0x1C); + #ifdef CONFIG_X86_X32_ABI CHECK_CSI_OFFSET(_sigchld_x32); CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); /* no _sigchld_x32 in the generic siginfo_t */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._utime) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields._sigchld_x32._stime) != 0x20); #endif CHECK_CSI_OFFSET(_sigfault); CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_addr) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr) != 0x0C); + + BUILD_BUG_ON(offsetof(siginfo_t, si_addr_lsb) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_addr_lsb) != 0x10); + + BUILD_BUG_ON(offsetof(siginfo_t, si_lower) != 0x20); + BUILD_BUG_ON(offsetof(siginfo_t, si_upper) != 0x28); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_lower) != 0x14); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_upper) != 0x18); + + BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); + CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_band) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_fd) != 0x10); + CHECK_CSI_OFFSET(_sigsys); CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + BUILD_BUG_ON(offsetof(siginfo_t, si_call_addr) != 0x10); + BUILD_BUG_ON(offsetof(siginfo_t, si_syscall) != 0x18); + BUILD_BUG_ON(offsetof(siginfo_t, si_arch) != 0x1C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_call_addr) != 0x0C); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_syscall) != 0x10); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_arch) != 0x14); + /* any new si_fields should be added here */ } -- cgit v1.2.3 From d3f468963cd6fd6d2aa5e26aed8b24232096d0e1 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 5 Mar 2018 19:18:47 +0000 Subject: s390/entry.S: fix spurious zeroing of r0 when a system call is interrupted we might call the critical section cleanup handler that re-does some of the operations. When we are between .Lsysc_vtime and .Lsysc_do_svc we might also redo the saving of the problem state registers r0-r7: .Lcleanup_system_call: [...] 0: # update accounting time stamp mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER # set up saved register r11 lg %r15,__LC_KERNEL_STACK la %r9,STACK_FRAME_OVERHEAD(%r15) stg %r9,24(%r11) # r11 pt_regs pointer # fill pt_regs mvc __PT_R8(64,%r9),__LC_SAVE_AREA_SYNC ---> stmg %r0,%r7,__PT_R0(%r9) The problem is now, that we might have already zeroed out r0. The fix is to move the zeroing of r0 after sysc_do_svc. Reported-by: Farhan Ali Fixes: 7041d28115e91 ("s390: scrub registers on kernel entry and KVM exit") Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 73492461c4549..a5621ea6d1234 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -427,13 +427,13 @@ ENTRY(system_call) UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP stmg %r0,%r7,__PT_R0(%r11) - # clear user controlled register to prevent speculative use - xgr %r0,%r0 mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC stg %r14,__PT_FLAGS(%r11) .Lsysc_do_svc: + # clear user controlled register to prevent speculative use + xgr %r0,%r0 # load address of system call table lg %r10,__THREAD_sysc_table(%r13,%r12) llgh %r8,__PT_INT_CODE+2(%r11) -- cgit v1.2.3 From b0c41b8b6e43120d7c35e4709508a3d90a09646e Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 6 Mar 2018 13:44:32 +0530 Subject: powerpc/pseries: Fix vector5 in ibm architecture vector table With ibm,dynamic-memory-v2 and ibm,drc-info coming around the same time, byte22 in vector5 of ibm architecture vector table got set twice separately. The end result is that guest kernel isn't advertising support for ibm,dynamic-memory-v2. Fix this by removing the duplicate assignment of byte22. Fixes: 02ef6dd8109b ("powerpc: Enable support for ibm,drc-info devtree property") Signed-off-by: Bharata B Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom_init.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index d22c41c26bb30..acf4b2e0530cb 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -874,7 +874,6 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .mmu = 0, .hash_ext = 0, .radix_ext = 0, - .byte22 = 0, }, /* option vector 6: IBM PAPR hints */ -- cgit v1.2.3 From f07afa0462b76a5b9c4f3a43d5ac24fdb86a90c2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 6 Mar 2018 14:27:58 +0100 Subject: KVM: s390: fix memory overwrites when not using SCA entries Even if we don't have extended SCA support, we can have more than 64 CPUs if we don't enable any HW features that might use the SCA entries. Now, this works just fine, but we missed a return, which is why we would actually store the SCA entries. If we have more than 64 CPUs, this means writing outside of the basic SCA - bad. Let's fix this. This allows > 64 CPUs when running nested (under vSIE) without random crashes. Fixes: a6940674c384 ("KVM: s390: allow 255 VCPUs when sca entries aren't used") Reported-by: Christian Borntraeger Tested-by: Christian Borntraeger Signed-off-by: David Hildenbrand Message-Id: <20180306132758.21034-1-david@redhat.com> Cc: stable@vger.kernel.org Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index df19f158347e0..339ac0964590a 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2147,6 +2147,7 @@ static void sca_add_vcpu(struct kvm_vcpu *vcpu) /* we still need the basic sca for the ipte control */ vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); vcpu->arch.sie_block->scaol = (__u32)(__u64)sca; + return; } read_lock(&vcpu->kvm->arch.sca_lock); if (vcpu->kvm->arch.use_esca) { -- cgit v1.2.3 From b411991e0ca880d8e7f5eb117c05de6f4b47a2c7 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 6 Mar 2018 22:18:03 +0100 Subject: x86/syscalls/32: Simplify $entry == $compat entries If the compat entry point is equivalent to the native entry point, it does not need to be specified explicitly. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: luto@amacapital.net Cc: viro@zeniv.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/entry/syscalls/syscall_32.tbl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 448ac2161112b..ad5e95a369e49 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -8,7 +8,7 @@ # 0 i386 restart_syscall sys_restart_syscall 1 i386 exit sys_exit -2 i386 fork sys_fork sys_fork +2 i386 fork sys_fork 3 i386 read sys_read 4 i386 write sys_write 5 i386 open sys_open compat_sys_open @@ -78,7 +78,7 @@ 69 i386 ssetmask sys_ssetmask 70 i386 setreuid sys_setreuid16 71 i386 setregid sys_setregid16 -72 i386 sigsuspend sys_sigsuspend sys_sigsuspend +72 i386 sigsuspend sys_sigsuspend 73 i386 sigpending sys_sigpending compat_sys_sigpending 74 i386 sethostname sys_sethostname 75 i386 setrlimit sys_setrlimit compat_sys_setrlimit @@ -196,7 +196,7 @@ 187 i386 sendfile sys_sendfile compat_sys_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork sys_vfork sys_vfork +190 i386 vfork sys_vfork 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 192 i386 mmap2 sys_mmap_pgoff 193 i386 truncate64 sys_truncate64 sys32_truncate64 -- cgit v1.2.3 From a41e2ab08ed62fffc81f71a9bc9c642495a52308 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 6 Mar 2018 22:18:04 +0100 Subject: x86/entry: Remove stale syscall prototype sys32_vm86_warning() is long gone. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: luto@amacapital.net Cc: viro@zeniv.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sys_ia32.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 82c34ee25a651..43d59cae9eb10 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -37,7 +37,6 @@ asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32); long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int); -long sys32_vm86_warning(void); asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t); asmlinkage long sys32_sync_file_range(int, unsigned, unsigned, -- cgit v1.2.3 From 7c2178c1ff482679fb0ca0b628f720a888814548 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 6 Mar 2018 22:18:05 +0100 Subject: x86/syscalls: Use proper syscall definition for sys_ioperm() Using SYSCALL_DEFINEx() is recommended, so use it also here. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: luto@amacapital.net Cc: viro@zeniv.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/ioport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 2f723301eb58f..38deafebb21b7 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -23,7 +23,7 @@ /* * this changes the io permissions bitmap in the current task. */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +SYSCALL_DEFINE3(ioperm, unsigned long, from, unsigned long, num, int, turn_on) { struct thread_struct *t = ¤t->thread; struct tss_struct *tss; -- cgit v1.2.3 From 4ddb45db30851b2269101ee8969f079b028dd257 Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 6 Mar 2018 22:18:07 +0100 Subject: x86/syscalls: Use COMPAT_SYSCALL_DEFINEx() macros for x86-only compat syscalls While at it, convert declarations of type "unsigned" to "unsigned int". Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: luto@amacapital.net Cc: viro@zeniv.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/entry/syscalls/syscall_32.tbl | 30 ++++++++-------- arch/x86/ia32/sys_ia32.c | 63 ++++++++++++++++++---------------- arch/x86/include/asm/sys_ia32.h | 45 +++++++++++++++--------- 3 files changed, 76 insertions(+), 62 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ad5e95a369e49..e7fd0a76bf994 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -13,7 +13,7 @@ 4 i386 write sys_write 5 i386 open sys_open compat_sys_open 6 i386 close sys_close -7 i386 waitpid sys_waitpid sys32_waitpid +7 i386 waitpid sys_waitpid compat_sys_x86_waitpid 8 i386 creat sys_creat 9 i386 link sys_link 10 i386 unlink sys_unlink @@ -96,7 +96,7 @@ 87 i386 swapon sys_swapon 88 i386 reboot sys_reboot 89 i386 readdir sys_old_readdir compat_sys_old_readdir -90 i386 mmap sys_old_mmap sys32_mmap +90 i386 mmap sys_old_mmap compat_sys_x86_mmap 91 i386 munmap sys_munmap 92 i386 truncate sys_truncate compat_sys_truncate 93 i386 ftruncate sys_ftruncate compat_sys_ftruncate @@ -186,8 +186,8 @@ 177 i386 rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait 178 i386 rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo 179 i386 rt_sigsuspend sys_rt_sigsuspend -180 i386 pread64 sys_pread64 sys32_pread -181 i386 pwrite64 sys_pwrite64 sys32_pwrite +180 i386 pread64 sys_pread64 compat_sys_x86_pread +181 i386 pwrite64 sys_pwrite64 compat_sys_x86_pwrite 182 i386 chown sys_chown16 183 i386 getcwd sys_getcwd 184 i386 capget sys_capget @@ -199,11 +199,11 @@ 190 i386 vfork sys_vfork 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 192 i386 mmap2 sys_mmap_pgoff -193 i386 truncate64 sys_truncate64 sys32_truncate64 -194 i386 ftruncate64 sys_ftruncate64 sys32_ftruncate64 -195 i386 stat64 sys_stat64 sys32_stat64 -196 i386 lstat64 sys_lstat64 sys32_lstat64 -197 i386 fstat64 sys_fstat64 sys32_fstat64 +193 i386 truncate64 sys_truncate64 compat_sys_x86_truncate64 +194 i386 ftruncate64 sys_ftruncate64 compat_sys_x86_ftruncate64 +195 i386 stat64 sys_stat64 compat_sys_x86_stat64 +196 i386 lstat64 sys_lstat64 compat_sys_x86_lstat64 +197 i386 fstat64 sys_fstat64 compat_sys_x86_fstat64 198 i386 lchown32 sys_lchown 199 i386 getuid32 sys_getuid 200 i386 getgid32 sys_getgid @@ -231,7 +231,7 @@ # 222 is unused # 223 is unused 224 i386 gettid sys_gettid -225 i386 readahead sys_readahead sys32_readahead +225 i386 readahead sys_readahead compat_sys_x86_readahead 226 i386 setxattr sys_setxattr 227 i386 lsetxattr sys_lsetxattr 228 i386 fsetxattr sys_fsetxattr @@ -256,7 +256,7 @@ 247 i386 io_getevents sys_io_getevents compat_sys_io_getevents 248 i386 io_submit sys_io_submit compat_sys_io_submit 249 i386 io_cancel sys_io_cancel -250 i386 fadvise64 sys_fadvise64 sys32_fadvise64 +250 i386 fadvise64 sys_fadvise64 compat_sys_x86_fadvise64 # 251 is available for reuse (was briefly sys_set_zone_reclaim) 252 i386 exit_group sys_exit_group 253 i386 lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie @@ -278,7 +278,7 @@ 269 i386 fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 270 i386 tgkill sys_tgkill 271 i386 utimes sys_utimes compat_sys_utimes -272 i386 fadvise64_64 sys_fadvise64_64 sys32_fadvise64_64 +272 i386 fadvise64_64 sys_fadvise64_64 compat_sys_x86_fadvise64_64 273 i386 vserver 274 i386 mbind sys_mbind 275 i386 get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy @@ -306,7 +306,7 @@ 297 i386 mknodat sys_mknodat 298 i386 fchownat sys_fchownat 299 i386 futimesat sys_futimesat compat_sys_futimesat -300 i386 fstatat64 sys_fstatat64 sys32_fstatat +300 i386 fstatat64 sys_fstatat64 compat_sys_x86_fstatat 301 i386 unlinkat sys_unlinkat 302 i386 renameat sys_renameat 303 i386 linkat sys_linkat @@ -320,7 +320,7 @@ 311 i386 set_robust_list sys_set_robust_list compat_sys_set_robust_list 312 i386 get_robust_list sys_get_robust_list compat_sys_get_robust_list 313 i386 splice sys_splice -314 i386 sync_file_range sys_sync_file_range sys32_sync_file_range +314 i386 sync_file_range sys_sync_file_range compat_sys_x86_sync_file_range 315 i386 tee sys_tee 316 i386 vmsplice sys_vmsplice compat_sys_vmsplice 317 i386 move_pages sys_move_pages compat_sys_move_pages @@ -330,7 +330,7 @@ 321 i386 signalfd sys_signalfd compat_sys_signalfd 322 i386 timerfd_create sys_timerfd_create 323 i386 eventfd sys_eventfd -324 i386 fallocate sys_fallocate sys32_fallocate +324 i386 fallocate sys_fallocate compat_sys_x86_fallocate 325 i386 timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime 326 i386 timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime 327 i386 signalfd4 sys_signalfd4 compat_sys_signalfd4 diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 96cd33bbfc854..3bc03446ec441 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -51,15 +51,14 @@ #define AA(__x) ((unsigned long)(__x)) -asmlinkage long sys32_truncate64(const char __user *filename, - unsigned long offset_low, - unsigned long offset_high) +COMPAT_SYSCALL_DEFINE3(x86_truncate64, const char __user *, filename, + unsigned long, offset_low, unsigned long, offset_high) { return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); } -asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low, - unsigned long offset_high) +COMPAT_SYSCALL_DEFINE3(x86_ftruncate64, unsigned int, fd, + unsigned long, offset_low, unsigned long, offset_high) { return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); } @@ -96,8 +95,8 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) return 0; } -asmlinkage long sys32_stat64(const char __user *filename, - struct stat64 __user *statbuf) +COMPAT_SYSCALL_DEFINE2(x86_stat64, const char __user *, filename, + struct stat64 __user *, statbuf) { struct kstat stat; int ret = vfs_stat(filename, &stat); @@ -107,8 +106,8 @@ asmlinkage long sys32_stat64(const char __user *filename, return ret; } -asmlinkage long sys32_lstat64(const char __user *filename, - struct stat64 __user *statbuf) +COMPAT_SYSCALL_DEFINE2(x86_lstat64, const char __user *, filename, + struct stat64 __user *, statbuf) { struct kstat stat; int ret = vfs_lstat(filename, &stat); @@ -117,7 +116,8 @@ asmlinkage long sys32_lstat64(const char __user *filename, return ret; } -asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) +COMPAT_SYSCALL_DEFINE2(x86_fstat64, unsigned int, fd, + struct stat64 __user *, statbuf) { struct kstat stat; int ret = vfs_fstat(fd, &stat); @@ -126,8 +126,9 @@ asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) return ret; } -asmlinkage long sys32_fstatat(unsigned int dfd, const char __user *filename, - struct stat64 __user *statbuf, int flag) +COMPAT_SYSCALL_DEFINE4(x86_fstatat, unsigned int, dfd, + const char __user *, filename, + struct stat64 __user *, statbuf, int, flag) { struct kstat stat; int error; @@ -153,7 +154,7 @@ struct mmap_arg_struct32 { unsigned int offset; }; -asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg) +COMPAT_SYSCALL_DEFINE1(x86_mmap, struct mmap_arg_struct32 __user *, arg) { struct mmap_arg_struct32 a; @@ -167,22 +168,22 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg) a.offset>>PAGE_SHIFT); } -asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr, - int options) +COMPAT_SYSCALL_DEFINE3(x86_waitpid, compat_pid_t, pid, unsigned int __user *, + stat_addr, int, options) { return compat_sys_wait4(pid, stat_addr, options, NULL); } /* warning: next two assume little endian */ -asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, - u32 poslo, u32 poshi) +COMPAT_SYSCALL_DEFINE5(x86_pread, unsigned int, fd, char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) { return sys_pread64(fd, ubuf, count, ((loff_t)AA(poshi) << 32) | AA(poslo)); } -asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, - u32 count, u32 poslo, u32 poshi) +COMPAT_SYSCALL_DEFINE5(x86_pwrite, unsigned int, fd, const char __user *, ubuf, + u32, count, u32, poslo, u32, poshi) { return sys_pwrite64(fd, ubuf, count, ((loff_t)AA(poshi) << 32) | AA(poslo)); @@ -193,8 +194,9 @@ asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf, * Some system calls that need sign extended arguments. This could be * done by a generic wrapper. */ -long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, - __u32 len_low, __u32 len_high, int advice) +COMPAT_SYSCALL_DEFINE6(x86_fadvise64_64, int, fd, __u32, offset_low, + __u32, offset_high, __u32, len_low, __u32, len_high, + int, advice) { return sys_fadvise64_64(fd, (((u64)offset_high)<<32) | offset_low, @@ -202,30 +204,31 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, advice); } -asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, - size_t count) +COMPAT_SYSCALL_DEFINE4(x86_readahead, int, fd, unsigned int, off_lo, + unsigned int, off_hi, size_t, count) { return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); } -asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, - unsigned n_low, unsigned n_hi, int flags) +COMPAT_SYSCALL_DEFINE6(x86_sync_file_range, int, fd, unsigned int, off_low, + unsigned int, off_hi, unsigned int, n_low, + unsigned int, n_hi, int, flags) { return sys_sync_file_range(fd, ((u64)off_hi << 32) | off_low, ((u64)n_hi << 32) | n_low, flags); } -asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, - size_t len, int advice) +COMPAT_SYSCALL_DEFINE5(x86_fadvise64, int, fd, unsigned int, offset_lo, + unsigned int, offset_hi, size_t, len, int, advice) { return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, len, advice); } -asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo, - unsigned offset_hi, unsigned len_lo, - unsigned len_hi) +COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode, + unsigned int, offset_lo, unsigned int, offset_hi, + unsigned int, len_lo, unsigned int, len_hi) { return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, ((u64)len_hi << 32) | len_lo); diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 43d59cae9eb10..32831905d97a9 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -20,30 +20,41 @@ #include /* ia32/sys_ia32.c */ -asmlinkage long sys32_truncate64(const char __user *, unsigned long, unsigned long); -asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); +asmlinkage long compat_sys_x86_truncate64(const char __user *, unsigned long, + unsigned long); +asmlinkage long compat_sys_x86_ftruncate64(unsigned int, unsigned long, + unsigned long); -asmlinkage long sys32_stat64(const char __user *, struct stat64 __user *); -asmlinkage long sys32_lstat64(const char __user *, struct stat64 __user *); -asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); -asmlinkage long sys32_fstatat(unsigned int, const char __user *, +asmlinkage long compat_sys_x86_stat64(const char __user *, + struct stat64 __user *); +asmlinkage long compat_sys_x86_lstat64(const char __user *, + struct stat64 __user *); +asmlinkage long compat_sys_x86_fstat64(unsigned int, struct stat64 __user *); +asmlinkage long compat_sys_x86_fstatat(unsigned int, const char __user *, struct stat64 __user *, int); struct mmap_arg_struct32; -asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *); +asmlinkage long compat_sys_x86_mmap(struct mmap_arg_struct32 __user *); -asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int); +asmlinkage long compat_sys_x86_waitpid(compat_pid_t, unsigned int __user *, + int); -asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); -asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32); +asmlinkage long compat_sys_x86_pread(unsigned int, char __user *, u32, u32, + u32); +asmlinkage long compat_sys_x86_pwrite(unsigned int, const char __user *, u32, + u32, u32); -long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int); +asmlinkage long compat_sys_x86_fadvise64_64(int, __u32, __u32, __u32, __u32, + int); -asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t); -asmlinkage long sys32_sync_file_range(int, unsigned, unsigned, - unsigned, unsigned, int); -asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int); -asmlinkage long sys32_fallocate(int, int, unsigned, - unsigned, unsigned, unsigned); +asmlinkage ssize_t compat_sys_x86_readahead(int, unsigned int, unsigned int, + size_t); +asmlinkage long compat_sys_x86_sync_file_range(int, unsigned int, unsigned int, + unsigned int, unsigned int, + int); +asmlinkage long compat_sys_x86_fadvise64(int, unsigned int, unsigned int, + size_t, int); +asmlinkage long compat_sys_x86_fallocate(int, int, unsigned int, unsigned int, + unsigned int, unsigned int); /* ia32/ia32_signal.c */ asmlinkage long sys32_sigreturn(void); -- cgit v1.2.3 From af52201d991624d2d5adce2c123805b3d42a8d4d Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 6 Mar 2018 22:18:08 +0100 Subject: x86/entry: Do not special-case clone(2) in compat entry With the CPU renaming registers on its own, and all the overhead of the syscall entry/exit, it is doubtful whether the compiled output of mov %r8, %rax mov %rcx, %r8 mov %rax, %rcx jmpq sys_clone is measurably slower than the hand-crafted version of xchg %r8, %rcx So get rid of this special case. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: luto@amacapital.net Cc: viro@zeniv.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 12 ------------ arch/x86/entry/syscalls/syscall_32.tbl | 2 +- arch/x86/ia32/sys_ia32.c | 11 +++++++++++ arch/x86/include/asm/sys_ia32.h | 2 ++ 4 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e811dd9c5e99e..ff61b7bb750ba 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -406,15 +406,3 @@ ENTRY(entry_INT80_compat) TRACE_IRQS_ON jmp swapgs_restore_regs_and_return_to_usermode END(entry_INT80_compat) - -ENTRY(stub32_clone) - /* - * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). - * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). - * - * The native 64-bit kernel's sys_clone() implements the latter, - * so we need to swap arguments here before calling it: - */ - xchg %r8, %rcx - jmp sys_clone -ENDPROC(stub32_clone) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index e7fd0a76bf994..2a5e99cff8597 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -126,7 +126,7 @@ 117 i386 ipc sys_ipc compat_sys_ipc 118 i386 fsync sys_fsync 119 i386 sigreturn sys_sigreturn sys32_sigreturn -120 i386 clone sys_clone stub32_clone +120 i386 clone sys_clone compat_sys_x86_clone 121 i386 setdomainname sys_setdomainname 122 i386 uname sys_newuname 123 i386 modify_ldt sys_modify_ldt diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 3bc03446ec441..6512498bbef69 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -233,3 +233,14 @@ COMPAT_SYSCALL_DEFINE6(x86_fallocate, int, fd, int, mode, return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo, ((u64)len_hi << 32) | len_lo); } + +/* + * The 32-bit clone ABI is CONFIG_CLONE_BACKWARDS + */ +COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, + unsigned long, newsp, int __user *, parent_tidptr, + unsigned long, tls_val, int __user *, child_tidptr) +{ + return sys_clone(clone_flags, newsp, parent_tidptr, child_tidptr, + tls_val); +} diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 32831905d97a9..906794aa034e7 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -55,6 +55,8 @@ asmlinkage long compat_sys_x86_fadvise64(int, unsigned int, unsigned int, size_t, int); asmlinkage long compat_sys_x86_fallocate(int, int, unsigned int, unsigned int, unsigned int, unsigned int); +asmlinkage long compat_sys_x86_clone(unsigned long, unsigned long, int __user *, + unsigned long, int __user *); /* ia32/ia32_signal.c */ asmlinkage long sys32_sigreturn(void); -- cgit v1.2.3 From 91c5f0de64a226e0d9c558b26ef7c2655ef31cca Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Tue, 6 Mar 2018 22:18:09 +0100 Subject: x86/entry/64/compat: Save one instruction in entry_INT80_compat() As %rdi is never user except in the following push, there is no need to restore %rdi to the original value. Signed-off-by: Dominik Brodowski Acked-by: Linus Torvalds Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: luto@amacapital.net Cc: viro@zeniv.linux.org.uk Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64_compat.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index ff61b7bb750ba..08425c42f8b7c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -363,9 +363,7 @@ ENTRY(entry_INT80_compat) pushq 2*8(%rdi) /* regs->ip */ pushq 1*8(%rdi) /* regs->orig_ax */ - movq (%rdi), %rdi /* restore %rdi */ - - pushq %rdi /* pt_regs->di */ + pushq (%rdi) /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ -- cgit v1.2.3 From 6007b080d2e2adb7af22bf29165f0594ea12b34c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 7 Mar 2018 22:10:01 +0100 Subject: bpf, x64: increase number of passes In Cilium some of the main programs we run today are hitting 9 passes on x64's JIT compiler, and we've had cases already where we surpassed the limit where the JIT then punts the program to the interpreter instead, leading to insertion failures due to CONFIG_BPF_JIT_ALWAYS_ON or insertion failures due to the prog array owner being JITed but the program to insert not (both must have the same JITed/non-JITed property). One concrete case the program image shrunk from 12,767 bytes down to 10,288 bytes where the image converged after 16 steps. I've measured that this took 340us in the JIT until it converges on my i7-6600U. Thus, increase the original limit we had from day one where the JIT covered cBPF only back then before we run into the case (as similar with the complexity limit) where we trip over this and hit program rejections. Also add a cond_resched() into the compilation loop, the JIT process runs without any locks and may sleep anyway. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 45e4eb5bcbb2a..ce5b2ebd57015 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1188,7 +1188,7 @@ skip_init_addrs: * may converge on the last pass. In such case do one more * pass to emit the final image */ - for (pass = 0; pass < 10 || image; pass++) { + for (pass = 0; pass < 20 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); if (proglen <= 0) { image = NULL; @@ -1215,6 +1215,7 @@ skip_init_addrs: } } oldproglen = proglen; + cond_resched(); } if (bpf_jit_enable > 1) -- cgit v1.2.3 From 076ca272a14cea558b1092ec85cea08510283f2a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 7 Mar 2018 11:12:27 -0800 Subject: x86/vsyscall/64: Drop "native" vsyscalls Since Linux v3.2, vsyscalls have been deprecated and slow. From v3.2 on, Linux had three vsyscall modes: "native", "emulate", and "none". "emulate" is the default. All known user programs work correctly in emulate mode, but vsyscalls turn into page faults and are emulated. This is very slow. In "native" mode, the vsyscall page is easily usable as an exploit gadget, but vsyscalls are a bit faster -- they turn into normal syscalls. (This is in contrast to vDSO functions, which can be much faster than syscalls.) In "none" mode, there are no vsyscalls. For all practical purposes, "native" was really just a chicken bit in case something went wrong with the emulation. It's been over six years, and nothing has gone wrong. Delete it. Signed-off-by: Andy Lutomirski Acked-by: Kees Cook Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Dominik Brodowski Cc: Kernel Hardening Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/519fee5268faea09ae550776ce969fa6e88668b0.1520449896.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 11 +---------- arch/x86/entry/vsyscall/vsyscall_64.c | 16 +++------------- arch/x86/include/asm/pgtable_types.h | 2 -- tools/testing/selftests/x86/test_vsyscall.c | 11 ++++++----- 4 files changed, 10 insertions(+), 30 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1aed6c0e4137..09c599e0900d4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2266,7 +2266,7 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[native|emulate|none]. + line parameter vsyscall=[emulate|none]. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty @@ -2274,15 +2274,6 @@ choice If unsure, select "Emulate". - config LEGACY_VSYSCALL_NATIVE - bool "Native" - help - Actual executable code is located in the fixed vsyscall - address mapping, implementing time() efficiently. Since - this makes the mapping executable, it can be used during - security vulnerability exploitation (traditionally as - ROP gadgets). This configuration is not recommended. - config LEGACY_VSYSCALL_EMULATE bool "Emulate" help diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 577fa8adb785b..8560ef68a9d63 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -42,10 +42,8 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = -#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE) - NATIVE; -#elif defined(CONFIG_LEGACY_VSYSCALL_NONE) +static enum { EMULATE, NONE } vsyscall_mode = +#ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; #else EMULATE; @@ -56,8 +54,6 @@ static int __init vsyscall_setup(char *str) if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; - else if (!strcmp("native", str)) - vsyscall_mode = NATIVE; else if (!strcmp("none", str)) vsyscall_mode = NONE; else @@ -139,10 +135,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) WARN_ON_ONCE(address != regs->ip); - /* This should be unreachable in NATIVE mode. */ - if (WARN_ON(vsyscall_mode == NATIVE)) - return false; - if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); @@ -370,9 +362,7 @@ void __init map_vsyscall(void) if (vsyscall_mode != NONE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); + PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 246f15b4e64ce..acfe755562a6a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -174,7 +174,6 @@ enum page_cache_mode { #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) -#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) @@ -206,7 +205,6 @@ enum page_cache_mode { #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) -#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index be81621446f01..0b4f1cc2291c6 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -450,7 +450,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) num_vsyscall_traps++; } -static int test_native_vsyscall(void) +static int test_emulation(void) { time_t tmp; bool is_native; @@ -458,7 +458,7 @@ static int test_native_vsyscall(void) if (!vtime) return 0; - printf("[RUN]\tchecking for native vsyscall\n"); + printf("[RUN]\tchecking that vsyscalls are emulated\n"); sethandler(SIGTRAP, sigtrap, 0); set_eflags(get_eflags() | X86_EFLAGS_TF); vtime(&tmp); @@ -474,11 +474,12 @@ static int test_native_vsyscall(void) */ is_native = (num_vsyscall_traps > 1); - printf("\tvsyscalls are %s (%d instructions in vsyscall page)\n", + printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n", + (is_native ? "FAIL" : "OK"), (is_native ? "native" : "emulated"), (int)num_vsyscall_traps); - return 0; + return is_native; } #endif @@ -498,7 +499,7 @@ int main(int argc, char **argv) nerrs += test_vsys_r(); #ifdef __x86_64__ - nerrs += test_native_vsyscall(); + nerrs += test_emulation(); #endif return nerrs ? 1 : 0; -- cgit v1.2.3 From 36268223c1e9981d6cfc33aff8520b3bde4b8114 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 26 Feb 2018 09:35:01 -0500 Subject: x86/spectre_v2: Don't check microcode versions when running under hypervisors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As: 1) It's known that hypervisors lie about the environment anyhow (host mismatch) 2) Even if the hypervisor (Xen, KVM, VMWare, etc) provided a valid "correct" value, it all gets to be very murky when migration happens (do you provide the "new" microcode of the machine?). And in reality the cloud vendors are the ones that should make sure that the microcode that is running is correct and we should just sing lalalala and trust them. Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Thomas Gleixner Reviewed-by: Paolo Bonzini Cc: Wanpeng Li Cc: kvm Cc: Krčmář Cc: Borislav Petkov CC: "H. Peter Anvin" CC: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180226213019.GE9497@char.us.oracle.com --- arch/x86/kernel/cpu/intel.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d19e903214b40..4aa9fd3793905 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -144,6 +144,13 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) { int i; + /* + * We know that the hypervisor lie to us on the microcode version so + * we may as well hope that it is running the correct version. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) + return false; + for (i = 0; i < ARRAY_SIZE(spectre_bad_microcodes); i++) { if (c->x86_model == spectre_bad_microcodes[i].model && c->x86_stepping == spectre_bad_microcodes[i].stepping) -- cgit v1.2.3 From 854857f5944c59a881ff607b37ed9ed41d031a3b Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 28 Feb 2018 11:28:40 +0100 Subject: x86/microcode: Get rid of struct apply_microcode_ctx It is a useless remnant from earlier times. Use the ucode_state enum directly. No functional change. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Tested-by: Ashok Raj Cc: Arjan Van De Ven Link: https://lkml.kernel.org/r/20180228102846.13447-2-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index aa1b9a422f2be..63370651e3766 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -373,26 +373,23 @@ static int collect_cpu_info(int cpu) return ret; } -struct apply_microcode_ctx { - enum ucode_state err; -}; - static void apply_microcode_local(void *arg) { - struct apply_microcode_ctx *ctx = arg; + enum ucode_state *err = arg; - ctx->err = microcode_ops->apply_microcode(smp_processor_id()); + *err = microcode_ops->apply_microcode(smp_processor_id()); } static int apply_microcode_on_target(int cpu) { - struct apply_microcode_ctx ctx = { .err = 0 }; + enum ucode_state err; int ret; - ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); - if (!ret) - ret = ctx.err; - + ret = smp_call_function_single(cpu, apply_microcode_local, &err, 1); + if (!ret) { + if (err == UCODE_ERROR) + ret = 1; + } return ret; } -- cgit v1.2.3 From c182d2b7d0ca48e0d6ff16f7d883161238c447ed Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 28 Feb 2018 11:28:41 +0100 Subject: x86/microcode/intel: Check microcode revision before updating sibling threads After updating microcode on one of the threads of a core, the other thread sibling automatically gets the update since the microcode resources on a hyperthreaded core are shared between the two threads. Check the microcode revision on the CPU before performing a microcode update and thus save us the WRMSR 0x79 because it is a particularly expensive operation. [ Borislav: Massage changelog and coding style. ] Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Tested-by: Ashok Raj Cc: Arjan Van De Ven Link: http://lkml.kernel.org/r/1519352533-15992-2-git-send-email-ashok.raj@intel.com Link: https://lkml.kernel.org/r/20180228102846.13447-3-bp@alien8.de --- arch/x86/kernel/cpu/microcode/intel.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 923054a6b7601..87bd6dc940815 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -589,6 +589,17 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) if (!mc) return 0; + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + uci->cpu_sig.rev = rev; + return UCODE_OK; + } + /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -776,7 +787,7 @@ static enum ucode_state apply_microcode_intel(int cpu) { struct microcode_intel *mc; struct ucode_cpu_info *uci; - struct cpuinfo_x86 *c; + struct cpuinfo_x86 *c = &cpu_data(cpu); static int prev_rev; u32 rev; @@ -793,6 +804,18 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_NFOUND; } + /* + * Save us the MSR write below - which is a particular expensive + * operation - when the other hyperthread has updated the microcode + * already. + */ + rev = intel_get_microcode_revision(); + if (rev >= mc->hdr.rev) { + uci->cpu_sig.rev = rev; + c->microcode = rev; + return UCODE_OK; + } + /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -813,8 +836,6 @@ static enum ucode_state apply_microcode_intel(int cpu) prev_rev = rev; } - c = &cpu_data(cpu); - uci->cpu_sig.rev = rev; c->microcode = rev; -- cgit v1.2.3 From 91df9fdf51492aec9fed6b4cbd33160886740f47 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 28 Feb 2018 11:28:42 +0100 Subject: x86/microcode/intel: Writeback and invalidate caches before updating microcode Updating microcode is less error prone when caches have been flushed and depending on what exactly the microcode is updating. For example, some of the issues around certain Broadwell parts can be addressed by doing a full cache flush. [ Borislav: Massage it and use native_wbinvd() in both cases. ] Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Tested-by: Ashok Raj Cc: Arjan Van De Ven Link: http://lkml.kernel.org/r/1519352533-15992-3-git-send-email-ashok.raj@intel.com Link: https://lkml.kernel.org/r/20180228102846.13447-4-bp@alien8.de --- arch/x86/kernel/cpu/microcode/intel.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 87bd6dc940815..e2864bc2d5750 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -600,6 +600,12 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return UCODE_OK; } + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + /* write microcode via MSR 0x79 */ native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -816,6 +822,12 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_OK; } + /* + * Writeback and invalidate caches before updating microcode to avoid + * internal issues depending on what the microcode is updating. + */ + native_wbinvd(); + /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); -- cgit v1.2.3 From 30ec26da9967d0d785abc24073129a34c3211777 Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 28 Feb 2018 11:28:43 +0100 Subject: x86/microcode: Do not upload microcode if CPUs are offline Avoid loading microcode if any of the CPUs are offline, and issue a warning. Having different microcode revisions on the system at any time is outright dangerous. [ Borislav: Massage changelog. ] Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Tested-by: Ashok Raj Reviewed-by: Tom Lendacky Cc: Arjan Van De Ven Link: http://lkml.kernel.org/r/1519352533-15992-4-git-send-email-ashok.raj@intel.com Link: https://lkml.kernel.org/r/20180228102846.13447-5-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 63370651e3766..fa32cb3dcca5a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -486,6 +486,16 @@ static void __exit microcode_dev_exit(void) /* fake device for request_firmware */ static struct platform_device *microcode_pdev; +static int check_online_cpus(void) +{ + if (num_online_cpus() == num_present_cpus()) + return 0; + + pr_err("Not all CPUs online, aborting microcode update.\n"); + + return -EINVAL; +} + static enum ucode_state reload_for_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -519,7 +529,13 @@ static ssize_t reload_store(struct device *dev, return size; get_online_cpus(); + + ret = check_online_cpus(); + if (ret) + goto put; + mutex_lock(µcode_mutex); + for_each_online_cpu(cpu) { tmp_ret = reload_for_cpu(cpu); if (tmp_ret > UCODE_NFOUND) { @@ -538,6 +554,8 @@ static ssize_t reload_store(struct device *dev, microcode_check(); mutex_unlock(µcode_mutex); + +put: put_online_cpus(); if (!ret) -- cgit v1.2.3 From d8c3b52c00a05036e0a6b315b4b17921a7b67997 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 28 Feb 2018 11:28:44 +0100 Subject: x86/microcode/intel: Look into the patch cache first The cache might contain a newer patch - look in there first. A follow-on change will make sure newest patches are loaded into the cache of microcode patches. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Tested-by: Ashok Raj Cc: Arjan Van De Ven Link: https://lkml.kernel.org/r/20180228102846.13447-6-bp@alien8.de --- arch/x86/kernel/cpu/microcode/intel.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index e2864bc2d5750..2aded9db1d425 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -791,9 +791,9 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) static enum ucode_state apply_microcode_intel(int cpu) { - struct microcode_intel *mc; - struct ucode_cpu_info *uci; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpuinfo_x86 *c = &cpu_data(cpu); + struct microcode_intel *mc; static int prev_rev; u32 rev; @@ -801,11 +801,10 @@ static enum ucode_state apply_microcode_intel(int cpu) if (WARN_ON(raw_smp_processor_id() != cpu)) return UCODE_ERROR; - uci = ucode_cpu_info + cpu; - mc = uci->mc; + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); if (!mc) { - /* Look for a newer patch in our cache: */ - mc = find_patch(uci); + mc = uci->mc; if (!mc) return UCODE_NFOUND; } -- cgit v1.2.3 From cfb52a5a09c8ae3a1dafb44ce549fde5b69e8117 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 28 Feb 2018 11:28:45 +0100 Subject: x86/microcode: Request microcode on the BSP ... so that any newer version can land in the cache and can later be fished out by the application functions. Do that before grabbing the hotplug lock. Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Tested-by: Ashok Raj Reviewed-by: Tom Lendacky Cc: Arjan Van De Ven Link: https://lkml.kernel.org/r/20180228102846.13447-7-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index fa32cb3dcca5a..5dd157d48606c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -499,15 +499,10 @@ static int check_online_cpus(void) static enum ucode_state reload_for_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - enum ucode_state ustate; if (!uci->valid) return UCODE_OK; - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); - if (ustate != UCODE_OK) - return ustate; - return apply_microcode_on_target(cpu); } @@ -515,11 +510,11 @@ static ssize_t reload_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { + int cpu, bsp = boot_cpu_data.cpu_index; enum ucode_state tmp_ret = UCODE_OK; bool do_callback = false; unsigned long val; ssize_t ret = 0; - int cpu; ret = kstrtoul(buf, 0, &val); if (ret) @@ -528,6 +523,10 @@ static ssize_t reload_store(struct device *dev, if (val != 1) return size; + tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); + if (tmp_ret != UCODE_OK) + return size; + get_online_cpus(); ret = check_online_cpus(); -- cgit v1.2.3 From a5321aec6412b20b5ad15db2d6b916c05349dbff Mon Sep 17 00:00:00 2001 From: Ashok Raj Date: Wed, 28 Feb 2018 11:28:46 +0100 Subject: x86/microcode: Synchronize late microcode loading Original idea by Ashok, completely rewritten by Borislav. Before you read any further: the early loading method is still the preferred one and you should always do that. The following patch is improving the late loading mechanism for long running jobs and cloud use cases. Gather all cores and serialize the microcode update on them by doing it one-by-one to make the late update process as reliable as possible and avoid potential issues caused by the microcode update. [ Borislav: Rewrite completely. ] Co-developed-by: Borislav Petkov Signed-off-by: Ashok Raj Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Tom Lendacky Tested-by: Ashok Raj Reviewed-by: Tom Lendacky Cc: Arjan Van De Ven Link: https://lkml.kernel.org/r/20180228102846.13447-8-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 118 +++++++++++++++++++++++++++-------- 1 file changed, 92 insertions(+), 26 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5dd157d48606c..70ecbc8099c94 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -22,13 +22,16 @@ #define pr_fmt(fmt) "microcode: " fmt #include +#include #include #include #include #include #include +#include #include #include +#include #include #include @@ -64,6 +67,11 @@ LIST_HEAD(microcode_cache); */ static DEFINE_MUTEX(microcode_mutex); +/* + * Serialize late loading so that CPUs get updated one-by-one. + */ +static DEFINE_SPINLOCK(update_lock); + struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -486,6 +494,19 @@ static void __exit microcode_dev_exit(void) /* fake device for request_firmware */ static struct platform_device *microcode_pdev; +/* + * Late loading dance. Why the heavy-handed stomp_machine effort? + * + * - HT siblings must be idle and not execute other code while the other sibling + * is loading microcode in order to avoid any negative interactions caused by + * the loading. + * + * - In addition, microcode update on the cores must be serialized until this + * requirement can be relaxed in the future. Right now, this is conservative + * and good. + */ +#define SPINUNIT 100 /* 100 nsec */ + static int check_online_cpus(void) { if (num_online_cpus() == num_present_cpus()) @@ -496,23 +517,85 @@ static int check_online_cpus(void) return -EINVAL; } -static enum ucode_state reload_for_cpu(int cpu) +static atomic_t late_cpus; + +/* + * Returns: + * < 0 - on error + * 0 - no update done + * 1 - microcode was updated + */ +static int __reload_late(void *info) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + unsigned int timeout = NSEC_PER_SEC; + int all_cpus = num_online_cpus(); + int cpu = smp_processor_id(); + enum ucode_state err; + int ret = 0; - if (!uci->valid) - return UCODE_OK; + atomic_dec(&late_cpus); + + /* + * Wait for all CPUs to arrive. A load will not be attempted unless all + * CPUs show up. + * */ + while (atomic_read(&late_cpus)) { + if (timeout < SPINUNIT) { + pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", + atomic_read(&late_cpus)); + return -1; + } + + ndelay(SPINUNIT); + timeout -= SPINUNIT; + + touch_nmi_watchdog(); + } + + spin_lock(&update_lock); + apply_microcode_local(&err); + spin_unlock(&update_lock); + + if (err > UCODE_NFOUND) { + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; + } else if (err == UCODE_UPDATED) { + ret = 1; + } - return apply_microcode_on_target(cpu); + atomic_inc(&late_cpus); + + while (atomic_read(&late_cpus) != all_cpus) + cpu_relax(); + + return ret; +} + +/* + * Reload microcode late on all CPUs. Wait for a sec until they + * all gather together. + */ +static int microcode_reload_late(void) +{ + int ret; + + atomic_set(&late_cpus, num_online_cpus()); + + ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); + if (ret < 0) + return ret; + else if (ret > 0) + microcode_check(); + + return ret; } static ssize_t reload_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t size) { - int cpu, bsp = boot_cpu_data.cpu_index; enum ucode_state tmp_ret = UCODE_OK; - bool do_callback = false; + int bsp = boot_cpu_data.cpu_index; unsigned long val; ssize_t ret = 0; @@ -534,30 +617,13 @@ static ssize_t reload_store(struct device *dev, goto put; mutex_lock(µcode_mutex); - - for_each_online_cpu(cpu) { - tmp_ret = reload_for_cpu(cpu); - if (tmp_ret > UCODE_NFOUND) { - pr_warn("Error reloading microcode on CPU %d\n", cpu); - - /* set retval for the first encountered reload error */ - if (!ret) - ret = -EINVAL; - } - - if (tmp_ret == UCODE_UPDATED) - do_callback = true; - } - - if (!ret && do_callback) - microcode_check(); - + ret = microcode_reload_late(); mutex_unlock(µcode_mutex); put: put_online_cpus(); - if (!ret) + if (ret >= 0) ret = size; return ret; -- cgit v1.2.3 From c5b679f5c9e3851ee118d95961def374bb3b4ce6 Mon Sep 17 00:00:00 2001 From: Seunghun Han Date: Wed, 7 Mar 2018 13:32:15 +0900 Subject: x86/pti: Fix a comment typo s/visinble/visible/ Signed-off-by: Seunghun Han Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1520397135-132809-1-git-send-email-kkamagui@gmail.com --- arch/x86/mm/pti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index ce38f165489b5..631507f0c1980 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -332,7 +332,7 @@ static void __init pti_clone_user_shared(void) } /* - * Clone the ESPFIX P4D into the user space visinble page table + * Clone the ESPFIX P4D into the user space visible page table */ static void __init pti_setup_espfix64(void) { -- cgit v1.2.3 From fa94d0c6e0f3431523f5701084d799c77c7d4a4f Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 6 Mar 2018 15:21:41 +0100 Subject: x86/MCE: Save microcode revision in machine check records Updating microcode used to be relatively rare. Now that it has become more common we should save the microcode version in a machine check record to make sure that those people looking at the error have this important information bundled with the rest of the logged information. [ Borislav: Simplify a bit. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Yazen Ghannam Cc: linux-edac Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20180301233449.24311-1-tony.luck@intel.com --- arch/x86/include/uapi/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 91723461dc1fe..435db58a7bade 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -30,6 +30,7 @@ struct mce { __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ __u64 ppin; /* Protected Processor Inventory Number */ + __u32 microcode;/* Microcode revision */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8ff94d1e2dce5..b3323cab91398 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -130,6 +130,8 @@ void mce_setup(struct mce *m) if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) rdmsrl(MSR_PPIN, m->ppin); + + m->microcode = boot_cpu_data.microcode; } DEFINE_PER_CPU(struct mce, injectm); @@ -262,7 +264,7 @@ static void __print_mce(struct mce *m) */ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, - cpu_data(m->extcpu).microcode); + m->microcode); } static void print_mce(struct mce *m) -- cgit v1.2.3 From b3b7c4795ccab5be71f080774c45bbbcc75c2aaf Mon Sep 17 00:00:00 2001 From: Seunghun Han Date: Tue, 6 Mar 2018 15:21:43 +0100 Subject: x86/MCE: Serialize sysfs changes The check_interval file in /sys/devices/system/machinecheck/machinecheck directory is a global timer value for MCE polling. If it is changed by one CPU, mce_restart() broadcasts the event to other CPUs to delete and restart the MCE polling timer and __mcheck_cpu_init_timer() reinitializes the mce_timer variable. If more than one CPU writes a specific value to the check_interval file concurrently, mce_timer is not protected from such concurrent accesses and all kinds of explosions happen. Since only root can write to those sysfs variables, the issue is not a big deal security-wise. However, concurrent writes to these configuration variables is void of reason so the proper thing to do is to serialize the access with a mutex. Boris: - Make store_int_with_restart() use device_store_ulong() to filter out negative intervals - Limit min interval to 1 second - Correct locking - Massage commit message Signed-off-by: Seunghun Han Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Tony Luck Cc: linux-edac Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20180302202706.9434-1-kkamagui@gmail.com --- arch/x86/kernel/cpu/mcheck/mce.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b3323cab91398..466f47301334b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -56,6 +56,9 @@ static DEFINE_MUTEX(mce_log_mutex); +/* sysfs synchronization */ +static DEFINE_MUTEX(mce_sysfs_mutex); + #define CREATE_TRACE_POINTS #include @@ -2088,6 +2091,7 @@ static ssize_t set_ignore_ce(struct device *s, if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; + mutex_lock(&mce_sysfs_mutex); if (mca_cfg.ignore_ce ^ !!new) { if (new) { /* disable ce features */ @@ -2100,6 +2104,8 @@ static ssize_t set_ignore_ce(struct device *s, on_each_cpu(mce_enable_ce, (void *)1, 1); } } + mutex_unlock(&mce_sysfs_mutex); + return size; } @@ -2112,6 +2118,7 @@ static ssize_t set_cmci_disabled(struct device *s, if (kstrtou64(buf, 0, &new) < 0) return -EINVAL; + mutex_lock(&mce_sysfs_mutex); if (mca_cfg.cmci_disabled ^ !!new) { if (new) { /* disable cmci */ @@ -2123,6 +2130,8 @@ static ssize_t set_cmci_disabled(struct device *s, on_each_cpu(mce_enable_ce, NULL, 1); } } + mutex_unlock(&mce_sysfs_mutex); + return size; } @@ -2130,8 +2139,19 @@ static ssize_t store_int_with_restart(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = device_store_int(s, attr, buf, size); + unsigned long old_check_interval = check_interval; + ssize_t ret = device_store_ulong(s, attr, buf, size); + + if (check_interval == old_check_interval) + return ret; + + if (check_interval < 1) + check_interval = 1; + + mutex_lock(&mce_sysfs_mutex); mce_restart(); + mutex_unlock(&mce_sysfs_mutex); + return ret; } -- cgit v1.2.3 From c07a8f8b08ba683ea24f3ac9159f37ae94daf47f Mon Sep 17 00:00:00 2001 From: Francis Deslauriers Date: Thu, 8 Mar 2018 22:18:12 -0500 Subject: x86/kprobes: Fix kernel crash when probing .entry_trampoline code Disable the kprobe probing of the entry trampoline: .entry_trampoline is a code area that is used to ensure page table isolation between userspace and kernelspace. At the beginning of the execution of the trampoline, we load the kernel's CR3 register. This has the effect of enabling the translation of the kernel virtual addresses to physical addresses. Before this happens most kernel addresses can not be translated because the running process' CR3 is still used. If a kprobe is placed on the trampoline code before that change of the CR3 register happens the kernel crashes because int3 handling pages are not accessible. To fix this, add the .entry_trampoline section to the kprobe blacklist to prohibit the probing of code before all the kernel pages are accessible. Signed-off-by: Francis Deslauriers Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: mathieu.desnoyers@efficios.com Cc: mhiramat@kernel.org Link: http://lkml.kernel.org/r/1520565492-4637-2-git-send-email-francis.deslauriers@efficios.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sections.h | 1 + arch/x86/kernel/kprobes/core.c | 10 +++++++++- arch/x86/kernel/vmlinux.lds.S | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index d6baf23782bcc..5c019d23d06b1 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -10,6 +10,7 @@ extern struct exception_table_entry __stop___ex_table[]; #if defined(CONFIG_X86_64) extern char __end_rodata_hpage_align[]; +extern char __entry_trampoline_start[], __entry_trampoline_end[]; #endif #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index bd36f3c33cd0f..0715f827607c4 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1168,10 +1168,18 @@ NOKPROBE_SYMBOL(longjmp_break_handler); bool arch_within_kprobe_blacklist(unsigned long addr) { + bool is_in_entry_trampoline_section = false; + +#ifdef CONFIG_X86_64 + is_in_entry_trampoline_section = + (addr >= (unsigned long)__entry_trampoline_start && + addr < (unsigned long)__entry_trampoline_end); +#endif return (addr >= (unsigned long)__kprobes_text_start && addr < (unsigned long)__kprobes_text_end) || (addr >= (unsigned long)__entry_text_start && - addr < (unsigned long)__entry_text_end); + addr < (unsigned long)__entry_text_end) || + is_in_entry_trampoline_section; } int __init arch_init_kprobes(void) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9b138a06c1a46..b854ebf5851b7 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -118,9 +118,11 @@ SECTIONS #ifdef CONFIG_X86_64 . = ALIGN(PAGE_SIZE); + VMLINUX_SYMBOL(__entry_trampoline_start) = .; _entry_trampoline = .; *(.entry_trampoline) . = ALIGN(PAGE_SIZE); + VMLINUX_SYMBOL(__entry_trampoline_end) = .; ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); #endif -- cgit v1.2.3 From e21da1c992007594d391e7b301779cf30f438691 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Mar 2018 15:40:50 +0000 Subject: arm64: Relax ARM_SMCCC_ARCH_WORKAROUND_1 discovery A recent update to the ARM SMCCC ARCH_WORKAROUND_1 specification allows firmware to return a non zero, positive value to describe that although the mitigation is implemented at the higher exception level, the CPU on which the call is made is not affected. Let's relax the check on the return value from ARCH_WORKAROUND_1 so that we only error out if the returned value is negative. Fixes: b092201e0020 ("arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support") Signed-off-by: Marc Zyngier Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpu_errata.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 52f15cd896e11..b5a28336c0771 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -178,7 +178,7 @@ static int enable_smccc_arch_workaround_1(void *data) case PSCI_CONDUIT_HVC: arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if (res.a0) + if ((int)res.a0 < 0) return 0; cb = call_hvc_arch_workaround_1; smccc_start = __smccc_workaround_1_hvc_start; @@ -188,7 +188,7 @@ static int enable_smccc_arch_workaround_1(void *data) case PSCI_CONDUIT_SMC: arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if (res.a0) + if ((int)res.a0 < 0) return 0; cb = call_smc_arch_workaround_1; smccc_start = __smccc_workaround_1_smc_start; -- cgit v1.2.3 From 1da961d72ab0cfbe8b7c26cba731dc2bb6b9494b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 5 Mar 2018 19:25:49 +0300 Subject: x86/cpufeatures: Add Intel Total Memory Encryption cpufeature CPUID.0x7.0x0:ECX[13] indicates whether CPU supports Intel Total Memory Encryption. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Cc: Kai Huang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180305162610.37510-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index f41079da38c55..16898eb813f5c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -316,6 +316,7 @@ #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ #define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ #define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_TME (16*32+13) /* Intel Total Memory Encryption */ #define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ #define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ #define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ -- cgit v1.2.3 From 7958b2246fadf54b7ff820a2a5a2c5ca1554716f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 5 Mar 2018 19:25:51 +0300 Subject: x86/cpufeatures: Add Intel PCONFIG cpufeature CPUID.0x7.0x0:EDX[18] indicates whether Intel CPU support PCONFIG instruction. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Cc: Kai Huang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20180305162610.37510-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 16898eb813f5c..d554c11e01ff4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -329,6 +329,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ -- cgit v1.2.3 From a8b48a4dccea77e29462e59f1dbf0d5aa1ff167c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 7 Mar 2018 22:17:20 +1100 Subject: KVM: PPC: Book3S HV: Fix trap number return from __kvmppc_vcore_entry This fixes a bug where the trap number that is returned by __kvmppc_vcore_entry gets corrupted. The effect of the corruption is that IPIs get ignored on POWER9 systems when the IPI is sent via a doorbell interrupt to a CPU which is executing in a KVM guest. The effect of the IPI being ignored is often that another CPU locks up inside smp_call_function_many() (and if that CPU is holding a spinlock, other CPUs then lock up inside raw_spin_lock()). The trap number is currently held in register r12 for most of the assembly-language part of the guest exit path. In that path, we call kvmppc_subcore_exit_guest(), which is a C function, without restoring r12 afterwards. Depending on the kernel config and the compiler, it may modify r12 or it may not, so some config/compiler combinations see the bug and others don't. To fix this, we arrange for the trap number to be stored on the stack from the 'guest_bypass:' label until the end of the function, then the trap number is loaded and returned in r12 as before. Cc: stable@vger.kernel.org # v4.8+ Fixes: fd7bacbca47a ("KVM: PPC: Book3S HV: Fix TB corruption in guest exit path on HMI interrupt") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f31f357b8c5ae..d33264697a31a 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -320,7 +320,6 @@ kvm_novcpu_exit: stw r12, STACK_SLOT_TRAP(r1) bl kvmhv_commence_exit nop - lwz r12, STACK_SLOT_TRAP(r1) b kvmhv_switch_to_host /* @@ -1220,6 +1219,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) secondary_too_late: li r12, 0 + stw r12, STACK_SLOT_TRAP(r1) cmpdi r4, 0 beq 11f stw r12, VCPU_TRAP(r4) @@ -1558,12 +1558,12 @@ mc_cont: 3: stw r5,VCPU_SLB_MAX(r9) guest_bypass: + stw r12, STACK_SLOT_TRAP(r1) mr r3, r12 /* Increment exit count, poke other threads to exit */ bl kvmhv_commence_exit nop ld r9, HSTATE_KVM_VCPU(r13) - lwz r12, VCPU_TRAP(r9) /* Stop others sending VCPU interrupts to this physical CPU */ li r0, -1 @@ -1898,6 +1898,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do * have to coordinate the hardware threads. + * Here STACK_SLOT_TRAP(r1) contains the trap number. */ kvmhv_switch_to_host: /* Secondary threads wait for primary to do partition switch */ @@ -1950,12 +1951,12 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* If HMI, call kvmppc_realmode_hmi_handler() */ + lwz r12, STACK_SLOT_TRAP(r1) cmpwi r12, BOOK3S_INTERRUPT_HMI bne 27f bl kvmppc_realmode_hmi_handler nop cmpdi r3, 0 - li r12, BOOK3S_INTERRUPT_HMI /* * At this point kvmppc_realmode_hmi_handler may have resync-ed * the TB, and if it has, we must not subtract the guest timebase @@ -2008,10 +2009,8 @@ BEGIN_FTR_SECTION lwz r8, KVM_SPLIT_DO_RESTORE(r3) cmpwi r8, 0 beq 47f - stw r12, STACK_SLOT_TRAP(r1) bl kvmhv_p9_restore_lpcr nop - lwz r12, STACK_SLOT_TRAP(r1) b 48f 47: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) @@ -2049,6 +2048,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) + lwz r12, STACK_SLOT_TRAP(r1) /* return trap # in r12 */ ld r0, SFS+PPC_LR_STKOFF(r1) addi r1, r1, SFS mtlr r0 -- cgit v1.2.3 From b5069782453459f6ec1fdeb495d9901a4545fcb5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Mar 2018 22:03:12 -0700 Subject: x86/vm86/32: Fix POPF emulation POPF would trap if VIP was set regardless of whether IF was set. Fix it. Suggested-by: Stas Sergeev Reported-by: Bart Oldeman Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 5ed92a8ab71f ("x86/vm86: Use the normal pt_regs area for vm86") Link: http://lkml.kernel.org/r/ce95f40556e7b2178b6bc06ee9557827ff94bd28.1521003603.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/vm86_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5edb27f1a2c40..9d0b5af7db915 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -727,7 +727,8 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) return; check_vip: - if (VEFLAGS & X86_EFLAGS_VIP) { + if ((VEFLAGS & (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) == + (X86_EFLAGS_VIP | X86_EFLAGS_VIF)) { save_v86_state(regs, VM86_STI); return; } -- cgit v1.2.3 From a14bff131108faf50cc0cf864589fd71ee216c96 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 14 Mar 2018 11:24:27 +0000 Subject: x86/speculation, objtool: Annotate indirect calls/jumps for objtool on 32-bit kernels In the following commit: 9e0e3c5130e9 ("x86/speculation, objtool: Annotate indirect calls/jumps for objtool") ... we added annotations for CALL_NOSPEC/JMP_NOSPEC on 64-bit x86 kernels, but we did not annotate the 32-bit path. Annotate it similarly. Signed-off-by: Andy Whitcroft Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Dan Williams Cc: Dave Hansen Cc: David Woodhouse Cc: David Woodhouse Cc: Greg Kroah-Hartman Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180314112427.22351-1-apw@canonical.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/nospec-branch.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index b7063cfa19f97..b3996d60f981e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -183,7 +183,10 @@ * otherwise we'll run out of registers. We don't care about CET * here, anyway. */ -# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n", \ +# define CALL_NOSPEC \ + ALTERNATIVE( \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ " jmp 904f;\n" \ " .align 16\n" \ "901: call 903f;\n" \ -- cgit v1.2.3 From e21a4f3a930cda6e4902cb5b3213365e5ff3ce7c Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 27 Feb 2018 12:33:50 +0100 Subject: KVM: arm/arm64: Avoid vcpu_load for other vcpu ioctls than KVM_RUN Calling vcpu_load() registers preempt notifiers for this vcpu and calls kvm_arch_vcpu_load(). The latter will soon be doing a lot of heavy lifting on arm/arm64 and will try to do things such as enabling the virtual timer and setting us up to handle interrupts from the timer hardware. Loading state onto hardware registers and enabling hardware to signal interrupts can be problematic when we're not actually about to run the VCPU, because it makes it difficult to establish the right context when handling interrupts from the timer, and it makes the register access code difficult to reason about. Luckily, now when we call vcpu_load in each ioctl implementation, we can simply remove the call from the non-KVM_RUN vcpu ioctls, and our kvm_arch_vcpu_load() is only used for loading vcpu content to the physical CPU when we're actually going to run the vcpu. Cc: stable@vger.kernel.org Fixes: 9b062471e52a ("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl") Reviewed-by: Julien Grall Reviewed-by: Marc Zyngier Reviewed-by: Andrew Jones Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 3 --- virt/kvm/arm/arm.c | 9 --------- 2 files changed, 12 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index d7e3299a77346..959e50d2588c0 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -363,8 +363,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, { int ret = 0; - vcpu_load(vcpu); - trace_kvm_set_guest_debug(vcpu, dbg->control); if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) { @@ -386,7 +384,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, } out: - vcpu_put(vcpu); return ret; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 86941f6181bb0..53572304843b2 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -384,14 +384,11 @@ static void vcpu_power_off(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - vcpu_load(vcpu); - if (vcpu->arch.power_off) mp_state->mp_state = KVM_MP_STATE_STOPPED; else mp_state->mp_state = KVM_MP_STATE_RUNNABLE; - vcpu_put(vcpu); return 0; } @@ -400,8 +397,6 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int ret = 0; - vcpu_load(vcpu); - switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: vcpu->arch.power_off = false; @@ -413,7 +408,6 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, ret = -EINVAL; } - vcpu_put(vcpu); return ret; } @@ -1036,8 +1030,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_device_attr attr; long r; - vcpu_load(vcpu); - switch (ioctl) { case KVM_ARM_VCPU_INIT: { struct kvm_vcpu_init init; @@ -1114,7 +1106,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } - vcpu_put(vcpu); return r; } -- cgit v1.2.3 From 18a955219bf7d9008ce480d4451b6b8bf4483a22 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 13 Mar 2018 11:03:46 -0600 Subject: x86/mm: Fix vmalloc_fault to use pXd_large Gratian Crisan reported that vmalloc_fault() crashes when CONFIG_HUGETLBFS is not set since the function inadvertently uses pXn_huge(), which always return 0 in this case. ioremap() does not depend on CONFIG_HUGETLBFS. Fix vmalloc_fault() to call pXd_large() instead. Fixes: f4eafd8bcd52 ("x86/mm: Fix vmalloc_fault() to handle large pages properly") Reported-by: Gratian Crisan Signed-off-by: Toshi Kani Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Cc: linux-mm@kvack.org Cc: Borislav Petkov Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20180313170347.3829-2-toshi.kani@hpe.com --- arch/x86/mm/fault.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c88573d90f3e9..25a30b5d6582f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -330,7 +330,7 @@ static noinline int vmalloc_fault(unsigned long address) if (!pmd_k) return -1; - if (pmd_huge(*pmd_k)) + if (pmd_large(*pmd_k)) return 0; pte_k = pte_offset_kernel(pmd_k, address); @@ -475,7 +475,7 @@ static noinline int vmalloc_fault(unsigned long address) if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref)) BUG(); - if (pud_huge(*pud)) + if (pud_large(*pud)) return 0; pmd = pmd_offset(pud, address); @@ -486,7 +486,7 @@ static noinline int vmalloc_fault(unsigned long address) if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref)) BUG(); - if (pmd_huge(*pmd)) + if (pmd_large(*pmd)) return 0; pte_ref = pte_offset_kernel(pmd_ref, address); -- cgit v1.2.3 From a7e6c7015bf3e0cb467a2f6c0e1de985ee1a0ecb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Mar 2018 21:36:22 -0700 Subject: x86, memremap: fix altmap accounting at free Commit 24b6d4164348 "mm: pass the vmem_altmap to vmemmap_free" converted the vmemmap_free() path to pass the altmap argument all the way through the call chain rather than looking it up based on the page. Unfortunately that ends up over freeing altmap allocated pages in some cases since free_pagetable() is used to free both memmap space and pte space, where only the memmap stored in huge pages uses altmap allocations. Given that altmap allocations for memmap space are special cased in vmemmap_populate_hugepages() add a symmetric / special case free_hugepage_table() to handle altmap freeing, and cleanup the unneeded passing of altmap to leaf functions that do not require it. Without this change the sanity check accounting in devm_memremap_pages_release() will throw a warning with the following signature. nd_pmem pfn10.1: devm_memremap_pages_release: failed to free all reserved pages WARNING: CPU: 44 PID: 3539 at kernel/memremap.c:310 devm_memremap_pages_release+0x1c7/0x220 CPU: 44 PID: 3539 Comm: ndctl Tainted: G L 4.16.0-rc1-linux-stable #7 RIP: 0010:devm_memremap_pages_release+0x1c7/0x220 [..] Call Trace: release_nodes+0x225/0x270 device_release_driver_internal+0x15d/0x210 bus_remove_device+0xe2/0x160 device_del+0x130/0x310 ? klist_release+0x56/0x100 ? nd_region_notify+0xc0/0xc0 [libnvdimm] device_unregister+0x16/0x60 This was missed in testing since not all configurations will trigger this warning. Fixes: 24b6d4164348 ("mm: pass the vmem_altmap to vmemmap_free") Reported-by: Jane Chu Cc: Ross Zwisler Reviewed-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/x86/mm/init_64.c | 60 ++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8b72923f1d35c..af11a28902355 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -800,17 +800,11 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, #define PAGE_INUSE 0xFD -static void __meminit free_pagetable(struct page *page, int order, - struct vmem_altmap *altmap) +static void __meminit free_pagetable(struct page *page, int order) { unsigned long magic; unsigned int nr_pages = 1 << order; - if (altmap) { - vmem_altmap_free(altmap, nr_pages); - return; - } - /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); @@ -826,8 +820,16 @@ static void __meminit free_pagetable(struct page *page, int order, free_pages((unsigned long)page_address(page), order); } -static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, +static void __meminit free_hugepage_table(struct page *page, struct vmem_altmap *altmap) +{ + if (altmap) + vmem_altmap_free(altmap, PMD_SIZE / PAGE_SIZE); + else + free_pagetable(page, get_order(PMD_SIZE)); +} + +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) { pte_t *pte; int i; @@ -839,14 +841,13 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd, } /* free a pte talbe */ - free_pagetable(pmd_page(*pmd), 0, altmap); + free_pagetable(pmd_page(*pmd), 0); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); spin_unlock(&init_mm.page_table_lock); } -static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, - struct vmem_altmap *altmap) +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) { pmd_t *pmd; int i; @@ -858,14 +859,13 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, } /* free a pmd talbe */ - free_pagetable(pud_page(*pud), 0, altmap); + free_pagetable(pud_page(*pud), 0); spin_lock(&init_mm.page_table_lock); pud_clear(pud); spin_unlock(&init_mm.page_table_lock); } -static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d, - struct vmem_altmap *altmap) +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) { pud_t *pud; int i; @@ -877,7 +877,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d, } /* free a pud talbe */ - free_pagetable(p4d_page(*p4d), 0, altmap); + free_pagetable(p4d_page(*p4d), 0); spin_lock(&init_mm.page_table_lock); p4d_clear(p4d); spin_unlock(&init_mm.page_table_lock); @@ -885,7 +885,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d, static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, - struct vmem_altmap *altmap, bool direct) + bool direct) { unsigned long next, pages = 0; pte_t *pte; @@ -916,7 +916,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, * freed when offlining, or simplely not in use. */ if (!direct) - free_pagetable(pte_page(*pte), 0, altmap); + free_pagetable(pte_page(*pte), 0); spin_lock(&init_mm.page_table_lock); pte_clear(&init_mm, addr, pte); @@ -939,7 +939,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, page_addr = page_address(pte_page(*pte)); if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { - free_pagetable(pte_page(*pte), 0, altmap); + free_pagetable(pte_page(*pte), 0); spin_lock(&init_mm.page_table_lock); pte_clear(&init_mm, addr, pte); @@ -974,9 +974,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, if (IS_ALIGNED(addr, PMD_SIZE) && IS_ALIGNED(next, PMD_SIZE)) { if (!direct) - free_pagetable(pmd_page(*pmd), - get_order(PMD_SIZE), - altmap); + free_hugepage_table(pmd_page(*pmd), + altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); @@ -989,9 +988,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, page_addr = page_address(pmd_page(*pmd)); if (!memchr_inv(page_addr, PAGE_INUSE, PMD_SIZE)) { - free_pagetable(pmd_page(*pmd), - get_order(PMD_SIZE), - altmap); + free_hugepage_table(pmd_page(*pmd), + altmap); spin_lock(&init_mm.page_table_lock); pmd_clear(pmd); @@ -1003,8 +1001,8 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end, } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next, altmap, direct); - free_pte_table(pte_base, pmd, altmap); + remove_pte_table(pte_base, addr, next, direct); + free_pte_table(pte_base, pmd); } /* Call free_pmd_table() in remove_pud_table(). */ @@ -1033,8 +1031,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, IS_ALIGNED(next, PUD_SIZE)) { if (!direct) free_pagetable(pud_page(*pud), - get_order(PUD_SIZE), - altmap); + get_order(PUD_SIZE)); spin_lock(&init_mm.page_table_lock); pud_clear(pud); @@ -1048,8 +1045,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, if (!memchr_inv(page_addr, PAGE_INUSE, PUD_SIZE)) { free_pagetable(pud_page(*pud), - get_order(PUD_SIZE), - altmap); + get_order(PUD_SIZE)); spin_lock(&init_mm.page_table_lock); pud_clear(pud); @@ -1062,7 +1058,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, pmd_base = pmd_offset(pud, 0); remove_pmd_table(pmd_base, addr, next, direct, altmap); - free_pmd_table(pmd_base, pud, altmap); + free_pmd_table(pmd_base, pud); } if (direct) @@ -1094,7 +1090,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, * to adapt for boot-time switching between 4 and 5 level page tables. */ if (CONFIG_PGTABLE_LEVELS == 5) - free_pud_table(pud_base, p4d, altmap); + free_pud_table(pud_base, p4d); } if (direct) -- cgit v1.2.3 From cfb61b5e3e09f8b49bc4d685429df75f45127adc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 15 Mar 2018 14:18:00 -0700 Subject: sparc64: Fix regression in pmdp_invalidate(). pmdp_invalidate() was changed to update the pmd atomically (to not lose dirty/access bits) and return the original pmd value. However, in doing so, we lost a lot of the essential work that set_pmd_at() does, namely to update hugepage mapping counts and queuing up the batched TLB flush entry. Thus we were not flushing entries out of the TLB when making such PMD changes. Fix this by abstracting the accounting work of set_pmd_at() out into a separate function, and call it from pmdp_establish(). Fixes: a8e654f01cb7 ("sparc64: update pmdp_invalidate() to return old pmd value") Signed-off-by: David S. Miller --- arch/sparc/mm/tlb.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 847ddffbf38ad..b5cfab7116514 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -163,13 +163,10 @@ static void tlb_batch_pmd_scan(struct mm_struct *mm, unsigned long vaddr, pte_unmap(pte); } -void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ - pmd_t orig = *pmdp; - - *pmdp = pmd; +static void __set_pmd_acct(struct mm_struct *mm, unsigned long addr, + pmd_t orig, pmd_t pmd) +{ if (mm == &init_mm) return; @@ -219,6 +216,15 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } +void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + pmd_t orig = *pmdp; + + *pmdp = pmd; + __set_pmd_acct(mm, addr, orig, pmd); +} + static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { @@ -227,6 +233,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, do { old = *pmdp; } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + __set_pmd_acct(vma->vm_mm, address, old, pmd); return old; } -- cgit v1.2.3 From e3b3121fa8da94cb20f9e0c64ab7981ae47fd085 Mon Sep 17 00:00:00 2001 From: Alexander Sergeyev Date: Tue, 13 Mar 2018 22:38:56 +0300 Subject: x86/speculation: Remove Skylake C2 from Speculation Control microcode blacklist In accordance with Intel's microcode revision guidance from March 6 MCU rev 0xc2 is cleared on both Skylake H/S and Skylake Xeon E3 processors that share CPUID 506E3. Signed-off-by: Alexander Sergeyev Signed-off-by: Thomas Gleixner Cc: Jia Zhang Cc: Greg Kroah-Hartman Cc: Kyle Huey Cc: David Woodhouse Link: https://lkml.kernel.org/r/20180313193856.GA8580@localhost.localdomain --- arch/x86/kernel/cpu/intel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 4aa9fd3793905..c3af167d0a70c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -105,7 +105,7 @@ static void probe_xeon_phi_r3mwait(struct cpuinfo_x86 *c) /* * Early microcode releases for the Spectre v2 mitigation were broken. * Information taken from; - * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/01/microcode-update-guidance.pdf + * - https://newsroom.intel.com/wp-content/uploads/sites/11/2018/03/microcode-update-guidance.pdf * - https://kb.vmware.com/s/article/52345 * - Microcode revisions observed in the wild * - Release note from 20180108 microcode release @@ -123,7 +123,6 @@ static const struct sku_microcode spectre_bad_microcodes[] = { { INTEL_FAM6_KABYLAKE_MOBILE, 0x09, 0x80 }, { INTEL_FAM6_SKYLAKE_X, 0x03, 0x0100013e }, { INTEL_FAM6_SKYLAKE_X, 0x04, 0x0200003c }, - { INTEL_FAM6_SKYLAKE_DESKTOP, 0x03, 0xc2 }, { INTEL_FAM6_BROADWELL_CORE, 0x04, 0x28 }, { INTEL_FAM6_BROADWELL_GT3E, 0x01, 0x1b }, { INTEL_FAM6_BROADWELL_XEON_D, 0x02, 0x14 }, -- cgit v1.2.3 From 18ffc0cce4ff947a2acc9b2e06ae5309a6e6fb43 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 22 Feb 2018 15:19:37 +0100 Subject: microblaze: Setup dependencies for ASM optimized lib functions The patch: "microblaze: Setup proper dependency for optimized lib functions" (sha1: 7b6ce52be3f86520524711a6f33f3866f9339694) didn't setup all dependencies properly. Optimized lib functions in C are also present for little endian and optimized library functions in assembler are implemented only for big endian version. Reported-by: kbuild test robot Signed-off-by: Michal Simek --- arch/microblaze/Kconfig.platform | 2 +- arch/microblaze/lib/fastcopy.S | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/microblaze/Kconfig.platform b/arch/microblaze/Kconfig.platform index 6996f397c16c1..f7f1739c11b9f 100644 --- a/arch/microblaze/Kconfig.platform +++ b/arch/microblaze/Kconfig.platform @@ -8,7 +8,6 @@ menu "Platform options" config OPT_LIB_FUNCTION bool "Optimalized lib function" - depends on CPU_LITTLE_ENDIAN default y help Allows turn on optimalized library function (memcpy and memmove). @@ -21,6 +20,7 @@ config OPT_LIB_FUNCTION config OPT_LIB_ASM bool "Optimalized lib function ASM" depends on OPT_LIB_FUNCTION && (XILINX_MICROBLAZE0_USE_BARREL = 1) + depends on CPU_BIG_ENDIAN default n help Allows turn on optimalized library function (memcpy and memmove). diff --git a/arch/microblaze/lib/fastcopy.S b/arch/microblaze/lib/fastcopy.S index 62021d7e249e0..fdc48bb065d89 100644 --- a/arch/microblaze/lib/fastcopy.S +++ b/arch/microblaze/lib/fastcopy.S @@ -29,10 +29,6 @@ * between mem locations with size of xfer spec'd in bytes */ -#ifdef __MICROBLAZEEL__ -#error Microblaze LE not support ASM optimized lib func. Disable OPT_LIB_ASM. -#endif - #include .text .globl memcpy -- cgit v1.2.3 From cd4dfee6a8bfbbe404e9905aff85e267ec99f5fa Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 9 Mar 2018 09:52:55 -0600 Subject: microblaze: remove unused alloc_maybe_bootmem alloc_maybe_bootmem is unused, so remove it. Signed-off-by: Rob Herring Signed-off-by: Michal Simek --- arch/microblaze/include/asm/setup.h | 1 - arch/microblaze/mm/init.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'arch') diff --git a/arch/microblaze/include/asm/setup.h b/arch/microblaze/include/asm/setup.h index be84a4d3917fc..7c968c1d1729e 100644 --- a/arch/microblaze/include/asm/setup.h +++ b/arch/microblaze/include/asm/setup.h @@ -44,7 +44,6 @@ void machine_shutdown(void); void machine_halt(void); void machine_power_off(void); -extern void *alloc_maybe_bootmem(size_t size, gfp_t mask); extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask); # endif /* __ASSEMBLY__ */ diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 434639f9a3a6b..5bc9c7fbb541e 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -414,14 +414,6 @@ void __init *early_get_page(void) #endif /* CONFIG_MMU */ -void * __ref alloc_maybe_bootmem(size_t size, gfp_t mask) -{ - if (mem_init_done) - return kmalloc(size, mask); - else - return alloc_bootmem(size); -} - void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) { void *p; -- cgit v1.2.3 From 101646a24a2f9cdb61d7732459fbf068a7bbb542 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 9 Mar 2018 09:54:07 -0600 Subject: microblaze: switch to NO_BOOTMEM Microblaze doesn't set CONFIG_NO_BOOTMEM and so memblock_virt_alloc() doesn't work for CONFIG_HAVE_MEMBLOCK && !CONFIG_NO_BOOTMEM. Similar change was already done by others architectures "ARM: mm: Remove bootmem code and switch to NO_BOOTMEM" (sha1: 84f452b1e8fc73ac0e31254c66e3e2260ce5263d) or "openrisc: Consolidate setup to use memblock instead of bootmem" (sha1: 266c7fad157265bb54d17db1c9545f2aaa488643) or "parisc: Drop bootmem and switch to memblock" (sha1: 4fe9e1d957e45ad8eba9885ee860a0e93d13a7c7) or "powerpc: Remove bootmem allocator" (sha1: 10239733ee8617bac3f1c1769af43a88ed979324) or "s390/mm: Convert bootmem to memblock" (sha1: 50be634507284eea38df78154d22615d21200b42) or "sparc64: Convert over to NO_BOOTMEM." (sha1: 625d693e9784f988371e69c2b41a2172c0be6c11) or "xtensa: drop sysmem and switch to memblock" (sha1: 0e46c1115f5816949220d62dd3ff04aa68e7ac6b) Issue was introduced by: "of/fdt: use memblock_virt_alloc for early alloc" (sha1: 0fa1c579349fdd90173381712ad78aa99c09d38b) Signed-off-by: Rob Herring Tested-by: Alvaro Gamez Machado Tested-by: Michal Simek Signed-off-by: Michal Simek --- arch/microblaze/Kconfig | 1 + arch/microblaze/mm/init.c | 56 +++++------------------------------------------ 2 files changed, 7 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 4f798aa671ddd..3817a3e2146cf 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -24,6 +24,7 @@ config MICROBLAZE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER + select NO_BOOTMEM select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 5bc9c7fbb541e..df6de7ccdc2eb 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -32,9 +32,6 @@ int mem_init_done; #ifndef CONFIG_MMU unsigned int __page_offset; EXPORT_SYMBOL(__page_offset); - -#else -static int init_bootmem_done; #endif /* CONFIG_MMU */ char *klimit = _end; @@ -117,7 +114,6 @@ static void __init paging_init(void) void __init setup_memory(void) { - unsigned long map_size; struct memblock_region *reg; #ifndef CONFIG_MMU @@ -174,17 +170,6 @@ void __init setup_memory(void) pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn); pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn); - /* - * Find an area to use for the bootmem bitmap. - * We look for the first area which is at least - * 128kB in length (128kB is enough for a bitmap - * for 4GB of memory, using 4kB pages), plus 1 page - * (in case the address isn't page-aligned). - */ - map_size = init_bootmem_node(NODE_DATA(0), - PFN_UP(TOPHYS((u32)klimit)), min_low_pfn, max_low_pfn); - memblock_reserve(PFN_UP(TOPHYS((u32)klimit)) << PAGE_SHIFT, map_size); - /* Add active regions with valid PFNs */ for_each_memblock(memory, reg) { unsigned long start_pfn, end_pfn; @@ -196,32 +181,9 @@ void __init setup_memory(void) &memblock.memory, 0); } - /* free bootmem is whole main memory */ - free_bootmem_with_active_regions(0, max_low_pfn); - - /* reserve allocate blocks */ - for_each_memblock(reserved, reg) { - unsigned long top = reg->base + reg->size - 1; - - pr_debug("reserved - 0x%08x-0x%08x, %lx, %lx\n", - (u32) reg->base, (u32) reg->size, top, - memory_start + lowmem_size - 1); - - if (top <= (memory_start + lowmem_size - 1)) { - reserve_bootmem(reg->base, reg->size, BOOTMEM_DEFAULT); - } else if (reg->base < (memory_start + lowmem_size - 1)) { - unsigned long trunc_size = memory_start + lowmem_size - - reg->base; - reserve_bootmem(reg->base, trunc_size, BOOTMEM_DEFAULT); - } - } - /* XXX need to clip this if using highmem? */ sparse_memory_present_with_active_regions(0); -#ifdef CONFIG_MMU - init_bootmem_done = 1; -#endif paging_init(); } @@ -398,18 +360,12 @@ asmlinkage void __init mmu_init(void) /* This is only called until mem_init is done. */ void __init *early_get_page(void) { - void *p; - if (init_bootmem_done) { - p = alloc_bootmem_pages(PAGE_SIZE); - } else { - /* - * Mem start + kernel_tlb -> here is limit - * because of mem mapping from head.S - */ - p = __va(memblock_alloc_base(PAGE_SIZE, PAGE_SIZE, - memory_start + kernel_tlb)); - } - return p; + /* + * Mem start + kernel_tlb -> here is limit + * because of mem mapping from head.S + */ + return __va(memblock_alloc_base(PAGE_SIZE, PAGE_SIZE, + memory_start + kernel_tlb)); } #endif /* CONFIG_MMU */ -- cgit v1.2.3 From daaf216c06fba4ee4dc3f62715667da929d68774 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Thu, 8 Mar 2018 17:17:31 -0600 Subject: KVM: x86: Fix device passthrough when SME is active When using device passthrough with SME active, the MMIO range that is mapped for the device should not be mapped encrypted. Add a check in set_spte() to insure that a page is not mapped encrypted if that page is a device MMIO page as indicated by kvm_is_mmio_pfn(). Cc: # 4.14.x- Signed-off-by: Tom Lendacky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f551962ac2948..763bb3bade63f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2770,8 +2770,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, else pte_access &= ~ACC_WRITE_MASK; + if (!kvm_is_mmio_pfn(pfn)) + spte |= shadow_me_mask; + spte |= (u64)pfn << PAGE_SHIFT; - spte |= shadow_me_mask; if (pte_access & ACC_WRITE_MASK) { -- cgit v1.2.3 From 2613f36ed965d0e5a595a1d931fd3b480e82d6fd Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 14 Mar 2018 19:36:14 +0100 Subject: x86/microcode: Attempt late loading only when new microcode is present Return UCODE_NEW from the scanning functions to denote that new microcode was found and only then attempt the expensive synchronization dance. Reported-by: Emanuel Czirai Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Emanuel Czirai Tested-by: Ashok Raj Tested-by: Tom Lendacky Link: https://lkml.kernel.org/r/20180314183615.17629-1-bp@alien8.de --- arch/x86/include/asm/microcode.h | 1 + arch/x86/kernel/cpu/microcode/amd.c | 34 +++++++++++++++++++++------------- arch/x86/kernel/cpu/microcode/core.c | 8 +++----- arch/x86/kernel/cpu/microcode/intel.c | 4 +++- 4 files changed, 28 insertions(+), 19 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 7fb1047d61c7b..6cf0e4cb7b976 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -39,6 +39,7 @@ struct device; enum ucode_state { UCODE_OK = 0, + UCODE_NEW, UCODE_UPDATED, UCODE_NFOUND, UCODE_ERROR, diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a998e1a7d46fd..48179928ff38c 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -339,7 +339,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) return -EINVAL; ret = load_microcode_amd(true, x86_family(cpuid_1_eax), desc.data, desc.size); - if (ret != UCODE_OK) + if (ret > UCODE_UPDATED) return -EINVAL; return 0; @@ -683,27 +683,35 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, static enum ucode_state load_microcode_amd(bool save, u8 family, const u8 *data, size_t size) { + struct ucode_patch *p; enum ucode_state ret; /* free old equiv table */ free_equiv_cpu_table(); ret = __load_microcode_amd(family, data, size); - - if (ret != UCODE_OK) + if (ret != UCODE_OK) { cleanup(); + return ret; + } -#ifdef CONFIG_X86_32 - /* save BSP's matching patch for early load */ - if (save) { - struct ucode_patch *p = find_patch(0); - if (p) { - memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); - memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), - PATCH_MAX_SIZE)); - } + p = find_patch(0); + if (!p) { + return ret; + } else { + if (boot_cpu_data.microcode == p->patch_id) + return ret; + + ret = UCODE_NEW; } -#endif + + /* save BSP's matching patch for early load */ + if (!save) + return ret; + + memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); + memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE)); + return ret; } diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 70ecbc8099c94..9f0fe5bb450de 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -607,7 +607,7 @@ static ssize_t reload_store(struct device *dev, return size; tmp_ret = microcode_ops->request_microcode_fw(bsp, µcode_pdev->dev, true); - if (tmp_ret != UCODE_OK) + if (tmp_ret != UCODE_NEW) return size; get_online_cpus(); @@ -691,10 +691,8 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) if (system_state != SYSTEM_RUNNING) return UCODE_NFOUND; - ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, - refresh_fw); - - if (ustate == UCODE_OK) { + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, refresh_fw); + if (ustate == UCODE_NEW) { pr_debug("CPU%d updated upon init\n", cpu); apply_microcode_on_target(cpu); } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 2aded9db1d425..32b8e5724f966 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -862,6 +862,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, unsigned int leftover = size; unsigned int curr_mc_size = 0, new_mc_size = 0; unsigned int csig, cpf; + enum ucode_state ret = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -903,6 +904,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, new_mc = mc; new_mc_size = mc_size; mc = NULL; /* trigger new vmalloc */ + ret = UCODE_NEW; } ucode_ptr += mc_size; @@ -932,7 +934,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - return UCODE_OK; + return ret; } static int get_ucode_fw(void *to, const void *from, size_t n) -- cgit v1.2.3 From bb8c13d61a629276a162c1d2b1a20a815cbcfbb7 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 14 Mar 2018 19:36:15 +0100 Subject: x86/microcode: Fix CPU synchronization routine Emanuel reported an issue with a hang during microcode update because my dumb idea to use one atomic synchronization variable for both rendezvous - before and after update - was simply bollocks: microcode: microcode_reload_late: late_cpus: 4 microcode: __reload_late: cpu 2 entered microcode: __reload_late: cpu 1 entered microcode: __reload_late: cpu 3 entered microcode: __reload_late: cpu 0 entered microcode: __reload_late: cpu 1 left microcode: Timeout while waiting for CPUs rendezvous, remaining: 1 CPU1 above would finish, leave and the others will still spin waiting for it to join. So do two synchronization atomics instead, which makes the code a lot more straightforward. Also, since the update is serialized and it also takes quite some time per microcode engine, increase the exit timeout by the number of CPUs on the system. That's ok because the moment all CPUs are done, that timeout will be cut short. Furthermore, panic when some of the CPUs timeout when returning from a microcode update: we can't allow a system with not all cores updated. Also, as an optimization, do not do the exit sync if microcode wasn't updated. Reported-by: Emanuel Czirai Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Emanuel Czirai Tested-by: Ashok Raj Tested-by: Tom Lendacky Link: https://lkml.kernel.org/r/20180314183615.17629-2-bp@alien8.de --- arch/x86/kernel/cpu/microcode/core.c | 68 ++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 9f0fe5bb450de..10c4fc2c91f8e 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -517,7 +517,29 @@ static int check_online_cpus(void) return -EINVAL; } -static atomic_t late_cpus; +static atomic_t late_cpus_in; +static atomic_t late_cpus_out; + +static int __wait_for_cpus(atomic_t *t, long long timeout) +{ + int all_cpus = num_online_cpus(); + + atomic_inc(t); + + while (atomic_read(t) < all_cpus) { + if (timeout < SPINUNIT) { + pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", + all_cpus - atomic_read(t)); + return 1; + } + + ndelay(SPINUNIT); + timeout -= SPINUNIT; + + touch_nmi_watchdog(); + } + return 0; +} /* * Returns: @@ -527,30 +549,16 @@ static atomic_t late_cpus; */ static int __reload_late(void *info) { - unsigned int timeout = NSEC_PER_SEC; - int all_cpus = num_online_cpus(); int cpu = smp_processor_id(); enum ucode_state err; int ret = 0; - atomic_dec(&late_cpus); - /* * Wait for all CPUs to arrive. A load will not be attempted unless all * CPUs show up. * */ - while (atomic_read(&late_cpus)) { - if (timeout < SPINUNIT) { - pr_err("Timeout while waiting for CPUs rendezvous, remaining: %d\n", - atomic_read(&late_cpus)); - return -1; - } - - ndelay(SPINUNIT); - timeout -= SPINUNIT; - - touch_nmi_watchdog(); - } + if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) + return -1; spin_lock(&update_lock); apply_microcode_local(&err); @@ -558,15 +566,22 @@ static int __reload_late(void *info) if (err > UCODE_NFOUND) { pr_warn("Error reloading microcode on CPU %d\n", cpu); - ret = -1; - } else if (err == UCODE_UPDATED) { + return -1; + /* siblings return UCODE_OK because their engine got updated already */ + } else if (err == UCODE_UPDATED || err == UCODE_OK) { ret = 1; + } else { + return ret; } - atomic_inc(&late_cpus); - - while (atomic_read(&late_cpus) != all_cpus) - cpu_relax(); + /* + * Increase the wait timeout to a safe value here since we're + * serializing the microcode update and that could take a while on a + * large number of CPUs. And that is fine as the *actual* timeout will + * be determined by the last CPU finished updating and thus cut short. + */ + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus())) + panic("Timeout during microcode update!\n"); return ret; } @@ -579,12 +594,11 @@ static int microcode_reload_late(void) { int ret; - atomic_set(&late_cpus, num_online_cpus()); + atomic_set(&late_cpus_in, 0); + atomic_set(&late_cpus_out, 0); ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret < 0) - return ret; - else if (ret > 0) + if (ret > 0) microcode_check(); return ret; -- cgit v1.2.3 From 9ef0f88fe5466c2ca1d2975549ba6be502c464c1 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Wed, 7 Mar 2018 08:18:05 -0500 Subject: parisc: Handle case where flush_cache_range is called with no context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just when I had decided that flush_cache_range() was always called with a valid context, Helge reported two cases where the "BUG_ON(!vma->vm_mm->context);" was hit on the phantom buildd: kernel BUG at /mnt/sdb6/linux/linux-4.15.4/arch/parisc/kernel/cache.c:587! CPU: 1 PID: 3254 Comm: kworker/1:2 Tainted: G D 4.15.0-1-parisc64-smp #1 Debian 4.15.4-1+b1 Workqueue: events free_ioctx   IAOQ[0]: flush_cache_range+0x164/0x168   IAOQ[1]: flush_cache_page+0x0/0x1c8   RP(r2): unmap_page_range+0xae8/0xb88 Backtrace:   [<00000000404a6980>] unmap_page_range+0xae8/0xb88   [<00000000404a6ae0>] unmap_single_vma+0xc0/0x188   [<00000000404a6cdc>] zap_page_range_single+0x134/0x1f8   [<00000000404a702c>] unmap_mapping_range+0x1cc/0x208   [<0000000040461518>] truncate_pagecache+0x98/0x108   [<0000000040461624>] truncate_setsize+0x9c/0xb8   [<00000000405d7f30>] put_aio_ring_file+0x80/0x100   [<00000000405d803c>] aio_free_ring+0x8c/0x290   [<00000000405d82c0>] free_ioctx+0x80/0x180   [<0000000040284e6c>] process_one_work+0x21c/0x668   [<00000000402854c4>] worker_thread+0x20c/0x778   [<0000000040291d44>] kthread+0x2d4/0x2e0   [<0000000040204020>] end_fault_vector+0x20/0xc0 This indicates that we need to handle the no context case in flush_cache_range() as we do in flush_cache_mm(). In thinking about this, I realized that we don't need to flush the TLB when there is no context. So, I added context checks to the large flush cases in flush_cache_mm() and flush_cache_range(). The large flush case occurs frequently in flush_cache_mm() and the change should improve fork performance. The v2 version of this change removes the BUG_ON from flush_cache_page() by skipping the TLB flush when there is no context.  I also added code to flush the TLB in flush_cache_mm() and flush_cache_range() when we have a context that's not current.  Now all three routines handle TLB flushes in a similar manner. Signed-off-by: John David Anglin Cc: stable@vger.kernel.org # 4.9+ Signed-off-by: Helge Deller --- arch/parisc/kernel/cache.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 79089778725b3..e3b45546d589b 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -543,7 +543,8 @@ void flush_cache_mm(struct mm_struct *mm) rp3440, etc. So, avoid it if the mm isn't too big. */ if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && mm_total_size(mm) >= parisc_cache_flush_threshold) { - flush_tlb_all(); + if (mm->context) + flush_tlb_all(); flush_cache_all(); return; } @@ -571,6 +572,8 @@ void flush_cache_mm(struct mm_struct *mm) pfn = pte_pfn(*ptep); if (!pfn_valid(pfn)) continue; + if (unlikely(mm->context)) + flush_tlb_page(vma, addr); __flush_cache_page(vma, addr, PFN_PHYS(pfn)); } } @@ -579,26 +582,46 @@ void flush_cache_mm(struct mm_struct *mm) void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { + pgd_t *pgd; + unsigned long addr; + if ((!IS_ENABLED(CONFIG_SMP) || !arch_irqs_disabled()) && end - start >= parisc_cache_flush_threshold) { - flush_tlb_range(vma, start, end); + if (vma->vm_mm->context) + flush_tlb_range(vma, start, end); flush_cache_all(); return; } - flush_user_dcache_range_asm(start, end); - if (vma->vm_flags & VM_EXEC) - flush_user_icache_range_asm(start, end); - flush_tlb_range(vma, start, end); + if (vma->vm_mm->context == mfsp(3)) { + flush_user_dcache_range_asm(start, end); + if (vma->vm_flags & VM_EXEC) + flush_user_icache_range_asm(start, end); + flush_tlb_range(vma, start, end); + return; + } + + pgd = vma->vm_mm->pgd; + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + unsigned long pfn; + pte_t *ptep = get_ptep(pgd, addr); + if (!ptep) + continue; + pfn = pte_pfn(*ptep); + if (pfn_valid(pfn)) { + if (unlikely(vma->vm_mm->context)) + flush_tlb_page(vma, addr); + __flush_cache_page(vma, addr, PFN_PHYS(pfn)); + } + } } void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { - BUG_ON(!vma->vm_mm->context); - if (pfn_valid(pfn)) { - flush_tlb_page(vma, vmaddr); + if (likely(vma->vm_mm->context)) + flush_tlb_page(vma, vmaddr); __flush_cache_page(vma, vmaddr, PFN_PHYS(pfn)); } } -- cgit v1.2.3 From e3d03598e8ae7d195af5d3d049596dec336f569f Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 19 Mar 2018 13:57:46 -0700 Subject: x86/build/64: Force the linker to use 2MB page size Binutils 2.31 will enable -z separate-code by default for x86 to avoid mixing code pages with data to improve cache performance as well as security. To reduce x86-64 executable and shared object sizes, the maximum page size is reduced from 2MB to 4KB. But x86-64 kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to linker to force 2MB page size regardless of the default page size used by linker. Tested with Linux kernel 4.15.6 on x86-64. Signed-off-by: H.J. Lu Cc: Andy Shevchenko Cc: Eric Biederman Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/CAMe9rOp4_%3D_8twdpTyAP2DhONOCeaTOsniJLoppzhoNptL8xzA@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 498c1b8123006..1c4d012550ec5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -223,6 +223,15 @@ KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) LDFLAGS := -m elf_$(UTS_MACHINE) +# +# The 64-bit kernel must be aligned to 2MB. Pass -z max-page-size=0x200000 to +# the linker to force 2MB page size regardless of the default page size used +# by the linker. +# +ifdef CONFIG_X86_64 +LDFLAGS += $(call ld-option, -z max-page-size=0x200000) +endif + # Speed up the build KBUILD_CFLAGS += -pipe # Workaround for a gcc prelease that unfortunately was shipped in a suse release -- cgit v1.2.3 From c55b8550fa57ba4f5e507be406ff9fc2845713e8 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 19 Mar 2018 14:08:11 -0700 Subject: x86/boot/64: Verify alignment of the LOAD segment Since the x86-64 kernel must be aligned to 2MB, refuse to boot the kernel if the alignment of the LOAD segment isn't a multiple of 2MB. Signed-off-by: H.J. Lu Cc: Andy Shevchenko Cc: Eric Biederman Cc: H. Peter Anvin Cc: Juergen Gross Cc: Kees Cook Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/CAMe9rOrR7xSJgUfiCoZLuqWUwymRxXPoGBW38%2BpN%3D9g%2ByKNhZw@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 98761a1576ceb..252fee3208166 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -309,6 +309,10 @@ static void parse_elf(void *output) switch (phdr->p_type) { case PT_LOAD: +#ifdef CONFIG_X86_64 + if ((phdr->p_align % 0x200000) != 0) + error("Alignment of LOAD segment isn't multiple of 2MB"); +#endif #ifdef CONFIG_RELOCATABLE dest = output; dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR); -- cgit v1.2.3 From 2c2a9bbe7fecb2ad4981b6f4a56cacbfb849f848 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 12 Feb 2018 14:20:35 -0800 Subject: perf/x86/intel: Disable userspace RDPMC usage for large PEBS Userspace RDPMC cannot possibly work for large PEBS, which was introduced in: b8241d20699e ("perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold)") When the PEBS interrupt threshold is larger than one, there is no way to get exact auto-reload times and value for userspace RDPMC. Disable the userspace RDPMC usage when large PEBS is enabled. The only exception is when the PEBS interrupt threshold is 1, in which case user-space RDPMC works well even with auto-reload events. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Fixes: b8241d20699e ("perf/x86/intel: Implement batched PEBS interrupt handling (large PEBS interrupt threshold)") Link: http://lkml.kernel.org/r/1518474035-21006-6-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar (cherry picked from commit 1af22eba248efe2de25658041a80a3d40fb3e92e) --- arch/x86/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 140d33288e78e..3d24edfef3e4c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2118,7 +2118,8 @@ static int x86_pmu_event_init(struct perf_event *event) event->destroy(event); } - if (READ_ONCE(x86_pmu.attr_rdpmc)) + if (READ_ONCE(x86_pmu.attr_rdpmc) && + !(event->hw.flags & PERF_X86_EVENT_FREERUNNING)) event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; return err; -- cgit v1.2.3 From e5ea9b54a055619160bbfe527ebb7d7191823d66 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 17 Mar 2018 14:52:16 +0300 Subject: perf/x86/intel: Don't accidentally clear high bits in bdw_limit_period() We intended to clear the lowest 6 bits but because of a type bug we clear the high 32 bits as well. Andi says that periods are rarely more than U32_MAX so this bug probably doesn't have a huge runtime impact. Signed-off-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: 294fe0f52a44 ("perf/x86/intel: Add INST_RETIRED.ALL workarounds") Link: http://lkml.kernel.org/r/20180317115216.GB4035@mwanda Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 56457cb73448b..9b18a227fff73 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3194,7 +3194,7 @@ static unsigned bdw_limit_period(struct perf_event *event, unsigned left) X86_CONFIG(.event=0xc0, .umask=0x01)) { if (left < 128) left = 128; - left &= ~0x3fu; + left &= ~0x3fULL; } return left; } -- cgit v1.2.3 From e340895c9ed0b44548f08bbaaee4afc7bfacd354 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 12 Mar 2018 08:41:34 -0700 Subject: perf/x86/intel/uncore: Add missing filter constraint for SKX CHA event Adding a filter constraint for Intel Skylake CHA event UNC_CHA_UPI_CREDITS_ACQUIRED (0x38). The event supports core-id/thread-id and link filtering. Signed-off-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: http://lkml.kernel.org/r/1520869294-14176-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 22ec65bc033a9..0876798f2ac93 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3343,6 +3343,7 @@ static struct extra_reg skx_uncore_cha_extra_regs[] = { SNBEP_CBO_EVENT_EXTRA_REG(0x9134, 0xffff, 0x4), SNBEP_CBO_EVENT_EXTRA_REG(0x35, 0xff, 0x8), SNBEP_CBO_EVENT_EXTRA_REG(0x36, 0xff, 0x8), + SNBEP_CBO_EVENT_EXTRA_REG(0x38, 0xff, 0x3), EVENT_EXTRA_END }; -- cgit v1.2.3 From 174afc3e7dd7823df8218e16e7768b834097184e Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 12 Mar 2018 10:45:37 -0400 Subject: perf/x86/intel: Rename confusing 'freerunning PEBS' API and implementation to 'large PEBS' The 'freerunning PEBS' and 'large PEBS' are the same thing. Both of these names appear in the code and in the API, which causes confusion. Rename 'freerunning PEBS' to 'large PEBS' to unify the code, which eliminates the confusion. No functional change. Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1520865937-22910-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 2 +- arch/x86/events/intel/core.c | 12 ++++++------ arch/x86/events/intel/ds.c | 6 +++--- arch/x86/events/perf_event.h | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3d24edfef3e4c..88797c80b3e09 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2119,7 +2119,7 @@ static int x86_pmu_event_init(struct perf_event *event) } if (READ_ONCE(x86_pmu.attr_rdpmc) && - !(event->hw.flags & PERF_X86_EVENT_FREERUNNING)) + !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; return err; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 9b18a227fff73..1e41d7508d99c 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2952,9 +2952,9 @@ static void intel_pebs_aliases_skl(struct perf_event *event) return intel_pebs_aliases_precdist(event); } -static unsigned long intel_pmu_free_running_flags(struct perf_event *event) +static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) { - unsigned long flags = x86_pmu.free_running_flags; + unsigned long flags = x86_pmu.large_pebs_flags; if (event->attr.use_clockid) flags &= ~PERF_SAMPLE_TIME; @@ -2976,8 +2976,8 @@ static int intel_pmu_hw_config(struct perf_event *event) if (!event->attr.freq) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_free_running_flags(event))) - event->hw.flags |= PERF_X86_EVENT_FREERUNNING; + ~intel_pmu_large_pebs_flags(event))) + event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); @@ -3460,7 +3460,7 @@ static __initconst const struct x86_pmu core_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, - .free_running_flags = PEBS_FREERUNNING_FLAGS, + .large_pebs_flags = LARGE_PEBS_FLAGS, /* * Intel PMCs cannot be accessed sanely above 32-bit width, @@ -3502,7 +3502,7 @@ static __initconst const struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, - .free_running_flags = PEBS_FREERUNNING_FLAGS, + .large_pebs_flags = LARGE_PEBS_FLAGS, /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 18c25ab285574..d8015235ba765 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -935,7 +935,7 @@ void intel_pmu_pebs_add(struct perf_event *event) bool needed_cb = pebs_needs_sched_cb(cpuc); cpuc->n_pebs++; - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs++; pebs_update_state(needed_cb, cpuc, event->ctx->pmu); @@ -975,7 +975,7 @@ void intel_pmu_pebs_del(struct perf_event *event) bool needed_cb = pebs_needs_sched_cb(cpuc); cpuc->n_pebs--; - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs--; pebs_update_state(needed_cb, cpuc, event->ctx->pmu); @@ -1530,7 +1530,7 @@ void __init intel_ds_init(void) x86_pmu.pebs_record_size = sizeof(struct pebs_record_skl); x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; - x86_pmu.free_running_flags |= PERF_SAMPLE_TIME; + x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; default: diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 78f91ec1056ea..39cd0615f04fb 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -69,7 +69,7 @@ struct event_constraint { #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ #define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */ +#define PERF_X86_EVENT_LARGE_PEBS 0x0800 /* use large PEBS */ struct amd_nb { @@ -88,7 +88,7 @@ struct amd_nb { * REGS_USER can be handled for events limited to ring 3. * */ -#define PEBS_FREERUNNING_FLAGS \ +#define LARGE_PEBS_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ @@ -608,7 +608,7 @@ struct x86_pmu { struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); int max_pebs_events; - unsigned long free_running_flags; + unsigned long large_pebs_flags; /* * Intel LBR -- cgit v1.2.3 From 320b0651f32b830add6497fcdcfdcb6ae8c7b8a0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 13 Mar 2018 11:51:34 -0700 Subject: perf/x86/intel/uncore: Fix multi-domain PCI CHA enumeration bug on Skylake servers The number of CHAs is miscalculated on multi-domain PCI Skylake server systems, resulting in an uncore driver initialization error. Gary Kroening explains: "For systems with a single PCI segment, it is sufficient to look for the bus number to change in order to determine that all of the CHa's have been counted for a single socket. However, for multi PCI segment systems, each socket is given a new segment and the bus number does NOT change. So looking only for the bus number to change ends up counting all of the CHa's on all sockets in the system. This leads to writing CPU MSRs beyond a valid range and causes an error in ivbep_uncore_msr_init_box()." To fix this bug, query the number of CHAs from the CAPID6 register: it should read bits 27:0 in the CAPID6 register located at Device 30, Function 3, Offset 0x9C. These 28 bits form a bit vector of available LLC slices and the CHAs that manage those slices. Reported-by: Kroening, Gary Tested-by: Kroening, Gary Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Shevchenko Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: abanman@hpe.com Cc: dimitri.sivanich@hpe.com Cc: hpa@zytor.com Cc: mike.travis@hpe.com Cc: russ.anderson@hpe.com Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Link: http://lkml.kernel.org/r/1520967094-13219-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/uncore_snbep.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 0876798f2ac93..c98b943e58b4f 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3563,24 +3563,27 @@ static struct intel_uncore_type *skx_msr_uncores[] = { NULL, }; +/* + * To determine the number of CHAs, it should read bits 27:0 in the CAPID6 + * register which located at Device 30, Function 3, Offset 0x9C. PCI ID 0x2083. + */ +#define SKX_CAPID6 0x9c +#define SKX_CHA_BIT_MASK GENMASK(27, 0) + static int skx_count_chabox(void) { - struct pci_dev *chabox_dev = NULL; - int bus, count = 0; + struct pci_dev *dev = NULL; + u32 val = 0; - while (1) { - chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev); - if (!chabox_dev) - break; - if (count == 0) - bus = chabox_dev->bus->number; - if (bus != chabox_dev->bus->number) - break; - count++; - } + dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x2083, dev); + if (!dev) + goto out; - pci_dev_put(chabox_dev); - return count; + pci_read_config_dword(dev, SKX_CAPID6, &val); + val &= SKX_CHA_BIT_MASK; +out: + pci_dev_put(dev); + return hweight32(val); } void skx_uncore_cpu_init(void) -- cgit v1.2.3 From 5927145efd5de71976e62e2822511b13014d7e56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 19 Mar 2018 11:38:13 +0100 Subject: x86/cpu: Remove the CONFIG_X86_PPRO_FENCE=y quirk There were only a few Pentium Pro multiprocessors systems where this errata applied. They are more than 20 years old now, and we've slowly dropped places which put the workarounds in and discouraged anyone from enabling the workaround. Get rid of it for good. Tested-by: Tom Lendacky Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Gleixner Reviewed-by: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Joerg Roedel Cc: Jon Mason Cc: Linus Torvalds Cc: Muli Ben-Yehuda Cc: Peter Zijlstra Cc: iommu@lists.linux-foundation.org Link: http://lkml.kernel.org/r/20180319103826.12853-2-hch@lst.de Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 13 ------------- arch/x86/entry/vdso/vdso32/vclock_gettime.c | 2 -- arch/x86/include/asm/barrier.h | 30 ----------------------------- arch/x86/include/asm/io.h | 15 --------------- arch/x86/kernel/pci-nommu.c | 19 ------------------ arch/x86/um/asm/barrier.h | 4 ---- 6 files changed, 83 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 65a9a4716e34f..f0c5ef5781537 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -315,19 +315,6 @@ config X86_L1_CACHE_SHIFT default "4" if MELAN || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX -config X86_PPRO_FENCE - bool "PentiumPro memory ordering errata workaround" - depends on M686 || M586MMX || M586TSC || M586 || M486 || MGEODEGX1 - ---help--- - Old PentiumPro multiprocessor systems had errata that could cause - memory operations to violate the x86 ordering standard in rare cases. - Enabling this option will attempt to work around some (but not all) - occurrences of this problem, at the cost of much heavier spinlock and - memory barrier operations. - - If unsure, say n here. Even distro kernels should think twice before - enabling this: there are few systems, and an unlikely bug. - config X86_F00F_BUG def_bool y depends on M586MMX || M586TSC || M586 || M486 diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c index 7780bbfb06ef2..9242b28418d58 100644 --- a/arch/x86/entry/vdso/vdso32/vclock_gettime.c +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -5,8 +5,6 @@ #undef CONFIG_OPTIMIZE_INLINING #endif -#undef CONFIG_X86_PPRO_FENCE - #ifdef CONFIG_X86_64 /* diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index e1259f043ae99..042b5e892ed10 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -52,11 +52,7 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \ "lfence", X86_FEATURE_LFENCE_RDTSC) -#ifdef CONFIG_X86_PPRO_FENCE -#define dma_rmb() rmb() -#else #define dma_rmb() barrier() -#endif #define dma_wmb() barrier() #ifdef CONFIG_X86_32 @@ -68,30 +64,6 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, #define __smp_wmb() barrier() #define __smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) -#if defined(CONFIG_X86_PPRO_FENCE) - -/* - * For this option x86 doesn't have a strong TSO memory - * model and we should fall back to full barriers. - */ - -#define __smp_store_release(p, v) \ -do { \ - compiletime_assert_atomic_type(*p); \ - __smp_mb(); \ - WRITE_ONCE(*p, v); \ -} while (0) - -#define __smp_load_acquire(p) \ -({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ - compiletime_assert_atomic_type(*p); \ - __smp_mb(); \ - ___p1; \ -}) - -#else /* regular x86 TSO memory ordering */ - #define __smp_store_release(p, v) \ do { \ compiletime_assert_atomic_type(*p); \ @@ -107,8 +79,6 @@ do { \ ___p1; \ }) -#endif - /* Atomic operations are already serializing on x86 */ #define __smp_mb__before_atomic() barrier() #define __smp_mb__after_atomic() barrier() diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 95e948627fd04..f6e5b9375d8c3 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -232,21 +232,6 @@ extern void set_iounmap_nonlazy(void); */ #define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET)) -/* - * Cache management - * - * This needed for two cases - * 1. Out of order aware processors - * 2. Accidentally out of order processors (PPro errata #51) - */ - -static inline void flush_write_buffers(void) -{ -#if defined(CONFIG_X86_PPRO_FENCE) - asm volatile("lock; addl $0,0(%%esp)": : :"memory"); -#endif -} - #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 618285e475c62..ac7ea3a8242fe 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -37,7 +37,6 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, WARN_ON(size == 0); if (!check_addr("map_single", dev, bus, size)) return NOMMU_MAPPING_ERROR; - flush_write_buffers(); return bus; } @@ -72,25 +71,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return 0; s->dma_length = s->length; } - flush_write_buffers(); return nents; } -static void nommu_sync_single_for_device(struct device *dev, - dma_addr_t addr, size_t size, - enum dma_data_direction dir) -{ - flush_write_buffers(); -} - - -static void nommu_sync_sg_for_device(struct device *dev, - struct scatterlist *sg, int nelems, - enum dma_data_direction dir) -{ - flush_write_buffers(); -} - static int nommu_mapping_error(struct device *dev, dma_addr_t dma_addr) { return dma_addr == NOMMU_MAPPING_ERROR; @@ -101,8 +84,6 @@ const struct dma_map_ops nommu_dma_ops = { .free = dma_generic_free_coherent, .map_sg = nommu_map_sg, .map_page = nommu_map_page, - .sync_single_for_device = nommu_sync_single_for_device, - .sync_sg_for_device = nommu_sync_sg_for_device, .is_phys = 1, .mapping_error = nommu_mapping_error, .dma_supported = x86_dma_supported, diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index b7d73400ea29a..f31e5d9031617 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -30,11 +30,7 @@ #endif /* CONFIG_X86_32 */ -#ifdef CONFIG_X86_PPRO_FENCE -#define dma_rmb() rmb() -#else /* CONFIG_X86_PPRO_FENCE */ #define dma_rmb() barrier() -#endif /* CONFIG_X86_PPRO_FENCE */ #define dma_wmb() barrier() #include -- cgit v1.2.3 From 31ad7f8e7dc94d3b85ccf9b6141ce6dfd35a1781 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 19 Mar 2018 10:31:54 -0400 Subject: x86/vsyscall/64: Use proper accessor to update P4D entry Writing to it directly does not work for Xen PV guests. Fixes: 49275fef986a ("x86/vsyscall/64: Explicitly set _PAGE_USER in the pagetable hierarchy") Signed-off-by: Boris Ostrovsky Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Acked-by: Andy Lutomirski Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20180319143154.3742-1-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 8560ef68a9d63..317be365bce3a 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -347,7 +347,7 @@ void __init set_vsyscall_pgtable_user_bits(pgd_t *root) set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); p4d = p4d_offset(pgd, VSYSCALL_ADDR); #if CONFIG_PGTABLE_LEVELS >= 5 - p4d->p4d |= _PAGE_USER; + set_p4d(p4d, __p4d(p4d_val(*p4d) | _PAGE_USER)); #endif pud = pud_offset(p4d, VSYSCALL_ADDR); set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); -- cgit v1.2.3 From 32d43cd391bacb5f0814c2624399a5dad3501d09 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 20 Mar 2018 12:16:59 -0700 Subject: kvm/x86: fix icebp instruction handling The undocumented 'icebp' instruction (aka 'int1') works pretty much like 'int3' in the absense of in-circuit probing equipment (except, obviously, that it raises #DB instead of raising #BP), and is used by some validation test-suites as such. But Andy Lutomirski noticed that his test suite acted differently in kvm than on bare hardware. The reason is that kvm used an inexact test for the icebp instruction: it just assumed that an all-zero VM exit qualification value meant that the VM exit was due to icebp. That is not unlike the guess that do_debug() does for the actual exception handling case, but it's purely a heuristic, not an absolute rule. do_debug() does it because it wants to ascribe _some_ reasons to the #DB that happened, and an empty %dr6 value means that 'icebp' is the most likely casue and we have no better information. But kvm can just do it right, because unlike the do_debug() case, kvm actually sees the real reason for the #DB in the VM-exit interruption information field. So instead of relying on an inexact heuristic, just use the actual VM exit information that says "it was 'icebp'". Right now the 'icebp' instruction isn't technically documented by Intel, but that will hopefully change. The special "privileged software exception" information _is_ actually mentioned in the Intel SDM, even though the cause of it isn't enumerated. Reported-by: Andy Lutomirski Tested-by: Paolo Bonzini Signed-off-by: Linus Torvalds --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 8b67807511329..5db8b0b107664 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -352,6 +352,7 @@ enum vmcs_field { #define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ #define INTR_TYPE_HARD_EXCEPTION (3 << 8) /* processor exception */ #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ +#define INTR_TYPE_PRIV_SW_EXCEPTION (5 << 8) /* ICE breakpoint - undocumented */ #define INTR_TYPE_SOFT_EXCEPTION (6 << 8) /* software exception */ /* GUEST_INTERRUPTIBILITY_INFO flags. */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 051dab74e4e92..2d87603f91795 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1045,6 +1045,13 @@ static inline bool is_machine_check(u32 intr_info) (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); } +/* Undocumented: icebp/int1 */ +static inline bool is_icebp(u32 intr_info) +{ + return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) + == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK); +} + static inline bool cpu_has_vmx_msr_bitmap(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; @@ -6179,7 +6186,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; vcpu->arch.dr6 |= dr6 | DR6_RTM; - if (!(dr6 & ~DR6_RESERVED)) /* icebp */ + if (is_icebp(intr_info)) skip_emulated_instruction(vcpu); kvm_queue_exception(vcpu, DB_VECTOR); -- cgit v1.2.3 From 214cbc14734958fe533916fdb4194f5983ad4bc4 Mon Sep 17 00:00:00 2001 From: Mathias Kresin Date: Fri, 16 Mar 2018 21:27:28 +0100 Subject: MIPS: lantiq: Fix Danube USB clock On Danube the USB0 controller registers are at 1e101000 and the USB0 PHY register is at 1f203018 similar to all other lantiq SoCs. Activate the USB controller gating clock thorough the USB controller driver and not the PHY. This fixes a problem introduced in a previous commit. Fixes: dea54fbad332 ("phy: Add an USB PHY driver for the Lantiq SoCs using the RCU module") Signed-off-by: Mathias Kresin Signed-off-by: Hauke Mehrtens Acked-by: Martin Blumenstingl Cc: Ralf Baechle Cc: John Crispin Cc: linux-mips@linux-mips.org Cc: # 4.14+ Patchwork: https://patchwork.linux-mips.org/patch/18816/ Signed-off-by: James Hogan --- arch/mips/lantiq/xway/sysctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c index 52500d3b7004b..f11f1dd104936 100644 --- a/arch/mips/lantiq/xway/sysctrl.c +++ b/arch/mips/lantiq/xway/sysctrl.c @@ -560,7 +560,7 @@ void __init ltq_soc_init(void) } else { clkdev_add_static(ltq_danube_cpu_hz(), ltq_danube_fpi_hz(), ltq_danube_fpi_hz(), ltq_danube_pp32_hz()); - clkdev_add_pmu("1f203018.usb2-phy", "ctrl", 1, 0, PMU_USB0); + clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0); clkdev_add_pmu("1f203018.usb2-phy", "phy", 1, 0, PMU_USB0_P); clkdev_add_pmu("1e103000.sdio", NULL, 1, 0, PMU_SDIO); clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU); -- cgit v1.2.3 From 3223a5a7d3a606dcb7d9190a788b9544a45441ee Mon Sep 17 00:00:00 2001 From: Mathias Kresin Date: Fri, 16 Mar 2018 21:27:29 +0100 Subject: MIPS: lantiq: Enable AHB Bus for USB On Danube and AR9 the USB core is connected though a AHB bus to the main system cross bar, hence we need to enable the gating clock of the AHB Bus as well to make the USB controller work. Fixes: dea54fbad332 ("phy: Add an USB PHY driver for the Lantiq SoCs using the RCU module") Signed-off-by: Mathias Kresin Signed-off-by: Hauke Mehrtens Acked-by: Martin Blumenstingl Cc: Ralf Baechle Cc: John Crispin Cc: linux-mips@linux-mips.org Cc: # 4.14+ Patchwork: https://patchwork.linux-mips.org/patch/18814/ Signed-off-by: James Hogan --- arch/mips/lantiq/xway/sysctrl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c index f11f1dd104936..e0af39b33e287 100644 --- a/arch/mips/lantiq/xway/sysctrl.c +++ b/arch/mips/lantiq/xway/sysctrl.c @@ -549,9 +549,9 @@ void __init ltq_soc_init(void) clkdev_add_static(ltq_ar9_cpu_hz(), ltq_ar9_fpi_hz(), ltq_ar9_fpi_hz(), CLOCK_250M); clkdev_add_pmu("1f203018.usb2-phy", "phy", 1, 0, PMU_USB0_P); - clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0); + clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0 | PMU_AHBM); clkdev_add_pmu("1f203034.usb2-phy", "phy", 1, 0, PMU_USB1_P); - clkdev_add_pmu("1e106000.usb", "otg", 1, 0, PMU_USB1); + clkdev_add_pmu("1e106000.usb", "otg", 1, 0, PMU_USB1 | PMU_AHBM); clkdev_add_pmu("1e180000.etop", "switch", 1, 0, PMU_SWITCH); clkdev_add_pmu("1e103000.sdio", NULL, 1, 0, PMU_SDIO); clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU); @@ -560,7 +560,7 @@ void __init ltq_soc_init(void) } else { clkdev_add_static(ltq_danube_cpu_hz(), ltq_danube_fpi_hz(), ltq_danube_fpi_hz(), ltq_danube_pp32_hz()); - clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0); + clkdev_add_pmu("1e101000.usb", "otg", 1, 0, PMU_USB0 | PMU_AHBM); clkdev_add_pmu("1f203018.usb2-phy", "phy", 1, 0, PMU_USB0_P); clkdev_add_pmu("1e103000.sdio", NULL, 1, 0, PMU_SDIO); clkdev_add_pmu("1e103100.deu", NULL, 1, 0, PMU_DEU); -- cgit v1.2.3 From a821328c2f3003b908880792d71b2781b44fa53c Mon Sep 17 00:00:00 2001 From: Mathias Kresin Date: Fri, 16 Mar 2018 21:27:30 +0100 Subject: MIPS: lantiq: ase: Enable MFD_SYSCON Enable syscon to use it for the RCU MFD on Amazon SE as well. The Amazon SE also has similar reset controller system as Danube and XWAY and use their drivers mostly. As these drivers now need syscon also activate the syscon subsystem for for Amazon SE. Fixes: 2b6639d4c794 ("MIPS: lantiq: Enable MFD_SYSCON to be able to use it for the RCU MFD") Signed-off-by: Mathias Kresin Signed-off-by: Hauke Mehrtens Acked-by: Martin Blumenstingl Cc: Ralf Baechle Cc: John Crispin Cc: linux-mips@linux-mips.org Cc: # 4.14+ Patchwork: https://patchwork.linux-mips.org/patch/18817/ Signed-off-by: James Hogan --- arch/mips/lantiq/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/mips/lantiq/Kconfig b/arch/mips/lantiq/Kconfig index 692ae85a3e3d4..8e3a1fc2bc398 100644 --- a/arch/mips/lantiq/Kconfig +++ b/arch/mips/lantiq/Kconfig @@ -13,6 +13,8 @@ choice config SOC_AMAZON_SE bool "Amazon SE" select SOC_TYPE_XWAY + select MFD_SYSCON + select MFD_CORE config SOC_XWAY bool "XWAY" -- cgit v1.2.3 From 891731f6a5dbe508d12443175a7e166a2fba616a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 20 Mar 2018 19:29:51 +1100 Subject: MIPS: ralink: Remove ralink_halt() ralink_halt() does nothing that machine_halt() doesn't already do, so it adds no value. It actually causes incorrect behaviour due to the "unreachable()" at the end. This tells the compiler that the end of the function will never be reached, which isn't true. The compiler responds by not adding a 'return' instruction, so control simply moves on to whatever bytes come afterwards in memory. In my tested, that was the ralink_restart() function. This means that an attempt to 'halt' the machine would actually cause a reboot. So remove ralink_halt() so that a 'halt' really does halt. Fixes: c06e836ada59 ("MIPS: ralink: adds reset code") Signed-off-by: NeilBrown Cc: John Crispin Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: # 3.9+ Patchwork: https://patchwork.linux-mips.org/patch/18851/ Signed-off-by: James Hogan --- arch/mips/ralink/reset.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch') diff --git a/arch/mips/ralink/reset.c b/arch/mips/ralink/reset.c index 64543d66e76b5..e9531fea23a29 100644 --- a/arch/mips/ralink/reset.c +++ b/arch/mips/ralink/reset.c @@ -96,16 +96,9 @@ static void ralink_restart(char *command) unreachable(); } -static void ralink_halt(void) -{ - local_irq_disable(); - unreachable(); -} - static int __init mips_reboot_setup(void) { _machine_restart = ralink_restart; - _machine_halt = ralink_halt; return 0; } -- cgit v1.2.3 From a63d706ea719190a79a6c769e898f70680044d3e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 21 Mar 2018 14:02:10 +1100 Subject: MIPS: ralink: Fix booting on MT7621 Since commit 3af5a67c86a3 ("MIPS: Fix early CM probing") the MT7621 has not been able to boot. This commit caused mips_cm_probe() to be called before mt7621.c::proc_soc_init(). prom_soc_init() has a comment explaining that mips_cm_probe() "wipes out the bootloader config" and means that configuration registers are no longer available. It has some code to re-enable this config. Before this re-enable code is run, the sysc register cannot be read, so when SYSC_REG_CHIP_NAME0 is read, a garbage value is returned and panic() is called. If we move the config-repair code to the top of prom_soc_init(), the registers can be read and boot can proceed. Very occasionally, the first register read after the reconfiguration returns garbage, so add a call to __sync(). Fixes: 3af5a67c86a3 ("MIPS: Fix early CM probing") Signed-off-by: NeilBrown Reviewed-by: Matt Redfearn Cc: John Crispin Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: # 4.5+ Patchwork: https://patchwork.linux-mips.org/patch/18859/ Signed-off-by: James Hogan --- arch/mips/ralink/mt7621.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/mips/ralink/mt7621.c b/arch/mips/ralink/mt7621.c index 1b274742077dc..d2718de60b9b5 100644 --- a/arch/mips/ralink/mt7621.c +++ b/arch/mips/ralink/mt7621.c @@ -170,6 +170,28 @@ void prom_soc_init(struct ralink_soc_info *soc_info) u32 n1; u32 rev; + /* Early detection of CMP support */ + mips_cm_probe(); + mips_cpc_probe(); + + if (mips_cps_numiocu(0)) { + /* + * mips_cm_probe() wipes out bootloader + * config for CM regions and we have to configure them + * again. This SoC cannot talk to pamlbus devices + * witout proper iocu region set up. + * + * FIXME: it would be better to do this with values + * from DT, but we need this very early because + * without this we cannot talk to pretty much anything + * including serial. + */ + write_gcr_reg0_base(MT7621_PALMBUS_BASE); + write_gcr_reg0_mask(~MT7621_PALMBUS_SIZE | + CM_GCR_REGn_MASK_CMTGT_IOCU0); + __sync(); + } + n0 = __raw_readl(sysc + SYSC_REG_CHIP_NAME0); n1 = __raw_readl(sysc + SYSC_REG_CHIP_NAME1); @@ -194,26 +216,6 @@ void prom_soc_init(struct ralink_soc_info *soc_info) rt2880_pinmux_data = mt7621_pinmux_data; - /* Early detection of CMP support */ - mips_cm_probe(); - mips_cpc_probe(); - - if (mips_cps_numiocu(0)) { - /* - * mips_cm_probe() wipes out bootloader - * config for CM regions and we have to configure them - * again. This SoC cannot talk to pamlbus devices - * witout proper iocu region set up. - * - * FIXME: it would be better to do this with values - * from DT, but we need this very early because - * without this we cannot talk to pretty much anything - * including serial. - */ - write_gcr_reg0_base(MT7621_PALMBUS_BASE); - write_gcr_reg0_mask(~MT7621_PALMBUS_SIZE | - CM_GCR_REGn_MASK_CMTGT_IOCU0); - } if (!register_cps_smp_ops()) return; -- cgit v1.2.3 From 1705f7c534163594f8b05e060cb49fbea86ca70b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 22 Mar 2018 16:17:17 -0700 Subject: h8300: remove extraneous __BIG_ENDIAN definition A bugfix I did earlier caused a build regression on h8300, which defines the __BIG_ENDIAN macro in a slightly different way than the generic code: arch/h8300/include/asm/byteorder.h:5:0: warning: "__BIG_ENDIAN" redefined We don't need to define it here, as the same macro is already provided by the linux/byteorder/big_endian.h, and that version does not conflict. While this is a v4.16 regression, my earlier patch also got backported to the 4.14 and 4.15 stable kernels, so we need the fixup there as well. Link: http://lkml.kernel.org/r/20180313120752.2645129-1-arnd@arndb.de Fixes: 101110f6271c ("Kbuild: always define endianess in kconfig.h") Signed-off-by: Arnd Bergmann Cc: Yoshinori Sato Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/h8300/include/asm/byteorder.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/h8300/include/asm/byteorder.h b/arch/h8300/include/asm/byteorder.h index ecff2d1ca5a38..6eaa7ad5fc2c9 100644 --- a/arch/h8300/include/asm/byteorder.h +++ b/arch/h8300/include/asm/byteorder.h @@ -2,7 +2,6 @@ #ifndef __H8300_BYTEORDER_H__ #define __H8300_BYTEORDER_H__ -#define __BIG_ENDIAN __ORDER_BIG_ENDIAN__ #include #endif -- cgit v1.2.3 From b6bdb7517c3d3f41f20e5c2948d6bc3f8897394e Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 22 Mar 2018 16:17:20 -0700 Subject: mm/vmalloc: add interfaces to free unmapped page table On architectures with CONFIG_HAVE_ARCH_HUGE_VMAP set, ioremap() may create pud/pmd mappings. A kernel panic was observed on arm64 systems with Cortex-A75 in the following steps as described by Hanjun Guo. 1. ioremap a 4K size, valid page table will build, 2. iounmap it, pte0 will set to 0; 3. ioremap the same address with 2M size, pgd/pmd is unchanged, then set the a new value for pmd; 4. pte0 is leaked; 5. CPU may meet exception because the old pmd is still in TLB, which will lead to kernel panic. This panic is not reproducible on x86. INVLPG, called from iounmap, purges all levels of entries associated with purged address on x86. x86 still has memory leak. The patch changes the ioremap path to free unmapped page table(s) since doing so in the unmap path has the following issues: - The iounmap() path is shared with vunmap(). Since vmap() only supports pte mappings, making vunmap() to free a pte page is an overhead for regular vmap users as they do not need a pte page freed up. - Checking if all entries in a pte page are cleared in the unmap path is racy, and serializing this check is expensive. - The unmap path calls free_vmap_area_noflush() to do lazy TLB purges. Clearing a pud/pmd entry before the lazy TLB purges needs extra TLB purge. Add two interfaces, pud_free_pmd_page() and pmd_free_pte_page(), which clear a given pud/pmd entry and free up a page for the lower level entries. This patch implements their stub functions on x86 and arm64, which work as workaround. [akpm@linux-foundation.org: fix typo in pmd_free_pte_page() stub] Link: http://lkml.kernel.org/r/20180314180155.19492-2-toshi.kani@hpe.com Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings") Reported-by: Lei Li Signed-off-by: Toshi Kani Cc: Catalin Marinas Cc: Wang Xuefeng Cc: Will Deacon Cc: Hanjun Guo Cc: Michal Hocko Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Matthew Wilcox Cc: Chintan Pandya Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 10 ++++++++++ arch/x86/mm/pgtable.c | 24 ++++++++++++++++++++++++ include/asm-generic/pgtable.h | 10 ++++++++++ lib/ioremap.c | 6 ++++-- 4 files changed, 48 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8c704f1e53c22..2dbb2c9f1ec17 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -972,3 +972,13 @@ int pmd_clear_huge(pmd_t *pmdp) pmd_clear(pmdp); return 1; } + +int pud_free_pmd_page(pud_t *pud) +{ + return pud_none(*pud); +} + +int pmd_free_pte_page(pmd_t *pmd) +{ + return pmd_none(*pmd); +} diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 004abf9ebf122..1eed7ed518e68 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -702,4 +702,28 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } + +/** + * pud_free_pmd_page - Clear pud entry and free pmd page. + * @pud: Pointer to a PUD. + * + * Context: The pud range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pud_free_pmd_page(pud_t *pud) +{ + return pud_none(*pud); +} + +/** + * pmd_free_pte_page - Clear pmd entry and free pte page. + * @pmd: Pointer to a PMD. + * + * Context: The pmd range has been unmaped and TLB purged. + * Return: 1 if clearing the entry succeeded. 0 otherwise. + */ +int pmd_free_pte_page(pmd_t *pmd) +{ + return pmd_none(*pmd); +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2cfa3075d148b..bfbb44a5ad389 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -983,6 +983,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); +int pud_free_pmd_page(pud_t *pud); +int pmd_free_pte_page(pmd_t *pmd); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { @@ -1008,6 +1010,14 @@ static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } +static inline int pud_free_pmd_page(pud_t *pud) +{ + return 0; +} +static inline int pmd_free_pte_page(pmd_t *pmd) +{ + return 0; +} #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE diff --git a/lib/ioremap.c b/lib/ioremap.c index b808a390e4c3e..54e5bbaa32003 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -91,7 +91,8 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, if (ioremap_pmd_enabled() && ((next - addr) == PMD_SIZE) && - IS_ALIGNED(phys_addr + addr, PMD_SIZE)) { + IS_ALIGNED(phys_addr + addr, PMD_SIZE) && + pmd_free_pte_page(pmd)) { if (pmd_set_huge(pmd, phys_addr + addr, prot)) continue; } @@ -117,7 +118,8 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, if (ioremap_pud_enabled() && ((next - addr) == PUD_SIZE) && - IS_ALIGNED(phys_addr + addr, PUD_SIZE)) { + IS_ALIGNED(phys_addr + addr, PUD_SIZE) && + pud_free_pmd_page(pud)) { if (pud_set_huge(pud, phys_addr + addr, prot)) continue; } -- cgit v1.2.3 From 28ee90fe6048fa7b7ceaeb8831c0e4e454a4cf89 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Thu, 22 Mar 2018 16:17:24 -0700 Subject: x86/mm: implement free pmd/pte page interfaces Implement pud_free_pmd_page() and pmd_free_pte_page() on x86, which clear a given pud/pmd entry and free up lower level page table(s). The address range associated with the pud/pmd entry must have been purged by INVLPG. Link: http://lkml.kernel.org/r/20180314180155.19492-3-toshi.kani@hpe.com Fixes: e61ce6ade404e ("mm: change ioremap to set up huge I/O mappings") Signed-off-by: Toshi Kani Reported-by: Lei Li Cc: Michal Hocko Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1eed7ed518e68..34cda7e0551b4 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -712,7 +712,22 @@ int pmd_clear_huge(pmd_t *pmd) */ int pud_free_pmd_page(pud_t *pud) { - return pud_none(*pud); + pmd_t *pmd; + int i; + + if (pud_none(*pud)) + return 1; + + pmd = (pmd_t *)pud_page_vaddr(*pud); + + for (i = 0; i < PTRS_PER_PMD; i++) + if (!pmd_free_pte_page(&pmd[i])) + return 0; + + pud_clear(pud); + free_page((unsigned long)pmd); + + return 1; } /** @@ -724,6 +739,15 @@ int pud_free_pmd_page(pud_t *pud) */ int pmd_free_pte_page(pmd_t *pmd) { - return pmd_none(*pmd); + pte_t *pte; + + if (pmd_none(*pmd)) + return 1; + + pte = (pte_t *)pmd_page_vaddr(*pmd); + pmd_clear(pmd); + free_page((unsigned long)pte); + + return 1; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -- cgit v1.2.3 From 06ace26f4e6fcf747e890a39193be811777a048a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 22 Mar 2018 15:18:53 -0400 Subject: x86/efi: Free efi_pgd with free_pages() The efi_pgd is allocated as PGD_ALLOCATION_ORDER pages and therefore must also be freed as PGD_ALLOCATION_ORDER pages with free_pages(). Fixes: d9e9a6418065 ("x86/mm/pti: Allocate a separate user PGD") Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: linux-efi@vger.kernel.org Cc: Dave Hansen Cc: Ard Biesheuvel Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1521746333-19593-1-git-send-email-longman@redhat.com --- arch/x86/platform/efi/efi_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index c310a82843589..f9cfbc0d1f337 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -227,7 +227,7 @@ int __init efi_alloc_page_tables(void) if (!pud) { if (CONFIG_PGTABLE_LEVELS > 4) free_page((unsigned long) pgd_page_vaddr(*pgd)); - free_page((unsigned long)efi_pgd); + free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); return -ENOMEM; } -- cgit v1.2.3 From d8ba61ba58c88d5207c1ba2f7d9a2280e7d03be9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 23 Jul 2015 15:37:48 -0700 Subject: x86/entry/64: Don't use IST entry for #BP stack There's nothing IST-worthy about #BP/int3. We don't allow kprobes in the small handful of places in the kernel that run at CPL0 with an invalid stack, and 32-bit kernels have used normal interrupt gates for #BP forever. Furthermore, we don't allow kprobes in places that have usergs while in kernel mode, so "paranoid" is also unnecessary. Signed-off-by: Andy Lutomirski Signed-off-by: Linus Torvalds Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- arch/x86/entry/entry_64.S | 2 +- arch/x86/kernel/idt.c | 2 -- arch/x86/kernel/traps.c | 15 ++++++++------- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d5c7f18f79ace..9b114675fbc05 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1138,7 +1138,7 @@ apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK -idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 56d99be3706a2..50bee5fe11401 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -160,7 +160,6 @@ static const __initconst struct idt_data early_pf_idts[] = { */ static const __initconst struct idt_data dbg_idts[] = { INTG(X86_TRAP_DB, debug), - INTG(X86_TRAP_BP, int3), }; #endif @@ -183,7 +182,6 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; static const __initconst struct idt_data ist_idts[] = { ISTG(X86_TRAP_DB, debug, DEBUG_STACK), ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - SISTG(X86_TRAP_BP, int3, DEBUG_STACK), ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), #ifdef CONFIG_X86_MCE ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 3d9b2308e7fad..03f3d7695dacc 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -577,7 +577,6 @@ do_general_protection(struct pt_regs *regs, long error_code) } NOKPROBE_SYMBOL(do_general_protection); -/* May run on IST stack. */ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_DYNAMIC_FTRACE @@ -592,6 +591,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) if (poke_int3_handler(regs)) return; + /* + * Use ist_enter despite the fact that we don't use an IST stack. + * We can be called from a kprobe in non-CONTEXT_KERNEL kernel + * mode or even during context tracking state changes. + * + * This means that we can't schedule. That's okay. + */ ist_enter(regs); RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP @@ -609,15 +615,10 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) goto exit; - /* - * Let others (NMI) know that the debug stack is in use - * as we may switch to the interrupt stack. - */ - debug_stack_usage_inc(); cond_local_irq_enable(regs); do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); cond_local_irq_disable(regs); - debug_stack_usage_dec(); + exit: ist_exit(regs); } -- cgit v1.2.3