From c55d8cbfc9447f32cb0d880e23c42d4ad185fae5 Mon Sep 17 00:00:00 2001 From: David Dgien Date: Mon, 29 Jun 2020 20:38:38 -0400 Subject: arm: module: Allow modules outside of bl range Unlike the Linux kernel, barebox does not have a dedicated heap for storing modules. Therefore, if the system memory configuration places the general heap further away than can be reached by a 'bl' instruction (24 bits of address, or 16 MiB), then the module relocations will fail due to being out of range. Allocate PLTs when loading modules so that jumps and calls whose targets are too far away for their relative offsets to be encoded in the instructions themselves can be bounced via veneers in the module's PLT. The modules will use slightly more memory, but after rounding up to page size, the actual memory footprint is usually the same. Adoption of Linux commits: 66e94ba3c8ea ARM: kernel: avoid brute force search on PLT generation 1031a7e674d1 ARM: kernel: sort relocation sections before allocating PLTs 05123fef0982 ARM: kernel: allocate PLT entries only for external symbols 35fa91eed817 ARM: kernel: merge core and init PLTs 7d485f647c1f ARM: 8220/1: allow modules outside of bl range Signed-off-by: David Dgien Signed-off-by: Sascha Hauer --- arch/arm/Kconfig | 15 +++ arch/arm/Makefile | 4 + arch/arm/cpu/Kconfig | 1 + arch/arm/include/asm/module.h | 33 ++++-- arch/arm/lib32/Makefile | 1 + arch/arm/lib32/module-plts.c | 229 ++++++++++++++++++++++++++++++++++++++++++ arch/arm/lib32/module.c | 14 +++ arch/arm/lib32/module.lds | 4 + 8 files changed, 295 insertions(+), 6 deletions(-) create mode 100644 arch/arm/lib32/module-plts.c create mode 100644 arch/arm/lib32/module.lds diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index dfb18777b2..95fd8ecfe7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -477,4 +477,19 @@ config ARM_PSCI_DEBUG putc function. Only use for debugging. +config ARM_MODULE_PLTS + bool "Use PLTs to allow loading modules placed far from barebox image" + depends on MODULES + select QSORT + help + Allocate PLTs when loading modules so that jumps and calls whose + targets are too far away for their relative offsets to be encoded + in the instructions themselves can be bounced via veneers in the + module's PLT. The modules will use slightly more memory, but after + rounding up to page size, the actual memory footprint is usually + the same. + + Say y if your memory configuration puts the heap to far away from the + barebox image, causing relocation out of range errors + endmenu diff --git a/arch/arm/Makefile b/arch/arm/Makefile index c18a1d8029..6ba0a62611 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -18,6 +18,10 @@ AS += -EL LD += -EL endif +ifeq ($(CONFIG_ARM_MODULE_PLTS),y) +LDFLAGS_MODULE += -T $(srctree)/arch/arm/lib32/module.lds +endif + # Unaligned access is not supported when MMU is disabled, so given how # at least some of the code would be executed with MMU off, lets be # conservative and instruct the compiler not to generate any unaligned diff --git a/arch/arm/cpu/Kconfig b/arch/arm/cpu/Kconfig index 6b4fed5269..f9f52a6252 100644 --- a/arch/arm/cpu/Kconfig +++ b/arch/arm/cpu/Kconfig @@ -6,6 +6,7 @@ config PHYS_ADDR_T_64BIT config CPU_32 bool select HAS_MODULES + select HAVE_MOD_ARCH_SPECIFIC select HAS_DMA select HAVE_PBL_IMAGE diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index 5b4d1a3f36..3ce39bf82b 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -1,13 +1,34 @@ #ifndef _ASM_ARM_MODULE_H #define _ASM_ARM_MODULE_H -struct mod_arch_specific -{ - int foo; +#include + +struct unwind_table; + +#ifdef CONFIG_ARM_UNWIND +enum { + ARM_SEC_INIT, + ARM_SEC_DEVINIT, + ARM_SEC_CORE, + ARM_SEC_EXIT, + ARM_SEC_DEVEXIT, + ARM_SEC_HOT, + ARM_SEC_UNLIKELY, + ARM_SEC_MAX, +}; +#endif + +struct mod_arch_specific { +#ifdef CONFIG_ARM_UNWIND + struct unwind_table *unwind[ARM_SEC_MAX]; +#endif +#ifdef CONFIG_ARM_MODULE_PLTS + struct elf32_shdr *plt; + int plt_count; +#endif }; -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Ehdr Elf32_Ehdr +struct module; +u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val); #endif /* _ASM_ARM_MODULE_H */ diff --git a/arch/arm/lib32/Makefile b/arch/arm/lib32/Makefile index 597bc07905..ec6a3aea67 100644 --- a/arch/arm/lib32/Makefile +++ b/arch/arm/lib32/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memset.o obj-$(CONFIG_ARM_UNWIND) += unwind.o obj-$(CONFIG_ARM_SEMIHOSTING) += semihosting-trap.o semihosting.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_ARM_MODULE_PLTS) += module-plts.o extra-y += barebox.lds pbl-y += lib1funcs.o diff --git a/arch/arm/lib32/module-plts.c b/arch/arm/lib32/module-plts.c new file mode 100644 index 0000000000..53cf6b11c7 --- /dev/null +++ b/arch/arm/lib32/module-plts.c @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2014-2017 Linaro Ltd. + */ + +#include +#include +#include +#include + +#include + +#define PLT_ENT_STRIDE 32 +#define PLT_ENT_COUNT (PLT_ENT_STRIDE / sizeof(u32)) +#define PLT_ENT_SIZE (sizeof(struct plt_entries) / PLT_ENT_COUNT) + +#ifdef CONFIG_THUMB2_BAREBOX +#define PLT_ENT_LDR __opcode_to_mem_thumb32(0xf8dff000 | \ + (PLT_ENT_STRIDE - 4)) +#else +#define PLT_ENT_LDR __opcode_to_mem_arm(0xe59ff000 | \ + (PLT_ENT_STRIDE - 8)) +#endif + +struct plt_entries { + u32 ldr[PLT_ENT_COUNT]; + u32 lit[PLT_ENT_COUNT]; +}; + +u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) +{ + struct plt_entries *plt = (struct plt_entries *)mod->arch.plt->sh_addr; + int idx = 0; + + /* + * Look for an existing entry pointing to 'val'. Given that the + * relocations are sorted, this will be the last entry we allocated. + * (if one exists). + */ + if (mod->arch.plt_count > 0) { + plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT; + idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT; + + if (plt->lit[idx] == val) + return (u32)&plt->ldr[idx]; + + idx = (idx + 1) % PLT_ENT_COUNT; + if (!idx) + plt++; + } + + mod->arch.plt_count++; + BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size); + + if (!idx) + /* Populate a new set of entries */ + *plt = (struct plt_entries){ + { [0 ... PLT_ENT_COUNT - 1] = PLT_ENT_LDR, }, + { val, } + }; + else + plt->lit[idx] = val; + + return (u32)&plt->ldr[idx]; +} + +#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rel(const void *a, const void *b) +{ + const Elf32_Rel *x = a, *y = b; + int i; + + /* sort by type and symbol index */ + i = cmp_3way(ELF32_R_TYPE(x->r_info), ELF32_R_TYPE(y->r_info)); + if (i == 0) + i = cmp_3way(ELF32_R_SYM(x->r_info), ELF32_R_SYM(y->r_info)); + return i; +} + +static bool is_zero_addend_relocation(Elf32_Addr base, const Elf32_Rel *rel) +{ + u32 *tval = (u32 *)(base + rel->r_offset); + + /* + * Do a bitwise compare on the raw addend rather than fully decoding + * the offset and doing an arithmetic comparison. + * Note that a zero-addend jump/call relocation is encoded taking the + * PC bias into account, i.e., -8 for ARM and -4 for Thumb2. + */ + switch (ELF32_R_TYPE(rel->r_info)) { + u16 upper, lower; + + case R_ARM_THM_CALL: + case R_ARM_THM_JUMP24: + upper = __mem_to_opcode_thumb16(((u16 *)tval)[0]); + lower = __mem_to_opcode_thumb16(((u16 *)tval)[1]); + + return (upper & 0x7ff) == 0x7ff && (lower & 0x2fff) == 0x2ffe; + + case R_ARM_CALL: + case R_ARM_PC24: + case R_ARM_JUMP24: + return (__mem_to_opcode_arm(*tval) & 0xffffff) == 0xfffffe; + } + BUG(); +} + +static bool duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num) +{ + const Elf32_Rel *prev; + + /* + * Entries are sorted by type and symbol index. That means that, + * if a duplicate entry exists, it must be in the preceding + * slot. + */ + if (!num) + return false; + + prev = rel + num - 1; + return cmp_rel(rel + num, prev) == 0 && + is_zero_addend_relocation(base, prev); +} + +/* Count how many PLT entries we may need */ +static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, + const Elf32_Rel *rel, int num, Elf32_Word dstidx) +{ + unsigned int ret = 0; + const Elf32_Sym *s; + int i; + + for (i = 0; i < num; i++) { + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_ARM_CALL: + case R_ARM_PC24: + case R_ARM_JUMP24: + case R_ARM_THM_CALL: + case R_ARM_THM_JUMP24: + /* + * We only have to consider branch targets that resolve + * to symbols that are defined in a different section. + * This is not simply a heuristic, it is a fundamental + * limitation, since there is no guaranteed way to emit + * PLT entries sufficiently close to the branch if the + * section size exceeds the range of a branch + * instruction. So ignore relocations against defined + * symbols if they live in the same section as the + * relocation target. + */ + s = syms + ELF32_R_SYM(rel[i].r_info); + if (s->st_shndx == dstidx) + break; + + /* + * Jump relocations with non-zero addends against + * undefined symbols are supported by the ELF spec, but + * do not occur in practice (e.g., 'jump n bytes past + * the entry point of undefined function symbol f'). + * So we need to support them, but there is no need to + * take them into consideration when trying to optimize + * this code. So let's only check for duplicates when + * the addend is zero. + */ + if (!is_zero_addend_relocation(base, rel + i) || + !duplicate_rel(base, rel, i)) + ret++; + } + } + return ret; +} + +int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + char *secstrings, struct module *mod) +{ + unsigned long plts = 0; + Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum; + Elf32_Sym *syms = NULL; + + /* + * To store the PLTs, we expand the .text section for core module code + * and for initialization code. + */ + for (s = sechdrs; s < sechdrs_end; ++s) { + if (strcmp(".plt", secstrings + s->sh_name) == 0) + mod->arch.plt = s; + else if (s->sh_type == SHT_SYMTAB) + syms = (Elf32_Sym *)s->sh_addr; + } + + if (!mod->arch.plt) { + pr_err("%s: module PLT section missing\n", mod->name); + return -ENOEXEC; + } + if (!syms) { + pr_err("%s: module symtab section missing\n", mod->name); + return -ENOEXEC; + } + + for (s = sechdrs + 1; s < sechdrs_end; ++s) { + Elf32_Rel *rels = (void *)ehdr + s->sh_offset; + int numrels = s->sh_size / sizeof(Elf32_Rel); + Elf32_Shdr *dstsec = sechdrs + s->sh_info; + + if (s->sh_type != SHT_REL) + continue; + + /* ignore relocations that operate on non-exec sections */ + if (!(dstsec->sh_flags & SHF_EXECINSTR)) + continue; + + /* sort by type and symbol index */ + /* n.b. Barebox qsort instead of Linux sort */ + qsort(rels, numrels, sizeof(Elf32_Rel), cmp_rel); + + plts += count_plts(syms, dstsec->sh_addr, rels, numrels, s->sh_info); + } + + mod->arch.plt->sh_type = SHT_NOBITS; + mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.plt->sh_addralign = PLT_ENT_STRIDE; + mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE, + sizeof(struct plt_entries)); + mod->arch.plt_count = 0; + + pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size); + return 0; +} diff --git a/arch/arm/lib32/module.c b/arch/arm/lib32/module.c index be7965d59c..3ded9896b7 100644 --- a/arch/arm/lib32/module.c +++ b/arch/arm/lib32/module.c @@ -64,6 +64,20 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, offset -= 0x04000000; offset += sym->st_value - loc; + + /* + * Route through a PLT entry if 'offset' exceeds the + * supported range. Note that 'offset + loc + 8' + * contains the absolute jump target, i.e., + * @sym + addend, corrected for the +8 PC bias. + */ + if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS) && + (offset <= (s32)0xfe000000 || + offset >= (s32)0x02000000)) + offset = get_module_plt(module, loc, + offset + loc + 8) + - loc - 8; + if (offset & 3 || offset <= (s32)0xfe000000 || offset >= (s32)0x02000000) { diff --git a/arch/arm/lib32/module.lds b/arch/arm/lib32/module.lds new file mode 100644 index 0000000000..0dd204608c --- /dev/null +++ b/arch/arm/lib32/module.lds @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +SECTIONS { + .plt : { BYTE(0) } +} -- cgit v1.2.3