summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/fault.c77
-rw-r--r--arch/powerpc/mm/hash_utils_64.c10
-rw-r--r--arch/powerpc/mm/hugetlbpage.c40
-rw-r--r--arch/powerpc/mm/mem.c4
-rw-r--r--arch/powerpc/mm/mmu_context.c6
-rw-r--r--arch/powerpc/mm/mmu_context_book3s64.c39
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c135
-rw-r--r--arch/powerpc/mm/pgtable-book3s64.c279
-rw-r--r--arch/powerpc/mm/pgtable-hash64.c8
-rw-r--r--arch/powerpc/mm/pgtable-radix.c38
-rw-r--r--arch/powerpc/mm/pgtable.c49
-rw-r--r--arch/powerpc/mm/pgtable_64.c171
-rw-r--r--arch/powerpc/mm/pkeys.c4
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c2
-rw-r--r--arch/powerpc/mm/slb.c21
-rw-r--r--arch/powerpc/mm/subpage-prot.c8
-rw-r--r--arch/powerpc/mm/tlb-radix.c366
-rw-r--r--arch/powerpc/mm/tlb_hash32.c10
18 files changed, 916 insertions, 351 deletions
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index c01d627e687ae..b1ca7a0974e30 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -22,6 +22,7 @@
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/pagemap.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
@@ -66,37 +67,33 @@ static inline bool notify_page_fault(struct pt_regs *regs)
}
/*
- * Check whether the instruction at regs->nip is a store using
+ * Check whether the instruction inst is a store using
* an update addressing form which will update r1.
*/
-static bool store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(unsigned int inst)
{
- unsigned int inst;
-
- if (get_user(inst, (unsigned int __user *)regs->nip))
- return false;
/* check for 1 in the rA field */
if (((inst >> 16) & 0x1f) != 1)
return false;
/* check major opcode */
switch (inst >> 26) {
- case 37: /* stwu */
- case 39: /* stbu */
- case 45: /* sthu */
- case 53: /* stfsu */
- case 55: /* stfdu */
+ case OP_STWU:
+ case OP_STBU:
+ case OP_STHU:
+ case OP_STFSU:
+ case OP_STFDU:
return true;
- case 62: /* std or stdu */
+ case OP_STD: /* std or stdu */
return (inst & 3) == 1;
- case 31:
+ case OP_31:
/* check minor opcode */
switch ((inst >> 1) & 0x3ff) {
- case 181: /* stdux */
- case 183: /* stwux */
- case 247: /* stbux */
- case 439: /* sthux */
- case 695: /* stfsux */
- case 759: /* stfdux */
+ case OP_31_XOP_STDUX:
+ case OP_31_XOP_STWUX:
+ case OP_31_XOP_STBUX:
+ case OP_31_XOP_STHUX:
+ case OP_31_XOP_STFSUX:
+ case OP_31_XOP_STFDUX:
return true;
}
}
@@ -168,6 +165,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
return SIGBUS;
current->thread.trap_nr = BUS_ADRERR;
+ clear_siginfo(&info);
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_ADRERR;
@@ -234,8 +232,8 @@ static bool bad_kernel_fault(bool is_exec, unsigned long error_code,
}
static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
- struct vm_area_struct *vma,
- bool store_update_sp)
+ struct vm_area_struct *vma, unsigned int flags,
+ bool *must_retry)
{
/*
* N.B. The POWER/Open ABI allows programs to access up to
@@ -247,6 +245,7 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
* expand to 1MB without further checks.
*/
if (address + 0x100000 < vma->vm_end) {
+ unsigned int __user *nip = (unsigned int __user *)regs->nip;
/* get user regs even if this fault is in kernel mode */
struct pt_regs *uregs = current->thread.regs;
if (uregs == NULL)
@@ -264,8 +263,22 @@ static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
* between the last mapped region and the stack will
* expand the stack rather than segfaulting.
*/
- if (address + 2048 < uregs->gpr[1] && !store_update_sp)
- return true;
+ if (address + 2048 >= uregs->gpr[1])
+ return false;
+
+ if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
+ access_ok(VERIFY_READ, nip, sizeof(*nip))) {
+ unsigned int inst;
+ int res;
+
+ pagefault_disable();
+ res = __get_user_inatomic(inst, nip);
+ pagefault_enable();
+ if (!res)
+ return !store_updates_sp(inst);
+ *must_retry = true;
+ }
+ return true;
}
return false;
}
@@ -403,7 +416,7 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
int is_user = user_mode(regs);
int is_write = page_fault_is_write(error_code);
int fault, major = 0;
- bool store_update_sp = false;
+ bool must_retry = false;
if (notify_page_fault(regs))
return 0;
@@ -454,9 +467,6 @@ static int __do_page_fault(struct pt_regs *regs, unsigned long address,
* can result in fault, which will cause a deadlock when called with
* mmap_sem held
*/
- if (is_write && is_user)
- store_update_sp = store_updates_sp(regs);
-
if (is_user)
flags |= FAULT_FLAG_USER;
if (is_write)
@@ -503,8 +513,17 @@ retry:
return bad_area(regs, address);
/* The stack is being expanded, check if it's valid */
- if (unlikely(bad_stack_expansion(regs, address, vma, store_update_sp)))
- return bad_area(regs, address);
+ if (unlikely(bad_stack_expansion(regs, address, vma, flags,
+ &must_retry))) {
+ if (!must_retry)
+ return bad_area(regs, address);
+
+ up_read(&mm->mmap_sem);
+ if (fault_in_pages_readable((const char __user *)regs->nip,
+ sizeof(unsigned int)))
+ return bad_area_nosemaphore(regs, address);
+ goto retry;
+ }
/* Try to expand it */
if (unlikely(expand_stack(vma, address)))
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 0bd3790d35df4..8318716e5075a 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -64,6 +64,7 @@
#include <asm/trace.h>
#include <asm/ps3.h>
#include <asm/pte-walk.h>
+#include <asm/asm-prototypes.h>
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
@@ -572,8 +573,10 @@ static void __init htab_scan_page_sizes(void)
}
#ifdef CONFIG_HUGETLB_PAGE
- /* Reserve 16G huge page memory sections for huge pages */
- of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+ if (!hugetlb_disabled) {
+ /* Reserve 16G huge page memory sections for huge pages */
+ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+ }
#endif /* CONFIG_HUGETLB_PAGE */
}
@@ -1010,13 +1013,14 @@ void __init hash__early_init_mmu(void)
*/
__pte_frag_nr = H_PTE_FRAG_NR;
__pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = H_PMD_FRAG_NR;
+ __pmd_frag_size_shift = H_PMD_FRAG_SIZE_SHIFT;
__pte_index_size = H_PTE_INDEX_SIZE;
__pmd_index_size = H_PMD_INDEX_SIZE;
__pud_index_size = H_PUD_INDEX_SIZE;
__pgd_index_size = H_PGD_INDEX_SIZE;
__pud_cache_index = H_PUD_CACHE_INDEX;
- __pmd_cache_index = H_PMD_CACHE_INDEX;
__pte_table_size = H_PTE_TABLE_SIZE;
__pmd_table_size = H_PMD_TABLE_SIZE;
__pud_table_size = H_PUD_TABLE_SIZE;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f1153f8254e3d..7c5f479c5c00f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -35,6 +35,8 @@
#define PAGE_SHIFT_16M 24
#define PAGE_SHIFT_16G 34
+bool hugetlb_disabled = false;
+
unsigned int HPAGE_SHIFT;
EXPORT_SYMBOL(HPAGE_SHIFT);
@@ -50,7 +52,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long s
}
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
- unsigned long address, unsigned pdshift, unsigned pshift)
+ unsigned long address, unsigned int pdshift,
+ unsigned int pshift, spinlock_t *ptl)
{
struct kmem_cache *cachep;
pte_t *new;
@@ -80,8 +83,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
*/
smp_wmb();
- spin_lock(&mm->page_table_lock);
-
+ spin_lock(ptl);
/*
* We have multiple higher-level entries that point to the same
* actual pte location. Fill in each as we go and backtrack on error.
@@ -111,7 +113,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
*hpdp = __hugepd(0);
kmem_cache_free(cachep, new);
}
- spin_unlock(&mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -136,6 +138,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
hugepd_t *hpdp = NULL;
unsigned pshift = __ffs(sz);
unsigned pdshift = PGDIR_SHIFT;
+ spinlock_t *ptl;
addr &= ~(sz-1);
pg = pgd_offset(mm, addr);
@@ -144,39 +147,46 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
if (pshift == PGDIR_SHIFT)
/* 16GB huge page */
return (pte_t *) pg;
- else if (pshift > PUD_SHIFT)
+ else if (pshift > PUD_SHIFT) {
/*
* We need to use hugepd table
*/
+ ptl = &mm->page_table_lock;
hpdp = (hugepd_t *)pg;
- else {
+ } else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
if (pshift == PUD_SHIFT)
return (pte_t *)pu;
- else if (pshift > PMD_SHIFT)
+ else if (pshift > PMD_SHIFT) {
+ ptl = pud_lockptr(mm, pu);
hpdp = (hugepd_t *)pu;
- else {
+ } else {
pdshift = PMD_SHIFT;
pm = pmd_alloc(mm, pu, addr);
if (pshift == PMD_SHIFT)
/* 16MB hugepage */
return (pte_t *)pm;
- else
+ else {
+ ptl = pmd_lockptr(mm, pm);
hpdp = (hugepd_t *)pm;
+ }
}
}
#else
if (pshift >= HUGEPD_PGD_SHIFT) {
+ ptl = &mm->page_table_lock;
hpdp = (hugepd_t *)pg;
} else {
pdshift = PUD_SHIFT;
pu = pud_alloc(mm, pg, addr);
if (pshift >= HUGEPD_PUD_SHIFT) {
+ ptl = pud_lockptr(mm, pu);
hpdp = (hugepd_t *)pu;
} else {
pdshift = PMD_SHIFT;
pm = pmd_alloc(mm, pu, addr);
+ ptl = pmd_lockptr(mm, pm);
hpdp = (hugepd_t *)pm;
}
}
@@ -186,7 +196,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz
BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr,
+ pdshift, pshift, ptl))
return NULL;
return hugepte_offset(*hpdp, addr, pdshift);
@@ -497,6 +508,10 @@ struct page *follow_huge_pd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
retry:
+ /*
+ * hugepage directory entries are protected by mm->page_table_lock
+ * Use this instead of huge_pte_lockptr
+ */
ptl = &mm->page_table_lock;
spin_lock(ptl);
@@ -651,6 +666,11 @@ static int __init hugetlbpage_init(void)
{
int psize;
+ if (hugetlb_disabled) {
+ pr_info("HugeTLB support is disabled!\n");
+ return 0;
+ }
+
#if !defined(CONFIG_PPC_FSL_BOOK3E) && !defined(CONFIG_PPC_8xx)
if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE))
return -ENODEV;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index c3c39b02b2ba7..8cecda4bd66ae 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -509,8 +509,10 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
*/
unsigned long access, trap;
- if (radix_enabled())
+ if (radix_enabled()) {
+ prefetch((void *)address);
return;
+ }
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 0ab297c4cfad1..f84e14f23e50a 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -57,8 +57,10 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* in switch_slb(), and/or the store of paca->mm_ctx_id in
* copy_mm_to_paca().
*
- * On the read side the barrier is in pte_xchg(), which orders
- * the store to the PTE vs the load of mm_cpumask.
+ * On the other side, the barrier is in mm/tlb-radix.c for
+ * radix which orders earlier stores to clear the PTEs vs
+ * the load of mm_cpumask. And pte_xchg which does the same
+ * thing for hash.
*
* This full barrier is needed by membarrier when switching
* between processes after store to rq->curr, before user-space
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index b75194dff64c2..f3d4b4a0e5616 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -159,9 +159,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
mm->context.id = index;
-#ifdef CONFIG_PPC_64K_PAGES
mm->context.pte_frag = NULL;
-#endif
+ mm->context.pmd_frag = NULL;
#ifdef CONFIG_SPAPR_TCE_IOMMU
mm_iommu_init(mm);
#endif
@@ -192,17 +191,11 @@ static void destroy_contexts(mm_context_t *ctx)
spin_unlock(&mmu_context_lock);
}
-#ifdef CONFIG_PPC_64K_PAGES
-static void destroy_pagetable_page(struct mm_struct *mm)
+static void pte_frag_destroy(void *pte_frag)
{
int count;
- void *pte_frag;
struct page *page;
- pte_frag = mm->context.pte_frag;
- if (!pte_frag)
- return;
-
page = virt_to_page(pte_frag);
/* drop all the pending references */
count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
@@ -213,12 +206,34 @@ static void destroy_pagetable_page(struct mm_struct *mm)
}
}
-#else
-static inline void destroy_pagetable_page(struct mm_struct *mm)
+static void pmd_frag_destroy(void *pmd_frag)
{
+ int count;
+ struct page *page;
+
+ page = virt_to_page(pmd_frag);
+ /* drop all the pending references */
+ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
+ /* We allow PTE_FRAG_NR fragments from a PTE page */
+ if (page_ref_sub_and_test(page, PMD_FRAG_NR - count)) {
+ pgtable_pmd_page_dtor(page);
+ free_unref_page(page);
+ }
+}
+
+static void destroy_pagetable_page(struct mm_struct *mm)
+{
+ void *frag;
+
+ frag = mm->context.pte_frag;
+ if (frag)
+ pte_frag_destroy(frag);
+
+ frag = mm->context.pmd_frag;
+ if (frag)
+ pmd_frag_destroy(frag);
return;
}
-#endif
void destroy_context(struct mm_struct *mm)
{
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index be8f5c9d4d088..4d80239ef83c3 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -54,16 +54,44 @@
#include "mmu_decl.h"
-static unsigned int first_context, last_context;
+/*
+ * The MPC8xx has only 16 contexts. We rotate through them on each task switch.
+ * A better way would be to keep track of tasks that own contexts, and implement
+ * an LRU usage. That way very active tasks don't always have to pay the TLB
+ * reload overhead. The kernel pages are mapped shared, so the kernel can run on
+ * behalf of any task that makes a kernel entry. Shared does not mean they are
+ * not protected, just that the ASID comparison is not performed. -- Dan
+ *
+ * The IBM4xx has 256 contexts, so we can just rotate through these as a way of
+ * "switching" contexts. If the TID of the TLB is zero, the PID/TID comparison
+ * is disabled, so we can use a TID of zero to represent all kernel pages as
+ * shared among all contexts. -- Dan
+ *
+ * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We should
+ * normally never have to steal though the facility is present if needed.
+ * -- BenH
+ */
+#define FIRST_CONTEXT 1
+#ifdef DEBUG_CLAMP_LAST_CONTEXT
+#define LAST_CONTEXT DEBUG_CLAMP_LAST_CONTEXT
+#elif defined(CONFIG_PPC_8xx)
+#define LAST_CONTEXT 16
+#elif defined(CONFIG_PPC_47x)
+#define LAST_CONTEXT 65535
+#else
+#define LAST_CONTEXT 255
+#endif
+
static unsigned int next_context, nr_free_contexts;
static unsigned long *context_map;
+#ifdef CONFIG_SMP
static unsigned long *stale_map[NR_CPUS];
+#endif
static struct mm_struct **context_mm;
static DEFINE_RAW_SPINLOCK(context_lock);
-static bool no_selective_tlbil;
#define CTX_MAP_SIZE \
- (sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
+ (sizeof(unsigned long) * (LAST_CONTEXT / BITS_PER_LONG + 1))
/* Steal a context from a task that has one at the moment.
@@ -87,7 +115,7 @@ static unsigned int steal_context_smp(unsigned int id)
struct mm_struct *mm;
unsigned int cpu, max, i;
- max = last_context - first_context;
+ max = LAST_CONTEXT - FIRST_CONTEXT;
/* Attempt to free next_context first and then loop until we manage */
while (max--) {
@@ -99,8 +127,8 @@ static unsigned int steal_context_smp(unsigned int id)
*/
if (mm->context.active) {
id++;
- if (id > last_context)
- id = first_context;
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
continue;
}
pr_hardcont(" | steal %d from 0x%p", id, mm);
@@ -139,10 +167,12 @@ static unsigned int steal_context_smp(unsigned int id)
static unsigned int steal_all_contexts(void)
{
struct mm_struct *mm;
+#ifdef CONFIG_SMP
int cpu = smp_processor_id();
+#endif
unsigned int id;
- for (id = first_context; id <= last_context; id++) {
+ for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
/* Pick up the victim mm */
mm = context_mm[id];
@@ -150,22 +180,24 @@ static unsigned int steal_all_contexts(void)
/* Mark this mm as having no context anymore */
mm->context.id = MMU_NO_CONTEXT;
- if (id != first_context) {
+ if (id != FIRST_CONTEXT) {
context_mm[id] = NULL;
__clear_bit(id, context_map);
#ifdef DEBUG_MAP_CONSISTENCY
mm->context.active = 0;
#endif
}
+#ifdef CONFIG_SMP
__clear_bit(id, stale_map[cpu]);
+#endif
}
/* Flush the TLB for all contexts (not to be used on SMP) */
_tlbil_all();
- nr_free_contexts = last_context - first_context;
+ nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT;
- return first_context;
+ return FIRST_CONTEXT;
}
/* Note that this will also be called on SMP if all other CPUs are
@@ -176,7 +208,9 @@ static unsigned int steal_all_contexts(void)
static unsigned int steal_context_up(unsigned int id)
{
struct mm_struct *mm;
+#ifdef CONFIG_SMP
int cpu = smp_processor_id();
+#endif
/* Pick up the victim mm */
mm = context_mm[id];
@@ -190,7 +224,9 @@ static unsigned int steal_context_up(unsigned int id)
mm->context.id = MMU_NO_CONTEXT;
/* XXX This clear should ultimately be part of local_flush_tlb_mm */
+#ifdef CONFIG_SMP
__clear_bit(id, stale_map[cpu]);
+#endif
return id;
}
@@ -201,7 +237,7 @@ static void context_check_map(void)
unsigned int id, nrf, nact;
nrf = nact = 0;
- for (id = first_context; id <= last_context; id++) {
+ for (id = FIRST_CONTEXT; id <= LAST_CONTEXT; id++) {
int used = test_bit(id, context_map);
if (!used)
nrf++;
@@ -219,7 +255,7 @@ static void context_check_map(void)
if (nact > num_online_cpus())
pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
nact, num_online_cpus());
- if (first_context > 0 && !test_bit(0, context_map))
+ if (FIRST_CONTEXT > 0 && !test_bit(0, context_map))
pr_err("MMU: Context 0 has been freed !!!\n");
}
#else
@@ -229,7 +265,10 @@ static void context_check_map(void) { }
void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
- unsigned int i, id, cpu = smp_processor_id();
+ unsigned int id;
+#ifdef CONFIG_SMP
+ unsigned int i, cpu = smp_processor_id();
+#endif
unsigned long *map;
/* No lockless fast path .. yet */
@@ -263,8 +302,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
/* We really don't have a context, let's try to acquire one */
id = next_context;
- if (id > last_context)
- id = first_context;
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
map = context_map;
/* No more free contexts, let's try to steal one */
@@ -277,7 +316,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
goto stolen;
}
#endif /* CONFIG_SMP */
- if (no_selective_tlbil)
+ if (IS_ENABLED(CONFIG_PPC_8xx))
id = steal_all_contexts();
else
id = steal_context_up(id);
@@ -287,9 +326,9 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
/* We know there's at least one free context, try to find it */
while (__test_and_set_bit(id, map)) {
- id = find_next_zero_bit(map, last_context+1, id);
- if (id > last_context)
- id = first_context;
+ id = find_next_zero_bit(map, LAST_CONTEXT+1, id);
+ if (id > LAST_CONTEXT)
+ id = FIRST_CONTEXT;
}
stolen:
next_context = id + 1;
@@ -303,6 +342,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
/* If that context got marked stale on this CPU, then flush the
* local TLB for it and unmark it before we use it
*/
+#ifdef CONFIG_SMP
if (test_bit(id, stale_map[cpu])) {
pr_hardcont(" | stale flush %d [%d..%d]",
id, cpu_first_thread_sibling(cpu),
@@ -317,6 +357,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next,
__clear_bit(id, stale_map[i]);
}
}
+#endif
/* Flick the MMU and release lock */
pr_hardcont(" -> %d\n", id);
@@ -418,51 +459,11 @@ void __init mmu_context_init(void)
init_mm.context.active = NR_CPUS;
/*
- * The MPC8xx has only 16 contexts. We rotate through them on each
- * task switch. A better way would be to keep track of tasks that
- * own contexts, and implement an LRU usage. That way very active
- * tasks don't always have to pay the TLB reload overhead. The
- * kernel pages are mapped shared, so the kernel can run on behalf
- * of any task that makes a kernel entry. Shared does not mean they
- * are not protected, just that the ASID comparison is not performed.
- * -- Dan
- *
- * The IBM4xx has 256 contexts, so we can just rotate through these
- * as a way of "switching" contexts. If the TID of the TLB is zero,
- * the PID/TID comparison is disabled, so we can use a TID of zero
- * to represent all kernel pages as shared among all contexts.
- * -- Dan
- *
- * The IBM 47x core supports 16-bit PIDs, thus 65535 contexts. We
- * should normally never have to steal though the facility is
- * present if needed.
- * -- BenH
- */
- if (mmu_has_feature(MMU_FTR_TYPE_8xx)) {
- first_context = 1;
- last_context = 16;
- no_selective_tlbil = true;
- } else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
- first_context = 1;
- last_context = 65535;
- no_selective_tlbil = false;
- } else {
- first_context = 1;
- last_context = 255;
- no_selective_tlbil = false;
- }
-
-#ifdef DEBUG_CLAMP_LAST_CONTEXT
- last_context = DEBUG_CLAMP_LAST_CONTEXT;
-#endif
- /*
* Allocate the maps used by context management
*/
context_map = memblock_virt_alloc(CTX_MAP_SIZE, 0);
- context_mm = memblock_virt_alloc(sizeof(void *) * (last_context + 1), 0);
-#ifndef CONFIG_SMP
- stale_map[0] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
-#else
+ context_mm = memblock_virt_alloc(sizeof(void *) * (LAST_CONTEXT + 1), 0);
+#ifdef CONFIG_SMP
stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE,
@@ -472,17 +473,17 @@ void __init mmu_context_init(void)
printk(KERN_INFO
"MMU: Allocated %zu bytes of context maps for %d contexts\n",
- 2 * CTX_MAP_SIZE + (sizeof(void *) * (last_context + 1)),
- last_context - first_context + 1);
+ 2 * CTX_MAP_SIZE + (sizeof(void *) * (LAST_CONTEXT + 1)),
+ LAST_CONTEXT - FIRST_CONTEXT + 1);
/*
* Some processors have too few contexts to reserve one for
* init_mm, and require using context 0 for a normal task.
* Other processors reserve the use of context zero for the kernel.
- * This code assumes first_context < 32.
+ * This code assumes FIRST_CONTEXT < 32.
*/
- context_map[0] = (1 << first_context) - 1;
- next_context = first_context;
- nr_free_contexts = last_context - first_context + 1;
+ context_map[0] = (1 << FIRST_CONTEXT) - 1;
+ next_context = FIRST_CONTEXT;
+ nr_free_contexts = LAST_CONTEXT - FIRST_CONTEXT + 1;
}
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 518518fb7c45a..c1f4ca45c93a4 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -9,14 +9,22 @@
#include <linux/sched.h>
#include <linux/mm_types.h>
+#include <linux/memblock.h>
#include <misc/cxl-base.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
+#include <asm/trace.h>
+#include <asm/powernv.h>
#include "mmu_decl.h"
#include <trace/events/thp.h>
+unsigned long __pmd_frag_nr;
+EXPORT_SYMBOL(__pmd_frag_nr);
+unsigned long __pmd_frag_size_shift;
+EXPORT_SYMBOL(__pmd_frag_size_shift);
+
int (*register_process_table)(unsigned long base, unsigned long page_size,
unsigned long tbl_size);
@@ -34,13 +42,16 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
int changed;
#ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(&vma->vm_mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(vma->vm_mm, pmdp));
#endif
changed = !pmd_same(*(pmdp), entry);
if (changed) {
- __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp),
- pmd_pte(entry), address);
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ /*
+ * We can use MMU_PAGE_2M here, because only radix
+ * path look at the psize.
+ */
+ __ptep_set_access_flags(vma, pmdp_ptep(pmdp),
+ pmd_pte(entry), address, MMU_PAGE_2M);
}
return changed;
}
@@ -59,7 +70,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
{
#ifdef CONFIG_DEBUG_VM
WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
WARN_ON(!(pmd_trans_huge(pmd) || pmd_devmap(pmd)));
#endif
trace_hugepage_set_pmd(addr, pmd_val(pmd));
@@ -141,7 +152,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd)
{
- return;
+ if (radix_enabled())
+ prefetch((void *)addr);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -171,3 +183,258 @@ int __meminit remove_section_mapping(unsigned long start, unsigned long end)
return hash__remove_section_mapping(start, end);
}
#endif /* CONFIG_MEMORY_HOTPLUG */
+
+void __init mmu_partition_table_init(void)
+{
+ unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+ unsigned long ptcr;
+
+ BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
+ partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
+ MEMBLOCK_ALLOC_ANYWHERE));
+
+ /* Initialize the Partition Table with no entries */
+ memset((void *)partition_tb, 0, patb_size);
+
+ /*
+ * update partition table control register,
+ * 64 K size.
+ */
+ ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
+ mtspr(SPRN_PTCR, ptcr);
+ powernv_set_nmmu_ptcr(ptcr);
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+ unsigned long dw1)
+{
+ unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
+
+ partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+ partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+ /*
+ * Global flush of TLBs and partition table caches for this lpid.
+ * The type of flush (hash or radix) depends on what the previous
+ * use of this partition ID was, not the new use.
+ */
+ asm volatile("ptesync" : : : "memory");
+ if (old & PATB_HR) {
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
+ } else {
+ asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+ "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+ trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
+ }
+ /* do we need fixup here ?*/
+ asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+
+static pmd_t *get_pmd_from_cache(struct mm_struct *mm)
+{
+ void *pmd_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pmd_frag;
+ if (ret) {
+ pmd_frag = ret + PMD_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pmd_frag & ~PAGE_MASK) == 0)
+ pmd_frag = NULL;
+ mm->context.pmd_frag = pmd_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pmd_t *)ret;
+}
+
+static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm)
+{
+ void *ret = NULL;
+ struct page *page;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
+
+ if (mm == &init_mm)
+ gfp &= ~__GFP_ACCOUNT;
+ page = alloc_page(gfp);
+ if (!page)
+ return NULL;
+ if (!pgtable_pmd_page_ctor(page)) {
+ __free_pages(page, 0);
+ return NULL;
+ }
+
+ ret = page_address(page);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PMD_FRAG_NR == 1)
+ return ret;
+
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pmd_frag)) {
+ set_page_count(page, PMD_FRAG_NR);
+ mm->context.pmd_frag = ret + PMD_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pmd_t *)ret;
+}
+
+pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr)
+{
+ pmd_t *pmd;
+
+ pmd = get_pmd_from_cache(mm);
+ if (pmd)
+ return pmd;
+
+ return __alloc_for_pmdcache(mm);
+}
+
+void pmd_fragment_free(unsigned long *pmd)
+{
+ struct page *page = virt_to_page(pmd);
+
+ if (put_page_testzero(page)) {
+ pgtable_pmd_page_dtor(page);
+ free_unref_page(page);
+ }
+}
+
+static pte_t *get_pte_from_cache(struct mm_struct *mm)
+{
+ void *pte_frag, *ret;
+
+ spin_lock(&mm->page_table_lock);
+ ret = mm->context.pte_frag;
+ if (ret) {
+ pte_frag = ret + PTE_FRAG_SIZE;
+ /*
+ * If we have taken up all the fragments mark PTE page NULL
+ */
+ if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
+ pte_frag = NULL;
+ mm->context.pte_frag = pte_frag;
+ }
+ spin_unlock(&mm->page_table_lock);
+ return (pte_t *)ret;
+}
+
+static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel)
+{
+ void *ret = NULL;
+ struct page *page;
+
+ if (!kernel) {
+ page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
+ if (!page)
+ return NULL;
+ if (!pgtable_page_ctor(page)) {
+ __free_page(page);
+ return NULL;
+ }
+ } else {
+ page = alloc_page(PGALLOC_GFP);
+ if (!page)
+ return NULL;
+ }
+
+
+ ret = page_address(page);
+ /*
+ * if we support only one fragment just return the
+ * allocated page.
+ */
+ if (PTE_FRAG_NR == 1)
+ return ret;
+ spin_lock(&mm->page_table_lock);
+ /*
+ * If we find pgtable_page set, we return
+ * the allocated page with single fragement
+ * count.
+ */
+ if (likely(!mm->context.pte_frag)) {
+ set_page_count(page, PTE_FRAG_NR);
+ mm->context.pte_frag = ret + PTE_FRAG_SIZE;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return (pte_t *)ret;
+}
+
+pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
+{
+ pte_t *pte;
+
+ pte = get_pte_from_cache(mm);
+ if (pte)
+ return pte;
+
+ return __alloc_for_ptecache(mm, kernel);
+}
+
+void pte_fragment_free(unsigned long *table, int kernel)
+{
+ struct page *page = virt_to_page(table);
+
+ if (put_page_testzero(page)) {
+ if (!kernel)
+ pgtable_page_dtor(page);
+ free_unref_page(page);
+ }
+}
+
+static inline void pgtable_free(void *table, int index)
+{
+ switch (index) {
+ case PTE_INDEX:
+ pte_fragment_free(table, 0);
+ break;
+ case PMD_INDEX:
+ pmd_fragment_free(table);
+ break;
+ case PUD_INDEX:
+ kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table);
+ break;
+ /* We don't free pgd table via RCU callback */
+ default:
+ BUG();
+ }
+}
+
+#ifdef CONFIG_SMP
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+ unsigned long pgf = (unsigned long)table;
+
+ BUG_ON(index > MAX_PGTABLE_INDEX_SIZE);
+ pgf |= index;
+ tlb_remove_table(tlb, (void *)pgf);
+}
+
+void __tlb_remove_table(void *_table)
+{
+ void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
+ unsigned int index = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
+
+ return pgtable_free(table, index);
+}
+#else
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
+{
+ return pgtable_free(table, index);
+}
+#endif
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 199bfda5f0d96..692bfc9e372cb 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -193,7 +193,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr
#ifdef CONFIG_DEBUG_VM
WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
__asm__ __volatile__(
@@ -265,7 +265,8 @@ void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
/*
* we store the pgtable in the second half of PMD
*/
@@ -285,7 +286,8 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pgtable_t pgtable;
pgtable_t *pgtable_slot;
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
+
pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD;
pgtable = *pgtable_slot;
/*
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index f1891e215e39e..96f68c5aa1f5b 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -617,7 +617,6 @@ void __init radix__early_init_mmu(void)
__pud_index_size = RADIX_PUD_INDEX_SIZE;
__pgd_index_size = RADIX_PGD_INDEX_SIZE;
__pud_cache_index = RADIX_PUD_INDEX_SIZE;
- __pmd_cache_index = RADIX_PMD_INDEX_SIZE;
__pte_table_size = RADIX_PTE_TABLE_SIZE;
__pmd_table_size = RADIX_PMD_TABLE_SIZE;
__pud_table_size = RADIX_PUD_TABLE_SIZE;
@@ -640,6 +639,8 @@ void __init radix__early_init_mmu(void)
#endif
__pte_frag_nr = RADIX_PTE_FRAG_NR;
__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
+ __pmd_frag_nr = RADIX_PMD_FRAG_NR;
+ __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
if (!firmware_has_feature(FW_FEATURE_LPAR)) {
radix_init_native();
@@ -975,7 +976,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add
#ifdef CONFIG_DEBUG_VM
WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
- assert_spin_locked(&mm->page_table_lock);
+ assert_spin_locked(pmd_lockptr(mm, pmdp));
#endif
old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
@@ -1083,3 +1084,36 @@ int radix__has_transparent_hugepage(void)
return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
+ pte_t entry, unsigned long address, int psize)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
+ _PAGE_RW | _PAGE_EXEC);
+ /*
+ * To avoid NMMU hang while relaxing access, we need mark
+ * the pte invalid in between.
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
+ atomic_read(&mm->context.copros) > 0) {
+ unsigned long old_pte, new_pte;
+
+ old_pte = __radix_pte_update(ptep, ~0, 0);
+ /*
+ * new value of pte
+ */
+ new_pte = old_pte | set;
+ radix__flush_tlb_page_psize(mm, address, psize);
+ __radix_pte_update(ptep, 0, new_pte);
+ } else {
+ __radix_pte_update(ptep, 0, set);
+ /*
+ * Book3S does not require a TLB flush when relaxing access
+ * restrictions when the address space is not attached to a
+ * NMMU, because the core MMU will reload the pte after taking
+ * an access fault, which is defined by the architectue.
+ */
+ }
+ /* See ptesync comment in radix__set_pte_at */
+}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 9f361ae571e95..d71c7777669cd 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -221,14 +221,55 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
entry = set_access_flags_filter(entry, vma, dirty);
changed = !pte_same(*(ptep), entry);
if (changed) {
- if (!is_vm_hugetlb_page(vma))
- assert_pte_locked(vma->vm_mm, address);
- __ptep_set_access_flags(vma->vm_mm, ptep, entry, address);
- flush_tlb_page(vma, address);
+ assert_pte_locked(vma->vm_mm, address);
+ __ptep_set_access_flags(vma, ptep, entry,
+ address, mmu_virtual_psize);
}
return changed;
}
+#ifdef CONFIG_HUGETLB_PAGE
+extern int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+#ifdef HUGETLB_NEED_PRELOAD
+ /*
+ * The "return 1" forces a call of update_mmu_cache, which will write a
+ * TLB entry. Without this, platforms that don't do a write of the TLB
+ * entry in the TLB miss handler asm will fault ad infinitum.
+ */
+ ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+ return 1;
+#else
+ int changed, psize;
+
+ pte = set_access_flags_filter(pte, vma, dirty);
+ changed = !pte_same(*(ptep), pte);
+ if (changed) {
+
+#ifdef CONFIG_PPC_BOOK3S_64
+ struct hstate *h = hstate_vma(vma);
+
+ psize = hstate_get_psize(h);
+#ifdef CONFIG_DEBUG_VM
+ assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
+#endif
+
+#else
+ /*
+ * Not used on non book3s64 platforms. But 8xx
+ * can possibly use tsize derived from hstate.
+ */
+ psize = 0;
+#endif
+ __ptep_set_access_flags(vma, ptep, pte, addr, psize);
+ }
+ return changed;
+#endif
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
#ifdef CONFIG_DEBUG_VM
void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
{
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 9bf659d5078c8..53e9eeecd5d44 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,7 +33,6 @@
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
-#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
@@ -47,13 +46,11 @@
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
-#include <asm/trace.h>
#include <asm/processor.h>
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/firmware.h>
#include <asm/dma.h>
-#include <asm/powernv.h>
#include "mmu_decl.h"
@@ -75,8 +72,6 @@ unsigned long __pud_index_size;
EXPORT_SYMBOL(__pud_index_size);
unsigned long __pgd_index_size;
EXPORT_SYMBOL(__pgd_index_size);
-unsigned long __pmd_cache_index;
-EXPORT_SYMBOL(__pmd_cache_index);
unsigned long __pud_cache_index;
EXPORT_SYMBOL(__pud_cache_index);
unsigned long __pte_table_size;
@@ -316,172 +311,6 @@ struct page *pmd_page(pmd_t pmd)
return virt_to_page(pmd_page_vaddr(pmd));
}
-#ifdef CONFIG_PPC_64K_PAGES
-static pte_t *get_from_cache(struct mm_struct *mm)
-{
- void *pte_frag, *ret;
-
- spin_lock(&mm->page_table_lock);
- ret = mm->context.pte_frag;
- if (ret) {
- pte_frag = ret + PTE_FRAG_SIZE;
- /*
- * If we have taken up all the fragments mark PTE page NULL
- */
- if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
- pte_frag = NULL;
- mm->context.pte_frag = pte_frag;
- }
- spin_unlock(&mm->page_table_lock);
- return (pte_t *)ret;
-}
-
-static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
-{
- void *ret = NULL;
- struct page *page;
-
- if (!kernel) {
- page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT);
- if (!page)
- return NULL;
- if (!pgtable_page_ctor(page)) {
- __free_page(page);
- return NULL;
- }
- } else {
- page = alloc_page(PGALLOC_GFP);
- if (!page)
- return NULL;
- }
-
- ret = page_address(page);
- spin_lock(&mm->page_table_lock);
- /*
- * If we find pgtable_page set, we return
- * the allocated page with single fragement
- * count.
- */
- if (likely(!mm->context.pte_frag)) {
- set_page_count(page, PTE_FRAG_NR);
- mm->context.pte_frag = ret + PTE_FRAG_SIZE;
- }
- spin_unlock(&mm->page_table_lock);
-
- return (pte_t *)ret;
-}
-
-pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
-{
- pte_t *pte;
-
- pte = get_from_cache(mm);
- if (pte)
- return pte;
-
- return __alloc_for_cache(mm, kernel);
-}
-#endif /* CONFIG_PPC_64K_PAGES */
-
-void pte_fragment_free(unsigned long *table, int kernel)
-{
- struct page *page = virt_to_page(table);
- if (put_page_testzero(page)) {
- if (!kernel)
- pgtable_page_dtor(page);
- free_unref_page(page);
- }
-}
-
-#ifdef CONFIG_SMP
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
- unsigned long pgf = (unsigned long)table;
-
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- pgf |= shift;
- tlb_remove_table(tlb, (void *)pgf);
-}
-
-void __tlb_remove_table(void *_table)
-{
- void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
- unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
-
- if (!shift)
- /* PTE page needs special handling */
- pte_fragment_free(table, 0);
- else {
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(shift), table);
- }
-}
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
-{
- if (!shift) {
- /* PTE page needs special handling */
- pte_fragment_free(table, 0);
- } else {
- BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
- kmem_cache_free(PGT_CACHE(shift), table);
- }
-}
-#endif
-
-#ifdef CONFIG_PPC_BOOK3S_64
-void __init mmu_partition_table_init(void)
-{
- unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
- unsigned long ptcr;
-
- BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
- partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
- MEMBLOCK_ALLOC_ANYWHERE));
-
- /* Initialize the Partition Table with no entries */
- memset((void *)partition_tb, 0, patb_size);
-
- /*
- * update partition table control register,
- * 64 K size.
- */
- ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12);
- mtspr(SPRN_PTCR, ptcr);
- powernv_set_nmmu_ptcr(ptcr);
-}
-
-void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
- unsigned long dw1)
-{
- unsigned long old = be64_to_cpu(partition_tb[lpid].patb0);
-
- partition_tb[lpid].patb0 = cpu_to_be64(dw0);
- partition_tb[lpid].patb1 = cpu_to_be64(dw1);
-
- /*
- * Global flush of TLBs and partition table caches for this lpid.
- * The type of flush (hash or radix) depends on what the previous
- * use of this partition ID was, not the new use.
- */
- asm volatile("ptesync" : : : "memory");
- if (old & PATB_HR) {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 1);
- } else {
- asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
- "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
- trace_tlbie(lpid, 0, TLBIEL_INVAL_SET_LPID, lpid, 2, 0, 0);
- }
- /* do we need fixup here ?*/
- asm volatile("eieio; tlbsync; ptesync" : : : "memory");
-}
-EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
-#endif /* CONFIG_PPC_BOOK3S_64 */
-
#ifdef CONFIG_STRICT_KERNEL_RWX
void mark_rodata_ro(void)
{
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 0eafdf01edc7d..e6f500fabf5ea 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -383,9 +383,9 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
{
/*
* If the currently associated pkey is execute-only, but the requested
- * protection requires read or write, move it back to the default pkey.
+ * protection is not execute-only, move it back to the default pkey.
*/
- if (vma_is_pkey_exec_only(vma) && (prot & (PROT_READ | PROT_WRITE)))
+ if (vma_is_pkey_exec_only(vma) && (prot != PROT_EXEC))
return 0;
/*
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index 2a049fb8523d5..bea6c544e38f9 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -167,7 +167,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
{
pmd_t *pmd;
- if (Hash == 0)
+ if (!Hash)
return;
pmd = pmd_offset(pud_offset(pgd_offset(mm, ea), ea), ea);
if (!pmd_none(*pmd))
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 66577cc66dc9f..cb796724a6fc7 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -63,14 +63,14 @@ static inline void slb_shadow_update(unsigned long ea, int ssize,
* updating it. No write barriers are needed here, provided
* we only update the current CPU's SLB shadow buffer.
*/
- p->save_area[index].esid = 0;
- p->save_area[index].vsid = cpu_to_be64(mk_vsid_data(ea, ssize, flags));
- p->save_area[index].esid = cpu_to_be64(mk_esid_data(ea, ssize, index));
+ WRITE_ONCE(p->save_area[index].esid, 0);
+ WRITE_ONCE(p->save_area[index].vsid, cpu_to_be64(mk_vsid_data(ea, ssize, flags)));
+ WRITE_ONCE(p->save_area[index].esid, cpu_to_be64(mk_esid_data(ea, ssize, index)));
}
static inline void slb_shadow_clear(enum slb_index index)
{
- get_slb_shadow()->save_area[index].esid = 0;
+ WRITE_ONCE(get_slb_shadow()->save_area[index].esid, 0);
}
static inline void create_shadowed_slbe(unsigned long ea, int ssize,
@@ -352,6 +352,14 @@ static void insert_slb_entry(unsigned long vsid, unsigned long ea,
/*
* We are irq disabled, hence should be safe to access PACA.
*/
+ VM_WARN_ON(!irqs_disabled());
+
+ /*
+ * We can't take a PMU exception in the following code, so hard
+ * disable interrupts.
+ */
+ hard_irq_disable();
+
index = get_paca()->stab_rr;
/*
@@ -369,6 +377,11 @@ static void insert_slb_entry(unsigned long vsid, unsigned long ea,
((unsigned long) ssize << SLB_VSID_SSIZE_SHIFT);
esid_data = mk_esid_data(ea, ssize, index);
+ /*
+ * No need for an isync before or after this slbmte. The exception
+ * we enter with and the rfid we exit with are context synchronizing.
+ * Also we only handle user segments here.
+ */
asm volatile("slbmte %0, %1" : : "r" (vsid_data), "r" (esid_data)
: "memory");
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index f14a07c2fb903..75cb646a79c38 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -13,6 +13,7 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
@@ -185,7 +186,11 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr,
* in a 2-bit field won't allow writes to a page that is otherwise
* write-protected.
*/
-long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpragmas"
+#pragma GCC diagnostic ignored "-Wattribute-alias"
+SYSCALL_DEFINE3(subpage_prot, unsigned long, addr,
+ unsigned long, len, u32 __user *, map)
{
struct mm_struct *mm = current->mm;
struct subpage_prot_table *spt = &mm->context.spt;
@@ -267,3 +272,4 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
up_write(&mm->mmap_sem);
return err;
}
+#pragma GCC diagnostic pop
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index a5d7309c2d055..67a6e86d3e7ef 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -12,6 +12,8 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/memblock.h>
+#include <linux/mmu_context.h>
+#include <linux/sched/mm.h>
#include <asm/ppc-opcode.h>
#include <asm/tlb.h>
@@ -118,6 +120,53 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static inline void __tlbiel_lpid(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
+static inline void __tlbiel_lpid_guest(unsigned long lpid, int set,
+ unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rb |= set << PPC_BITLSHIFT(51);
+ rs = 0; /* LPID comes from LPIDR */
+ prs = 1; /* process scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
+}
+
+
static inline void __tlbiel_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
@@ -150,6 +199,22 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
+static inline void __tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long ap, unsigned long ric)
+{
+ unsigned long rb,rs,prs,r;
+
+ rb = va & ~(PPC_BITMASK(52, 63));
+ rb |= ap << PPC_BITLSHIFT(58);
+ rs = lpid;
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+ trace_tlbie(lpid, 0, rb, rs, ric, prs, r);
+}
+
static inline void fixup_tlbie(void)
{
unsigned long pid = 0;
@@ -161,6 +226,16 @@ static inline void fixup_tlbie(void)
}
}
+static inline void fixup_tlbie_lpid(unsigned long lpid)
+{
+ unsigned long va = ((1UL << 52) - 1);
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, mmu_get_ap(MMU_PAGE_64K), RIC_FLUSH_TLB);
+ }
+}
+
/*
* We use 128 set in radix mode and 256 set in hpt mode.
*/
@@ -214,6 +289,86 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbiel_lpid(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void _tlbie_lpid(unsigned long lpid, unsigned long ric)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Workaround the fact that the "ric" argument to __tlbie_pid
+ * must be a compile-time contraint to match the "i" constraint
+ * in the asm statement.
+ */
+ switch (ric) {
+ case RIC_FLUSH_TLB:
+ __tlbie_lpid(lpid, RIC_FLUSH_TLB);
+ break;
+ case RIC_FLUSH_PWC:
+ __tlbie_lpid(lpid, RIC_FLUSH_PWC);
+ break;
+ case RIC_FLUSH_ALL:
+ default:
+ __tlbie_lpid(lpid, RIC_FLUSH_ALL);
+ }
+ fixup_tlbie_lpid(lpid);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+static inline void _tlbiel_lpid_guest(unsigned long lpid, unsigned long ric)
+{
+ int set;
+
+ VM_BUG_ON(mfspr(SPRN_LPID) != lpid);
+
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_lpid_guest(lpid, 0, ric);
+
+ /* For PWC, only one flush is needed */
+ if (ric == RIC_FLUSH_PWC) {
+ asm volatile("ptesync": : :"memory");
+ return;
+ }
+
+ /* For the remaining sets, just flush the TLB */
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
+ __tlbiel_lpid_guest(lpid, set, RIC_FLUSH_TLB);
+
+ asm volatile("ptesync": : :"memory");
+}
+
+
static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize)
@@ -268,6 +423,17 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
+static inline void _tlbie_lpid_va(unsigned long va, unsigned long lpid,
+ unsigned long psize, unsigned long ric)
+{
+ unsigned long ap = mmu_get_ap(psize);
+
+ asm volatile("ptesync": : :"memory");
+ __tlbie_lpid_va(va, lpid, ap, ric);
+ fixup_tlbie_lpid(lpid);
+ asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
static inline void _tlbie_va_range(unsigned long start, unsigned long end,
unsigned long pid, unsigned long page_size,
unsigned long psize, bool also_pwc)
@@ -340,6 +506,15 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
}
EXPORT_SYMBOL(radix__local_flush_tlb_page);
+static bool mm_is_singlethreaded(struct mm_struct *mm)
+{
+ if (atomic_read(&mm->context.copros) > 0)
+ return false;
+ if (atomic_read(&mm->mm_users) <= 1 && current->mm == mm)
+ return true;
+ return false;
+}
+
static bool mm_needs_flush_escalation(struct mm_struct *mm)
{
/*
@@ -347,10 +522,47 @@ static bool mm_needs_flush_escalation(struct mm_struct *mm)
* caching PTEs and not flushing them properly when
* RIC = 0 for a PID/LPID invalidate
*/
- return atomic_read(&mm->context.copros) != 0;
+ if (atomic_read(&mm->context.copros) > 0)
+ return true;
+ return false;
}
#ifdef CONFIG_SMP
+static void do_exit_flush_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+ unsigned long pid = mm->context.id;
+
+ if (current->mm == mm)
+ return; /* Local CPU */
+
+ if (current->active_mm == mm) {
+ /*
+ * Must be a kernel thread because sender is single-threaded.
+ */
+ BUG_ON(current->mm);
+ mmgrab(&init_mm);
+ switch_mm(mm, &init_mm, current);
+ current->active_mm = &init_mm;
+ mmdrop(mm);
+ }
+ _tlbiel_pid(pid, RIC_FLUSH_ALL);
+}
+
+static void exit_flush_lazy_tlbs(struct mm_struct *mm)
+{
+ /*
+ * Would be nice if this was async so it could be run in
+ * parallel with our local flush, but generic code does not
+ * give a good API for it. Could extend the generic code or
+ * make a special powerpc IPI for flushing TLBs.
+ * For now it's not too performance critical.
+ */
+ smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
+ (void *)mm, 1);
+ mm_reset_thread_local(mm);
+}
+
void radix__flush_tlb_mm(struct mm_struct *mm)
{
unsigned long pid;
@@ -360,18 +572,30 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
return;
preempt_disable();
+ /*
+ * Order loads of mm_cpumask vs previous stores to clear ptes before
+ * the invalidate. See barrier in switch_mm_irqs_off
+ */
+ smp_mb();
if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+
if (mm_needs_flush_escalation(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
- } else
+ } else {
+local:
_tlbiel_pid(pid, RIC_FLUSH_TLB);
+ }
preempt_enable();
}
EXPORT_SYMBOL(radix__flush_tlb_mm);
-void radix__flush_all_mm(struct mm_struct *mm)
+static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
{
unsigned long pid;
@@ -380,12 +604,25 @@ void radix__flush_all_mm(struct mm_struct *mm)
return;
preempt_disable();
- if (!mm_is_thread_local(mm))
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (!fullmm) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
+ }
_tlbie_pid(pid, RIC_FLUSH_ALL);
- else
+ } else {
+local:
_tlbiel_pid(pid, RIC_FLUSH_ALL);
+ }
preempt_enable();
}
+void radix__flush_all_mm(struct mm_struct *mm)
+{
+ __flush_all_mm(mm, false);
+}
EXPORT_SYMBOL(radix__flush_all_mm);
void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
@@ -404,10 +641,17 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
return;
preempt_disable();
- if (!mm_is_thread_local(mm))
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
- else
+ } else {
+local:
_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
+ }
preempt_enable();
}
@@ -466,14 +710,22 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
return;
preempt_disable();
- if (mm_is_thread_local(mm)) {
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- } else {
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (end != TLB_FLUSH_ALL) {
+ exit_flush_lazy_tlbs(mm);
+ goto is_local;
+ }
+ }
local = false;
full = (end == TLB_FLUSH_ALL ||
nr_pages > tlb_single_page_flush_ceiling);
+ } else {
+is_local:
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
}
if (full) {
@@ -534,6 +786,49 @@ static int radix_get_mmu_psize(int page_size)
return psize;
}
+/*
+ * Flush partition scoped LPID address translation for all CPUs.
+ */
+void radix__flush_tlb_lpid_page(unsigned int lpid,
+ unsigned long addr,
+ unsigned long page_size)
+{
+ int psize = radix_get_mmu_psize(page_size);
+
+ _tlbie_lpid_va(addr, lpid, psize, RIC_FLUSH_TLB);
+}
+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid_page);
+
+/*
+ * Flush partition scoped PWC from LPID for all CPUs.
+ */
+void radix__flush_pwc_lpid(unsigned int lpid)
+{
+ _tlbie_lpid(lpid, RIC_FLUSH_PWC);
+}
+EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
+
+/*
+ * Flush partition scoped translations from LPID (=LPIDR)
+ */
+void radix__local_flush_tlb_lpid(unsigned int lpid)
+{
+ _tlbiel_lpid(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid);
+
+/*
+ * Flush process scoped translations from LPID (=LPIDR).
+ * Important difference, the guest normally manages its own translations,
+ * but some cases e.g., vCPU CPU migration require KVM to flush.
+ */
+void radix__local_flush_tlb_lpid_guest(unsigned int lpid)
+{
+ _tlbiel_lpid_guest(lpid, RIC_FLUSH_ALL);
+}
+EXPORT_SYMBOL_GPL(radix__local_flush_tlb_lpid_guest);
+
+
static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long end, int psize);
@@ -551,7 +846,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
* See the comment for radix in arch_exit_mmap().
*/
if (tlb->fullmm) {
- radix__flush_all_mm(mm);
+ __flush_all_mm(mm, true);
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->need_flush_all)
radix__flush_tlb_mm(mm);
@@ -584,24 +879,33 @@ static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
return;
preempt_disable();
- if (mm_is_thread_local(mm)) {
- local = true;
- full = (end == TLB_FLUSH_ALL ||
- nr_pages > tlb_local_single_page_flush_ceiling);
- } else {
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ if (end != TLB_FLUSH_ALL) {
+ exit_flush_lazy_tlbs(mm);
+ goto is_local;
+ }
+ }
local = false;
full = (end == TLB_FLUSH_ALL ||
nr_pages > tlb_single_page_flush_ceiling);
+ } else {
+is_local:
+ local = true;
+ full = (end == TLB_FLUSH_ALL ||
+ nr_pages > tlb_local_single_page_flush_ceiling);
}
if (full) {
- if (!local && mm_needs_flush_escalation(mm))
- also_pwc = true;
-
- if (local)
+ if (local) {
_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
- else
- _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
+ } else {
+ if (mm_needs_flush_escalation(mm))
+ also_pwc = true;
+
+ _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
+ }
} else {
if (local)
_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
@@ -642,11 +946,17 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
/* Otherwise first do the PWC, then iterate the pages. */
preempt_disable();
-
- if (mm_is_thread_local(mm)) {
- _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
- } else {
+ smp_mb(); /* see radix__flush_tlb_mm */
+ if (!mm_is_thread_local(mm)) {
+ if (unlikely(mm_is_singlethreaded(mm))) {
+ exit_flush_lazy_tlbs(mm);
+ goto local;
+ }
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
+ goto local;
+ } else {
+local:
+ _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
}
preempt_enable();
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 702d7689d714e..cf8472cf3d593 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -41,7 +41,7 @@ void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long addr)
{
unsigned long ptephys;
- if (Hash != 0) {
+ if (Hash) {
ptephys = __pa(ptep) & PAGE_MASK;
flush_hash_pages(mm->context.id, addr, ptephys, 1);
}
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(flush_hash_entry);
*/
void tlb_flush(struct mmu_gather *tlb)
{
- if (Hash == 0) {
+ if (!Hash) {
/*
* 603 needs to flush the whole TLB here since
* it doesn't use a hash table.
@@ -84,7 +84,7 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
int count;
unsigned int ctx = mm->context.id;
- if (Hash == 0) {
+ if (!Hash) {
_tlbia();
return;
}
@@ -124,7 +124,7 @@ void flush_tlb_mm(struct mm_struct *mm)
{
struct vm_area_struct *mp;
- if (Hash == 0) {
+ if (!Hash) {
_tlbia();
return;
}
@@ -145,7 +145,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
struct mm_struct *mm;
pmd_t *pmd;
- if (Hash == 0) {
+ if (!Hash) {
_tlbie(vmaddr);
return;
}