From a9af0c5dfdaf0b2e1a8bab7fbf6f29138947d534 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 7 Apr 2014 15:36:54 -0700 Subject: mm/hugetlb.c: add NULL check of return value of huge_pte_offset huge_pte_offset() could return NULL, so we need NULL check to avoid potential NULL pointer dereferences. Signed-off-by: Naoya Horiguchi Cc: Mel Gorman Cc: Sasha Levin Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c02b9dadfb05..6cddfadaba038 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2690,7 +2690,8 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -2734,7 +2735,7 @@ retry_avoidcopy: */ spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(pte_same(huge_ptep_get(ptep), pte))) { + if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); /* Break COW */ -- cgit v1.2.3 From 88a9ab6e3dfb5b10168130c255c6102c925343ab Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Apr 2014 15:36:55 -0700 Subject: mm,numa: reorganize change_pmd_range() Reorganize the order of ifs in change_pmd_range a little, in preparation for the next patch. [akpm@linux-foundation.org: fix indenting, per David] Signed-off-by: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Reported-by: Xing Gang Tested-by: Chegu Vinod Acked-by: David Rientjes Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 769a67a158037..79cb51866e02f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -118,6 +118,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); + if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + continue; if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); @@ -133,10 +135,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, continue; } } - /* fall through */ + /* fall through, the trans huge pmd just split */ } - if (pmd_none_or_clear_bad(pmd)) - continue; + VM_BUG_ON(pmd_trans_huge(*pmd)); this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; -- cgit v1.2.3 From 1ad9f620c3a22fa800489455ce517c29e576934e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Apr 2014 15:36:56 -0700 Subject: mm: numa: recheck for transhuge pages under lock during protection changes Sasha reported the following bug using trinity kernel BUG at mm/mprotect.c:149! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 20 PID: 26219 Comm: trinity-c216 Tainted: G W 3.14.0-rc5-next-20140305-sasha-00011-ge06f5f3-dirty #105 task: ffff8800b6c80000 ti: ffff880228436000 task.ti: ffff880228436000 RIP: change_protection_range+0x3b3/0x500 Call Trace: change_protection+0x25/0x30 change_prot_numa+0x1b/0x30 task_numa_work+0x279/0x360 task_work_run+0xae/0xf0 do_notify_resume+0x8e/0xe0 retint_signal+0x4d/0x92 The VM_BUG_ON was added in -mm by the patch "mm,numa: reorganize change_pmd_range". The race existed without the patch but was just harder to hit. The problem is that a transhuge check is made without holding the PTL. It's possible at the time of the check that a parallel fault clears the pmd and inserts a new one which then triggers the VM_BUG_ON check. This patch removes the VM_BUG_ON but fixes the race by rechecking transhuge under the PTL when marking page tables for NUMA hinting and bailing if a race occurred. It is not a problem for calls to mprotect() as they hold mmap_sem for write. Signed-off-by: Mel Gorman Reported-by: Sasha Levin Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 79cb51866e02f..2c51c79c8a698 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) } #endif +/* + * For a prot_numa update we only hold mmap_sem for read so there is a + * potential race with faulting where a pmd was temporarily none. This + * function checks for a transhuge pmd under the appropriate lock. It + * returns a pte if it was successfully locked or NULL if it raced with + * a transhuge insertion. + */ +static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, int prot_numa, spinlock_t **ptl) +{ + pte_t *pte; + spinlock_t *pmdl; + + /* !prot_numa is protected by mmap_sem held for write */ + if (!prot_numa) + return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + + pmdl = pmd_lock(vma->vm_mm, pmd); + if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) { + spin_unlock(pmdl); + return NULL; + } + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl); + spin_unlock(pmdl); + return pte; +} + static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; unsigned long pages = 0; - pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); + if (!pte) + return 0; + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -132,12 +163,13 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += HPAGE_PMD_NR; nr_huge_updates++; } + + /* huge pmd was handled */ continue; } } /* fall through, the trans huge pmd just split */ } - VM_BUG_ON(pmd_trans_huge(*pmd)); this_pages = change_pte_range(vma, pmd, addr, next, newprot, dirty_accountable, prot_numa); pages += this_pages; -- cgit v1.2.3 From a5338093bfb462256f70f3450c08f73e59543e26 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Apr 2014 15:36:57 -0700 Subject: mm: move mmu notifier call from change_protection to change_pmd_range The NUMA scanning code can end up iterating over many gigabytes of unpopulated memory, especially in the case of a freshly started KVM guest with lots of memory. This results in the mmu notifier code being called even when there are no mapped pages in a virtual address range. The amount of time wasted can be enough to trigger soft lockup warnings with very large KVM guests. This patch moves the mmu notifier call to the pmd level, which represents 1GB areas of memory on x86-64. Furthermore, the mmu notifier code is only called from the address in the PMD where present mappings are first encountered. The hugetlbfs code is left alone for now; hugetlb mappings are not relocatable, and as such are left alone by the NUMA code, and should never trigger this problem to begin with. Signed-off-by: Rik van Riel Acked-by: David Rientjes Cc: Peter Zijlstra Cc: Andrea Arcangeli Reported-by: Xing Gang Tested-by: Chegu Vinod Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 ++ mm/mprotect.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6cddfadaba038..ed5072c64daac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3186,6 +3186,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, address, end); + mmu_notifier_invalidate_range_start(mm, start, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; @@ -3215,6 +3216,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, */ flush_tlb_range(vma, start, end); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + mmu_notifier_invalidate_range_end(mm, start, end); return pages << h->order; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 2c51c79c8a698..c43d557941f80 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -140,9 +140,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pgprot_t newprot, int dirty_accountable, int prot_numa) { pmd_t *pmd; + struct mm_struct *mm = vma->vm_mm; unsigned long next; unsigned long pages = 0; unsigned long nr_huge_updates = 0; + unsigned long mni_start = 0; pmd = pmd_offset(pud, addr); do { @@ -151,6 +153,13 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, next = pmd_addr_end(addr, end); if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) continue; + + /* invoke the mmu notifier if the pmd is populated */ + if (!mni_start) { + mni_start = addr; + mmu_notifier_invalidate_range_start(mm, mni_start, end); + } + if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); @@ -175,6 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += this_pages; } while (pmd++, addr = next, addr != end); + if (mni_start) + mmu_notifier_invalidate_range_end(mm, mni_start, end); + if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; @@ -234,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { - struct mm_struct *mm = vma->vm_mm; unsigned long pages; - mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); - mmu_notifier_invalidate_range_end(mm, start, end); return pages; } -- cgit v1.2.3 From 619d0d76c1ee943f171c7d4fc021ec7602388579 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Mon, 7 Apr 2014 15:36:59 -0700 Subject: mm/vmscan: restore sc->gfp_mask after promoting it to __GFP_HIGHMEM We promote sc->gfp_mask to __GFP_HIGHMEM to forcibly scan highmem if there are too many buffer_heads pinning highmem. See cc715d99e5 ("mm: vmscan: forcibly scan highmem if there are too many buffer_heads pinning highmem"). This patch restores sc->gfp_mask to its caller original value after finishing the scan job, to avoid the impact on other invocations from its upper caller, such as vmpressure_prio(), shrink_slab(). Signed-off-by: Weijie Yang Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1f56a80a7c414..1c51e4f52fd97 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2314,6 +2314,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long lru_pages = 0; bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; + gfp_t orig_mask; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; @@ -2323,6 +2324,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ + orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; @@ -2393,6 +2395,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) } } + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + return aborted_reclaim; } -- cgit v1.2.3 From 9bbc04eeb01fcb5c20bb10f34989665df7200163 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Mon, 7 Apr 2014 15:37:00 -0700 Subject: mm/vmscan: do not check compaction_ready on promoted zones We abort direct reclaim if we find the zone is ready for compaction. Sometimes the zone is just a promoted highmem zone to force a scan of highmem, which is not the intended zone the caller want to allocate a page from. In this situation, setting aborted_reclaim to indicate the caller turned back to retry the allocation is waste of time and could cause a loop in __alloc_pages_slowpath(). This patch does not check compaction_ready() on promoted zones to avoid the above situation. Only set aborted_reclaim if the caller intended zone is ready for compaction. Signed-off-by: Weijie Yang Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c51e4f52fd97..06879ead73800 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2318,6 +2318,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2358,7 +2359,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * noticeable problem, like transparent huge * page allocations. */ - if (compaction_ready(zone, sc)) { + if ((zonelist_zone_idx(z) <= requested_highidx) + && compaction_ready(zone, sc)) { aborted_reclaim = true; continue; } -- cgit v1.2.3 From 7aa6b4ad5a81d7761b044d38ac0120850a6396ca Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Apr 2014 15:37:01 -0700 Subject: mm/memory.c: update comment in unmap_single_vma() The described issue now occurs inside mmap_region(). And unfortunately is still valid. Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 82c1e4cf00d16..c6ee34d10fcc0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1320,9 +1320,9 @@ static void unmap_single_vma(struct mmu_gather *tlb, * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When + * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file + * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. -- cgit v1.2.3 From 70ef57e6c22c3323dce179b7d0d433c479266612 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:01 -0700 Subject: mm: exclude memoryless nodes from zone_reclaim We had a report about strange OOM killer strikes on a PPC machine although there was a lot of swap free and a tons of anonymous memory which could be swapped out. In the end it turned out that the OOM was a side effect of zone reclaim which wasn't unmapping and swapping out and so the system was pushed to the OOM. Although this sounds like a bug somewhere in the kswapd vs. zone reclaim vs. direct reclaim interaction numactl on the said hardware suggests that the zone reclaim should not have been set in the first place: node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 0 size: 0 MB node 0 free: 0 MB node 2 cpus: node 2 size: 7168 MB node 2 free: 6019 MB node distances: node 0 2 0: 10 40 2: 40 10 So all the CPUs are associated with Node0 which doesn't have any memory while Node2 contains all the available memory. Node distances cause an automatic zone_reclaim_mode enabling. Zone reclaim is intended to keep the allocations local but this doesn't make any sense on the memoryless nodes. So let's exclude such nodes for init_zone_allows_reclaim which evaluates zone reclaim behavior and suitable reclaim_nodes. Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: Nishanth Aravamudan Tested-by: Nishanth Aravamudan Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 979378deccbfc..336ee925f7569 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1870,7 +1870,7 @@ static void __paginginit init_zone_allows_reclaim(int nid) { int i; - for_each_online_node(i) + for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); else @@ -4919,7 +4919,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - init_zone_allows_reclaim(nid); + if (node_state(nid, N_MEMORY)) + init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif -- cgit v1.2.3 From 7d348b9ea64db0a315d777ce7d4b06697f946503 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:03 -0700 Subject: mm/compaction: disallow high-order page for migration target Purpose of compaction is to get a high order page. Currently, if we find high-order page while searching migration target page, we break it to order-0 pages and use them as migration target. It is contrary to purpose of compaction, so disallow high-order page to be used for migration target. Additionally, clean-up logic in suitable_migration_target() to simplify the code. There is no functional changes from this clean-up. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b6ab771600680..9a03fdb1fd848 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { - int migratetype = get_pageblock_migratetype(page); - - /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ - if (migratetype == MIGRATE_RESERVE) - return false; - - if (is_migrate_isolate(migratetype)) - return false; - - /* If the page is a large free page, then allow migration */ + /* If the page is a large free page, then disallow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return true; + return false; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(migratetype)) + if (migrate_async_suitable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ -- cgit v1.2.3 From 01ead5340bcf5f3a1cd2452c75516d0ef4d908d7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:04 -0700 Subject: mm/compaction: do not call suitable_migration_target() on every page suitable_migration_target() checks that pageblock is suitable for migration target. In isolate_freepages_block(), it is called on every page and this is inefficient. So make it called once per pageblock. suitable_migration_target() also checks if page is highorder or not, but it's criteria for highorder is pageblock order. So calling it once within pageblock range has no problem. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 9a03fdb1fd848..3a1828541dc0b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -244,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, struct page *cursor, *valid_page = NULL; unsigned long flags; bool locked = false; + bool checked_pageblock = false; cursor = pfn_to_page(blockpfn); @@ -275,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; /* Recheck this is a suitable migration target under lock */ - if (!strict && !suitable_migration_target(page)) - break; + if (!strict && !checked_pageblock) { + /* + * We need to check suitability of pageblock only once + * and this isolate_freepages_block() is called with + * pageblock range, so just check once is sufficient. + */ + checked_pageblock = true; + if (!suitable_migration_target(page)) + break; + } /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) -- cgit v1.2.3 From be1aa03b973c7dcdc576f3503f7a60429825c35d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:05 -0700 Subject: mm/compaction: change the timing to check to drop the spinlock It is odd to drop the spinlock when we scan (SWAP_CLUSTER_MAX - 1) th pfn page. This may results in below situation while isolating migratepage. 1. try isolate 0x0 ~ 0x200 pfn pages. 2. When low_pfn is 0x1ff, ((low_pfn+1) % SWAP_CLUSTER_MAX) == 0, so drop the spinlock. 3. Then, to complete isolating, retry to aquire the lock. I think that it is better to use SWAP_CLUSTER_MAX th pfn for checking the criteria about dropping the lock. This has no harm 0x0 pfn, because, at this time, locked variable would be false. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 3a1828541dc0b..0eb9f99196ce7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -487,7 +487,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ - if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { + if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { if (should_release_lock(&zone->lru_lock)) { spin_unlock_irqrestore(&zone->lru_lock, flags); locked = false; -- cgit v1.2.3 From c122b2087ab94192f2b937e47b563a9c4e688ece Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:06 -0700 Subject: mm/compaction: check pageblock suitability once per pageblock isolation_suitable() and migrate_async_suitable() is used to be sure that this pageblock range is fine to be migragted. It isn't needed to call it on every page. Current code do well if not suitable, but, don't do well when suitable. 1) It re-checks isolation_suitable() on each page of a pageblock that was already estabilished as suitable. 2) It re-checks migrate_async_suitable() on each page of a pageblock that was not entered through the next_pageblock: label, because last_pageblock_nr is not otherwise updated. This patch fixes situation by 1) calling isolation_suitable() only once per pageblock and 2) always updating last_pageblock_nr to the pageblock that was just checked. Additionally, move PageBuddy() check after pageblock unit check, since pageblock check is the first thing we should do and makes things more simple. [vbabka@suse.cz: rephrase commit description] Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 0eb9f99196ce7..6878c005bc8ed 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -526,8 +526,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, /* If isolation recently failed, do not retry */ pageblock_nr = low_pfn >> pageblock_order; - if (!isolation_suitable(cc, page)) - goto next_pageblock; + if (last_pageblock_nr != pageblock_nr) { + int mt; + + last_pageblock_nr = pageblock_nr; + if (!isolation_suitable(cc, page)) + goto next_pageblock; + + /* + * For async migration, also only scan in MOVABLE + * blocks. Async migration is optimistic to see if + * the minimum amount of work satisfies the allocation + */ + mt = get_pageblock_migratetype(page); + if (!cc->sync && !migrate_async_suitable(mt)) { + cc->finished_update_migrate = true; + skipped_async_unsuitable = true; + goto next_pageblock; + } + } /* * Skip if free. page_order cannot be used without zone->lock @@ -536,18 +553,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (PageBuddy(page)) continue; - /* - * For async migration, also only scan in MOVABLE blocks. Async - * migration is optimistic to see if the minimum amount of work - * satisfies the allocation - */ - if (!cc->sync && last_pageblock_nr != pageblock_nr && - !migrate_async_suitable(get_pageblock_migratetype(page))) { - cc->finished_update_migrate = true; - skipped_async_unsuitable = true; - goto next_pageblock; - } - /* * Check may be lockless but that's ok as we recheck later. * It's possible to migrate LRU pages and balloon pages @@ -639,7 +644,6 @@ check_compact_cluster: next_pageblock: low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; - last_pageblock_nr = pageblock_nr; } acct_isolated(zone, locked, cc); -- cgit v1.2.3 From b6c750163c0d138f5041d95fcdbd1094b6928057 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 7 Apr 2014 15:37:07 -0700 Subject: mm/compaction: clean-up code on success of ballon isolation It is just for clean-up to reduce code size and improve readability. There is no functional change. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 6878c005bc8ed..054c28b51c755 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -562,11 +562,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, if (unlikely(balloon_page_movable(page))) { if (locked && balloon_page_isolate(page)) { /* Successfully isolated */ - cc->finished_update_migrate = true; - list_add(&page->lru, migratelist); - cc->nr_migratepages++; - nr_isolated++; - goto check_compact_cluster; + goto isolate_success; } } continue; @@ -627,13 +623,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, VM_BUG_ON_PAGE(PageTransCompound(page), page); /* Successfully isolated */ - cc->finished_update_migrate = true; del_page_from_lru_list(page, lruvec, page_lru(page)); + +isolate_success: + cc->finished_update_migrate = true; list_add(&page->lru, migratelist); cc->nr_migratepages++; nr_isolated++; -check_compact_cluster: /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; -- cgit v1.2.3 From 1e1836e84f87d12feac6dd225fcef5eba1ca724b Mon Sep 17 00:00:00 2001 From: Alex Thorlton Date: Mon, 7 Apr 2014 15:37:09 -0700 Subject: mm: revert "thp: make MADV_HUGEPAGE check for mm->def_flags" The main motivation behind this patch is to provide a way to disable THP for jobs where the code cannot be modified, and using a malloc hook with madvise is not an option (i.e. statically allocated data). This patch allows us to do just that, without affecting other jobs running on the system. We need to do this sort of thing for jobs where THP hurts performance, due to the possibility of increased remote memory accesses that can be created by situations such as the following: When you touch 1 byte of an untouched, contiguous 2MB chunk, a THP will be handed out, and the THP will be stuck on whatever node the chunk was originally referenced from. If many remote nodes need to do work on that same chunk, they'll be making remote accesses. With THP disabled, 4K pages can be handed out to separate nodes as they're needed, greatly reducing the amount of remote accesses to memory. This patch is based on some of my work combined with some suggestions/patches given by Oleg Nesterov. The main goal here is to add a prctl switch to allow us to disable to THP on a per mm_struct basis. Here's a bit of test data with the new patch in place... First with the flag unset: # perf stat -a ./prctl_wrapper_mmv3 0 ./thp_pthread -C 0 -m 0 -c 512 -b 256g Setting thp_disabled for this task... thp_disable: 0 Set thp_disabled state to 0 Process pid = 18027 PF/ MAX MIN TOTCPU/ TOT_PF/ TOT_PF/ WSEC/ TYPE: CPUS WALL WALL SYS USER TOTCPU CPU WALL_SEC SYS_SEC CPU NODES 512 1.120 0.060 0.000 0.110 0.110 0.000 28571428864 -9223372036854775808 55803572 23 Performance counter stats for './prctl_wrapper_mmv3_hack 0 ./thp_pthread -C 0 -m 0 -c 512 -b 256g': 273719072.841402 task-clock # 641.026 CPUs utilized [100.00%] 1,008,986 context-switches # 0.000 M/sec [100.00%] 7,717 CPU-migrations # 0.000 M/sec [100.00%] 1,698,932 page-faults # 0.000 M/sec 355,222,544,890,379 cycles # 1.298 GHz [100.00%] 536,445,412,234,588 stalled-cycles-frontend # 151.02% frontend cycles idle [100.00%] 409,110,531,310,223 stalled-cycles-backend # 115.17% backend cycles idle [100.00%] 148,286,797,266,411 instructions # 0.42 insns per cycle # 3.62 stalled cycles per insn [100.00%] 27,061,793,159,503 branches # 98.867 M/sec [100.00%] 1,188,655,196 branch-misses # 0.00% of all branches 427.001706337 seconds time elapsed Now with the flag set: # perf stat -a ./prctl_wrapper_mmv3 1 ./thp_pthread -C 0 -m 0 -c 512 -b 256g Setting thp_disabled for this task... thp_disable: 1 Set thp_disabled state to 1 Process pid = 144957 PF/ MAX MIN TOTCPU/ TOT_PF/ TOT_PF/ WSEC/ TYPE: CPUS WALL WALL SYS USER TOTCPU CPU WALL_SEC SYS_SEC CPU NODES 512 0.620 0.260 0.250 0.320 0.570 0.001 51612901376 128000000000 100806448 23 Performance counter stats for './prctl_wrapper_mmv3_hack 1 ./thp_pthread -C 0 -m 0 -c 512 -b 256g': 138789390.540183 task-clock # 641.959 CPUs utilized [100.00%] 534,205 context-switches # 0.000 M/sec [100.00%] 4,595 CPU-migrations # 0.000 M/sec [100.00%] 63,133,119 page-faults # 0.000 M/sec 147,977,747,269,768 cycles # 1.066 GHz [100.00%] 200,524,196,493,108 stalled-cycles-frontend # 135.51% frontend cycles idle [100.00%] 105,175,163,716,388 stalled-cycles-backend # 71.07% backend cycles idle [100.00%] 180,916,213,503,160 instructions # 1.22 insns per cycle # 1.11 stalled cycles per insn [100.00%] 26,999,511,005,868 branches # 194.536 M/sec [100.00%] 714,066,351 branch-misses # 0.00% of all branches 216.196778807 seconds time elapsed As with previous versions of the patch, We're getting about a 2x performance increase here. Here's a link to the test case I used, along with the little wrapper to activate the flag: http://oss.sgi.com/projects/memtests/thp_pthread_mmprctlv3.tar.gz This patch (of 3): Revert commit 8e72033f2a48 and add in code to fix up any issues caused by the revert. The revert is necessary because hugepage_madvise would return -EINVAL when VM_NOHUGEPAGE is set, which will break subsequent chunks of this patch set. Here's a snip of an e-mail from Gerald detailing the original purpose of this code, and providing justification for the revert: "The intent of commit 8e72033f2a48 was to guard against any future programming errors that may result in an madvice(MADV_HUGEPAGE) on guest mappings, which would crash the kernel. Martin suggested adding the bit to arch/s390/mm/pgtable.c, if 8e72033f2a48 was to be reverted, because that check will also prevent a kernel crash in the case described above, it will now send a SIGSEGV instead. This would now also allow to do the madvise on other parts, if needed, so it is a more flexible approach. One could also say that it would have been better to do it this way right from the beginning..." Signed-off-by: Alex Thorlton Suggested-by: Oleg Nesterov Tested-by: Christian Borntraeger Cc: Gerald Schaefer Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paolo Bonzini Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Rik van Riel Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Johannes Weiner Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/pgtable.c | 3 +++ mm/huge_memory.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 796c9320c709f..5d8324cd866b1 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -505,6 +505,9 @@ static int gmap_connect_pgtable(unsigned long address, unsigned long segment, if (!pmd_present(*pmd) && __pte_alloc(mm, vma, pmd, vmaddr)) return -ENOMEM; + /* large pmds cannot yet be handled */ + if (pmd_large(*pmd)) + return -EFAULT; /* pmd now points to a valid segment table entry. */ rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT); if (!rmap) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6ac89e9f82efe..a2f4981418fc5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1891,17 +1891,22 @@ out: int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - struct mm_struct *mm = vma->vm_mm; - switch (advice) { case MADV_HUGEPAGE: +#ifdef CONFIG_S390 + /* + * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 + * can't handle this properly after s390_enable_sie, so we simply + * ignore the madvise to prevent qemu from causing a SIGSEGV. + */ + if (mm_has_pgste(vma->vm_mm)) + return 0; +#endif /* * Be somewhat over-protective like KSM for now! */ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) return -EINVAL; - if (mm->def_flags & VM_NOHUGEPAGE) - return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; /* -- cgit v1.2.3 From 9164550ecd15253d72b5fe3b4baa9505c4b6fa1f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:14 -0700 Subject: mm: disable split page table lock for !MMU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's no reason to enable split page table lock if don't have page tables. It also triggers build error at least on ARM since we don't define pmd_page() for !MMU. In file included from arch/arm/kernel/asm-offsets.c:14:0: include/linux/mm.h: In function 'pte_lockptr': include/linux/mm.h:1392:2: error: implicit declaration of function 'pmd_page' [-Werror=implicit-function-declaration] include/linux/mm.h:1392:2: warning: passing argument 1 of 'ptlock_ptr' makes pointer from integer without a cast [enabled by default] include/linux/mm.h:1384:27: note: expected 'struct page *' but argument is of type 'int' Signed-off-by: Kirill A. Shutemov Reported-by: Uwe Kleine-König Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 2888024e0b0ab..37fbe1ef52397 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -216,6 +216,7 @@ config PAGEFLAGS_EXTENDED # config SPLIT_PTLOCK_CPUS int + default "999999" if !MMU default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 default "4" -- cgit v1.2.3 From 8c6e50b0290c4c708a3e6462729e1e9151a9a7df Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:18 -0700 Subject: mm: introduce vm_ops->map_pages() Here's new version of faultaround patchset. It took a while to tune it and collect performance data. First patch adds new callback ->map_pages to vm_operations_struct. ->map_pages() is called when VM asks to map easy accessible pages. Filesystem should find and map pages associated with offsets from "pgoff" till "max_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup page table entry. Pointer to entry associated with offset "pgoff" is passed in "pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". Currently VM use ->map_pages only on read page fault path. We try to map FAULT_AROUND_PAGES a time. FAULT_AROUND_PAGES is 16 for now. Performance data for different FAULT_AROUND_ORDER is below. TODO: - implement ->map_pages() for shmem/tmpfs; - modify get_user_pages() to be able to use ->map_pages() and implement mmap(MAP_POPULATE|MAP_NONBLOCK) on top. ========================================================================= Tested on 4-socket machine (120 threads) with 128GiB of RAM. Few real-world workloads. The sweet spot for FAULT_AROUND_ORDER here is somewhere between 3 and 5. Let's say 4 :) Linux build (make -j60) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 283,301,572 247,151,987 212,215,789 204,772,882 199,568,944 194,703,779 193,381,485 time, seconds 151.227629483 153.920996480 151.356125472 150.863792049 150.879207877 151.150764954 151.450962358 Linux rebuild (make -j60) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 5,396,854 4,148,444 2,855,286 2,577,282 2,361,957 2,169,573 2,112,643 time, seconds 27.404543757 27.559725591 27.030057426 26.855045126 26.678618635 26.974523490 26.761320095 Git test suite (make -j60 test) FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 minor-faults 129,591,823 99,200,751 66,106,718 57,606,410 51,510,808 45,776,813 44,085,515 time, seconds 66.087215026 64.784546905 64.401156567 65.282708668 66.034016829 66.793780811 67.237810413 Two synthetic tests: access every word in file in sequential/random order. It doesn't improve much after FAULT_AROUND_ORDER == 4. Sequential access 16GiB file FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 1 thread minor-faults 4,195,437 2,098,275 525,068 262,251 131,170 32,856 8,282 time, seconds 7.250461742 6.461711074 5.493859139 5.488488147 5.707213983 5.898510832 5.109232856 8 threads minor-faults 33,557,540 16,892,728 4,515,848 2,366,999 1,423,382 442,732 142,339 time, seconds 16.649304881 9.312555263 6.612490639 6.394316732 6.669827501 6.75078944 6.371900528 32 threads minor-faults 134,228,222 67,526,810 17,725,386 9,716,537 4,763,731 1,668,921 537,200 time, seconds 49.164430543 29.712060103 12.938649729 10.175151004 11.840094583 9.594081325 9.928461797 60 threads minor-faults 251,687,988 126,146,952 32,919,406 18,208,804 10,458,947 2,733,907 928,217 time, seconds 86.260656897 49.626551828 22.335007632 17.608243696 16.523119035 16.339489186 16.326390902 120 threads minor-faults 503,352,863 252,939,677 67,039,168 35,191,827 19,170,091 4,688,357 1,471,862 time, seconds 124.589206333 79.757867787 39.508707872 32.167281632 29.972989292 28.729834575 28.042251622 Random access 1GiB file 1 thread minor-faults 262,636 132,743 34,369 17,299 8,527 3,451 1,222 time, seconds 15.351890914 16.613802482 16.569227308 15.179220992 16.557356122 16.578247824 15.365266994 8 threads minor-faults 2,098,948 1,061,871 273,690 154,501 87,110 25,663 7,384 time, seconds 15.040026343 15.096933500 14.474757288 14.289129964 14.411537468 14.296316837 14.395635804 32 threads minor-faults 8,390,734 4,231,023 1,054,432 528,847 269,242 97,746 26,881 time, seconds 20.430433109 21.585235358 22.115062928 14.872878951 14.880856305 14.883370649 14.821261690 60 threads minor-faults 15,733,258 7,892,809 1,973,393 988,266 594,789 164,994 51,691 time, seconds 26.577302548 25.692397770 18.728863715 20.153026398 21.619101933 17.745086260 17.613215273 120 threads minor-faults 31,471,111 15,816,616 3,959,209 1,978,685 1,008,299 264,635 96,010 time, seconds 41.835322703 40.459786095 36.085306105 35.313894834 35.814445675 36.552633793 34.289210594 Touch only one page in page table in 16GiB file FAULT_AROUND_ORDER Baseline 1 3 4 5 7 9 1 thread minor-faults 8,372 8,324 8,270 8,260 8,249 8,239 8,237 time, seconds 0.039892712 0.045369149 0.051846126 0.063681685 0.079095975 0.17652406 0.541213386 8 threads minor-faults 65,731 65,681 65,628 65,620 65,608 65,599 65,596 time, seconds 0.124159196 0.488600638 0.156854426 0.191901957 0.242631486 0.543569456 1.677303984 32 threads minor-faults 262,388 262,341 262,285 262,276 262,266 262,257 263,183 time, seconds 0.452421421 0.488600638 0.565020946 0.648229739 0.789850823 1.651584361 5.000361559 60 threads minor-faults 491,822 491,792 491,723 491,711 491,701 491,691 491,825 time, seconds 0.763288616 0.869620515 0.980727360 1.161732354 1.466915814 3.04041448 9.308612938 120 threads minor-faults 983,466 983,655 983,366 983,372 983,363 984,083 984,164 time, seconds 1.595846553 1.667902182 2.008959376 2.425380942 2.941368804 5.977807890 18.401846125 This patch (of 2): Introduce new vm_ops callback ->map_pages() and uses it for mapping easy accessible pages around fault address. On read page fault, if filesystem provides ->map_pages(), we try to map up to FAULT_AROUND_PAGES pages around page fault address in hope to reduce number of minor page faults. We call ->map_pages first and use ->fault() as fallback if page by the offset is not ready to be mapped (cold page cache or something). Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 10 +++++ include/linux/mm.h | 8 ++++ mm/memory.c | 81 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 96 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f424e0e5b46bb..efca5c1bbb102 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -529,6 +529,7 @@ locking rules: open: yes close: yes fault: yes can return with page locked +map_pages: yes page_mkwrite: yes can return with page locked access: yes @@ -540,6 +541,15 @@ the page, then ensure it is not already truncated (the page lock will block subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. + ->map_pages() is called when VM asks to map easy accessible pages. +Filesystem should find and map pages associated with offsets from "pgoff" +till "max_pgoff". ->map_pages() is called with page table locked and must +not block. If it's not possible to reach a page without blocking, +filesystem should skip it. Filesystem should use do_set_pte() to setup +page table entry. Pointer to entry associated with offset "pgoff" is +passed in "pte" field in vm_fault structure. Pointers to entries for other +offsets should be calculated relative to "pte". + ->page_mkwrite() is called when a previously read-only pte is about to become writeable. The filesystem again must ensure that there are no truncate/invalidate races, and then return with the page locked. If diff --git a/include/linux/mm.h b/include/linux/mm.h index c270fa68a32bf..f710d32291e82 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -213,6 +213,10 @@ struct vm_fault { * is set (which is also implied by * VM_FAULT_ERROR). */ + /* for ->map_pages() only */ + pgoff_t max_pgoff; /* map pages for offset from pgoff till + * max_pgoff inclusive */ + pte_t *pte; /* pte entry associated with ->pgoff */ }; /* @@ -224,6 +228,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); + void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ @@ -584,6 +589,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) pte = pte_mkwrite(pte); return pte; } + +void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon); #endif /* diff --git a/mm/memory.c b/mm/memory.c index c6ee34d10fcc0..4eefb7e315217 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3342,7 +3342,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, return ret; } -static void do_set_pte(struct vm_area_struct *vma, unsigned long address, +/** + * do_set_pte - setup new PTE entry for given page and add reverse page mapping. + * + * @vma: virtual memory area + * @address: user virtual address + * @page: page to map + * @pte: pointer to target page table entry + * @write: true, if new entry is writable + * @anon: true, if it's anonymous page + * + * Caller must hold page table lock relevant for @pte. + * + * Target users are page handler itself and implementations of + * vm_ops->map_pages. + */ +void do_set_pte(struct vm_area_struct *vma, unsigned long address, struct page *page, pte_t *pte, bool write, bool anon) { pte_t entry; @@ -3366,6 +3381,52 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } +#define FAULT_AROUND_ORDER 4 +#define FAULT_AROUND_PAGES (1UL << FAULT_AROUND_ORDER) +#define FAULT_AROUND_MASK ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1) + +static void do_fault_around(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pgoff_t pgoff, unsigned int flags) +{ + unsigned long start_addr; + pgoff_t max_pgoff; + struct vm_fault vmf; + int off; + + BUILD_BUG_ON(FAULT_AROUND_PAGES > PTRS_PER_PTE); + + start_addr = max(address & FAULT_AROUND_MASK, vma->vm_start); + off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + pte -= off; + pgoff -= off; + + /* + * max_pgoff is either end of page table or end of vma + * or FAULT_AROUND_PAGES from pgoff, depending what is neast. + */ + max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + PTRS_PER_PTE - 1; + max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, + pgoff + FAULT_AROUND_PAGES - 1); + + /* Check if it makes any sense to call ->map_pages */ + while (!pte_none(*pte)) { + if (++pgoff > max_pgoff) + return; + start_addr += PAGE_SIZE; + if (start_addr >= vma->vm_end) + return; + pte++; + } + + vmf.virtual_address = (void __user *) start_addr; + vmf.pte = pte; + vmf.pgoff = pgoff; + vmf.max_pgoff = max_pgoff; + vmf.flags = flags; + vma->vm_ops->map_pages(vma, &vmf); +} + static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) @@ -3373,7 +3434,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *fault_page; spinlock_t *ptl; pte_t *pte; - int ret; + int ret = 0; + + /* + * Let's call ->map_pages() first and use ->fault() as fallback + * if page by the offset is not ready to be mapped (cold cache or + * something). + */ + if (vma->vm_ops->map_pages) { + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + do_fault_around(vma, address, pte, pgoff, flags); + if (!pte_same(*pte, orig_pte)) + goto unlock_out; + pte_unmap_unlock(pte, ptl); + } ret = __do_fault(vma, address, pgoff, flags, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) @@ -3387,8 +3461,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } do_set_pte(vma, address, fault_page, pte, false, false); - pte_unmap_unlock(pte, ptl); unlock_page(fault_page); +unlock_out: + pte_unmap_unlock(pte, ptl); return ret; } -- cgit v1.2.3 From f1820361f83d556a7f0a9f629100f3825e594328 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:19 -0700 Subject: mm: implement ->map_pages for page cache filemap_map_pages() is generic implementation of ->map_pages() for filesystems who uses page cache. It should be safe to use filemap_map_pages() for ->map_pages() if filesystem use filemap_fault() for ->fault(). Signed-off-by: Kirill A. Shutemov Acked-by: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 2 ++ fs/btrfs/file.c | 1 + fs/cifs/file.c | 1 + fs/ext4/file.c | 1 + fs/f2fs/file.c | 1 + fs/fuse/file.c | 1 + fs/gfs2/file.c | 1 + fs/nfs/file.c | 1 + fs/nilfs2/file.c | 1 + fs/ubifs/file.c | 1 + fs/xfs/xfs_file.c | 1 + include/linux/mm.h | 1 + mm/filemap.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/nommu.c | 6 +++++ 14 files changed, 93 insertions(+) (limited to 'mm') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a16b0ff497ca3..d8223209d4b1d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -832,6 +832,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) static const struct vm_operations_struct v9fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; @@ -839,6 +840,7 @@ static const struct vm_operations_struct v9fs_file_vm_ops = { static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { .close = v9fs_mmap_vm_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = v9fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e1ffb1e228989..c660527af8388 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2025,6 +2025,7 @@ out: static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 834fce759d807..216d7e99f9219 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3113,6 +3113,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static struct vm_operations_struct cifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = cifs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6db7f7db7777d..4e508fc83dcf1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -200,6 +200,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0dfcef53a6ed8..129a3bdb05ca0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -84,6 +84,7 @@ out: static const struct vm_operations_struct f2fs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 65df7d8be4f58..48992cac714b4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2117,6 +2117,7 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct fuse_file_vm_ops = { .close = fuse_vma_close, .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = fuse_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6c794085abaca..80d67253623cb 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -494,6 +494,7 @@ out: static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = gfs2_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5bb790a69c716..284ca901fe16c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -617,6 +617,7 @@ out: static const struct vm_operations_struct nfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 08fdb77852acd..f3a82fbcae026 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) static const struct vm_operations_struct nilfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = nilfs_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 123c79b7261ef..4f34dbae823dc 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1538,6 +1538,7 @@ out_unlock: static const struct vm_operations_struct ubifs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = ubifs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7abff8c16ca7..003c0051b62fa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1483,6 +1483,7 @@ const struct file_operations xfs_dir_file_operations = { static const struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = xfs_vm_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/include/linux/mm.h b/include/linux/mm.h index f710d32291e82..9132faed1a41b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1847,6 +1847,7 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); +extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); /* mm/page-writeback.c */ diff --git a/mm/filemap.c b/mm/filemap.c index 21781f1fe52b6..3f9b5fbb623fa 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include /* for BUG_ON(!in_atomic()) only */ #include #include +#include #include "internal.h" #define CREATE_TRACE_POINTS @@ -2064,6 +2065,78 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct radix_tree_iter iter; + void **slot; + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + loff_t size; + struct page *page; + unsigned long address = (unsigned long) vmf->virtual_address; + unsigned long addr; + pte_t *pte; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { + if (iter.index > vmf->max_pgoff) + break; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + goto next; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + break; + else + goto next; + } + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + if (!PageUptodate(page) || + PageReadahead(page) || + PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + + if (page->mapping != mapping || !PageUptodate(page)) + goto unlock; + + size = i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1; + if (page->index >= size >> PAGE_CACHE_SHIFT) + goto unlock; + + pte = vmf->pte + page->index - vmf->pgoff; + if (!pte_none(*pte)) + goto unlock; + + if (file->f_ra.mmap_miss > 0) + file->f_ra.mmap_miss--; + addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; + do_set_pte(vma, addr, page, pte, false, false); + unlock_page(page); + goto next; +unlock: + unlock_page(page); +skip: + page_cache_release(page); +next: + if (iter.index == vmf->max_pgoff) + break; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(filemap_map_pages); + int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; @@ -2093,6 +2166,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, + .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, .remap_pages = generic_file_remap_pages, }; diff --git a/mm/nommu.c b/mm/nommu.c index a554e5a451cdb..e19482533ce33 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1985,6 +1985,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); +void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + BUG(); +} +EXPORT_SYMBOL(filemap_map_pages); + int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, unsigned long size, pgoff_t pgoff) { -- cgit v1.2.3 From 99e3e53f4e0fd807607bf381e14f6de8feedd383 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:21 -0700 Subject: mm: cleanup size checks in filemap_fault() and filemap_map_pages() Minor cleanups: - 'size' variable is now in bytes, not pages; - use round_up(): it should be easier to read. Signed-off-by: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 3f9b5fbb623fa..b952d99c827ca 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1953,11 +1953,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; struct page *page; - pgoff_t size; + loff_t size; int ret = 0; - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (offset >= size) + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (offset >= size >> PAGE_CACHE_SHIFT) return VM_FAULT_SIGBUS; /* @@ -2006,8 +2006,8 @@ retry_find: * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(offset >= size)) { + size = round_up(i_size_read(inode), PAGE_CACHE_SIZE); + if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) { unlock_page(page); page_cache_release(page); return VM_FAULT_SIGBUS; @@ -2111,8 +2111,8 @@ repeat: if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1; - if (page->index >= size >> PAGE_CACHE_SHIFT) + size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE); + if (page->index >= size >> PAGE_CACHE_SHIFT) goto unlock; pte = vmf->pte + page->index - vmf->pgoff; -- cgit v1.2.3 From 1592eef01505177ed50149795a1560ec5a139df1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:22 -0700 Subject: mm: add debugfs tunable for fault_around_order Let's allow people to tweak faultaround at runtime. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kirill A. Shutemov Cc: Linus Torvalds Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: Matthew Wilcox Cc: Dave Hansen Cc: Alexander Viro Cc: Dave Chinner Cc: Ning Qu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 4eefb7e315217..1b88da5c08b33 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -3382,8 +3383,63 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, } #define FAULT_AROUND_ORDER 4 -#define FAULT_AROUND_PAGES (1UL << FAULT_AROUND_ORDER) -#define FAULT_AROUND_MASK ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1) + +#ifdef CONFIG_DEBUG_FS +static unsigned int fault_around_order = FAULT_AROUND_ORDER; + +static int fault_around_order_get(void *data, u64 *val) +{ + *val = fault_around_order; + return 0; +} + +static int fault_around_order_set(void *data, u64 val) +{ + BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); + if (1UL << val > PTRS_PER_PTE) + return -EINVAL; + fault_around_order = val; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, + fault_around_order_get, fault_around_order_set, "%llu\n"); + +static int __init fault_around_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, + &fault_around_order_fops); + if (!ret) + pr_warn("Failed to create fault_around_order in debugfs"); + return 0; +} +late_initcall(fault_around_debugfs); + +static inline unsigned long fault_around_pages(void) +{ + return 1UL << fault_around_order; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); +} +#else +static inline unsigned long fault_around_pages(void) +{ + unsigned long nr_pages; + + nr_pages = 1UL << FAULT_AROUND_ORDER; + BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); + return nr_pages; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); +} +#endif static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) @@ -3393,21 +3449,19 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, struct vm_fault vmf; int off; - BUILD_BUG_ON(FAULT_AROUND_PAGES > PTRS_PER_PTE); - - start_addr = max(address & FAULT_AROUND_MASK, vma->vm_start); + start_addr = max(address & fault_around_mask(), vma->vm_start); off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); pte -= off; pgoff -= off; /* * max_pgoff is either end of page table or end of vma - * or FAULT_AROUND_PAGES from pgoff, depending what is neast. + * or fault_around_pages() from pgoff, depending what is neast. */ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, - pgoff + FAULT_AROUND_PAGES - 1); + pgoff + fault_around_pages() - 1); /* Check if it makes any sense to call ->map_pages */ while (!pte_none(*pte)) { -- cgit v1.2.3 From d7c1755179b82d954f593ca5285b9360f2f62e9c Mon Sep 17 00:00:00 2001 From: Ning Qu Date: Mon, 7 Apr 2014 15:37:24 -0700 Subject: mm: implement ->map_pages for shmem/tmpfs In shmem/tmpfs, we also use the generic filemap_map_pages, seems the additional checking is not worth a separate version of map_pages for it. Signed-off-by: Ning Qu Acked-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Rik van Riel Cc: Dave Hansen Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index a3ba988ec9461..70709347a1e29 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2723,6 +2723,7 @@ static const struct super_operations shmem_ops = { static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, + .map_pages = filemap_map_pages, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, .get_policy = shmem_get_policy, -- cgit v1.2.3 From 615d6e8756c87149f2d4c1b93d471bca002bd849 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 7 Apr 2014 15:37:25 -0700 Subject: mm: per-thread vma caching This patch is a continuation of efforts trying to optimize find_vma(), avoiding potentially expensive rbtree walks to locate a vma upon faults. The original approach (https://lkml.org/lkml/2013/11/1/410), where the largest vma was also cached, ended up being too specific and random, thus further comparison with other approaches were needed. There are two things to consider when dealing with this, the cache hit rate and the latency of find_vma(). Improving the hit-rate does not necessarily translate in finding the vma any faster, as the overhead of any fancy caching schemes can be too high to consider. We currently cache the last used vma for the whole address space, which provides a nice optimization, reducing the total cycles in find_vma() by up to 250%, for workloads with good locality. On the other hand, this simple scheme is pretty much useless for workloads with poor locality. Analyzing ebizzy runs shows that, no matter how many threads are running, the mmap_cache hit rate is less than 2%, and in many situations below 1%. The proposed approach is to replace this scheme with a small per-thread cache, maximizing hit rates at a very low maintenance cost. Invalidations are performed by simply bumping up a 32-bit sequence number. The only expensive operation is in the rare case of a seq number overflow, where all caches that share the same address space are flushed. Upon a miss, the proposed replacement policy is based on the page number that contains the virtual address in question. Concretely, the following results are seen on an 80 core, 8 socket x86-64 box: 1) System bootup: Most programs are single threaded, so the per-thread scheme does improve ~50% hit rate by just adding a few more slots to the cache. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 50.61% | 19.90 | | patched | 73.45% | 13.58 | +----------------+----------+------------------+ 2) Kernel build: This one is already pretty good with the current approach as we're dealing with good locality. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 75.28% | 11.03 | | patched | 88.09% | 9.31 | +----------------+----------+------------------+ 3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload. +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 70.66% | 17.14 | | patched | 91.15% | 12.57 | +----------------+----------+------------------+ 4) Ebizzy: There's a fair amount of variation from run to run, but this approach always shows nearly perfect hit rates, while baseline is just about non-existent. The amounts of cycles can fluctuate between anywhere from ~60 to ~116 for the baseline scheme, but this approach reduces it considerably. For instance, with 80 threads: +----------------+----------+------------------+ | caching scheme | hit-rate | cycles (billion) | +----------------+----------+------------------+ | baseline | 1.06% | 91.54 | | patched | 99.97% | 14.18 | +----------------+----------+------------------+ [akpm@linux-foundation.org: fix nommu build, per Davidlohr] [akpm@linux-foundation.org: document vmacache_valid() logic] [akpm@linux-foundation.org: attempt to untangle header files] [akpm@linux-foundation.org: add vmacache_find() BUG_ON] [hughd@google.com: add vmacache_valid_mm() (from Oleg)] [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: adjust and enhance comments] Signed-off-by: Davidlohr Bueso Reviewed-by: Rik van Riel Acked-by: Linus Torvalds Reviewed-by: Michel Lespinasse Cc: Oleg Nesterov Tested-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/unicore32/include/asm/mmu_context.h | 4 +- fs/exec.c | 5 +- fs/proc/task_mmu.c | 3 +- include/linux/mm_types.h | 4 +- include/linux/sched.h | 7 ++ include/linux/vmacache.h | 38 +++++++++++ kernel/debug/debug_core.c | 14 +++- kernel/fork.c | 7 +- mm/Makefile | 2 +- mm/mmap.c | 55 ++++++++------- mm/nommu.c | 24 ++++--- mm/vmacache.c | 112 +++++++++++++++++++++++++++++++ 12 files changed, 231 insertions(+), 44 deletions(-) create mode 100644 include/linux/vmacache.h create mode 100644 mm/vmacache.c (limited to 'mm') diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index fb5e4c658f7ae..ef470a7a3d0f4 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -14,6 +14,8 @@ #include #include +#include +#include #include #include @@ -73,7 +75,7 @@ do { \ else \ mm->mmap = NULL; \ rb_erase(&high_vma->vm_rb, &mm->mm_rb); \ - mm->mmap_cache = NULL; \ + vmacache_invalidate(mm); \ mm->map_count--; \ remove_vma(high_vma); \ } \ diff --git a/fs/exec.c b/fs/exec.c index 25dfeba6d55f8..b60ccf969a8b3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code); static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; - struct mm_struct * old_mm, *active_mm; + struct mm_struct *old_mm, *active_mm; /* Notify parent that we're no longer interested in the old VM */ tsk = current; @@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm) tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); + tsk->mm->vmacache_seqnum = 0; + vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { up_read(&old_mm->mmap_sem); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fb52b548080da..442177b1119a4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) /* * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at + * vmacache most of the time. We have zero last_addr at * the beginning and also after lseek. We will have -1 last_addr * after the end of the vmas. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 290901a8c1de9..2b58d192ea240 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -342,9 +342,9 @@ struct mm_rss_stat { struct kioctx_table; struct mm_struct { - struct vm_area_struct * mmap; /* list of VMAs */ + struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; - struct vm_area_struct * mmap_cache; /* last find_vma result */ + u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/sched.h b/include/linux/sched.h index 7cb07fd266808..642477dd814ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -132,6 +132,10 @@ struct perf_event_context; struct blk_plug; struct filename; +#define VMACACHE_BITS 2 +#define VMACACHE_SIZE (1U << VMACACHE_BITS) +#define VMACACHE_MASK (VMACACHE_SIZE - 1) + /* * List of flags we want to share for kernel threads, * if only because they are not used by them anyway. @@ -1235,6 +1239,9 @@ struct task_struct { #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif + /* per-thread vma caching */ + u32 vmacache_seqnum; + struct vm_area_struct *vmacache[VMACACHE_SIZE]; #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; #endif diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h new file mode 100644 index 0000000000000..c3fa0fd439499 --- /dev/null +++ b/include/linux/vmacache.h @@ -0,0 +1,38 @@ +#ifndef __LINUX_VMACACHE_H +#define __LINUX_VMACACHE_H + +#include +#include + +/* + * Hash based on the page number. Provides a good hit rate for + * workloads with good locality and those with random accesses as well. + */ +#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK) + +static inline void vmacache_flush(struct task_struct *tsk) +{ + memset(tsk->vmacache, 0, sizeof(tsk->vmacache)); +} + +extern void vmacache_flush_all(struct mm_struct *mm); +extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); +extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, + unsigned long addr); + +#ifndef CONFIG_MMU +extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end); +#endif + +static inline void vmacache_invalidate(struct mm_struct *mm) +{ + mm->vmacache_seqnum++; + + /* deal with overflows */ + if (unlikely(mm->vmacache_seqnum == 0)) + vmacache_flush_all(mm); +} + +#endif /* __LINUX_VMACACHE_H */ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 99982a70ddade..2956c8da16055 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm && current->mm->mmap_cache) { - flush_cache_range(current->mm->mmap_cache, - addr, addr + BREAK_INSTR_SIZE); + if (current->mm) { + int i; + + for (i = 0; i < VMACACHE_SIZE; i++) { + if (!current->vmacache[i]) + continue; + flush_cache_range(current->vmacache[i], + addr, addr + BREAK_INSTR_SIZE); + } } + /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/fork.c b/kernel/fork.c index e40c0a01d5a63..bc0e96b78dfd4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->locked_vm = 0; mm->mmap = NULL; - mm->mmap_cache = NULL; + mm->vmacache_seqnum = 0; mm->map_count = 0; cpumask_clear(mm_cpumask(mm)); mm->mm_rb = RB_ROOT; @@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; + /* initialize the new vmacache entries */ + vmacache_flush(tsk); + if (clone_flags & CLONE_VM) { atomic_inc(&oldmm->mm_users); mm = oldmm; diff --git a/mm/Makefile b/mm/Makefile index cdd741519ee0b..23a6f7e230194 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -16,7 +16,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o balloon_compaction.o \ + compaction.o balloon_compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/mmap.c b/mm/mmap.c index 46433e137abc2..b1202cf81f4bd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - if (mm->mmap_cache == vma) - mm->mmap_cache = prev; + + /* Kill the cache */ + vmacache_invalidate(mm); } /* @@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = NULL; + struct rb_node *rb_node; + struct vm_area_struct *vma; /* Check the cache first. */ - /* (Cache hit rate is typically around 35%.) */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { - struct rb_node *rb_node; + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; - rb_node = mm->mm_rb.rb_node; - vma = NULL; + rb_node = mm->mm_rb.rb_node; + vma = NULL; - while (rb_node) { - struct vm_area_struct *vma_tmp; - - vma_tmp = rb_entry(rb_node, - struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - vma = vma_tmp; - if (vma_tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - if (vma) - mm->mmap_cache = vma; + while (rb_node) { + struct vm_area_struct *tmp; + + tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (tmp->vm_end > addr) { + vma = tmp; + if (tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; } + + if (vma) + vmacache_update(addr, vma); return vma; } @@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - mm->mmap_cache = NULL; /* Kill the cache. */ + + /* Kill the cache */ + vmacache_invalidate(mm); } /* diff --git a/mm/nommu.c b/mm/nommu.c index e19482533ce33..5d3f3524bbdc8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include @@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { + int i; struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; + struct task_struct *curr = current; kenter("%p", vma); protect_vma(vma, 0); mm->map_count--; - if (mm->mmap_cache == vma) - mm->mmap_cache = NULL; + for (i = 0; i < VMACACHE_SIZE; i++) { + /* if the vma is cached, invalidate the entire cache */ + if (curr->vmacache[i] == vma) { + vmacache_invalidate(curr->mm); + break; + } + } /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = ACCESS_ONCE(mm->mmap_cache); - if (vma && vma->vm_start <= addr && vma->vm_end > addr) + vma = vmacache_find(mm, addr); + if (likely(vma)) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (vma->vm_start > addr) return NULL; if (vma->vm_end > addr) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } @@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; /* check the cache first */ - vma = mm->mmap_cache; - if (vma && vma->vm_start == addr && vma->vm_end == end) + vma = vmacache_find_exact(mm, addr, end); + if (vma) return vma; /* trawl the list (there may be multiple mappings in which addr @@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_start > addr) return NULL; if (vma->vm_end == end) { - mm->mmap_cache = vma; + vmacache_update(addr, vma); return vma; } } diff --git a/mm/vmacache.c b/mm/vmacache.c new file mode 100644 index 0000000000000..d4224b397c0e4 --- /dev/null +++ b/mm/vmacache.c @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2014 Davidlohr Bueso. + */ +#include +#include +#include + +/* + * Flush vma caches for threads that share a given mm. + * + * The operation is safe because the caller holds the mmap_sem + * exclusively and other threads accessing the vma cache will + * have mmap_sem held at least for read, so no extra locking + * is required to maintain the vma cache. + */ +void vmacache_flush_all(struct mm_struct *mm) +{ + struct task_struct *g, *p; + + rcu_read_lock(); + for_each_process_thread(g, p) { + /* + * Only flush the vmacache pointers as the + * mm seqnum is already set and curr's will + * be set upon invalidation when the next + * lookup is done. + */ + if (mm == p->mm) + vmacache_flush(p); + } + rcu_read_unlock(); +} + +/* + * This task may be accessing a foreign mm via (for example) + * get_user_pages()->find_vma(). The vmacache is task-local and this + * task's vmacache pertains to a different mm (ie, its own). There is + * nothing we can do here. + * + * Also handle the case where a kernel thread has adopted this mm via use_mm(). + * That kernel thread's vmacache is not applicable to this mm. + */ +static bool vmacache_valid_mm(struct mm_struct *mm) +{ + return current->mm == mm && !(current->flags & PF_KTHREAD); +} + +void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) +{ + if (vmacache_valid_mm(newvma->vm_mm)) + current->vmacache[VMACACHE_HASH(addr)] = newvma; +} + +static bool vmacache_valid(struct mm_struct *mm) +{ + struct task_struct *curr; + + if (!vmacache_valid_mm(mm)) + return false; + + curr = current; + if (mm->vmacache_seqnum != curr->vmacache_seqnum) { + /* + * First attempt will always be invalid, initialize + * the new cache for this task here. + */ + curr->vmacache_seqnum = mm->vmacache_seqnum; + vmacache_flush(curr); + return false; + } + return true; +} + +struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start <= addr && vma->vm_end > addr) { + BUG_ON(vma->vm_mm != mm); + return vma; + } + } + + return NULL; +} + +#ifndef CONFIG_MMU +struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + int i; + + if (!vmacache_valid(mm)) + return NULL; + + for (i = 0; i < VMACACHE_SIZE; i++) { + struct vm_area_struct *vma = current->vmacache[i]; + + if (vma && vma->vm_start == start && vma->vm_end == end) + return vma; + } + + return NULL; +} +#endif -- cgit v1.2.3 From 3b32123d734cb414e366b35a3b2142a995f9d1a0 Mon Sep 17 00:00:00 2001 From: Gideon Israel Dsouza Date: Mon, 7 Apr 2014 15:37:26 -0700 Subject: mm: use macros from compiler.h instead of __attribute__((...)) To increase compiler portability there is which provides convenience macros for various gcc constructs. Eg: __weak for __attribute__((weak)). I've replaced all instances of gcc attributes with the right macro in the memory management (/mm) subsystem. [akpm@linux-foundation.org: while-we're-there consistency tweaks] Signed-off-by: Gideon Israel Dsouza Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- mm/nommu.c | 3 ++- mm/sparse.c | 4 +++- mm/util.c | 5 +++-- mm/vmalloc.c | 4 +++- 5 files changed, 13 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed5072c64daac..c5aa439933649 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -3521,7 +3522,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* Can be overriden by architectures */ -__attribute__((weak)) struct page * +struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write) { diff --git a/mm/nommu.c b/mm/nommu.c index 5d3f3524bbdc8..e68deff6d4476 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -460,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } diff --git a/mm/sparse.c b/mm/sparse.c index 38cad8fd73973..d1b48b691ac8c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,10 +5,12 @@ #include #include #include +#include #include #include #include #include + #include "internal.h" #include #include @@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) } #endif -void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) +void __weak __meminit vmemmap_populate_print_last(void) { } diff --git a/mm/util.c b/mm/util.c index a24aa22f24736..d7813e6d4cc7c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -307,7 +308,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * If the architecture not support this function, simply return with no * page pinned */ -int __attribute__((weak)) __get_user_pages_fast(unsigned long start, +int __weak __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { return 0; @@ -338,7 +339,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * callers need to carefully consider what to use. On many architectures, * get_user_pages_fast simply falls back to get_user_pages. */ -int __attribute__((weak)) get_user_pages_fast(unsigned long start, +int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf96803c5b5..a7b522f4851d3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -27,7 +27,9 @@ #include #include #include +#include #include + #include #include #include @@ -2181,7 +2183,7 @@ EXPORT_SYMBOL(remap_vmalloc_range); * Implement a stub for vmalloc_sync_all() if the architecture chose not to * have one. */ -void __attribute__((weak)) vmalloc_sync_all(void) +void __weak vmalloc_sync_all(void) { } -- cgit v1.2.3 From 2a389610a7331d22344698f23ef2e8c55b2cde7b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:29 -0700 Subject: mm, mempolicy: rename slab_node for clarity slab_node() is actually a mempolicy function, so rename it to mempolicy_slab_node() to make it clearer that it used for processes with mempolicies. At the same time, cleanup its code by saving numa_mem_id() in a local variable (since we require a node with memory, not just any node) and remove an obsolete comment that assumes the mempolicy is actually passed into the function. Signed-off-by: David Rientjes Acked-by: Christoph Lameter Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 +- mm/mempolicy.c | 15 ++++++--------- mm/slab.c | 4 ++-- mm/slub.c | 2 +- 4 files changed, 10 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5f1ea756aacee..cfe55dfca0153 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -151,7 +151,7 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask); -extern unsigned slab_node(void); +extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e3ab02822799d..0ad0ba31979f5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1782,21 +1782,18 @@ static unsigned interleave_nodes(struct mempolicy *policy) /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. - * @policy must be protected by freeing by the caller. If @policy is - * the current task's mempolicy, this protection is implicit, as only the - * task can change it's policy. The system default policy requires no - * such protection. */ -unsigned slab_node(void) +unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; + int node = numa_mem_id(); if (in_interrupt()) - return numa_node_id(); + return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) - return numa_node_id(); + return node; switch (policy->mode) { case MPOL_PREFERRED: @@ -1816,11 +1813,11 @@ unsigned slab_node(void) struct zonelist *zonelist; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); - zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; + zonelist = &NODE_DATA(node)->node_zonelists[0]; (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone ? zone->node : numa_node_id(); + return zone ? zone->node : node; } default: diff --git a/mm/slab.c b/mm/slab.c index 9153c802e2fee..4b17f4c2e92dd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3042,7 +3042,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) - nid_alloc = slab_node(); + nid_alloc = mempolicy_slab_node(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3074,7 +3074,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), flags); retry: /* diff --git a/mm/slub.c b/mm/slub.c index fe6d7be22ef09..5b05e4fe9a1a1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1685,7 +1685,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; -- cgit v1.2.3 From f0432d159601f96839f514f286eaa5b75c4112dc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:30 -0700 Subject: mm, mempolicy: remove per-process flag PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users. There's no significant performance degradation to checking current->mempolicy rather than current->flags & PF_MEMPOLICY in the allocation path, especially since this is considered unlikely(). Running TCP_RR with netperf-2.4.5 through localhost on 16 cpu machine with 64GB of memory and without a mempolicy: threads before after 16 1249409 1244487 32 1281786 1246783 48 1239175 1239138 64 1244642 1241841 80 1244346 1248918 96 1266436 1254316 112 1307398 1312135 128 1327607 1326502 Per-process flags are a scarce resource so we should free them up whenever possible and make them available. We'll be using it shortly for memcg oom reserves. Signed-off-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tejun Heo Cc: Mel Gorman Cc: Oleg Nesterov Cc: Rik van Riel Cc: Jianguo Wu Cc: Tim Hockin Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 1 - include/linux/sched.h | 1 - kernel/fork.c | 1 - mm/mempolicy.c | 31 ------------------------------- mm/slab.c | 4 ++-- 5 files changed, 2 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index cfe55dfca0153..3c1b968da0caa 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -143,7 +143,6 @@ extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, enum mpol_rebind_step step); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); -extern void mpol_fix_fork_child_flag(struct task_struct *p); extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, diff --git a/include/linux/sched.h b/include/linux/sched.h index 642477dd814ac..6c70645eb3b61 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1851,7 +1851,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ #define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */ diff --git a/kernel/fork.c b/kernel/fork.c index c777964c0662b..e905e9c6b224a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1276,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->mempolicy = NULL; goto bad_fork_cleanup_threadgroup_lock; } - mpol_fix_fork_child_flag(p); #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0ad0ba31979f5..78e1472933ea0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, return err; } -/* - * Update task->flags PF_MEMPOLICY bit: set iff non-default - * mempolicy. Allows more rapid checking of this (combined perhaps - * with other PF_* flag bits) on memory allocation hot code paths. - * - * If called from outside this file, the task 'p' should -only- be - * a newly forked child not yet visible on the task list, because - * manipulating the task flags of a visible task is not safe. - * - * The above limitation is why this routine has the funny name - * mpol_fix_fork_child_flag(). - * - * It is also safe to call this with a task pointer of current, - * which the static wrapper mpol_set_task_struct_flag() does, - * for use within this file. - */ - -void mpol_fix_fork_child_flag(struct task_struct *p) -{ - if (p->mempolicy) - p->flags |= PF_MEMPOLICY; - else - p->flags &= ~PF_MEMPOLICY; -} - -static void mpol_set_task_struct_flag(void) -{ - mpol_fix_fork_child_flag(current); -} - /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) @@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - mpol_set_task_struct_flag(); if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); diff --git a/mm/slab.c b/mm/slab.c index 4b17f4c2e92dd..3db4cb06e32ea 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3027,7 +3027,7 @@ out: #ifdef CONFIG_NUMA /* - * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. + * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set. * * If we are in_interrupt, then process context, including cpusets and * mempolicy, may not apply and should not be used for allocation policy. @@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) { void *objp; - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { + if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) { objp = alternate_node_alloc(cache, flags); if (objp) goto out; -- cgit v1.2.3 From da1c67a76f7cf2b3404823d24f9f10fa91aa5dc5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 7 Apr 2014 15:37:34 -0700 Subject: mm, compaction: determine isolation mode only once The conditions that control the isolation mode in isolate_migratepages_range() do not change during the iteration, so extract them out and only define the value once. This actually does have an effect, gcc doesn't optimize it itself because of cc->sync. Signed-off-by: David Rientjes Cc: Mel Gorman Acked-by: Rik van Riel Acked-by: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 054c28b51c755..37f976287068c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; - isolate_mode_t mode = 0; struct lruvec *lruvec; unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; bool skipped_async_unsuitable = false; + const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | + (unevictable ? ISOLATE_UNEVICTABLE : 0); /* * Ensure that there are not too many pages isolated from the LRU @@ -608,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (!cc->sync) - mode |= ISOLATE_ASYNC_MIGRATE; - - if (unevictable) - mode |= ISOLATE_UNEVICTABLE; - lruvec = mem_cgroup_page_lruvec(page, zone); /* Try isolate the page */ -- cgit v1.2.3 From eb9a3c62a0b6064c7f7e5b961ce00c646d21cb78 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 7 Apr 2014 15:37:35 -0700 Subject: mempool: add unlikely and likely hints Add unlikely and likely hints to the function mempool_free. It lays out the code in such a way that the common path is executed straighforward and saves a cache line. Signed-off-by: Mikulas Patocka Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 659aa42bad162..905434f18c973 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -304,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool) * ensures that there will be frees which return elements to the * pool waking up the waiters. */ - if (pool->curr_nr < pool->min_nr) { + if (unlikely(pool->curr_nr < pool->min_nr)) { spin_lock_irqsave(&pool->lock, flags); - if (pool->curr_nr < pool->min_nr) { + if (likely(pool->curr_nr < pool->min_nr)) { add_element(pool, element); spin_unlock_irqrestore(&pool->lock, flags); wake_up(&pool->wait); -- cgit v1.2.3 From ac7149045d9fcca1063e22e4c6f607bca8fce268 Mon Sep 17 00:00:00 2001 From: Choi Gi-yong Date: Mon, 7 Apr 2014 15:37:36 -0700 Subject: mm: fix 'ERROR: do not initialise globals to 0 or NULL' and coding style Signed-off-by: Choi Gi-yong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- mm/nommu.c | 16 ++++++---------- 2 files changed, 7 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c5aa439933649..27938c441b6fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2898,8 +2898,7 @@ retry: if (anon_rmap) { ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); - } - else + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); diff --git a/mm/nommu.c b/mm/nommu.c index e68deff6d4476..85f8d6698d487 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -298,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; memcpy(addr, buf, count); - return(count); + return count; } /* @@ -1012,8 +1012,7 @@ static int validate_mmap_request(struct file *file, /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; - } - else { + } else { /* we're going to read the file into private memory we * allocate */ if (!(capabilities & BDI_CAP_MAP_COPY)) @@ -1044,23 +1043,20 @@ static int validate_mmap_request(struct file *file, if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { if (prot & PROT_EXEC) return -EPERM; - } - else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { + } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { /* handle implication of PROT_EXEC by PROT_READ */ if (current->personality & READ_IMPLIES_EXEC) { if (capabilities & BDI_CAP_EXEC_MAP) prot |= PROT_EXEC; } - } - else if ((prot & PROT_READ) && + } else if ((prot & PROT_READ) && (prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP) ) { /* backing file is not executable, try to copy */ capabilities &= ~BDI_CAP_MAP_DIRECT; } - } - else { + } else { /* anonymous mappings are always memory backed and can be * privately mapped */ @@ -1668,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* find the first potentially overlapping VMA */ vma = find_vma(mm, start); if (!vma) { - static int limit = 0; + static int limit; if (limit < 5) { printk(KERN_WARNING "munmap of memory not mmapped by process %d" -- cgit v1.2.3 From 3643763834b935208b067db9f4a239aba9dbe28d Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Mon, 7 Apr 2014 15:37:37 -0700 Subject: mm/vmalloc.c: enhance vm_map_ram() comment vm_map_ram() has a fragmentation problem when it cannot purge a chunk(ie, 4M address space) if there is a pinning object in that addresss space. So it could consume all VMALLOC address space easily. We can fix the fragmentation problem by using vmap instead of vm_map_ram() but vmap() is known to be slow compared to vm_map_ram(). Minchan said vm_map_ram is 5 times faster than vmap in his tests. So I thought we should fix fragment problem of vm_map_ram because our proprietary GPU driver has used it heavily. On second thought, it's not an easy because we should reuse freed space for solving the problem and it could make more IPI and bitmap operation for searching hole. It could mitigate API's goal which is very fast mapping. And even fragmentation problem wouldn't show in 64 bit machine. Another option is that the user should separate long-life and short-life object and use vmap for long-life but vm_map_ram for short-life. If we inform the user about the characteristic of vm_map_ram the user can choose one according to the page lifetime. Let's add some notice messages to user. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Gioh Kim Reviewed-by: Zhang Yanfei Cc: Minchan Kim Cc: Johannes Weiner Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a7b522f4851d3..bf233b283319c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1085,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram); * @node: prefer to allocate data structures on this node * @prot: memory protection to use. PAGE_KERNEL for regular RAM * + * If you use this function for less than VMAP_MAX_ALLOC pages, it could be + * faster than vmap so it's good. But if you mix long-life and short-life + * objects with vm_map_ram(), it could consume lots of address space through + * fragmentation (especially on a 32bit machine). You could see failures in + * the end. Please use this function for short-lived objects. + * * Returns: a pointer to the address that has been mapped, or %NULL on failure */ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) -- cgit v1.2.3 From d230dec18dc9a581f153c14cb5371640cd62543b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 7 Apr 2014 15:37:38 -0700 Subject: mm: use 'const char *' insted of 'char *' for reason in dump_page() I tried to use 'dump_page(page, __func__)' for debugging, but it triggers warning: warning: passing argument 2 of `dump_page' discards `const' qualifier from pointer target type [enabled by default] Let's convert 'reason' to 'const char *' in dump_page() and friends: we shouldn't modify it anyway. Signed-off-by: Kirill A. Shutemov Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 4 ++-- mm/page_alloc.c | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 5042c036dda9f..2d57efa64cc16 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -3,8 +3,8 @@ struct page; -extern void dump_page(struct page *page, char *reason); -extern void dump_page_badflags(struct page *page, char *reason, +extern void dump_page(struct page *page, const char *reason); +extern void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags); #ifdef CONFIG_DEBUG_VM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 336ee925f7569..73c25912c7c45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(struct page *page, char *reason, unsigned long bad_flags) +static void bad_page(struct page *page, const char *reason, + unsigned long bad_flags) { static unsigned long resume; static unsigned long nr_shown; @@ -623,7 +624,7 @@ out: static inline int free_pages_check(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - char *bad_reason = NULL; + const char *bad_reason = NULL; unsigned long bad_flags = 0; if (unlikely(page_mapcount(page))) @@ -6545,7 +6546,8 @@ static void dump_page_flags(unsigned long flags) printk(")\n"); } -void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) +void dump_page_badflags(struct page *page, const char *reason, + unsigned long badflags) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", @@ -6561,7 +6563,7 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags) mem_cgroup_print_bad_page(page); } -void dump_page(struct page *page, char *reason) +void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -- cgit v1.2.3 From 7af467e8e1881fb65924329f46998e6e0d801038 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:39 -0700 Subject: mm: memcg: remove unnecessary preemption disabling lock_page_cgroup() disables preemption, remove explicit preemption disabling for code paths holding this lock. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dcc8153a1681b..6e0f781412a2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -921,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, bool anon, int nr_pages) { - preempt_disable(); - /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. @@ -947,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, } __this_cpu_add(memcg->stat->nr_page_events, nr_pages); - - preempt_enable(); } unsigned long @@ -3748,17 +3744,14 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline -void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, - struct mem_cgroup *to, - unsigned int nr_pages, - enum mem_cgroup_stat_index idx) +static void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, + struct mem_cgroup *to, + unsigned int nr_pages, + enum mem_cgroup_stat_index idx) { /* Update stat data for mem_cgroup */ - preempt_disable(); __this_cpu_sub(from->stat->count[idx], nr_pages); __this_cpu_add(to->stat->count[idx], nr_pages); - preempt_enable(); } /** -- cgit v1.2.3 From 59d1d256e156bb232700836b79d1ead5027f7b1d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:40 -0700 Subject: mm: memcg: remove mem_cgroup_move_account_page_stat() It used to disable preemption and run sanity checks but now it's only taking a number out of one percpu counter and putting it into another. Do this directly in the callsite and save the indirection. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6e0f781412a2f..b9928230a14c3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3744,16 +3744,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, - struct mem_cgroup *to, - unsigned int nr_pages, - enum mem_cgroup_stat_index idx) -{ - /* Update stat data for mem_cgroup */ - __this_cpu_sub(from->stat->count[idx], nr_pages); - __this_cpu_add(to->stat->count[idx], nr_pages); -} - /** * mem_cgroup_move_account - move account of the page * @page: the page @@ -3799,13 +3789,19 @@ static int mem_cgroup_move_account(struct page *page, move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_FILE_MAPPED); + if (!anon && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } - if (PageWriteback(page)) - mem_cgroup_move_account_page_stat(from, to, nr_pages, - MEM_CGROUP_STAT_WRITEBACK); + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } mem_cgroup_charge_statistics(from, page, anon, -nr_pages); -- cgit v1.2.3 From 1bec6b333e241a9db47d3939fb08a4e174ece02f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:41 -0700 Subject: mm: memcg: inline mem_cgroup_charge_common() mem_cgroup_charge_common() is used by both cache and anon pages, but most of its body only applies to anon pages and the remainder is not worth having in a separate function. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b9928230a14c3..5aab347a5b0b0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3887,20 +3887,21 @@ out: return ret; } -/* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit - */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) +int mem_cgroup_newpage_charge(struct page *page, + struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; bool oom = true; int ret; + if (mem_cgroup_disabled()) + return 0; + + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); + VM_BUG_ON(!mm); + if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -3914,22 +3915,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); + __mem_cgroup_commit_charge(memcg, page, nr_pages, + MEM_CGROUP_CHARGE_TYPE_ANON, false); return 0; } -int mem_cgroup_newpage_charge(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - if (mem_cgroup_disabled()) - return 0; - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without @@ -4047,9 +4037,11 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (PageCompound(page)) return 0; - if (!PageSwapCache(page)) - ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - else { /* page is swapcache/shmem */ + if (!PageSwapCache(page)) { + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); + if (ret != -ENOMEM) + __mem_cgroup_commit_charge(memcg, page, 1, type, false); + } else { /* page is swapcache/shmem */ ret = __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) -- cgit v1.2.3 From 284f39afeaa4ab1409b8f43b29cdea3007960ee3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:41 -0700 Subject: mm: memcg: push !mm handling out to page cache charge function Only page cache charges can happen without an mm context, so push this special case out of the inner core and into the cache charge function. An ancient comment explains that the mm can also be NULL in case the task is currently being migrated, but that is not actually true with the current case, so just remove it. Signed-off-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5aab347a5b0b0..8d6cedd16f8df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2703,15 +2703,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (gfp_mask & __GFP_NOFAIL) oom = false; - - /* - * We always charge the cgroup the mm_struct belongs to. - * The mm_struct's mem_cgroup changes on task migration if the - * thread group leader migrates. It's possible that mm is not - * set, if so charge the root memcg (happens for pagecache usage). - */ - if (!*ptr && !mm) - *ptr = root_mem_cgroup; again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; @@ -4038,6 +4029,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return 0; if (!PageSwapCache(page)) { + /* + * Page cache insertions can happen without an actual + * task context, e.g. during disk probing on boot. + */ + if (!mm) + memcg = root_mem_cgroup; ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); if (ret != -ENOMEM) __mem_cgroup_commit_charge(memcg, page, 1, type, false); -- cgit v1.2.3 From 03583f1a631c0511dfd2f16e716d5b40f675de5a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:42 -0700 Subject: memcg: remove unnecessary !mm check from try_get_mem_cgroup_from_mm() Users pass either a mm that has been established under task lock, or use a verified current->mm, which means the task can't be exiting. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8d6cedd16f8df..c3b674f9774f3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1075,13 +1075,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; - if (!mm) - return NULL; - /* - * Because we have no locks, mm->owner's may be being moved to other - * cgroup. We use css_tryget() here even if this looks - * pessimistic (rather than adding locks here). - */ rcu_read_lock(); do { memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); -- cgit v1.2.3 From df381975463996178d685f6ef7d3555c5f887201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:43 -0700 Subject: memcg: get_mem_cgroup_from_mm() Instead of returning NULL from try_get_mem_cgroup_from_mm() when the mm owner is exiting, just return root_mem_cgroup. This makes sense for all callsites and gets rid of some of them having to fallback manually. [fengguang.wu@intel.com: fix warnings] Signed-off-by: Johannes Weiner Signed-off-by: Fengguang Wu Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ------ mm/memcontrol.c | 18 ++++-------------- 2 files changed, 4 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eccfb4a4b3792..134636f835f75 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -94,7 +94,6 @@ bool task_in_mem_cgroup(struct task_struct *task, extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); -extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); @@ -294,11 +293,6 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return NULL; } -static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) -{ - return NULL; -} - static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3b674f9774f3..87c3ec37dd26c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1071,7 +1071,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } -struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; @@ -1079,7 +1079,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) do { memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!memcg)) - break; + memcg = root_mem_cgroup; } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; @@ -1475,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task, p = find_lock_task_mm(task); if (p) { - curr = try_get_mem_cgroup_from_mm(p->mm); + curr = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1489,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task, css_get(&curr->css); rcu_read_unlock(); } - if (!curr) - return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -3617,15 +3615,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) if (!current->mm || current->memcg_kmem_skip_account) return true; - memcg = try_get_mem_cgroup_from_mm(current->mm); - - /* - * very rare case described in mem_cgroup_from_task. Unfortunately there - * isn't much we can do without complicating this too much, and it would - * be gfp-dependent anyway. Just let it go - */ - if (unlikely(!memcg)) - return true; + memcg = get_mem_cgroup_from_mm(current->mm); if (!memcg_can_account_kmem(memcg)) { css_put(&memcg->css); -- cgit v1.2.3 From b6b6cc72bc404c952968530d7df4c3a4ab82b65b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:44 -0700 Subject: memcg: do not replicate get_mem_cgroup_from_mm in __mem_cgroup_try_charge __mem_cgroup_try_charge duplicates get_mem_cgroup_from_mm for charges which came without a memcg. The only reason seems to be a tiny optimization when css_tryget is not called if the charge can be consumed from the stock. Nevertheless css_tryget is very cheap since it has been reworked to use per-cpu counting so this optimization doesn't give us anything these days. So let's drop the code duplication so that the code is more readable. Signed-off-by: Michal Hocko Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 50 ++++++-------------------------------------------- 1 file changed, 6 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 87c3ec37dd26c..7480022d46555 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2697,52 +2697,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; - if (mem_cgroup_is_root(memcg)) - goto done; - if (consume_stock(memcg, nr_pages)) - goto done; css_get(&memcg->css); } else { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(mm->owner); - /* - * Because we don't have task_lock(), "p" can exit. - * In that case, "memcg" can point to root or p can be NULL with - * race with swapoff. Then, we have small risk of mis-accouning. - * But such kind of mis-account by race always happens because - * we don't have cgroup_mutex(). It's overkill and we allo that - * small race, here. - * (*) swapoff at el will charge against mm-struct not against - * task-struct. So, mm->owner can be NULL. - */ - memcg = mem_cgroup_from_task(p); - if (!memcg) - memcg = root_mem_cgroup; - if (mem_cgroup_is_root(memcg)) { - rcu_read_unlock(); - goto done; - } - if (consume_stock(memcg, nr_pages)) { - /* - * It seems dagerous to access memcg without css_get(). - * But considering how consume_stok works, it's not - * necessary. If consume_stock success, some charges - * from this memcg are cached on this cpu. So, we - * don't need to call css_get()/css_tryget() before - * calling consume_stock(). - */ - rcu_read_unlock(); - goto done; - } - /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(&memcg->css)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); + memcg = get_mem_cgroup_from_mm(mm); } + if (mem_cgroup_is_root(memcg)) + goto done; + if (consume_stock(memcg, nr_pages)) + goto done; do { bool invoke_oom = oom && !nr_oom_retries; @@ -2778,8 +2740,8 @@ again: if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - css_put(&memcg->css); done: + css_put(&memcg->css); *ptr = memcg; return 0; nomem: -- cgit v1.2.3 From 6d1fdc48938cd51a3964778d78f27cb26c8eb55d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:45 -0700 Subject: memcg: sanitize __mem_cgroup_try_charge() call protocol Some callsites pass a memcg directly, some callsites pass an mm that then has to be translated to a memcg. This makes for a terrible function interface. Just push the mm-to-memcg translation into the respective callsites and always pass a memcg to mem_cgroup_try_charge(). [mhocko@suse.cz: add charge mm helper] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 207 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 102 insertions(+), 105 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7480022d46555..038b037f8d673 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2575,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, } -/* See __mem_cgroup_try_charge() for details */ +/* See mem_cgroup_try_charge() for details */ enum { CHARGE_OK, /* success */ CHARGE_RETRY, /* need to retry but retry is not bad */ @@ -2648,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return CHARGE_NOMEM; } -/* - * __mem_cgroup_try_charge() does - * 1. detect memcg to be charged against from passed *mm and *ptr, - * 2. update res_counter - * 3. call memory reclaim if necessary. - * - * In some special case, if the task is fatal, fatal_signal_pending() or - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon - * as possible without any hazards. 2: all pages should have a valid - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg - * pointer, that is treated as a charge to root_mem_cgroup. - * - * So __mem_cgroup_try_charge() will return - * 0 ... on success, filling *ptr with a valid memcg pointer. - * -ENOMEM ... charge failure because of resource limits. - * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails * - * Unlike the exported interface, an "oom" parameter is added. if oom==true, - * the oom-killer can be invoked. + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ -static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - struct mem_cgroup **ptr, - bool oom) +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *memcg = NULL; int ret; + if (mem_cgroup_is_root(memcg)) + goto done; /* - * Unlike gloval-vm's OOM-kill, we're not in memory shortage - * in system level. So, allow to go ahead dying process in addition to - * MEMDIE process. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) - || fatal_signal_pending(current))) + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current))) goto bypass; if (unlikely(task_in_memcg_oom(current))) @@ -2695,14 +2684,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (gfp_mask & __GFP_NOFAIL) oom = false; again: - if (*ptr) { /* css should be a valid one */ - memcg = *ptr; - css_get(&memcg->css); - } else { - memcg = get_mem_cgroup_from_mm(mm); - } - if (mem_cgroup_is_root(memcg)) - goto done; if (consume_stock(memcg, nr_pages)) goto done; @@ -2710,10 +2691,8 @@ again: bool invoke_oom = oom && !nr_oom_retries; /* If killed, bypass charge */ - if (fatal_signal_pending(current)) { - css_put(&memcg->css); + if (fatal_signal_pending(current)) goto bypass; - } ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, invoke_oom); @@ -2722,17 +2701,12 @@ again: break; case CHARGE_RETRY: /* not in OOM situation but retry */ batch = nr_pages; - css_put(&memcg->css); - memcg = NULL; goto again; case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) { - css_put(&memcg->css); + if (!oom || invoke_oom) goto nomem; - } nr_oom_retries--; break; } @@ -2741,19 +2715,43 @@ again: if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); done: - css_put(&memcg->css); - *ptr = memcg; return 0; nomem: - if (!(gfp_mask & __GFP_NOFAIL)) { - *ptr = NULL; + if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; - } bypass: - *ptr = root_mem_cgroup; return -EINTR; } +/** + * mem_cgroup_try_charge_mm - try charging a mm + * @mm: mm_struct to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails + * + * Returns the charged mem_cgroup associated with the given mm_struct or + * NULL the charge failed. + */ +static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, + gfp_t gfp_mask, + unsigned int nr_pages, + bool oom) + +{ + struct mem_cgroup *memcg; + int ret; + + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + css_put(&memcg->css); + if (ret == -EINTR) + memcg = root_mem_cgroup; + else if (ret) + memcg = NULL; + + return memcg; +} + /* * Somemtimes we have to undo a charge we got by try_charge(). * This function is for that and do uncharge, put css's refcnt. @@ -2949,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; - struct mem_cgroup *_memcg; int ret = 0; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; - _memcg = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, oom_gfp_allowed(gfp)); - + ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, + oom_gfp_allowed(gfp)); if (ret == -EINTR) { /* - * __mem_cgroup_try_charge() chosed to bypass to root due to + * mem_cgroup_try_charge() chosed to bypass to root due to * OOM kill or fatal signal. Since our only options are to * either fail the allocation or charge it to this cgroup, do * it as a temporary condition. But we can't fail. From a @@ -2972,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) * * This condition will only trigger if the task entered * memcg_charge_kmem in a sane state, but was OOM-killed during - * __mem_cgroup_try_charge() above. Tasks that were already + * mem_cgroup_try_charge() above. Tasks that were already * dying when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ @@ -3826,10 +3821,9 @@ out: int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; + struct mem_cgroup *memcg; bool oom = true; - int ret; if (mem_cgroup_disabled()) return 0; @@ -3848,9 +3842,9 @@ int mem_cgroup_newpage_charge(struct page *page, oom = false; } - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); - if (ret == -ENOMEM) - return ret; + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + if (!memcg) + return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, nr_pages, MEM_CGROUP_CHARGE_TYPE_ANON, false); return 0; @@ -3867,7 +3861,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, gfp_t mask, struct mem_cgroup **memcgp) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; int ret; @@ -3880,31 +3874,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, * in turn serializes uncharging. */ if (PageCgroupUsed(pc)) - return 0; - if (!do_swap_account) - goto charge_cur_mm; - memcg = try_get_mem_cgroup_from_page(page); + goto out; + if (do_swap_account) + memcg = try_get_mem_cgroup_from_page(page); if (!memcg) - goto charge_cur_mm; - *memcgp = memcg; - ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); + memcg = get_mem_cgroup_from_mm(mm); + ret = mem_cgroup_try_charge(memcg, mask, 1, true); css_put(&memcg->css); if (ret == -EINTR) - ret = 0; - return ret; -charge_cur_mm: - ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; + memcg = root_mem_cgroup; + else if (ret) + return ret; +out: + *memcgp = memcg; + return 0; } int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) { - *memcgp = NULL; - if (mem_cgroup_disabled()) + if (mem_cgroup_disabled()) { + *memcgp = NULL; return 0; + } /* * A racing thread's fault, or swapoff, may have already * updated the pte, and even removed page from swap cache: in @@ -3912,12 +3904,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, * there's also a KSM case which does need to charge the page. */ if (!PageSwapCache(page)) { - int ret; + struct mem_cgroup *memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); - if (ret == -EINTR) - ret = 0; - return ret; + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; + *memcgp = memcg; + return 0; } return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); } @@ -3964,8 +3957,8 @@ void mem_cgroup_commit_charge_swapin(struct page *page, int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *memcg = NULL; enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + struct mem_cgroup *memcg; int ret; if (mem_cgroup_disabled()) @@ -3973,23 +3966,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (PageCompound(page)) return 0; - if (!PageSwapCache(page)) { - /* - * Page cache insertions can happen without an actual - * task context, e.g. during disk probing on boot. - */ - if (!mm) - memcg = root_mem_cgroup; - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); - if (ret != -ENOMEM) - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - } else { /* page is swapcache/shmem */ + if (PageSwapCache(page)) { /* shmem */ ret = __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); - if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, type); + if (ret) + return ret; + __mem_cgroup_commit_charge_swapin(page, memcg, type); + return 0; } - return ret; + + /* + * Page cache insertions can happen without an actual mm + * context, e.g. during disk probing on boot. + */ + if (unlikely(!mm)) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); + if (!memcg) + return -ENOMEM; + } + __mem_cgroup_commit_charge(memcg, page, 1, type, false); + return 0; } static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, @@ -6601,8 +6599,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, - GFP_KERNEL, 1, &memcg, false); + ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); if (ret) /* mem_cgroup_clear_mc() will do uncharge later */ return ret; -- cgit v1.2.3 From d715ae08f2ff87508a081c4df78061bf4f7211d6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 7 Apr 2014 15:37:46 -0700 Subject: memcg: rename high level charging functions mem_cgroup_newpage_charge is used only for charging anonymous memory so it is better to rename it to mem_cgroup_charge_anon. mem_cgroup_cache_charge is used for file backed memory so rename it to mem_cgroup_charge_file. Signed-off-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memcg_test.txt | 4 ++-- include/linux/memcontrol.h | 8 ++++---- mm/filemap.c | 2 +- mm/huge_memory.c | 8 ++++---- mm/memcontrol.c | 4 ++-- mm/memory.c | 6 +++--- mm/shmem.c | 6 +++--- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index ce94a83a7d9ae..80ac454704b80 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -24,7 +24,7 @@ Please note that implementation details can be changed. a page/swp_entry may be charged (usage += PAGE_SIZE) at - mem_cgroup_newpage_charge() + mem_cgroup_charge_anon() Called at new page fault and Copy-On-Write. mem_cgroup_try_charge_swapin() @@ -32,7 +32,7 @@ Please note that implementation details can be changed. Followed by charge-commit-cancel protocol. (With swap accounting) At commit, a charge recorded in swap_cgroup is removed. - mem_cgroup_cache_charge() + mem_cgroup_charge_file() Called at add_to_page_cache() mem_cgroup_cache_charge_swapin() diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 134636f835f75..96f3fc87ab964 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -65,7 +65,7 @@ struct mem_cgroup_reclaim_cookie { * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) */ -extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, +extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); /* for swap handling */ extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, @@ -74,7 +74,7 @@ extern void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg); extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); -extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); @@ -233,13 +233,13 @@ void mem_cgroup_print_bad_page(struct page *page); #else /* CONFIG_MEMCG */ struct mem_cgroup; -static inline int mem_cgroup_newpage_charge(struct page *page, +static inline int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return 0; } -static inline int mem_cgroup_cache_charge(struct page *page, +static inline int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return 0; diff --git a/mm/filemap.c b/mm/filemap.c index b952d99c827ca..27ebc0c9571bb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -563,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); if (error) return error; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2f4981418fc5..64635f5278ff2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_newpage_charge(pages[i], mm, + mem_cgroup_charge_anon(pages[i], mm, GFP_KERNEL))) { if (pages[i]) put_page(pages[i]); @@ -1101,7 +1101,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2359,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) return; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 038b037f8d673..e33b1d09eb1f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3818,7 +3818,7 @@ out: return ret; } -int mem_cgroup_newpage_charge(struct page *page, +int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { unsigned int nr_pages = 1; @@ -3954,7 +3954,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, MEM_CGROUP_CHARGE_TYPE_ANON); } -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, +int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; diff --git a/mm/memory.c b/mm/memory.c index 1b88da5c08b33..854e4027719f1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2828,7 +2828,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -3281,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) + if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -3537,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { + if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { page_cache_release(new_page); return VM_FAULT_OOM; } diff --git a/mm/shmem.c b/mm/shmem.c index 70709347a1e29..70273f8df5867 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); + error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -1080,7 +1080,7 @@ repeat: goto failed; } - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1134,7 +1134,7 @@ repeat: SetPageSwapBacked(page); __set_page_locked(page); - error = mem_cgroup_cache_charge(page, current->mm, + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (error) goto decused; -- cgit v1.2.3 From 3a025760fc158b3726eac89ee95d7f29599e9dfa Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 7 Apr 2014 15:37:48 -0700 Subject: mm: page_alloc: spill to remote nodes before waking kswapd On NUMA systems, a node may start thrashing cache or even swap anonymous pages while there are still free pages on remote nodes. This is a result of commits 81c0a2bb515f ("mm: page_alloc: fair zone allocator policy") and fff4068cba48 ("mm: page_alloc: revert NUMA aspect of fair allocation policy"). Before those changes, the allocator would first try all allowed zones, including those on remote nodes, before waking any kswapds. But now, the allocator fastpath doubles as the fairness pass, which in turn can only consider the local node to prevent remote spilling based on exhausted fairness batches alone. Remote nodes are only considered in the slowpath, after the kswapds are woken up. But if remote nodes still have free memory, kswapd should not be woken to rebalance the local node or it may thrash cash or swap prematurely. Fix this by adding one more unfair pass over the zonelist that is allowed to spill to remote nodes after the local fairness pass fails but before entering the slowpath and waking the kswapds. This also gets rid of the GFP_THISNODE exemption from the fairness protocol because the unfair pass is no longer tied to kswapd, which GFP_THISNODE is not allowed to wake up. However, because remote spills can be more frequent now - we prefer them over local kswapd reclaim - the allocation batches on remote nodes could underflow more heavily. When resetting the batches, use atomic_long_read() directly instead of zone_page_state() to calculate the delta as the latter filters negative counter values. Signed-off-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 1 + mm/page_alloc.c | 89 +++++++++++++++++++++++++++++---------------------------- 2 files changed, 46 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 29e1e761f9ebe..3e910000fda43 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_FAIR 0x100 /* fair zone allocation */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73c25912c7c45..15d140755e71f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; -} -#else -static bool gfp_thisnode_allocation(gfp_t gfp_mask) -{ - return false; -} #endif /* @@ -1584,12 +1575,7 @@ again: get_pageblock_migratetype(page)); } - /* - * NOTE: GFP_THISNODE allocations do not partake in the kswapd - * aging protocol, so they can't be fair. - */ - if (!gfp_thisnode_allocation(gfp_flags)) - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1955,23 +1941,12 @@ zonelist_scan: * zone size to ensure fair page aging. The zone a * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. - * - * Try to stay in local zones in the fastpath. If - * that fails, the slowpath is entered, which will do - * another pass starting with the local zones, but - * ultimately fall back to remote zones that do not - * partake in the fairness round-robin cycle of this - * zonelist. - * - * NOTE: GFP_THISNODE allocations do not partake in - * the kswapd aging protocol, so they can't be fair. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - !gfp_thisnode_allocation(gfp_mask)) { - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) continue; + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; } /* * When allocating a page cache page for writing, we @@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) +static void reset_alloc_batches(struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); /* * Only reset the batches of zones that were actually - * considered in the fast path, we don't want to - * thrash fairness information for zones that are not + * considered in the fairness pass, we don't want to + * trash fairness information for zones that are not * actually part of this zonelist's round-robin cycle. */ if (!zone_local(preferred_zone, zone)) continue; mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); } } +static void wake_all_kswapds(unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) +{ + struct zoneref *z; + struct zone *zone; + + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); +} + static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { @@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (gfp_thisnode_allocation(gfp_mask)) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: - prepare_slowpath(gfp_mask, order, zonelist, - high_zoneidx, preferred_zone); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2753,11 +2737,28 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif +retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) { + /* + * The first pass makes sure allocations are spread + * fairly within the local node. However, the local + * node might have free pages left after the fairness + * batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without + * fairness, and include remote zones now, before + * entering the slowpath and waking kswapd: prefer + * spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + reset_alloc_batches(zonelist, high_zoneidx, + preferred_zone); + alloc_flags &= ~ALLOC_FAIR; + goto retry; + } /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not -- cgit v1.2.3 From 57e68e9cd65b4b8eb4045a1e0d0746458502554c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 7 Apr 2014 15:37:50 -0700 Subject: mm: try_to_unmap_cluster() should lock_page() before mlocking A BUG_ON(!PageLocked) was triggered in mlock_vma_page() by Sasha Levin fuzzing with trinity. The call site try_to_unmap_cluster() does not lock the pages other than its check_page parameter (which is already locked). The BUG_ON in mlock_vma_page() is not documented and its purpose is somewhat unclear, but apparently it serializes against page migration, which could otherwise fail to transfer the PG_mlocked flag. This would not be fatal, as the page would be eventually encountered again, but NR_MLOCK accounting would become distorted nevertheless. This patch adds a comment to the BUG_ON in mlock_vma_page() and munlock_vma_page() to that effect. The call site try_to_unmap_cluster() is fixed so that for page != check_page, trylock_page() is attempted (to avoid possible deadlocks as we already have check_page locked) and mlock_vma_page() is performed only upon success. If the page lock cannot be obtained, the page is left without PG_mlocked, which is again not a problem in the whole unevictable memory design. Signed-off-by: Vlastimil Babka Signed-off-by: Bob Liu Reported-by: Sasha Levin Cc: Wanpeng Li Cc: Michel Lespinasse Cc: KOSAKI Motohiro Acked-by: Rik van Riel Cc: David Rientjes Cc: Mel Gorman Cc: Hugh Dickins Cc: Joonsoo Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 2 ++ mm/rmap.c | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 4e1a681622856..b1eb536340056 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page) */ void mlock_vma_page(struct page *page) { + /* Serialize with page migration */ BUG_ON(!PageLocked(page)); if (!TestSetPageMlocked(page)) { @@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page) unsigned int nr_pages; struct zone *zone = page_zone(page); + /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); /* diff --git a/mm/rmap.c b/mm/rmap.c index 11cf322f8133d..9c3e77396d1a1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1332,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, BUG_ON(!page || PageAnon(page)); if (locked_vma) { - mlock_vma_page(page); /* no-op if already mlocked */ - if (page == check_page) + if (page == check_page) { + /* we know we have check_page locked */ + mlock_vma_page(page); ret = SWAP_MLOCK; + } else if (trylock_page(page)) { + /* + * If we can lock the page, perform mlock. + * Otherwise leave the page alone, it will be + * eventually encountered again later. + */ + mlock_vma_page(page); + unlock_page(page); + } continue; /* don't unmap */ } -- cgit v1.2.3 From ed6d7c8e578331cad594ee70d60e2e146b5dce7b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 7 Apr 2014 15:37:51 -0700 Subject: mm: remove unused arg of set_page_dirty_balance() There's only one caller of set_page_dirty_balance() and that will call it with page_mkwrite == 0. The page_mkwrite argument was unused since commit b827e496c893 "mm: close page_mkwrite races". Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 2 +- mm/memory.c | 2 +- mm/page-writeback.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 021b8a319b9e2..5777c13849ba8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -178,7 +178,7 @@ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); -void set_page_dirty_balance(struct page *page, int page_mkwrite); +void set_page_dirty_balance(struct page *page); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/memory.c b/mm/memory.c index 854e4027719f1..d0f0bef3be488 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2782,7 +2782,7 @@ reuse: */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + set_page_dirty_balance(dirty_page); /* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7106cb1aca8e3..ef413492a1494 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1562,9 +1562,9 @@ pause: bdi_start_background_writeback(bdi); } -void set_page_dirty_balance(struct page *page, int page_mkwrite) +void set_page_dirty_balance(struct page *page) { - if (set_page_dirty(page) || page_mkwrite) { + if (set_page_dirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping) -- cgit v1.2.3 From 136199f0a67cd6bb3f0e8de0ad50f52879f82077 Mon Sep 17 00:00:00 2001 From: Emil Medve Date: Mon, 7 Apr 2014 15:37:52 -0700 Subject: memblock: use for_each_memblock() This is a small cleanup. Signed-off-by: Emil Medve Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 24 +++++++++++------------- mm/page_alloc.c | 10 +++++----- 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 7fe5354e7552c..c5c20c46f97e8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1271,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - unsigned long i; phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + struct memblock_region *r; if (!limit) return; /* find out max address */ - for (i = 0; i < memblock.memory.cnt; i++) { - struct memblock_region *r = &memblock.memory.regions[i]; - + for_each_memblock(memory, r) { if (limit <= r->size) { max_addr = r->base + limit; break; @@ -1379,13 +1377,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si void __init_memblock memblock_trim_memory(phys_addr_t align) { - int i; phys_addr_t start, end, orig_start, orig_end; - struct memblock_type *mem = &memblock.memory; + struct memblock_region *r; - for (i = 0; i < mem->cnt; i++) { - orig_start = mem->regions[i].base; - orig_end = mem->regions[i].base + mem->regions[i].size; + for_each_memblock(memory, r) { + orig_start = r->base; + orig_end = r->base + r->size; start = round_up(orig_start, align); end = round_down(orig_end, align); @@ -1393,11 +1390,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align) continue; if (start < end) { - mem->regions[i].base = start; - mem->regions[i].size = end - start; + r->base = start; + r->size = end - start; } else { - memblock_remove_region(mem, i); - i--; + memblock_remove_region(&memblock.memory, + r - memblock.memory.regions); + r--; } } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15d140755e71f..48427a7cfb45f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5073,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_MEMORY]); - struct memblock_type *type = &memblock.memory; + struct memblock_region *r; /* Need to find movable_zone earlier when movable_node is specified. */ find_usable_zone_for_movable(); @@ -5083,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void) * options. */ if (movable_node_is_enabled()) { - for (i = 0; i < type->cnt; i++) { - if (!memblock_is_hotpluggable(&type->regions[i])) + for_each_memblock(memory, r) { + if (!memblock_is_hotpluggable(r)) continue; - nid = type->regions[i].nid; + nid = r->nid; - usable_startpfn = PFN_DOWN(type->regions[i].base); + usable_startpfn = PFN_DOWN(r->base); zone_movable_pfn[nid] = zone_movable_pfn[nid] ? min(usable_startpfn, zone_movable_pfn[nid]) : usable_startpfn; -- cgit v1.2.3 From 167632303005670713603452a3c9ee5de4aa5828 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:37:53 -0700 Subject: mm/memblock.c: use PFN_PHYS() Replace ((phys_addr_t)(x) << PAGE_SHIFT) by pfn macro. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index c5c20c46f97e8..e9d6ca9a01a9a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1253,7 +1253,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn) pages += end_pfn - start_pfn; } - return (phys_addr_t)pages << PAGE_SHIFT; + return PFN_PHYS(pages); } /* lowest address */ @@ -1324,7 +1324,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) { struct memblock_type *type = &memblock.memory; - int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); + int mid = memblock_search(type, PFN_PHYS(pfn)); if (mid == -1) return -1; -- cgit v1.2.3 From 55f67141a8927b2be3e51840da37b8a2320143ed Mon Sep 17 00:00:00 2001 From: "Mizuma, Masayoshi" Date: Mon, 7 Apr 2014 15:37:54 -0700 Subject: mm: hugetlb: fix softlockup when a large number of hugepages are freed. When I decrease the value of nr_hugepage in procfs a lot, softlockup happens. It is because there is no chance of context switch during this process. On the other hand, when I allocate a large number of hugepages, there is some chance of context switch. Hence softlockup doesn't happen during this process. So it's necessary to add the context switch in the freeing process as same as allocating process to avoid softlockup. When I freed 12 TB hugapages with kernel-2.6.32-358.el6, the freeing process occupied a CPU over 150 seconds and following softlockup message appeared twice or more. $ echo 6000000 > /proc/sys/vm/nr_hugepages $ cat /proc/sys/vm/nr_hugepages 6000000 $ grep ^Huge /proc/meminfo HugePages_Total: 6000000 HugePages_Free: 6000000 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB $ echo 0 > /proc/sys/vm/nr_hugepages BUG: soft lockup - CPU#16 stuck for 67s! [sh:12883] ... Pid: 12883, comm: sh Not tainted 2.6.32-358.el6.x86_64 #1 Call Trace: free_pool_huge_page+0xb8/0xd0 set_max_huge_pages+0x128/0x190 hugetlb_sysctl_handler_common+0x113/0x140 hugetlb_sysctl_handler+0x1e/0x20 proc_sys_call_handler+0x97/0xd0 proc_sys_write+0x14/0x20 vfs_write+0xb8/0x1a0 sys_write+0x51/0x90 __audit_syscall_exit+0x265/0x290 system_call_fastpath+0x16/0x1b I have not confirmed this problem with upstream kernels because I am not able to prepare the machine equipped with 12TB memory now. However I confirmed that the amount of decreasing hugepages was directly proportional to the amount of required time. I measured required times on a smaller machine. It showed 130-145 hugepages decreased in a millisecond. Amount of decreasing Required time Decreasing rate hugepages (msec) (pages/msec) ------------------------------------------------------------ 10,000 pages == 20GB 70 - 74 135-142 30,000 pages == 60GB 208 - 229 131-144 It means decrement of 6TB hugepages will trigger softlockup with the default threshold 20sec, in this decreasing rate. Signed-off-by: Masayoshi Mizuma Cc: Joonsoo Kim Cc: Michal Hocko Cc: Wanpeng Li Cc: Aneesh Kumar Cc: KOSAKI Motohiro Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 27938c441b6fe..dd30f22b35e0c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1536,6 +1536,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, while (min_count < persistent_huge_pages(h)) { if (!free_pool_huge_page(h, nodes_allowed, 0)) break; + cond_resched_lock(&hugetlb_lock); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) -- cgit v1.2.3 From 29f175d125f0f3a9503af8a5596f93d714cceb08 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 7 Apr 2014 15:37:55 -0700 Subject: mm/readahead.c: inline ra_submit Commit f9acc8c7b35a ("readahead: sanify file_ra_state names") left ra_submit with a single function call. Move ra_submit to internal.h and inline it to save some stack. Thanks to Andrew Morton for commenting different versions. Signed-off-by: Fabian Frederick Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- mm/internal.h | 15 +++++++++++++++ mm/readahead.c | 21 +++------------------ 3 files changed, 18 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9132faed1a41b..6edcea720ddd0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1875,9 +1875,6 @@ void page_cache_async_readahead(struct address_space *mapping, unsigned long size); unsigned long max_sane_readahead(unsigned long nr); -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, - struct file *filp); /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); diff --git a/mm/internal.h b/mm/internal.h index 3e910000fda43..07b67361a40a2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -11,6 +11,7 @@ #ifndef __MM_INTERNAL_H #define __MM_INTERNAL_H +#include #include void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, @@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +extern int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size); + +/* + * Submit IO for the read-ahead request in file_ra_state. + */ +static inline unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, struct file *filp) +{ + return __do_page_cache_readahead(mapping, filp, + ra->start, ra->size, ra->async_size); +} + /* * Turn a non-refcounted page (->_count == 0) into refcounted with * a count of one. diff --git a/mm/readahead.c b/mm/readahead.c index 29c5e1af5a0c7..0ca36a7770b1b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -8,9 +8,7 @@ */ #include -#include #include -#include #include #include #include @@ -20,6 +18,8 @@ #include #include +#include "internal.h" + /* * Initialise a struct file's readahead state. Assumes that the caller has * memset *ra to zero. @@ -149,8 +149,7 @@ out: * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -static int -__do_page_cache_readahead(struct address_space *mapping, struct file *filp, +int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size) { @@ -243,20 +242,6 @@ unsigned long max_sane_readahead(unsigned long nr) return min(nr, MAX_READAHEAD); } -/* - * Submit IO for the read-ahead request in file_ra_state. - */ -unsigned long ra_submit(struct file_ra_state *ra, - struct address_space *mapping, struct file *filp) -{ - int actual; - - actual = __do_page_cache_readahead(mapping, filp, - ra->start, ra->size, ra->async_size); - - return actual; -} - /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large -- cgit v1.2.3 From ed12d845b5f528cc0846023862b9c448a36122ec Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 7 Apr 2014 15:37:59 -0700 Subject: mm/page_alloc.c: change mm debug routines back to EXPORT_SYMBOL A new dump_page() routine was recently added, and marked EXPORT_SYMBOL_GPL. dump_page() was also added to the VM_BUG_ON_PAGE() macro, and so the end result is that non-GPL code can no longer call get_page() and a few other routines. This only happens if the kernel was compiled with CONFIG_DEBUG_VM. Change dump_page() to be EXPORT_SYMBOL. Longer explanation: Prior to commit 309381feaee5 ("mm: dump page when hitting a VM_BUG_ON using VM_BUG_ON_PAGE") , it was possible to build MIT-licensed (non-GPL) drivers on Fedora. Fedora is semi-unique, in that it sets CONFIG_VM_DEBUG. Because Fedora sets CONFIG_VM_DEBUG, they end up pulling in dump_page(), via VM_BUG_ON_PAGE, via get_page(). As one of the authors of NVIDIA's new, open source, "UVM-Lite" kernel module, I originally choose to use the kernel's get_page() routine from within nvidia_uvm_page_cache.c, because get_page() has always seemed to be very clearly intended for use by non-GPL, driver code. So I'm hoping that making get_page() widely accessible again will not be too controversial. We did check with Fedora first, and they responded (https://bugzilla.redhat.com/show_bug.cgi?id=1074710#c3) that we should try to get upstream changed, before asking Fedora to change. Their reasoning seems beneficial to Linux: leaving CONFIG_DEBUG_VM set allows Fedora to help catch mm bugs. Signed-off-by: John Hubbard Cc: Sasha Levin Cc: Josh Boyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48427a7cfb45f..5dba2933c9c01 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6568,4 +6568,4 @@ void dump_page(struct page *page, const char *reason) { dump_page_badflags(page, reason, 0); } -EXPORT_SYMBOL_GPL(dump_page); +EXPORT_SYMBOL(dump_page); -- cgit v1.2.3 From 6b4525164e247e29f48b3a69e3d35f60fab50ae5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Apr 2014 15:38:25 -0700 Subject: mm/zswap.c: fix trivial typo and arrange indentation Signed-off-by: SeongJae Park Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index e55bab9dc41f8..5b2245324715d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -160,14 +160,14 @@ static void zswap_comp_exit(void) * rbnode - links the entry into red-black tree for the appropriate swap type * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code - * concurent calls to load, invalidate, and writeback. The lock + * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. * handle - zsmalloc allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression */ struct zswap_entry { struct rb_node rbnode; -- cgit v1.2.3 From 6335b19344cc263724ae49a76ed930b21a659055 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Apr 2014 15:38:26 -0700 Subject: mm/zswap.c: update zsmalloc in comment to zbud zswap used zsmalloc before and now using zbud. But, some comments saying it use zsmalloc yet. Fix the trivial problems. Signed-off-by: SeongJae Park Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 5b2245324715d..25312eb373a03 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -165,7 +165,7 @@ static void zswap_comp_exit(void) * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zsmalloc allocation handle that stores the compressed page data + * handle - zbud allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression */ @@ -282,7 +282,7 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * Carries out the common pattern of freeing and entry's zbud allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_tree *tree, -- cgit v1.2.3 From 60105e1248f571aa3b895cd63bef072ed9d90c77 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 7 Apr 2014 15:38:27 -0700 Subject: mm/zswap: support multiple swap devices Cai Liu reporeted that now zbud pool pages counting has a problem when multiple swap is used because it just counts only one swap intead of all of swap so zswap cannot control writeback properly. The result is unnecessary writeback or no writeback when we should really writeback. IOW, it made zswap crazy. Another problem in zswap is: For example, let's assume we use two swap A and B with different priority and A already has charged 19% long time ago and let's assume that A swap is full now so VM start to use B so that B has charged 1% recently. It menas zswap charged (19% + 1%) is full by default. Then, if VM want to swap out more pages into B, zbud_reclaim_page would be evict one of pages in B's pool and it would be repeated continuously. It's totally LRU reverse problem and swap thrashing in B would happen. This patch makes zswap consider mutliple swap by creating *a* zbud pool which will be shared by multiple swap so all of zswap pages in multiple swap keep order by LRU so it can prevent above two problems. Signed-off-by: Minchan Kim Reported-by: Cai Liu Suggested-by: Weijie Yang Cc: Seth Jennings Reviewed-by: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 64 ++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 25312eb373a03..c0c9b7c80c054 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* zbud_pool is shared by all of zswap backend */ +static struct zbud_pool *zswap_pool; + /********************************* * compression functions **********************************/ @@ -189,7 +192,6 @@ struct zswap_header { struct zswap_tree { struct rb_root rbroot; spinlock_t lock; - struct zbud_pool *pool; }; static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; @@ -285,13 +287,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) * Carries out the common pattern of freeing and entry's zbud allocation, * freeing the entry itself, and decrementing the number of stored pages. */ -static void zswap_free_entry(struct zswap_tree *tree, - struct zswap_entry *entry) +static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(tree->pool, entry->handle); + zbud_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); } /* caller must hold the tree lock */ @@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree, BUG_ON(refcount < 0); if (refcount == 0) { zswap_rb_erase(&tree->rbroot, entry); - zswap_free_entry(tree, entry); + zswap_free_entry(entry); } } @@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) zbud_unmap(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); - BUG_ON(pool != tree->pool); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + + src = (u8 *)zbud_map(zswap_pool, entry->handle) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zbud_unmap(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(tree->pool, 8)) { + if (zbud_reclaim_page(zswap_pool, 8)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(tree->pool, handle); + zhdr = zbud_map(zswap_pool, handle); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(tree->pool, handle); + zbud_unmap(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); + zswap_pool_pages = zbud_get_pool_size(zswap_pool); return 0; @@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(tree->pool, entry->handle) + + src = (u8 *)zbud_map(zswap_pool, entry->handle) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(tree->pool, entry->handle); + zbud_unmap(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type) /* walk the tree and free everything */ spin_lock(&tree->lock); rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(tree, entry); + zswap_free_entry(entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); - - zbud_destroy_pool(tree->pool); kfree(tree); zswap_trees[type] = NULL; } @@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type) struct zswap_tree *tree; tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); - if (!tree) - goto err; - tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); - if (!tree->pool) - goto freetree; + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + tree->rbroot = RB_ROOT; spin_lock_init(&tree->lock); zswap_trees[type] = tree; - return; - -freetree: - kfree(tree); -err: - pr_err("alloc failed, zswap disabled for swap type %d\n", type); } static struct frontswap_ops zswap_frontswap_ops = { @@ -907,9 +899,16 @@ static int __init init_zswap(void) return 0; pr_info("loading zswap\n"); + + zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + if (!zswap_pool) { + pr_err("zbud pool creation failed\n"); + goto error; + } + if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); - goto error; + goto cachefail; } if (zswap_comp_init()) { pr_err("compressor initialization failed\n"); @@ -919,6 +918,7 @@ static int __init init_zswap(void) pr_err("per-cpu initialization failed\n"); goto pcpufail; } + frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); @@ -927,6 +927,8 @@ pcpufail: zswap_comp_exit(); compfail: zswap_entry_cache_destory(); +cachefail: + zbud_destroy_pool(zswap_pool); error: return -ENOMEM; } -- cgit v1.2.3 From 5d2d42de185b77b4bc60b5ac77386c4099d71bf3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Apr 2014 15:38:28 -0700 Subject: mm/zswap.c: remove unnecessary parentheses Fix following trivial checkpatch error: ERROR: return is not a function, parentheses are not required Signed-off-by: SeongJae Park Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index c0c9b7c80c054..34b75cc708276 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -204,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache; static int zswap_entry_cache_create(void) { zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); - return (zswap_entry_cache == NULL); + return zswap_entry_cache == NULL; } static void zswap_entry_cache_destory(void) @@ -408,8 +408,8 @@ cleanup: **********************************/ static bool zswap_is_full(void) { - return (totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages); + return totalram_pages * zswap_max_pool_percent / 100 < + zswap_pool_pages; } /********************************* -- cgit v1.2.3 From a44cb9449182fd7b25bf5f1cc38b7f19e0b96f6d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:23 -0700 Subject: memcg, slab: never try to merge memcg caches When a kmem cache is created (kmem_cache_create_memcg()), we first try to find a compatible cache that already exists and can handle requests from the new cache, i.e. has the same object size, alignment, ctor, etc. If there is such a cache, we do not create any new caches, instead we simply increment the refcount of the cache found and return it. Currently we do this procedure not only when creating root caches, but also for memcg caches. However, there is no point in that, because, as every memcg cache has exactly the same parameters as its parent and cache merging cannot be turned off in runtime (only on boot by passing "slub_nomerge"), the root caches of any two potentially mergeable memcg caches should be merged already, i.e. it must be the same root cache, and therefore we couldn't even get to the memcg cache creation, because it already exists. The only exception is boot caches - they are explicitly forbidden to be merged by setting their refcount to -1. There are currently only two of them - kmem_cache and kmem_cache_node, which are used in slab internals (I do not count kmalloc caches as their refcount is set to 1 immediately after creation). Since they are prevented from merging preliminary I guess we should avoid to merge their children too. So let's remove the useless code responsible for merging memcg caches. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 21 ++++----------------- mm/slab_common.c | 8 +++++--- mm/slub.c | 19 +++++++++---------- 3 files changed, 18 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 8184a7cde272b..3045316b7c9df 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, struct mem_cgroup; #ifdef CONFIG_SLUB struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)); +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)); #else static inline struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { return NULL; } #endif @@ -119,13 +119,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return !s->memcg_params || s->memcg_params->is_root_cache; } -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) -{ - return (is_root_cache(cachep) && !memcg) || - (cachep->memcg_params->memcg == memcg); -} - static inline void memcg_bind_pages(struct kmem_cache *s, int order) { if (!is_root_cache(s)) @@ -204,12 +197,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return true; } -static inline bool cache_match_memcg(struct kmem_cache *cachep, - struct mem_cgroup *memcg) -{ - return true; -} - static inline void memcg_bind_pages(struct kmem_cache *s, int order) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index 1ec3c619ba04b..e77b51eb73479 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -200,9 +200,11 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, */ flags &= CACHE_CREATE_MASK; - s = __kmem_cache_alias(memcg, name, size, align, flags, ctor); - if (s) - goto out_unlock; + if (!memcg) { + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) + goto out_unlock; + } err = -ENOMEM; s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); diff --git a/mm/slub.c b/mm/slub.c index 5b05e4fe9a1a1..7d81afb270482 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3685,6 +3685,9 @@ static int slab_unmergeable(struct kmem_cache *s) if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; + if (!is_root_cache(s)) + return 1; + if (s->ctor) return 1; @@ -3697,9 +3700,8 @@ static int slab_unmergeable(struct kmem_cache *s) return 0; } -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(size_t size, size_t align, + unsigned long flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3722,7 +3724,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, continue; if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; + continue; /* * Check if alignment is compatible. * Courtesy of Adrian Drzewiecki @@ -3733,21 +3735,18 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, if (s->size - size >= sizeof(void *)) continue; - if (!cache_match_memcg(s, memcg)) - continue; - return s; } return NULL; } struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(memcg, size, align, flags, name, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { s->refcount++; /* -- cgit v1.2.3 From 5722d094ad2b56fa2c1cb3adaf40071a55bbf242 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:24 -0700 Subject: memcg, slab: cleanup memcg cache creation This patch cleans up the memcg cache creation path as follows: - Move memcg cache name creation to a separate function to be called from kmem_cache_create_memcg(). This allows us to get rid of the mutex protecting the temporary buffer used for the name formatting, because the whole cache creation path is protected by the slab_mutex. - Get rid of memcg_create_kmem_cache(). This function serves as a proxy to kmem_cache_create_memcg(). After separating the cache name creation path, it would be reduced to a function call, so let's inline it. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 +++++ mm/memcontrol.c | 89 ++++++++++++++++++++-------------------------- mm/slab_common.c | 5 ++- 3 files changed, 52 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96f3fc87ab964..ab7f02884983b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -491,6 +491,9 @@ void __memcg_kmem_commit_charge(struct page *page, void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); + +char *memcg_create_cache_name(struct mem_cgroup *memcg, + struct kmem_cache *root_cache); int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache); void memcg_free_cache_params(struct kmem_cache *s); @@ -635,6 +638,12 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } +static inline char *memcg_create_cache_name(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) +{ + return NULL; +} + static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e33b1d09eb1f4..32c7342df4bf9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3094,6 +3094,29 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } +char *memcg_create_cache_name(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) +{ + static char *buf = NULL; + + /* + * We need a mutex here to protect the shared buffer. Since this is + * expected to be called only on cache creation, we can employ the + * slab_mutex for that purpose. + */ + lockdep_assert_held(&slab_mutex); + + if (!buf) { + buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!buf) + return NULL; + } + + cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); + return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), buf); +} + int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@ -3298,46 +3321,6 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, - struct kmem_cache *s) -{ - struct kmem_cache *new = NULL; - static char *tmp_path = NULL, *tmp_name = NULL; - static DEFINE_MUTEX(mutex); /* protects tmp_name */ - - BUG_ON(!memcg_can_account_kmem(memcg)); - - mutex_lock(&mutex); - /* - * kmem_cache_create_memcg duplicates the given name and - * cgroup_name for this name requires RCU context. - * This static temporary buffer is used to prevent from - * pointless shortliving allocation. - */ - if (!tmp_path || !tmp_name) { - if (!tmp_path) - tmp_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (!tmp_name) - tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmp_path || !tmp_name) - goto out; - } - - cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1); - snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), tmp_name); - - new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align, - (s->flags & ~SLAB_PANIC), s->ctor, s); - if (new) - new->allocflags |= __GFP_KMEMCG; - else - new = s; -out: - mutex_unlock(&mutex); - return new; -} - void kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; @@ -3384,12 +3367,6 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) mutex_unlock(&activate_kmem_mutex); } -struct create_work { - struct mem_cgroup *memcg; - struct kmem_cache *cachep; - struct work_struct work; -}; - static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; @@ -3407,13 +3384,25 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) mutex_unlock(&memcg->slab_caches_mutex); } +struct create_work { + struct mem_cgroup *memcg; + struct kmem_cache *cachep; + struct work_struct work; +}; + static void memcg_create_cache_work_func(struct work_struct *w) { - struct create_work *cw; + struct create_work *cw = container_of(w, struct create_work, work); + struct mem_cgroup *memcg = cw->memcg; + struct kmem_cache *cachep = cw->cachep; + struct kmem_cache *new; - cw = container_of(w, struct create_work, work); - memcg_create_kmem_cache(cw->memcg, cw->cachep); - css_put(&cw->memcg->css); + new = kmem_cache_create_memcg(memcg, cachep->name, + cachep->object_size, cachep->align, + cachep->flags & ~SLAB_PANIC, cachep->ctor, cachep); + if (new) + new->allocflags |= __GFP_KMEMCG; + css_put(&memcg->css); kfree(cw); } diff --git a/mm/slab_common.c b/mm/slab_common.c index e77b51eb73479..11857abf7057c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -215,7 +215,10 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, s->align = calculate_alignment(flags, align, size); s->ctor = ctor; - s->name = kstrdup(name, GFP_KERNEL); + if (memcg) + s->name = memcg_create_cache_name(memcg, parent_cache); + else + s->name = kstrdup(name, GFP_KERNEL); if (!s->name) goto out_free_cache; -- cgit v1.2.3 From 794b1248be4e7e157f5535c3ee49168aa4643349 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:26 -0700 Subject: memcg, slab: separate memcg vs root cache creation paths Memcg-awareness turned kmem_cache_create() into a dirty interweaving of memcg-only and except-for-memcg calls. To clean this up, let's move the code responsible for memcg cache creation to a separate function. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 -- include/linux/slab.h | 6 +- mm/memcontrol.c | 7 +- mm/slab_common.c | 187 ++++++++++++++++++++++++++------------------- 4 files changed, 111 insertions(+), 95 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab7f02884983b..02d3072841e96 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -638,12 +638,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return -1; } -static inline char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - return NULL; -} - static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { diff --git a/include/linux/slab.h b/include/linux/slab.h index b5b2df60299e2..3dd389aa91c7c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -115,9 +115,9 @@ int slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); -struct kmem_cache * -kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, - unsigned long, void (*)(void *), struct kmem_cache *); +#ifdef CONFIG_MEMCG_KMEM +void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *); +#endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32c7342df4bf9..451523c3bd4eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3395,13 +3395,8 @@ static void memcg_create_cache_work_func(struct work_struct *w) struct create_work *cw = container_of(w, struct create_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - struct kmem_cache *new; - new = kmem_cache_create_memcg(memcg, cachep->name, - cachep->object_size, cachep->align, - cachep->flags & ~SLAB_PANIC, cachep->ctor, cachep); - if (new) - new->allocflags |= __GFP_KMEMCG; + kmem_cache_create_memcg(memcg, cachep); css_put(&memcg->css); kfree(cw); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 11857abf7057c..ccc012f001264 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, - size_t size) +static int kmem_cache_sanity_check(const char *name, size_t size) { struct kmem_cache *s = NULL; @@ -57,13 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, } #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) - /* - * For simplicity, we won't check this in the list of memcg - * caches. We have control over memcg naming, and if there - * aren't duplicates in the global list, there won't be any - * duplicates in the memcg lists as well. - */ - if (!memcg && !strcmp(s->name, name)) { + if (!strcmp(s->name, name)) { pr_err("%s (%s): Cache name already exists.\n", __func__, name); dump_stack(); @@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, return 0; } #else -static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, - const char *name, size_t size) +static inline int kmem_cache_sanity_check(const char *name, size_t size) { return 0; } @@ -139,6 +131,46 @@ unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } +static struct kmem_cache * +do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) +{ + struct kmem_cache *s; + int err; + + err = -ENOMEM; + s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); + if (!s) + goto out; + + s->name = name; + s->object_size = object_size; + s->size = size; + s->align = align; + s->ctor = ctor; + + err = memcg_alloc_cache_params(memcg, s, root_cache); + if (err) + goto out_free_cache; + + err = __kmem_cache_create(s, flags); + if (err) + goto out_free_cache; + + s->refcount = 1; + list_add(&s->list, &slab_caches); + memcg_register_cache(s); +out: + if (err) + return ERR_PTR(err); + return s; + +out_free_cache: + memcg_free_cache_params(s); + kfree(s); + goto out; +} /* * kmem_cache_create - Create a cache. @@ -164,34 +196,21 @@ unsigned long calculate_alignment(unsigned long flags, * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ - struct kmem_cache * -kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *), - struct kmem_cache *parent_cache) +kmem_cache_create(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s = NULL; + struct kmem_cache *s; + char *cache_name; int err; get_online_cpus(); mutex_lock(&slab_mutex); - err = kmem_cache_sanity_check(memcg, name, size); + err = kmem_cache_sanity_check(name, size); if (err) goto out_unlock; - if (memcg) { - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can - * try to create the same cache, but only one of them may - * succeed. Therefore if we get here and see the cache has - * already been created, we silently return NULL. - */ - if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg))) - goto out_unlock; - } - /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this @@ -200,55 +219,29 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, */ flags &= CACHE_CREATE_MASK; - if (!memcg) { - s = __kmem_cache_alias(name, size, align, flags, ctor); - if (s) - goto out_unlock; - } - - err = -ENOMEM; - s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); - if (!s) + s = __kmem_cache_alias(name, size, align, flags, ctor); + if (s) goto out_unlock; - s->object_size = s->size = size; - s->align = calculate_alignment(flags, align, size); - s->ctor = ctor; - - if (memcg) - s->name = memcg_create_cache_name(memcg, parent_cache); - else - s->name = kstrdup(name, GFP_KERNEL); - if (!s->name) - goto out_free_cache; - - err = memcg_alloc_cache_params(memcg, s, parent_cache); - if (err) - goto out_free_cache; - - err = __kmem_cache_create(s, flags); - if (err) - goto out_free_cache; + cache_name = kstrdup(name, GFP_KERNEL); + if (!cache_name) { + err = -ENOMEM; + goto out_unlock; + } - s->refcount = 1; - list_add(&s->list, &slab_caches); - memcg_register_cache(s); + s = do_kmem_cache_create(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); + if (IS_ERR(s)) { + err = PTR_ERR(s); + kfree(cache_name); + } out_unlock: mutex_unlock(&slab_mutex); put_online_cpus(); if (err) { - /* - * There is no point in flooding logs with warnings or - * especially crashing the system if we fail to create a cache - * for a memcg. In this case we will be accounting the memcg - * allocation to the root cgroup until we succeed to create its - * own cache, but it isn't that critical. - */ - if (!memcg) - return NULL; - if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); @@ -260,21 +253,55 @@ out_unlock: return NULL; } return s; - -out_free_cache: - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - goto out_unlock; } +EXPORT_SYMBOL(kmem_cache_create); -struct kmem_cache * -kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) +#ifdef CONFIG_MEMCG_KMEM +/* + * kmem_cache_create_memcg - Create a cache for a memory cgroup. + * @memcg: The memory cgroup the new cache is for. + * @root_cache: The parent of the new cache. + * + * This function attempts to create a kmem cache that will serve allocation + * requests going from @memcg to @root_cache. The new cache inherits properties + * from its parent. + */ +void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); + struct kmem_cache *s; + char *cache_name; + + get_online_cpus(); + mutex_lock(&slab_mutex); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) + goto out_unlock; + + cache_name = memcg_create_cache_name(memcg, root_cache); + if (!cache_name) + goto out_unlock; + + s = do_kmem_cache_create(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); + if (IS_ERR(s)) { + kfree(cache_name); + goto out_unlock; + } + + s->allocflags |= __GFP_KMEMCG; + +out_unlock: + mutex_unlock(&slab_mutex); + put_online_cpus(); } -EXPORT_SYMBOL(kmem_cache_create); +#endif /* CONFIG_MEMCG_KMEM */ void kmem_cache_destroy(struct kmem_cache *s) { -- cgit v1.2.3 From 051dd46050f2a9bdfff8cc067f8987069eae1743 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:27 -0700 Subject: memcg, slab: unregister cache from memcg before starting to destroy it Currently, memcg_unregister_cache(), which deletes the cache being destroyed from the memcg_slab_caches list, is called after __kmem_cache_shutdown() (see kmem_cache_destroy()), which starts to destroy the cache. As a result, one can access a partially destroyed cache while traversing a memcg_slab_caches list, which can have deadly consequences (for instance, cache_show() called for each cache on a memcg_slab_caches list from mem_cgroup_slabinfo_read() will dereference pointers to already freed data). To fix this, let's move memcg_unregister_cache() before the cache destruction process beginning, issuing memcg_register_cache() on failure. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++++++------ mm/slab_common.c | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 451523c3bd4eb..c22d8bf42d9a4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3140,6 +3140,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, s->memcg_params->root_cache = root_cache; INIT_WORK(&s->memcg_params->destroy, kmem_cache_destroy_work_func); + css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3148,6 +3149,10 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, void memcg_free_cache_params(struct kmem_cache *s) { + if (!s->memcg_params) + return; + if (!s->memcg_params->is_root_cache) + css_put(&s->memcg_params->memcg->css); kfree(s->memcg_params); } @@ -3170,9 +3175,6 @@ void memcg_register_cache(struct kmem_cache *s) memcg = s->memcg_params->memcg; id = memcg_cache_id(memcg); - css_get(&memcg->css); - - /* * Since readers won't lock (see cache_from_memcg_idx()), we need a * barrier here to ensure nobody will see the kmem_cache partially @@ -3221,10 +3223,8 @@ void memcg_unregister_cache(struct kmem_cache *s) * after removing it from the memcg_slab_caches list, otherwise we can * fail to convert memcg_params_to_cache() while traversing the list. */ - VM_BUG_ON(!root->memcg_params->memcg_caches[id]); + VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); root->memcg_params->memcg_caches[id] = NULL; - - css_put(&memcg->css); } /* diff --git a/mm/slab_common.c b/mm/slab_common.c index ccc012f001264..0c2879ff414c4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -313,9 +313,9 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + memcg_unregister_cache(s); if (!__kmem_cache_shutdown(s)) { - memcg_unregister_cache(s); mutex_unlock(&slab_mutex); if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); @@ -325,6 +325,7 @@ void kmem_cache_destroy(struct kmem_cache *s) kmem_cache_free(kmem_cache, s); } else { list_add(&s->list, &slab_caches); + memcg_register_cache(s); mutex_unlock(&slab_mutex); printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", s->name); -- cgit v1.2.3 From b8529907ba35d625fa4b85d3e4dc8021be97c1f3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:28 -0700 Subject: memcg, slab: do not destroy children caches if parent has aliases Currently we destroy children caches at the very beginning of kmem_cache_destroy(). This is wrong, because the root cache will not necessarily be destroyed in the end - if it has aliases (refcount > 0), kmem_cache_destroy() will simply decrement its refcount and return. In this case, at best we will get a bunch of warnings in dmesg, like this one: kmem_cache_destroy kmalloc-32:0: Slab cache still has objects CPU: 1 PID: 7139 Comm: modprobe Tainted: G B W 3.13.0+ #117 Call Trace: dump_stack+0x49/0x5b kmem_cache_destroy+0xdf/0xf0 kmem_cache_destroy_memcg_children+0x97/0xc0 kmem_cache_destroy+0xf/0xf0 xfs_mru_cache_uninit+0x21/0x30 [xfs] exit_xfs_fs+0x2e/0xc44 [xfs] SyS_delete_module+0x198/0x1f0 system_call_fastpath+0x16/0x1b At worst - if kmem_cache_destroy() will race with an allocation from a memcg cache - the kernel will panic. This patch fixes this by moving children caches destruction after the check if the cache has aliases. Plus, it forbids destroying a root cache if it still has children caches, because each children cache keeps a reference to its parent. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 +--- mm/memcontrol.c | 13 ++++---- mm/slab_common.c | 75 ++++++++++++++++++++++++++++++---------------- 3 files changed, 57 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 02d3072841e96..b569b8be5c5ac 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -507,7 +507,7 @@ struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); void mem_cgroup_destroy_cache(struct kmem_cache *cachep); -void kmem_cache_destroy_memcg_children(struct kmem_cache *s); +int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. @@ -661,10 +661,6 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { return cachep; } - -static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s) -{ -} #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c22d8bf42d9a4..29501f0405688 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3321,15 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -void kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; - int i; - - if (!s->memcg_params) - return; - if (!s->memcg_params->is_root_cache) - return; + int i, failed = 0; /* * If the cache is being destroyed, we trust that there is no one else @@ -3363,8 +3358,12 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) c->memcg_params->dead = false; cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); + + if (cache_from_memcg_idx(s, i)) + failed++; } mutex_unlock(&activate_kmem_mutex); + return failed; } static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) diff --git a/mm/slab_common.c b/mm/slab_common.c index 0c2879ff414c4..f3cfccf76dda6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -301,39 +301,64 @@ out_unlock: mutex_unlock(&slab_mutex); put_online_cpus(); } + +static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + int rc; + + if (!s->memcg_params || + !s->memcg_params->is_root_cache) + return 0; + + mutex_unlock(&slab_mutex); + rc = __kmem_cache_destroy_memcg_children(s); + mutex_lock(&slab_mutex); + + return rc; +} +#else +static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_MEMCG_KMEM */ void kmem_cache_destroy(struct kmem_cache *s) { - /* Destroy all the children caches if we aren't a memcg cache */ - kmem_cache_destroy_memcg_children(s); - get_online_cpus(); mutex_lock(&slab_mutex); + s->refcount--; - if (!s->refcount) { - list_del(&s->list); - memcg_unregister_cache(s); - - if (!__kmem_cache_shutdown(s)) { - mutex_unlock(&slab_mutex); - if (s->flags & SLAB_DESTROY_BY_RCU) - rcu_barrier(); - - memcg_free_cache_params(s); - kfree(s->name); - kmem_cache_free(kmem_cache, s); - } else { - list_add(&s->list, &slab_caches); - memcg_register_cache(s); - mutex_unlock(&slab_mutex); - printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", - s->name); - dump_stack(); - } - } else { - mutex_unlock(&slab_mutex); + if (s->refcount) + goto out_unlock; + + if (kmem_cache_destroy_memcg_children(s) != 0) + goto out_unlock; + + list_del(&s->list); + memcg_unregister_cache(s); + + if (__kmem_cache_shutdown(s) != 0) { + list_add(&s->list, &slab_caches); + memcg_register_cache(s); + printk(KERN_ERR "kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + goto out_unlock; } + + mutex_unlock(&slab_mutex); + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); + + memcg_free_cache_params(s); + kfree(s->name); + kmem_cache_free(kmem_cache, s); + goto out_put_cpus; + +out_unlock: + mutex_unlock(&slab_mutex); +out_put_cpus: put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.2.3 From 84d0ddd6b0e3187d85e609c2d10c36089cf0be04 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:29 -0700 Subject: slub: adjust memcg caches when creating cache alias Otherwise, kzalloc() called from a memcg won't clear the whole object. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7d81afb270482..33939e72bc372 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3748,7 +3748,11 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s = find_mergeable(size, align, flags, name, ctor); if (s) { + int i; + struct kmem_cache *c; + s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. @@ -3756,6 +3760,15 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); + if (!c) + continue; + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + if (sysfs_slab_alias(s, name)) { s->refcount--; s = NULL; -- cgit v1.2.3 From 9a41707bd3a0811919000daf094e9d50ea65f7da Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 7 Apr 2014 15:39:31 -0700 Subject: slub: rework sysfs layout for memcg caches Currently, we try to arrange sysfs entries for memcg caches in the same manner as for global caches. Apart from turning /sys/kernel/slab into a mess when there are a lot of kmem-active memcgs created, it actually does not work properly - we won't create more than one link to a memcg cache in case its parent is merged with another cache. For instance, if A is a root cache merged with another root cache B, we will have the following sysfs setup: X A -> X B -> X where X is some unique id (see create_unique_id()). Now if memcgs M and N start to allocate from cache A (or B, which is the same), we will get: X X:M X:N A -> X B -> X A:M -> X:M A:N -> X:N Since B is an alias for A, we won't get entries B:M and B:N, which is confusing. It is more logical to have entries for memcg caches under the corresponding root cache's sysfs directory. This would allow us to keep sysfs layout clean, and avoid such inconsistencies like one described above. This patch does the trick. It creates a "cgroup" kset in each root cache kobject to keep its children caches there. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Cc: David Rientjes Cc: Pekka Enberg Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 3 +++ mm/slub.c | 26 +++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f56bfa9e4526f..f2f7398848cfe 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -87,6 +87,9 @@ struct kmem_cache { #ifdef CONFIG_MEMCG_KMEM struct memcg_cache_params *memcg_params; int max_attr_size; /* for propagation, maximum size of a stored attr */ +#ifdef CONFIG_SYSFS + struct kset *memcg_kset; +#endif #endif #ifdef CONFIG_NUMA diff --git a/mm/slub.c b/mm/slub.c index 33939e72bc372..3508edec19f9c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5138,6 +5138,15 @@ static const struct kset_uevent_ops slab_uevent_ops = { static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params->root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -5203,7 +5212,7 @@ static int sysfs_slab_add(struct kmem_cache *s) name = create_unique_id(s); } - s->kobj.kset = slab_kset; + s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) { kobject_put(&s->kobj); @@ -5216,6 +5225,18 @@ static int sysfs_slab_add(struct kmem_cache *s) kobject_put(&s->kobj); return err; } + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + kobject_del(&s->kobj); + kobject_put(&s->kobj); + return -ENOMEM; + } + } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ @@ -5234,6 +5255,9 @@ static void sysfs_slab_remove(struct kmem_cache *s) */ return; +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); -- cgit v1.2.3 From 54b6a731025f9528d44945a72b1f4e5946bb2d80 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 7 Apr 2014 15:39:32 -0700 Subject: slub: fix leak of 'name' in sysfs_slab_add The failure paths of sysfs_slab_add don't release the allocation of 'name' made by create_unique_id() a few lines above the context of the diff below. Create a common exit path to make it more obvious what needs freeing. [vdavydov@parallels.com: free the name only if !unmergeable] Signed-off-by: Dave Jones Signed-off-by: Vladimir Davydov Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 3508edec19f9c..e7451861f95d5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5214,25 +5214,19 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); - if (err) { - kobject_put(&s->kobj); - return err; - } + if (err) + goto out_put_kobj; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; - } + if (err) + goto out_del_kobj; #ifdef CONFIG_MEMCG_KMEM if (is_root_cache(s)) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return -ENOMEM; + err = -ENOMEM; + goto out_del_kobj; } } #endif @@ -5241,9 +5235,16 @@ static int sysfs_slab_add(struct kmem_cache *s) if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); +out_put_kobj: + kobject_put(&s->kobj); + goto out; } static void sysfs_slab_remove(struct kmem_cache *s) -- cgit v1.2.3 From 88da03a67674bcd6e9ecf18a0a182cf1303056ba Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 7 Apr 2014 15:39:42 -0700 Subject: slub: use raw_cpu_inc for incrementing statistics Statistics are not critical to the operation of the allocation but should also not cause too much overhead. When __this_cpu_inc is altered to check if preemption is disabled this triggers. Use raw_cpu_inc to avoid the checks. Using this_cpu_ops may cause interrupt disable/enable sequences on various arches which may significantly impact allocator performance. [akpm@linux-foundation.org: add comment] Signed-off-by: Christoph Lameter Cc: Fengguang Wu Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e7451861f95d5..f620bbf4054aa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -224,7 +224,11 @@ static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } -- cgit v1.2.3 From 9e5c33d7aeeef62e5fa7e74f94432685bd03026b Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Mon, 7 Apr 2014 15:39:48 -0700 Subject: mm: create generic early_ioremap() support This patch creates a generic implementation of early_ioremap() support based on the existing x86 implementation. early_ioremp() is useful for early boot code which needs to temporarily map I/O or memory regions before normal mapping functions such as ioremap() are available. Some architectures have optional MMU. In the no-MMU case, the remap functions simply return the passed in physical address and the unmap functions do nothing. Signed-off-by: Mark Salter Acked-by: Catalin Marinas Acked-by: H. Peter Anvin Cc: Borislav Petkov Cc: Dave Young Cc: Will Deacon Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/early_ioremap.h | 42 +++++++ mm/Kconfig | 3 + mm/Makefile | 1 + mm/early_ioremap.c | 245 ++++++++++++++++++++++++++++++++++++ 4 files changed, 291 insertions(+) create mode 100644 include/asm-generic/early_ioremap.h create mode 100644 mm/early_ioremap.c (limited to 'mm') diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h new file mode 100644 index 0000000000000..a5de55c04fb2e --- /dev/null +++ b/include/asm-generic/early_ioremap.h @@ -0,0 +1,42 @@ +#ifndef _ASM_EARLY_IOREMAP_H_ +#define _ASM_EARLY_IOREMAP_H_ + +#include + +/* + * early_ioremap() and early_iounmap() are for temporary early boot-time + * mappings, before the real ioremap() is functional. + */ +extern void __iomem *early_ioremap(resource_size_t phys_addr, + unsigned long size); +extern void *early_memremap(resource_size_t phys_addr, + unsigned long size); +extern void early_iounmap(void __iomem *addr, unsigned long size); +extern void early_memunmap(void *addr, unsigned long size); + +/* + * Weak function called by early_ioremap_reset(). It does nothing, but + * architectures may provide their own version to do any needed cleanups. + */ +extern void early_ioremap_shutdown(void); + +#if defined(CONFIG_GENERIC_EARLY_IOREMAP) && defined(CONFIG_MMU) +/* Arch-specific initialization */ +extern void early_ioremap_init(void); + +/* Generic initialization called by architecture code */ +extern void early_ioremap_setup(void); + +/* + * Called as last step in paging_init() so library can act + * accordingly for subsequent map/unmap requests. + */ +extern void early_ioremap_reset(void); + +#else +static inline void early_ioremap_init(void) { } +static inline void early_ioremap_setup(void) { } +static inline void early_ioremap_reset(void) { } +#endif + +#endif /* _ASM_EARLY_IOREMAP_H_ */ diff --git a/mm/Kconfig b/mm/Kconfig index 37fbe1ef52397..ebe5880c29d6c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -578,3 +578,6 @@ config PGTABLE_MAPPING You can check speed with zsmalloc benchmark: https://github.com/spartacus06/zsmapbench + +config GENERIC_EARLY_IOREMAP + bool diff --git a/mm/Makefile b/mm/Makefile index 23a6f7e230194..9e5aaf92197d3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -61,3 +61,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c new file mode 100644 index 0000000000000..e10ccd299d666 --- /dev/null +++ b/mm/early_ioremap.c @@ -0,0 +1,245 @@ +/* + * Provide common bits of early_ioremap() support for architectures needing + * temporary mappings during boot before ioremap() is available. + * + * This is mostly a direct copy of the x86 early_ioremap implementation. + * + * (C) Copyright 1995 1996, 2014 Linus Torvalds + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_MMU +static int early_ioremap_debug __initdata; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static int after_paging_init __initdata; + +void __init __weak early_ioremap_shutdown(void) +{ +} + +void __init early_ioremap_reset(void) +{ + early_ioremap_shutdown(); + after_paging_init = 1; +} + +/* + * Generally, ioremap() is available after paging_init() has been called. + * Architectures wanting to allow early_ioremap after paging_init() can + * define __late_set_fixmap and __late_clear_fixmap to do the right thing. + */ +#ifndef __late_set_fixmap +static inline void __init __late_set_fixmap(enum fixed_addresses idx, + phys_addr_t phys, pgprot_t prot) +{ + BUG(); +} +#endif + +#ifndef __late_clear_fixmap +static inline void __init __late_clear_fixmap(enum fixed_addresses idx) +{ + BUG(); +} +#endif + +static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; + +void __init early_ioremap_setup(void) +{ + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (WARN_ON(prev_map[i])) + break; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i); +} + +static int __init check_early_ioremap_leak(void) +{ + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (WARN(count, KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n" + "please boot with early_ioremap_debug and report the dmesg.\n", + count)) + return 1; + return 0; +} +late_initcall(check_early_ioremap_leak); + +static void __init __iomem * +__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) +{ + unsigned long offset; + resource_size_t last_addr; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + WARN_ON(system_state != SYSTEM_BOOTING); + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n", + __func__, (u64)phys_addr, size)) + return NULL; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (WARN_ON(!size || last_addr < phys_addr)) + return NULL; + + prev_size[slot] = size; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (WARN_ON(nrpages > NR_FIX_BTMAPS)) + return NULL; + + /* + * Ok, go for it.. + */ + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_set_fixmap(idx, phys_addr, prot); + else + __early_set_fixmap(idx, phys_addr, prot); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n", + __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]); + + prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]); + return prev_map[slot]; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } + + if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n", + addr, size)) + return; + + if (WARN(prev_size[slot] != size, + "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot])) + return; + + WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n", + addr, size, slot); + + virt_addr = (unsigned long)addr; + if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) + return; + + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; + while (nrpages > 0) { + if (after_paging_init) + __late_clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR); + --idx; + --nrpages; + } + prev_map[slot] = NULL; +} + +/* Remap an IO device */ +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO); +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, + FIXMAP_PAGE_NORMAL); +} +#else /* CONFIG_MMU */ + +void __init __iomem * +early_ioremap(resource_size_t phys_addr, unsigned long size) +{ + return (__force void __iomem *)phys_addr; +} + +/* Remap memory */ +void __init * +early_memremap(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} + +void __init early_iounmap(void __iomem *addr, unsigned long size) +{ +} + +#endif /* CONFIG_MMU */ + + +void __init early_memunmap(void *addr, unsigned long size) +{ + early_iounmap((__force void __iomem *)addr, size); +} -- cgit v1.2.3