From bc2791f857e1984b7548d2a2de2ffb1a913dee62 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 22 Feb 2017 15:41:21 -0800 Subject: slab: link memcg kmem_caches on their associated memory cgroup With kmem cgroup support enabled, kmem_caches can be created and destroyed frequently and a great number of near empty kmem_caches can accumulate if there are a lot of transient cgroups and the system is not under memory pressure. When memory reclaim starts under such conditions, it can lead to consecutive deactivation and destruction of many kmem_caches, easily hundreds of thousands on moderately large systems, exposing scalability issues in the current slab management code. This is one of the patches to address the issue. While a memcg kmem_cache is listed on its root cache's ->children list, there is no direct way to iterate all kmem_caches which are assocaited with a memory cgroup. The only way to iterate them is walking all caches while filtering out caches which don't match, which would be most of them. This makes memcg destruction operations O(N^2) where N is the total number of slab caches which can be huge. This combined with the synchronous RCU operations can tie up a CPU and affect the whole machine for many hours when memory reclaim triggers offlining and destruction of the stale memcgs. This patch adds mem_cgroup->kmem_caches list which goes through memcg_cache_params->kmem_caches_node of all kmem_caches which are associated with the memcg. All memcg specific iterations, including stat file access, are updated to use the new list instead. Link: http://lkml.kernel.org/r/20170117235411.9408-6-tj@kernel.org Signed-off-by: Tejun Heo Reported-by: Jay Vana Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 ++++--- mm/slab.h | 3 +++ mm/slab_common.c | 36 +++++++++++++++++++++++++++++------- 3 files changed, 36 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b822e158b319e..834d641dfa8c9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2837,6 +2837,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; + INIT_LIST_HEAD(&memcg->kmem_caches); return 0; } @@ -4002,9 +4003,9 @@ static struct cftype mem_cgroup_legacy_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .seq_start = slab_start, - .seq_next = slab_next, - .seq_stop = slab_stop, + .seq_start = memcg_slab_start, + .seq_next = memcg_slab_next, + .seq_stop = memcg_slab_stop, .seq_show = memcg_slab_show, }, #endif diff --git a/mm/slab.h b/mm/slab.h index 3ed3336883ed1..a08f01016a3f8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -494,6 +494,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) void *slab_start(struct seq_file *m, loff_t *pos); void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); +void *memcg_slab_start(struct seq_file *m, loff_t *pos); +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); +void memcg_slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); diff --git a/mm/slab_common.c b/mm/slab_common.c index c3885032dbce5..c3bbeddaeaaf8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -154,6 +154,7 @@ static int init_memcg_params(struct kmem_cache *s, s->memcg_params.root_cache = root_cache; s->memcg_params.memcg = memcg; INIT_LIST_HEAD(&s->memcg_params.children_node); + INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node); return 0; } @@ -224,6 +225,7 @@ int memcg_update_all_caches(int num_memcgs) static void unlink_memcg_cache(struct kmem_cache *s) { list_del(&s->memcg_params.children_node); + list_del(&s->memcg_params.kmem_caches_node); } #else static inline int init_memcg_params(struct kmem_cache *s, @@ -596,6 +598,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, list_add(&s->memcg_params.children_node, &root_cache->memcg_params.children); + list_add(&s->memcg_params.kmem_caches_node, &memcg->kmem_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -651,9 +654,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) get_online_mems(); mutex_lock(&slab_mutex); - list_for_each_entry_safe(s, s2, &slab_caches, list) { - if (is_root_cache(s) || s->memcg_params.memcg != memcg) - continue; + list_for_each_entry_safe(s, s2, &memcg->kmem_caches, + memcg_params.kmem_caches_node) { /* * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. @@ -1201,15 +1203,35 @@ static int slab_show(struct seq_file *m, void *p) } #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +void *memcg_slab_start(struct seq_file *m, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + mutex_lock(&slab_mutex); + return seq_list_start(&memcg->kmem_caches, *pos); +} + +void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + return seq_list_next(p, &memcg->kmem_caches, pos); +} + +void memcg_slab_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&slab_mutex); +} + int memcg_slab_show(struct seq_file *m, void *p) { - struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct kmem_cache *s = list_entry(p, struct kmem_cache, + memcg_params.kmem_caches_node); struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - if (p == slab_caches.next) + if (p == memcg->kmem_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s) && s->memcg_params.memcg == memcg) - cache_show(s, m); + cache_show(s, m); return 0; } #endif -- cgit v1.2.3