mm,oom: move last second allocation to inside the OOM killer

Since selecting an OOM victim can take quite some time and the OOM situation might be resolved meanwhile, sometimes doing last second allocation attempt after selecting an OOM victim can succeed. Therefore, this patch moves last second allocation attempt to after selecting an OOM victim. This patch is expected to reduce the time window for potentially premature OOM killing considerably. Link: http://lkml.kernel.org/r/1511607169-5084-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Suggested-by: Michal Hocko <mhocko@suse.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: David Rientjes <rientjes@google.com> Cc: Manish Jaggi <mjaggi@caviumnetworks.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
author: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> 2017-11-29 12:15:12 +1100
committer: Stephen Rothwell <sfr@canb.auug.org.au> 2017-11-29 12:15:12 +1100
commit: 07a5f0f345ef23d5947b83e567c683a985705411 (patch)
tree: e7e82dab25679a067661341f1a08416442fff3b0
parent: 16fc3a5818fd582c7aa8a300ccda90329fa191f6 (diff)
download: linux-07a5f0f345ef23d5947b83e567c683a985705411.tar.gz
linux-07a5f0f345ef23d5947b83e567c683a985705411.tar.xz
3 files changed, 53 insertions, 18 deletions
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..27cd36b762b5 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -14,6 +14,8 @@ struct zonelist;
 struct notifier_block;
 struct mem_cgroup;
 struct task_struct;
+struct alloc_context;
+struct page;
 
 /*
  * Details of the page allocation that triggered the oom killer that are used to
@@ -38,6 +40,15 @@ struct oom_control {
 	 */
 	const int order;
 
+	/* Context for really last second allocation attempt. */
+	const struct alloc_context *ac;
+	/*
+	 * Set by the OOM killer if ac != NULL and last second allocation
+	 * attempt succeeded. If ac != NULL, the caller must check for
+	 * page != NULL.
+	 */
+	struct page *page;
+
 	/* Used by oom implementation, do not set */
 	unsigned long totalpages;
 	struct task_struct *chosen;
@@ -102,6 +113,8 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern struct page *alloc_pages_before_oomkill(const struct oom_control *oc);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c957be32b27a..348ec5a29103 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1061,6 +1061,9 @@ bool out_of_memory(struct oom_control *oc)
 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+		oc->page = alloc_pages_before_oomkill(oc);
+		if (oc->page)
+			return true;
 		get_task_struct(current);
 		oc->chosen = current;
 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
@@ -1068,6 +1071,17 @@ bool out_of_memory(struct oom_control *oc)
 	}
 
 	select_bad_process(oc);
+	/*
+	 * Try really last second allocation attempt after we selected an OOM
+	 * victim, for somebody might have managed to free memory while we were
+	 * selecting an OOM victim which can take quite some time.
+	 */
+	oc->page = alloc_pages_before_oomkill(oc);
+	if (oc->page) {
+		if (oc->chosen && oc->chosen != (void *)-1UL)
+			put_task_struct(oc->chosen);
+		return true;
+	}
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
 		dump_header(oc, NULL);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cb5cd5ab35d9..15e681203257 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3325,8 +3325,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		.memcg = NULL,
 		.gfp_mask = gfp_mask,
 		.order = order,
+		.ac = ac,
 	};
-	struct page *page;
+	struct page *page = NULL;
 
 	*did_some_progress = 0;
 
@@ -3340,19 +3341,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 	}
 
-	/*
-	 * Go through the zonelist yet one more time, keep very high watermark
-	 * here, this is only to catch a parallel oom killing, we must fail if
-	 * we're still under heavy pressure. But make sure that this reclaim
-	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
-	 * allocation which will never fail due to oom_lock already held.
-	 */
-	page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
-				      ~__GFP_DIRECT_RECLAIM, order,
-				      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
-	if (page)
-		goto out;
-
 	/* Coredumps can quickly deplete all memory reserves */
 	if (current->flags & PF_DUMPCORE)
 		goto out;
@@ -3387,16 +3375,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		goto out;
 
 	/* Exhausted what can be done so it's blamo time */
-	if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+	if (out_of_memory(&oc)) {
+		*did_some_progress = 1;
+		page = oc.page;
+	} else if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
 		*did_some_progress = 1;
 
 		/*
 		 * Help non-failing allocations by giving them access to memory
 		 * reserves
 		 */
-		if (gfp_mask & __GFP_NOFAIL)
-			page = __alloc_pages_cpuset_fallback(gfp_mask, order,
-					ALLOC_NO_WATERMARKS, ac);
+		page = __alloc_pages_cpuset_fallback(gfp_mask, order,
+						     ALLOC_NO_WATERMARKS, ac);
 	}
 out:
 	mutex_unlock(&oom_lock);
@@ -4156,6 +4146,24 @@ got_pg:
 	return page;
 }
 
+struct page *alloc_pages_before_oomkill(const struct oom_control *oc)
+{
+	/*
+	 * Go through the zonelist yet one more time, keep very high watermark
+	 * here, this is only to catch a parallel oom killing, we must fail if
+	 * we're still under heavy pressure. But make sure that this reclaim
+	 * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+	 * allocation which will never fail due to oom_lock already held.
+	 */
+	int alloc_flags = ALLOC_CPUSET | ALLOC_WMARK_HIGH;
+	gfp_t gfp_mask = oc->gfp_mask | __GFP_HARDWALL;
+
+	if (!oc->ac)
+		return NULL;
+	gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+	return get_page_from_freelist(gfp_mask, oc->order, alloc_flags, oc->ac);
+}
+
 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
 		int preferred_nid, nodemask_t *nodemask,
 		struct alloc_context *ac, gfp_t *alloc_mask,
author	Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>	2017-11-29 12:15:12 +1100
committer	Stephen Rothwell <sfr@canb.auug.org.au>	2017-11-29 12:15:12 +1100
commit	07a5f0f345ef23d5947b83e567c683a985705411 (patch)
tree	e7e82dab25679a067661341f1a08416442fff3b0
parent	16fc3a5818fd582c7aa8a300ccda90329fa191f6 (diff)
download	linux-07a5f0f345ef23d5947b83e567c683a985705411.tar.gz linux-07a5f0f345ef23d5947b83e567c683a985705411.tar.xz