From ac924c6034d9095f95ee889f7e31bbb9145da0c2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 15 May 2006 09:43:59 -0700
Subject: [PATCH] setup_per_zone_pages_min() overflow fix

As pointed out in http://bugzilla.kernel.org/show_bug.cgi?id=6490, this
function can experience overflows on 32-bit machines, causing our response to
changed values of min_free_kbytes to go whacky.

Fixing it efficiently is all too hard, so fix it with 64-bit math instead.

Cc: Ake Sandgren <ake.sandgren@hpc2n.umu.se>
Cc: Martin Bligh <mbligh@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea77c999047e..813b4ec1298a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -39,6 +39,7 @@
 #include <linux/mempolicy.h>
 
 #include <asm/tlbflush.h>
+#include <asm/div64.h>
 #include "internal.h"
 
 /*
@@ -2566,9 +2567,11 @@ void setup_per_zone_pages_min(void)
 	}
 
 	for_each_zone(zone) {
-		unsigned long tmp;
+		u64 tmp;
+
 		spin_lock_irqsave(&zone->lru_lock, flags);
-		tmp = (pages_min * zone->present_pages) / lowmem_pages;
+		tmp = (u64)pages_min * zone->present_pages;
+		do_div(tmp, lowmem_pages);
 		if (is_highmem(zone)) {
 			/*
 			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
@@ -2595,8 +2598,8 @@ void setup_per_zone_pages_min(void)
 			zone->pages_min = tmp;
 		}
 
-		zone->pages_low   = zone->pages_min + tmp / 4;
-		zone->pages_high  = zone->pages_min + tmp / 2;
+		zone->pages_low   = zone->pages_min + (tmp >> 2);
+		zone->pages_high  = zone->pages_min + (tmp >> 1);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 
-- 
cgit v1.2.3


From 39d24e64263cd3211705d3b61ea4171c65030921 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <kravetz@us.ibm.com>
Date: Mon, 15 May 2006 09:44:13 -0700
Subject: [PATCH] add slab_is_available() routine for boot code

slab_is_available() indicates slab based allocators are available for use.
SPARSEMEM code needs to know this as it can be called at various times
during the boot process.

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 1 +
 mm/slab.c            | 8 ++++++++
 mm/sparse.c          | 2 +-
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 3af03b19c983..2d985d59c7b8 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -150,6 +150,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 
 extern void kfree(const void *);
 extern unsigned int ksize(const void *);
+extern int slab_is_available(void);
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node(kmem_cache_t *, gfp_t flags, int node);
diff --git a/mm/slab.c b/mm/slab.c
index c32af7e7581e..b1d643b5238d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -700,6 +700,14 @@ static enum {
 	FULL
 } g_cpucache_up;
 
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+	return g_cpucache_up == FULL;
+}
+
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
diff --git a/mm/sparse.c b/mm/sparse.c
index d7c32de99ee8..c5e89eb9ac8f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -32,7 +32,7 @@ static struct mem_section *sparse_index_alloc(int nid)
 	unsigned long array_size = SECTIONS_PER_ROOT *
 				   sizeof(struct mem_section);
 
-	if (system_state == SYSTEM_RUNNING)
+	if (slab_is_available())
 		section = kmalloc_node(array_size, GFP_KERNEL, nid);
 	else
 		section = alloc_bootmem_node(NODE_DATA(nid), array_size);
-- 
cgit v1.2.3


From a4523a8b38089478f93bc053c31f678c63f5ee1b Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Mon, 15 May 2006 11:41:00 -0700
Subject: [PATCH] slab: Fix kmem_cache_destroy() on NUMA

With CONFIG_NUMA set, kmem_cache_destroy() may fail and say "Can't
free all objects."  The problem is caused by sequences such as the
following (suppose we are on a NUMA machine with two nodes, 0 and 1):

 * Allocate an object from cache on node 0.
 * Free the object on node 1.  The object is put into node 1's alien
   array_cache for node 0.
 * Call kmem_cache_destroy(), which ultimately ends up in __cache_shrink().
 * __cache_shrink() does drain_cpu_caches(), which loops through all nodes.
   For each node it drains the shared array_cache and then handles the
   alien array_cache for the other node.

However this means that node 0's shared array_cache will be drained,
and then node 1 will move the contents of its alien[0] array_cache
into that same shared array_cache.  node 0's shared array_cache is
never looked at again, so the objects left there will appear to be in
use when __cache_shrink() calls __node_shrink() for node 0.  So
__node_shrink() will return 1 and kmem_cache_destroy() will fail.

This patch fixes this by having drain_cpu_caches() do
drain_alien_cache() on every node before it does drain_array() on the
nodes' shared array_caches.

The problem was originally reported by Or Gerlitz <ogerlitz@voltaire.com>.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index b1d643b5238d..d31a06bfbea5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2200,11 +2200,14 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
-		if (l3) {
+		if (l3 && l3->alien)
+			drain_alien_cache(cachep, l3->alien);
+	}
+
+	for_each_online_node(node) {
+		l3 = cachep->nodelists[node];
+		if (l3)
 			drain_array(cachep, l3, l3->shared, 1, node);
-			if (l3->alien)
-				drain_alien_cache(cachep, l3->alien);
-		}
 	}
 }
 
-- 
cgit v1.2.3


From 12783b002db1f02c29353c8f698a85514420b9f4 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <kravetz@us.ibm.com>
Date: Sat, 20 May 2006 15:00:05 -0700
Subject: [PATCH] SPARSEMEM incorrectly calculates section number

A bad calculation/loop in __section_nr() could result in incorrect section
information being put into sysfs memory entries.  This primarily impacts
memory add operations as the sysfs information is used while onlining new
memory.

Fix suggested by Dave Hansen.

Note that the bug may not be obvious from the patch.  It actually occurs in
the function's return statement:

	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);

In the existing code, root_nr has already been multiplied by
SECTIONS_PER_ROOT.

Signed-off-by: Mike Kravetz <kravetz@us.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/sparse.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/sparse.c b/mm/sparse.c
index c5e89eb9ac8f..100040c0dfb6 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -87,11 +87,8 @@ int __section_nr(struct mem_section* ms)
 	unsigned long root_nr;
 	struct mem_section* root;
 
-	for (root_nr = 0;
-	     root_nr < NR_MEM_SECTIONS;
-	     root_nr += SECTIONS_PER_ROOT) {
-		root = __nr_to_section(root_nr);
-
+	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
+		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
 		if (!root)
 			continue;
 
-- 
cgit v1.2.3


From bdd804f478a0cc74bf7db8e9f9d5fd379d1b31ca Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sat, 20 May 2006 15:00:09 -0700
Subject: [PATCH] Cpuset: might sleep checking zones allowed fix

Fix a couple of infrequently encountered 'sleeping function called from
invalid context' in the cpuset hooks in __alloc_pages.  Could sleep while
interrupts disabled.

The routine cpuset_zone_allowed() is called by code in mm/page_alloc.c
__alloc_pages() to determine if a zone is allowed in the current tasks
cpuset.  This routine can sleep, for certain GFP_KERNEL allocations, if the
zone is on a memory node not allowed in the current cpuset, but might be
allowed in a parent cpuset.

But we can't sleep in __alloc_pages() if in interrupt, nor if called for a
GFP_ATOMIC request (__GFP_WAIT not set in gfp_flags).

The rule was intended to be:
  Don't call cpuset_zone_allowed() if you can't sleep, unless you
  pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  the code that might scan up ancestor cpusets and sleep.

This rule was being violated in a couple of places, due to a bogus change
made (by myself, pj) to __alloc_pages() as part of the November 2005 effort
to cleanup its logic, and also due to a later fix to constrain which swap
daemons were awoken.

The bogus change can be seen at:
  http://linux.derkeiler.com/Mailing-Lists/Kernel/2005-11/4691.html
  [PATCH 01/05] mm fix __alloc_pages cpuset ALLOC_* flags

This was first noticed on a tight memory system, in code that was disabling
interrupts and doing allocation requests with __GFP_WAIT not set, which
resulted in __might_sleep() writing complaints to the log "Debug: sleeping
function called ...", when the code in cpuset_zone_allowed() tried to take
the callback_sem cpuset semaphore.

We haven't seen a system hang on this 'might_sleep' yet, but we are at
decent risk of seeing it fairly soon, especially since the additional
cpuset_zone_allowed() check was added, conditioning wakeup_kswapd(), in
March 2006.

Special thanks to Dave Chinner, for figuring this out, and a tip of the hat
to Nick Piggin who warned me of this back in Nov 2005, before I was ready
to listen.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 813b4ec1298a..bb3416932ab0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -951,7 +951,7 @@ restart:
 		goto got_pg;
 
 	do {
-		if (cpuset_zone_allowed(*z, gfp_mask))
+		if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL))
 			wakeup_kswapd(*z, order);
 	} while (*(++z));
 
@@ -970,7 +970,8 @@ restart:
 		alloc_flags |= ALLOC_HARDER;
 	if (gfp_mask & __GFP_HIGH)
 		alloc_flags |= ALLOC_HIGH;
-	alloc_flags |= ALLOC_CPUSET;
+	if (wait)
+		alloc_flags |= ALLOC_CPUSET;
 
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
-- 
cgit v1.2.3


From e984bb43f7450312ba66fe0e67a99efa6be3b246 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sat, 20 May 2006 15:00:31 -0700
Subject: [PATCH] Align the node_mem_map endpoints to a MAX_ORDER boundary

Andy added code to buddy allocator which does not require the zone's
endpoints to be aligned to MAX_ORDER.  An issue is that the buddy allocator
requires the node_mem_map's endpoints to be MAX_ORDER aligned.  Otherwise
__page_find_buddy could compute a buddy not in node_mem_map for partial
MAX_ORDER regions at zone's endpoints.  page_is_buddy will detect that
these pages at endpoints are not PG_buddy (they were zeroed out by bootmem
allocator and not part of zone).  Of course the negative here is we could
waste a little memory but the positive is eliminating all the old checks
for zone boundary conditions.

SPARSEMEM won't encounter this issue because of MAX_ORDER size constraint
when SPARSEMEM is configured.  ia64 VIRTUAL_MEM_MAP doesn't need the logic
either because the holes and endpoints are handled differently.  This
leaves checking alloc_remap and other arches which privately allocate for
node_mem_map.

Signed-off-by: Bob Picco <bob.picco@hp.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 14 +++++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b5c21122c299..36740354d4db 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -22,6 +22,7 @@
 #else
 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
 #endif
+#define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
 
 struct free_area {
 	struct list_head	free_list;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb3416932ab0..253a450c400d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2125,14 +2125,22 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
-		unsigned long size;
+		unsigned long size, start, end;
 		struct page *map;
 
-		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
+		/*
+		 * The zone's endpoints aren't required to be MAX_ORDER
+		 * aligned but the node_mem_map endpoints must be in order
+		 * for the buddy allocator to function correctly.
+		 */
+		start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+		end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+		end = ALIGN(end, MAX_ORDER_NR_PAGES);
+		size =  (end - start) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node(pgdat, size);
-		pgdat->node_mem_map = map;
+		pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
 	}
 #ifdef CONFIG_FLATMEM
 	/*
-- 
cgit v1.2.3