From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Subject: slab - handle memoryless nodes V2a
References: bnc#436025, bnc#570492
Patch-mainline: not yet

The slab cache, since [apparently] 2.6.21, does not handle memoryless
nodes well.  Specifically, the "fast path"-- ____cache_alloc()--will
never succeed, but will be called twice: once speculatively [expected
to succeed] and once in the fallback path.  This adds significant
overhead to all kmem cache allocations, incurring a significant
regression relative to earlier kernels [from before slab.c was
reorganized].

This patch addresses the regression by modifying slab.c to treat the
first fallback node in a memoryless node's general zonelist as the "slab
local node" -- i.e., the local node for the purpose of slab allocations.
This is, in fact, the node from which all "local" allocations for cpus
attached to a memoryless node will be satisfied.

The new function numa_slab_nid(gfp_t) replaces all calls to
numa_node_id() in slab.c.  numa_slab_id() will simply return
numa_node_id() for nodes with memory, but will return the first
node in the local node's zonelist selected by the gfp flags.

Effects of the patch:

We first noticed the effects of the slab reorganization running the
AIM benchmark on a distro based on 2.6.27.  The effect is even more
pronounced in the hackbench results.  The platform in an HP rx8640
numa platform, configured with "0% Cell Local Memory".  In this
configuration, all memory appears in a "pseudo-node"--an artifact
of the firmware--and is interleaved across all the physical nodes'
memory on a cacheline granularity.  All cpus are presented as 
attached to memoryless nodes.

Here are the results of running hackbench at various load levels
with and without the patch on the same platform configured for
0% CLM and "100% CLM".

Command:  hackbench N process 100, for N = 10..100 by 10


	100% CLM				0% CLM		
Tasks	 no	with		 no	with
	patch	patch	%diff	patch	patch	%diff
  400	0.246	0.281	14.23%	2.962	 0.410	-86.16%
  800	0.418	0.421	0.72%	6.224	 0.793	-87.26%
 1200	0.548	0.532	-2.92%	9.058	 1.090	-87.97%
 1600	0.654	0.716	9.48%	12.473	 1.562	-87.48%
 2000	0.871	0.859	-1.38%	15.484	 1.889	-87.80%
 2400	0.994	1.043	4.93%	18.689	 2.309	-87.65%
 2800	1.196	1.195	-0.08%	22.069	 2.606	-88.19%
 3200	1.322	1.344	1.66%	25.642	 2.988	-88.35%
 3600	1.474	1.519	3.05%	28.003	 3.418	-87.79%
 4000	1.682	1.750	4.04%	30.887	 3.858	-87.51%

In the 100% CLM case, the regression does not appear, because
all nodes have local memory.  Note that the patch has >10%
overhead on the first run, but then varies widely from run
to run [more below].  For the 0%CLM configuration, the patch
reduced the run time by 86-88%.


The following runs extend the number of hackbench tasks using:

	hackbench N process 100, for N = 100 to 400 by 20

We didn't run the 0%CLM/no-patch runs as they were taking too
long for our liking.  We wanted to see how the patched kernel
performed as we extended the range.  

	100% CLM				0% CLM		
Tasks	 no	with		 no	with
	patch	patch	%diff	patch	patch	%diff
 4800	1.879	2.117	12.67%	not-run	 4.458	
 5600	2.100	2.352	12.00%	not-run	 5.207	
 6400	2.532	2.447	-3.36%	not-run	 5.669	
 8000	2.799	2.792	-0.25%	not-run	 6.651	
 8000	3.244	3.030	-6.60%	not-run	 7.366	
 8800	3.282	3.550	8.17%	not-run	 8.169	
 9600	3.595	3.738	3.98%	not-run	 8.582	
10400	3.811	4.045	6.14%	not-run	 9.705	
11200	4.090	4.162	1.76%	not-run	 9.760	
12000	4.408	4.419	0.25%	not-run	10.141	
12800	4.665	4.787	2.62%	not-run	11.628	
13600	5.095	5.069	-0.51%	not-run	11.735	
14400	5.347	5.464	2.19%	not-run	12.621	
15200	5.620	5.831	3.75%	not-run	13.452	
16000	5.870	6.161	4.96%	not-run	14.069	

The 0% CLM configuration with the patch performs worse than
the 100% CLM configuration.  In the 0% CLM case we had 64
ia64 cores beating on a single zone in the interleaved
memory-only pseudo-node.  In the 100% CLM case, we have 16
cores allocating memory locally to each of 4 nodes,
demonstating the difference between [pseudo-]SMP and NUMA
behavior.

Note, again, that the first run[s] have higher % difference
between the patched and unpatched kernels for the 100% CLM
config, and then vary quite a bit run to run.  To get a feel
for the average overhead, we ran 40 runs at the 16000 task
load point with more interations to increase the runtime
per run:

	hackbench 400 process 200

These were run on the 100% CLM configuration, as this best represents
most NUMA platforms:

		No patch	with Patch	%diff
Average of 40:	9.796		9.857		0.623


Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Nick Piggin <npiggin@suse.de>

 mm/slab.c |   88 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 12 deletions(-)

--- a/mm/slab.c
+++ b/mm/slab.c
@@ -281,7 +281,7 @@ struct kmem_list3 {
 	struct array_cache **alien;	/* on other nodes */
 	unsigned long next_reap;	/* updated without locking */
 	int free_touched;		/* updated without locking */
-};
+} __attribute__((aligned(sizeof(long))));
 
 /*
  * Need this for bootstrapping a per node allocator.
@@ -944,6 +944,11 @@ static int transfer_objects(struct array
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, l3) do { } while (0)
 
+static inline int numa_slab_nid(struct kmem_cache *cachep, gfp_t flags)
+{
+	return 0;
+}
+
 static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	return (struct array_cache **)BAD_ALIEN_MAGIC;
@@ -975,6 +980,64 @@ static inline void *____cache_alloc_node
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
+/*
+ * slow path for numa_slab_nid(), below
+ */
+static noinline int __numa_slab_nid(struct kmem_cache *cachep,
+					int node, gfp_t flags)
+{
+	struct zonelist *zonelist;
+	struct zone *zone;
+	enum zone_type highest_zoneidx = gfp_zone(flags);
+
+	if (likely(node_state(node, N_NORMAL_MEMORY)))
+		return node;
+
+	/*
+	 * memoryless node:  consult its zonelist.
+	 * Cache the fallback node, if cache pointer provided.
+	 */
+	zonelist = &NODE_DATA(node)->node_zonelists[0];
+	(void)first_zones_zonelist(zonelist, highest_zoneidx,
+						NULL,
+						&zone);
+	if (cachep)
+		cachep->nodelists[node] =
+			(struct kmem_list3 *)((unsigned long)zone->node << 1 | 1);
+	return zone->node;
+}
+
+/*
+ * "Local" node for slab is first node in zonelist with memory.
+ * For nodes with memory this will be the actual local node.
+ *
+ * Use nodelist[numa_node_id()] to cache the fallback node for
+ * memoryless nodes.  We'll be loading that member soon anyway,
+ * or already have, when called for cache refill, ...  Use low
+ * bit of "pointer" as flag for "memoryless_node", indicating
+ * that the fallback nodes is stored here [<<1].
+ */
+#define memoryless_node(L3L) ((L3L) & 1)
+static inline int numa_slab_nid(struct kmem_cache *cachep, gfp_t flags)
+{
+	int node = numa_mem_id();
+
+	if (likely(cachep)){
+		unsigned long l3l = (unsigned long)cachep->nodelists[node];
+
+		if (likely(l3l)) {
+			if (unlikely(memoryless_node(l3l)))
+				node = (int)(l3l >> 1);
+			return node;
+		}
+	}
+
+	/*
+	 * !cachep || !l3l - the slow path
+	 */
+	return __numa_slab_nid(cachep, node, flags);
+}
+
 static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	struct array_cache **ac_ptr;
@@ -1074,7 +1137,7 @@ static inline int cache_free_alien(struc
 	struct array_cache *alien = NULL;
 	int node;
 
-	node = numa_mem_id();
+	node = numa_slab_nid(cachep, GFP_KERNEL);
 
 	/*
 	 * Make sure we are not freeing a object from another node to the array
@@ -1503,7 +1566,7 @@ void __init kmem_cache_init(void)
 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 
-	node = numa_mem_id();
+	node = numa_slab_nid(NULL, GFP_KERNEL);
 
 	/* 1) create the cache_cache */
 	INIT_LIST_HEAD(&cache_chain);
@@ -2147,7 +2210,7 @@ static int __init_refok setup_cpu_cache(
 			}
 		}
 	}
-	cachep->nodelists[numa_mem_id()]->next_reap =
+	cachep->nodelists[numa_slab_nid(cachep, GFP_KERNEL)]->next_reap =
 			jiffies + REAPTIMEOUT_LIST3 +
 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
@@ -2479,7 +2542,7 @@ static void check_spinlock_acquired(stru
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
+	assert_spin_locked(&cachep->nodelists[numa_slab_nid(cachep, GFP_KERNEL)]->list_lock);
 #endif
 }
 
@@ -2506,7 +2569,7 @@ static void do_drain(void *arg)
 {
 	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
-	int node = numa_mem_id();
+	int node = numa_slab_nid(cachep, GFP_KERNEL);
 
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
@@ -3043,7 +3106,7 @@ static void *cache_alloc_refill(struct k
 
 retry:
 	check_irq_off();
-	node = numa_mem_id();
+	node = numa_slab_nid(cachep, flags);
 	if (unlikely(must_refill))
 		goto force_grow;
 	ac = cpu_cache_get(cachep);
@@ -3253,7 +3316,7 @@ static void *alternate_node_alloc(struct
 
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
-	nid_alloc = nid_here = numa_mem_id();
+	nid_alloc = nid_here = numa_slab_nid(cachep, flags);
 	get_mems_allowed();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_slab_spread_node();
@@ -3432,7 +3495,7 @@ __cache_alloc_node(struct kmem_cache *ca
 {
 	unsigned long save_flags;
 	void *ptr;
-	int slab_node = numa_mem_id();
+	int slab_node = numa_slab_nid(cachep, flags);
 
 	flags &= gfp_allowed_mask;
 
@@ -3498,7 +3561,8 @@ __do_cache_alloc(struct kmem_cache *cach
 	 * ____cache_alloc_node() knows how to locate memory on other nodes
 	 */
 	if (!objp)
-		objp = ____cache_alloc_node(cache, flags, numa_mem_id());
+		objp = ____cache_alloc_node(cache, flags,
+					 numa_slab_nid(cache, flags));
 
   out:
 	return objp;
@@ -3595,7 +3659,7 @@ static void cache_flusharray(struct kmem
 {
 	int batchcount;
 	struct kmem_list3 *l3;
-	int node = numa_mem_id();
+	int node = numa_slab_nid(cachep, GFP_KERNEL);
 
 	batchcount = ac->batchcount;
 #if DEBUG
@@ -4234,7 +4298,7 @@ static void cache_reap(struct work_struc
 {
 	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
-	int node = numa_mem_id();
+	int node = numa_slab_nid(NULL, GFP_KERNEL);
 	struct delayed_work *work = to_delayed_work(w);
 
 	if (!mutex_trylock(&cache_chain_mutex))