309 lines
10 KiB
Diff
309 lines
10 KiB
Diff
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
|
|
Subject: slab - handle memoryless nodes V2a
|
|
References: bnc#436025, bnc#570492
|
|
Patch-mainline: not yet
|
|
|
|
The slab cache, since [apparently] 2.6.21, does not handle memoryless
|
|
nodes well. Specifically, the "fast path"-- ____cache_alloc()--will
|
|
never succeed, but will be called twice: once speculatively [expected
|
|
to succeed] and once in the fallback path. This adds significant
|
|
overhead to all kmem cache allocations, incurring a significant
|
|
regression relative to earlier kernels [from before slab.c was
|
|
reorganized].
|
|
|
|
This patch addresses the regression by modifying slab.c to treat the
|
|
first fallback node in a memoryless node's general zonelist as the "slab
|
|
local node" -- i.e., the local node for the purpose of slab allocations.
|
|
This is, in fact, the node from which all "local" allocations for cpus
|
|
attached to a memoryless node will be satisfied.
|
|
|
|
The new function numa_slab_nid(gfp_t) replaces all calls to
|
|
numa_node_id() in slab.c. numa_slab_id() will simply return
|
|
numa_node_id() for nodes with memory, but will return the first
|
|
node in the local node's zonelist selected by the gfp flags.
|
|
|
|
Effects of the patch:
|
|
|
|
We first noticed the effects of the slab reorganization running the
|
|
AIM benchmark on a distro based on 2.6.27. The effect is even more
|
|
pronounced in the hackbench results. The platform in an HP rx8640
|
|
numa platform, configured with "0% Cell Local Memory". In this
|
|
configuration, all memory appears in a "pseudo-node"--an artifact
|
|
of the firmware--and is interleaved across all the physical nodes'
|
|
memory on a cacheline granularity. All cpus are presented as
|
|
attached to memoryless nodes.
|
|
|
|
Here are the results of running hackbench at various load levels
|
|
with and without the patch on the same platform configured for
|
|
0% CLM and "100% CLM".
|
|
|
|
Command: hackbench N process 100, for N = 10..100 by 10
|
|
|
|
|
|
100% CLM 0% CLM
|
|
Tasks no with no with
|
|
patch patch %diff patch patch %diff
|
|
400 0.246 0.281 14.23% 2.962 0.410 -86.16%
|
|
800 0.418 0.421 0.72% 6.224 0.793 -87.26%
|
|
1200 0.548 0.532 -2.92% 9.058 1.090 -87.97%
|
|
1600 0.654 0.716 9.48% 12.473 1.562 -87.48%
|
|
2000 0.871 0.859 -1.38% 15.484 1.889 -87.80%
|
|
2400 0.994 1.043 4.93% 18.689 2.309 -87.65%
|
|
2800 1.196 1.195 -0.08% 22.069 2.606 -88.19%
|
|
3200 1.322 1.344 1.66% 25.642 2.988 -88.35%
|
|
3600 1.474 1.519 3.05% 28.003 3.418 -87.79%
|
|
4000 1.682 1.750 4.04% 30.887 3.858 -87.51%
|
|
|
|
In the 100% CLM case, the regression does not appear, because
|
|
all nodes have local memory. Note that the patch has >10%
|
|
overhead on the first run, but then varies widely from run
|
|
to run [more below]. For the 0%CLM configuration, the patch
|
|
reduced the run time by 86-88%.
|
|
|
|
|
|
The following runs extend the number of hackbench tasks using:
|
|
|
|
hackbench N process 100, for N = 100 to 400 by 20
|
|
|
|
We didn't run the 0%CLM/no-patch runs as they were taking too
|
|
long for our liking. We wanted to see how the patched kernel
|
|
performed as we extended the range.
|
|
|
|
100% CLM 0% CLM
|
|
Tasks no with no with
|
|
patch patch %diff patch patch %diff
|
|
4800 1.879 2.117 12.67% not-run 4.458
|
|
5600 2.100 2.352 12.00% not-run 5.207
|
|
6400 2.532 2.447 -3.36% not-run 5.669
|
|
8000 2.799 2.792 -0.25% not-run 6.651
|
|
8000 3.244 3.030 -6.60% not-run 7.366
|
|
8800 3.282 3.550 8.17% not-run 8.169
|
|
9600 3.595 3.738 3.98% not-run 8.582
|
|
10400 3.811 4.045 6.14% not-run 9.705
|
|
11200 4.090 4.162 1.76% not-run 9.760
|
|
12000 4.408 4.419 0.25% not-run 10.141
|
|
12800 4.665 4.787 2.62% not-run 11.628
|
|
13600 5.095 5.069 -0.51% not-run 11.735
|
|
14400 5.347 5.464 2.19% not-run 12.621
|
|
15200 5.620 5.831 3.75% not-run 13.452
|
|
16000 5.870 6.161 4.96% not-run 14.069
|
|
|
|
The 0% CLM configuration with the patch performs worse than
|
|
the 100% CLM configuration. In the 0% CLM case we had 64
|
|
ia64 cores beating on a single zone in the interleaved
|
|
memory-only pseudo-node. In the 100% CLM case, we have 16
|
|
cores allocating memory locally to each of 4 nodes,
|
|
demonstating the difference between [pseudo-]SMP and NUMA
|
|
behavior.
|
|
|
|
Note, again, that the first run[s] have higher % difference
|
|
between the patched and unpatched kernels for the 100% CLM
|
|
config, and then vary quite a bit run to run. To get a feel
|
|
for the average overhead, we ran 40 runs at the 16000 task
|
|
load point with more interations to increase the runtime
|
|
per run:
|
|
|
|
hackbench 400 process 200
|
|
|
|
These were run on the 100% CLM configuration, as this best represents
|
|
most NUMA platforms:
|
|
|
|
No patch with Patch %diff
|
|
Average of 40: 9.796 9.857 0.623
|
|
|
|
|
|
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
|
|
Acked-by: Nick Piggin <npiggin@suse.de>
|
|
|
|
mm/slab.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------
|
|
1 file changed, 76 insertions(+), 12 deletions(-)
|
|
|
|
--- a/mm/slab.c
|
|
+++ b/mm/slab.c
|
|
@@ -281,7 +281,7 @@ struct kmem_list3 {
|
|
struct array_cache **alien; /* on other nodes */
|
|
unsigned long next_reap; /* updated without locking */
|
|
int free_touched; /* updated without locking */
|
|
-};
|
|
+} __attribute__((aligned(sizeof(long))));
|
|
|
|
/*
|
|
* Need this for bootstrapping a per node allocator.
|
|
@@ -944,6 +944,11 @@ static int transfer_objects(struct array
|
|
#define drain_alien_cache(cachep, alien) do { } while (0)
|
|
#define reap_alien(cachep, l3) do { } while (0)
|
|
|
|
+static inline int numa_slab_nid(struct kmem_cache *cachep, gfp_t flags)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
|
|
{
|
|
return (struct array_cache **)BAD_ALIEN_MAGIC;
|
|
@@ -975,6 +980,64 @@ static inline void *____cache_alloc_node
|
|
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
|
|
static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
|
|
|
|
+/*
|
|
+ * slow path for numa_slab_nid(), below
|
|
+ */
|
|
+static noinline int __numa_slab_nid(struct kmem_cache *cachep,
|
|
+ int node, gfp_t flags)
|
|
+{
|
|
+ struct zonelist *zonelist;
|
|
+ struct zone *zone;
|
|
+ enum zone_type highest_zoneidx = gfp_zone(flags);
|
|
+
|
|
+ if (likely(node_state(node, N_NORMAL_MEMORY)))
|
|
+ return node;
|
|
+
|
|
+ /*
|
|
+ * memoryless node: consult its zonelist.
|
|
+ * Cache the fallback node, if cache pointer provided.
|
|
+ */
|
|
+ zonelist = &NODE_DATA(node)->node_zonelists[0];
|
|
+ (void)first_zones_zonelist(zonelist, highest_zoneidx,
|
|
+ NULL,
|
|
+ &zone);
|
|
+ if (cachep)
|
|
+ cachep->nodelists[node] =
|
|
+ (struct kmem_list3 *)((unsigned long)zone->node << 1 | 1);
|
|
+ return zone->node;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * "Local" node for slab is first node in zonelist with memory.
|
|
+ * For nodes with memory this will be the actual local node.
|
|
+ *
|
|
+ * Use nodelist[numa_node_id()] to cache the fallback node for
|
|
+ * memoryless nodes. We'll be loading that member soon anyway,
|
|
+ * or already have, when called for cache refill, ... Use low
|
|
+ * bit of "pointer" as flag for "memoryless_node", indicating
|
|
+ * that the fallback nodes is stored here [<<1].
|
|
+ */
|
|
+#define memoryless_node(L3L) ((L3L) & 1)
|
|
+static inline int numa_slab_nid(struct kmem_cache *cachep, gfp_t flags)
|
|
+{
|
|
+ int node = numa_mem_id();
|
|
+
|
|
+ if (likely(cachep)){
|
|
+ unsigned long l3l = (unsigned long)cachep->nodelists[node];
|
|
+
|
|
+ if (likely(l3l)) {
|
|
+ if (unlikely(memoryless_node(l3l)))
|
|
+ node = (int)(l3l >> 1);
|
|
+ return node;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * !cachep || !l3l - the slow path
|
|
+ */
|
|
+ return __numa_slab_nid(cachep, node, flags);
|
|
+}
|
|
+
|
|
static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
|
|
{
|
|
struct array_cache **ac_ptr;
|
|
@@ -1074,7 +1137,7 @@ static inline int cache_free_alien(struc
|
|
struct array_cache *alien = NULL;
|
|
int node;
|
|
|
|
- node = numa_mem_id();
|
|
+ node = numa_slab_nid(cachep, GFP_KERNEL);
|
|
|
|
/*
|
|
* Make sure we are not freeing a object from another node to the array
|
|
@@ -1503,7 +1566,7 @@ void __init kmem_cache_init(void)
|
|
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
|
|
*/
|
|
|
|
- node = numa_mem_id();
|
|
+ node = numa_slab_nid(NULL, GFP_KERNEL);
|
|
|
|
/* 1) create the cache_cache */
|
|
INIT_LIST_HEAD(&cache_chain);
|
|
@@ -2147,7 +2210,7 @@ static int __init_refok setup_cpu_cache(
|
|
}
|
|
}
|
|
}
|
|
- cachep->nodelists[numa_mem_id()]->next_reap =
|
|
+ cachep->nodelists[numa_slab_nid(cachep, GFP_KERNEL)]->next_reap =
|
|
jiffies + REAPTIMEOUT_LIST3 +
|
|
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
|
|
|
|
@@ -2479,7 +2542,7 @@ static void check_spinlock_acquired(stru
|
|
{
|
|
#ifdef CONFIG_SMP
|
|
check_irq_off();
|
|
- assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
|
|
+ assert_spin_locked(&cachep->nodelists[numa_slab_nid(cachep, GFP_KERNEL)]->list_lock);
|
|
#endif
|
|
}
|
|
|
|
@@ -2506,7 +2569,7 @@ static void do_drain(void *arg)
|
|
{
|
|
struct kmem_cache *cachep = arg;
|
|
struct array_cache *ac;
|
|
- int node = numa_mem_id();
|
|
+ int node = numa_slab_nid(cachep, GFP_KERNEL);
|
|
|
|
check_irq_off();
|
|
ac = cpu_cache_get(cachep);
|
|
@@ -3043,7 +3106,7 @@ static void *cache_alloc_refill(struct k
|
|
|
|
retry:
|
|
check_irq_off();
|
|
- node = numa_mem_id();
|
|
+ node = numa_slab_nid(cachep, flags);
|
|
if (unlikely(must_refill))
|
|
goto force_grow;
|
|
ac = cpu_cache_get(cachep);
|
|
@@ -3253,7 +3316,7 @@ static void *alternate_node_alloc(struct
|
|
|
|
if (in_interrupt() || (flags & __GFP_THISNODE))
|
|
return NULL;
|
|
- nid_alloc = nid_here = numa_mem_id();
|
|
+ nid_alloc = nid_here = numa_slab_nid(cachep, flags);
|
|
get_mems_allowed();
|
|
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
|
|
nid_alloc = cpuset_slab_spread_node();
|
|
@@ -3432,7 +3495,7 @@ __cache_alloc_node(struct kmem_cache *ca
|
|
{
|
|
unsigned long save_flags;
|
|
void *ptr;
|
|
- int slab_node = numa_mem_id();
|
|
+ int slab_node = numa_slab_nid(cachep, flags);
|
|
|
|
flags &= gfp_allowed_mask;
|
|
|
|
@@ -3498,7 +3561,8 @@ __do_cache_alloc(struct kmem_cache *cach
|
|
* ____cache_alloc_node() knows how to locate memory on other nodes
|
|
*/
|
|
if (!objp)
|
|
- objp = ____cache_alloc_node(cache, flags, numa_mem_id());
|
|
+ objp = ____cache_alloc_node(cache, flags,
|
|
+ numa_slab_nid(cache, flags));
|
|
|
|
out:
|
|
return objp;
|
|
@@ -3595,7 +3659,7 @@ static void cache_flusharray(struct kmem
|
|
{
|
|
int batchcount;
|
|
struct kmem_list3 *l3;
|
|
- int node = numa_mem_id();
|
|
+ int node = numa_slab_nid(cachep, GFP_KERNEL);
|
|
|
|
batchcount = ac->batchcount;
|
|
#if DEBUG
|
|
@@ -4234,7 +4298,7 @@ static void cache_reap(struct work_struc
|
|
{
|
|
struct kmem_cache *searchp;
|
|
struct kmem_list3 *l3;
|
|
- int node = numa_mem_id();
|
|
+ int node = numa_slab_nid(NULL, GFP_KERNEL);
|
|
struct delayed_work *work = to_delayed_work(w);
|
|
|
|
if (!mutex_trylock(&cache_chain_mutex))
|