420 lines
12 KiB
Diff
420 lines
12 KiB
Diff
|
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||
|
Subject: [PATCH 05/31] mm: sl[au]b: add knowledge of reserve pages
|
||
|
Patch-mainline: Not yet
|
||
|
|
||
|
Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
|
||
|
contexts that are entitled to it. This is done to ensure reserve pages don't
|
||
|
leak out and get consumed.
|
||
|
|
||
|
The basic pattern used for all # allocators is the following, for each active
|
||
|
slab page we store if it came from an emergency allocation. When we find it
|
||
|
did, make sure the current allocation context would have been able to allocate
|
||
|
page from the emergency reserves as well. In that case allow the allocation. If
|
||
|
not, force a new slab allocation. When that works the memory pressure has
|
||
|
lifted enough to allow this context to get an object, otherwise fail the
|
||
|
allocation.
|
||
|
|
||
|
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||
|
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
|
||
|
---
|
||
|
include/linux/slub_def.h | 1
|
||
|
mm/slab.c | 61 ++++++++++++++++++++++++++++++++++++++++-------
|
||
|
mm/slob.c | 16 +++++++++++-
|
||
|
mm/slub.c | 43 +++++++++++++++++++++++++++------
|
||
|
4 files changed, 104 insertions(+), 17 deletions(-)
|
||
|
|
||
|
--- a/include/linux/slub_def.h
|
||
|
+++ b/include/linux/slub_def.h
|
||
|
@@ -38,6 +38,7 @@ struct kmem_cache_cpu {
|
||
|
void **freelist; /* Pointer to first free per cpu object */
|
||
|
struct page *page; /* The slab from which we are allocating */
|
||
|
int node; /* The node of the page (or -1 for debug) */
|
||
|
+ int reserve; /* Did the current page come from the reserve */
|
||
|
#ifdef CONFIG_SLUB_STATS
|
||
|
unsigned stat[NR_SLUB_STAT_ITEMS];
|
||
|
#endif
|
||
|
--- a/mm/slab.c
|
||
|
+++ b/mm/slab.c
|
||
|
@@ -120,6 +120,8 @@
|
||
|
#include <asm/tlbflush.h>
|
||
|
#include <asm/page.h>
|
||
|
|
||
|
+#include "internal.h"
|
||
|
+
|
||
|
/*
|
||
|
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
|
||
|
* 0 for faster, smaller code (especially in the critical paths).
|
||
|
@@ -268,7 +270,8 @@ struct array_cache {
|
||
|
unsigned int avail;
|
||
|
unsigned int limit;
|
||
|
unsigned int batchcount;
|
||
|
- unsigned int touched;
|
||
|
+ unsigned int touched:1,
|
||
|
+ reserve:1;
|
||
|
spinlock_t lock;
|
||
|
void *entry[]; /*
|
||
|
* Must have this definition in here for the proper
|
||
|
@@ -704,6 +707,27 @@ static inline struct array_cache *cpu_ca
|
||
|
return cachep->array[smp_processor_id()];
|
||
|
}
|
||
|
|
||
|
+/*
|
||
|
+ * If the last page came from the reserves, and the current allocation context
|
||
|
+ * does not have access to them, force an allocation to test the watermarks.
|
||
|
+ */
|
||
|
+static inline int slab_force_alloc(struct kmem_cache *cachep, gfp_t flags)
|
||
|
+{
|
||
|
+ if (unlikely(cpu_cache_get(cachep)->reserve) &&
|
||
|
+ !(gfp_to_alloc_flags(flags) & ALLOC_NO_WATERMARKS))
|
||
|
+ return 1;
|
||
|
+
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+
|
||
|
+static inline void slab_set_reserve(struct kmem_cache *cachep, int reserve)
|
||
|
+{
|
||
|
+ struct array_cache *ac = cpu_cache_get(cachep);
|
||
|
+
|
||
|
+ if (unlikely(ac->reserve != reserve))
|
||
|
+ ac->reserve = reserve;
|
||
|
+}
|
||
|
+
|
||
|
static inline struct kmem_cache *__find_general_cachep(size_t size,
|
||
|
gfp_t gfpflags)
|
||
|
{
|
||
|
@@ -910,6 +934,7 @@ static struct array_cache *alloc_arrayca
|
||
|
nc->limit = entries;
|
||
|
nc->batchcount = batchcount;
|
||
|
nc->touched = 0;
|
||
|
+ nc->reserve = 0;
|
||
|
spin_lock_init(&nc->lock);
|
||
|
}
|
||
|
return nc;
|
||
|
@@ -1606,7 +1631,8 @@ __initcall(cpucache_init);
|
||
|
* did not request dmaable memory, we might get it, but that
|
||
|
* would be relatively rare and ignorable.
|
||
|
*/
|
||
|
-static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
|
||
|
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid,
|
||
|
+ int *reserve)
|
||
|
{
|
||
|
struct page *page;
|
||
|
int nr_pages;
|
||
|
@@ -1628,6 +1654,7 @@ static void *kmem_getpages(struct kmem_c
|
||
|
if (!page)
|
||
|
return NULL;
|
||
|
|
||
|
+ *reserve = page->reserve;
|
||
|
nr_pages = (1 << cachep->gfporder);
|
||
|
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
|
||
|
add_zone_page_state(page_zone(page),
|
||
|
@@ -2060,6 +2087,7 @@ static int __init_refok setup_cpu_cache(
|
||
|
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
|
||
|
cpu_cache_get(cachep)->batchcount = 1;
|
||
|
cpu_cache_get(cachep)->touched = 0;
|
||
|
+ cpu_cache_get(cachep)->reserve = 0;
|
||
|
cachep->batchcount = 1;
|
||
|
cachep->limit = BOOT_CPUCACHE_ENTRIES;
|
||
|
return 0;
|
||
|
@@ -2745,6 +2773,7 @@ static int cache_grow(struct kmem_cache
|
||
|
size_t offset;
|
||
|
gfp_t local_flags;
|
||
|
struct kmem_list3 *l3;
|
||
|
+ int reserve;
|
||
|
|
||
|
/*
|
||
|
* Be lazy and only check for valid flags here, keeping it out of the
|
||
|
@@ -2783,7 +2812,7 @@ static int cache_grow(struct kmem_cache
|
||
|
* 'nodeid'.
|
||
|
*/
|
||
|
if (!objp)
|
||
|
- objp = kmem_getpages(cachep, local_flags, nodeid);
|
||
|
+ objp = kmem_getpages(cachep, local_flags, nodeid, &reserve);
|
||
|
if (!objp)
|
||
|
goto failed;
|
||
|
|
||
|
@@ -2800,6 +2829,7 @@ static int cache_grow(struct kmem_cache
|
||
|
if (local_flags & __GFP_WAIT)
|
||
|
local_irq_disable();
|
||
|
check_irq_off();
|
||
|
+ slab_set_reserve(cachep, reserve);
|
||
|
spin_lock(&l3->list_lock);
|
||
|
|
||
|
/* Make slab active. */
|
||
|
@@ -2934,7 +2964,8 @@ bad:
|
||
|
#define check_slabp(x,y) do { } while(0)
|
||
|
#endif
|
||
|
|
||
|
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
|
||
|
+static void *cache_alloc_refill(struct kmem_cache *cachep,
|
||
|
+ gfp_t flags, int must_refill)
|
||
|
{
|
||
|
int batchcount;
|
||
|
struct kmem_list3 *l3;
|
||
|
@@ -2944,6 +2975,8 @@ static void *cache_alloc_refill(struct k
|
||
|
retry:
|
||
|
check_irq_off();
|
||
|
node = numa_node_id();
|
||
|
+ if (unlikely(must_refill))
|
||
|
+ goto force_grow;
|
||
|
ac = cpu_cache_get(cachep);
|
||
|
batchcount = ac->batchcount;
|
||
|
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
|
||
|
@@ -3013,11 +3046,14 @@ alloc_done:
|
||
|
|
||
|
if (unlikely(!ac->avail)) {
|
||
|
int x;
|
||
|
+force_grow:
|
||
|
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
|
||
|
|
||
|
/* cache_grow can reenable interrupts, then ac could change. */
|
||
|
ac = cpu_cache_get(cachep);
|
||
|
- if (!x && ac->avail == 0) /* no objects in sight? abort */
|
||
|
+
|
||
|
+ /* no objects in sight? abort */
|
||
|
+ if (!x && (ac->avail == 0 || must_refill))
|
||
|
return NULL;
|
||
|
|
||
|
if (!ac->avail) /* objects refilled by interrupt? */
|
||
|
@@ -3107,17 +3143,18 @@ static inline void *____cache_alloc(stru
|
||
|
{
|
||
|
void *objp;
|
||
|
struct array_cache *ac;
|
||
|
+ int must_refill = slab_force_alloc(cachep, flags);
|
||
|
|
||
|
check_irq_off();
|
||
|
|
||
|
ac = cpu_cache_get(cachep);
|
||
|
- if (likely(ac->avail)) {
|
||
|
+ if (likely(ac->avail && !must_refill)) {
|
||
|
STATS_INC_ALLOCHIT(cachep);
|
||
|
ac->touched = 1;
|
||
|
objp = ac->entry[--ac->avail];
|
||
|
} else {
|
||
|
STATS_INC_ALLOCMISS(cachep);
|
||
|
- objp = cache_alloc_refill(cachep, flags);
|
||
|
+ objp = cache_alloc_refill(cachep, flags, must_refill);
|
||
|
/*
|
||
|
* the 'ac' may be updated by cache_alloc_refill(),
|
||
|
* and kmemleak_erase() requires its correct value.
|
||
|
@@ -3173,7 +3210,7 @@ static void *fallback_alloc(struct kmem_
|
||
|
struct zone *zone;
|
||
|
enum zone_type high_zoneidx = gfp_zone(flags);
|
||
|
void *obj = NULL;
|
||
|
- int nid;
|
||
|
+ int nid, reserve;
|
||
|
|
||
|
if (flags & __GFP_THISNODE)
|
||
|
return NULL;
|
||
|
@@ -3209,10 +3246,12 @@ retry:
|
||
|
if (local_flags & __GFP_WAIT)
|
||
|
local_irq_enable();
|
||
|
kmem_flagcheck(cache, flags);
|
||
|
- obj = kmem_getpages(cache, local_flags, numa_node_id());
|
||
|
+ obj = kmem_getpages(cache, local_flags, numa_node_id(),
|
||
|
+ &reserve);
|
||
|
if (local_flags & __GFP_WAIT)
|
||
|
local_irq_disable();
|
||
|
if (obj) {
|
||
|
+ slab_set_reserve(cache, reserve);
|
||
|
/*
|
||
|
* Insert into the appropriate per node queues
|
||
|
*/
|
||
|
@@ -3251,6 +3290,9 @@ static void *____cache_alloc_node(struct
|
||
|
l3 = cachep->nodelists[nodeid];
|
||
|
BUG_ON(!l3);
|
||
|
|
||
|
+ if (unlikely(slab_force_alloc(cachep, flags)))
|
||
|
+ goto force_grow;
|
||
|
+
|
||
|
retry:
|
||
|
check_irq_off();
|
||
|
spin_lock(&l3->list_lock);
|
||
|
@@ -3288,6 +3330,7 @@ retry:
|
||
|
|
||
|
must_grow:
|
||
|
spin_unlock(&l3->list_lock);
|
||
|
+force_grow:
|
||
|
x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
|
||
|
if (x)
|
||
|
goto retry;
|
||
|
--- a/mm/slob.c
|
||
|
+++ b/mm/slob.c
|
||
|
@@ -69,6 +69,7 @@
|
||
|
#include <linux/kmemtrace.h>
|
||
|
#include <linux/kmemleak.h>
|
||
|
#include <asm/atomic.h>
|
||
|
+#include "internal.h"
|
||
|
|
||
|
/*
|
||
|
* slob_block has a field 'units', which indicates size of block if +ve,
|
||
|
@@ -191,6 +192,11 @@ struct slob_rcu {
|
||
|
static DEFINE_SPINLOCK(slob_lock);
|
||
|
|
||
|
/*
|
||
|
+ * tracks the reserve state for the allocator.
|
||
|
+ */
|
||
|
+static int slob_reserve;
|
||
|
+
|
||
|
+/*
|
||
|
* Encode the given size and next info into a free slob block s.
|
||
|
*/
|
||
|
static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
|
||
|
@@ -240,7 +246,7 @@ static int slob_last(slob_t *s)
|
||
|
|
||
|
static void *slob_new_pages(gfp_t gfp, int order, int node)
|
||
|
{
|
||
|
- void *page;
|
||
|
+ struct page *page;
|
||
|
|
||
|
#ifdef CONFIG_NUMA
|
||
|
if (node != -1)
|
||
|
@@ -252,6 +258,8 @@ static void *slob_new_pages(gfp_t gfp, i
|
||
|
if (!page)
|
||
|
return NULL;
|
||
|
|
||
|
+ slob_reserve = page->reserve;
|
||
|
+
|
||
|
return page_address(page);
|
||
|
}
|
||
|
|
||
|
@@ -324,6 +332,11 @@ static void *slob_alloc(size_t size, gfp
|
||
|
slob_t *b = NULL;
|
||
|
unsigned long flags;
|
||
|
|
||
|
+ if (unlikely(slob_reserve)) {
|
||
|
+ if (!(gfp_to_alloc_flags(gfp) & ALLOC_NO_WATERMARKS))
|
||
|
+ goto grow;
|
||
|
+ }
|
||
|
+
|
||
|
if (size < SLOB_BREAK1)
|
||
|
slob_list = &free_slob_small;
|
||
|
else if (size < SLOB_BREAK2)
|
||
|
@@ -362,6 +375,7 @@ static void *slob_alloc(size_t size, gfp
|
||
|
}
|
||
|
spin_unlock_irqrestore(&slob_lock, flags);
|
||
|
|
||
|
+grow:
|
||
|
/* Not enough space: must allocate a new page */
|
||
|
if (!b) {
|
||
|
b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
|
||
|
--- a/mm/slub.c
|
||
|
+++ b/mm/slub.c
|
||
|
@@ -28,6 +28,8 @@
|
||
|
#include <linux/memory.h>
|
||
|
#include <linux/math64.h>
|
||
|
#include <linux/fault-inject.h>
|
||
|
+#include "internal.h"
|
||
|
+
|
||
|
|
||
|
/*
|
||
|
* Lock order:
|
||
|
@@ -1148,7 +1150,8 @@ static void setup_object(struct kmem_cac
|
||
|
s->ctor(object);
|
||
|
}
|
||
|
|
||
|
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
|
||
|
+static
|
||
|
+struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve)
|
||
|
{
|
||
|
struct page *page;
|
||
|
void *start;
|
||
|
@@ -1162,6 +1165,8 @@ static struct page *new_slab(struct kmem
|
||
|
if (!page)
|
||
|
goto out;
|
||
|
|
||
|
+ *reserve = page->reserve;
|
||
|
+
|
||
|
inc_slabs_node(s, page_to_nid(page), page->objects);
|
||
|
page->slab = s;
|
||
|
page->flags |= 1 << PG_slab;
|
||
|
@@ -1611,10 +1616,20 @@ static void *__slab_alloc(struct kmem_ca
|
||
|
{
|
||
|
void **object;
|
||
|
struct page *new;
|
||
|
+ int reserve;
|
||
|
|
||
|
/* We handle __GFP_ZERO in the caller */
|
||
|
gfpflags &= ~__GFP_ZERO;
|
||
|
|
||
|
+ if (unlikely(c->reserve)) {
|
||
|
+ /*
|
||
|
+ * If the current slab is a reserve slab and the current
|
||
|
+ * allocation context does not allow access to the reserves we
|
||
|
+ * must force an allocation to test the current levels.
|
||
|
+ */
|
||
|
+ if (!(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
|
||
|
+ goto grow_slab;
|
||
|
+ }
|
||
|
if (!c->page)
|
||
|
goto new_slab;
|
||
|
|
||
|
@@ -1628,8 +1643,8 @@ load_freelist:
|
||
|
object = c->page->freelist;
|
||
|
if (unlikely(!object))
|
||
|
goto another_slab;
|
||
|
- if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
|
||
|
- goto debug;
|
||
|
+ if (unlikely(PageSlubDebug(c->page) || c->reserve))
|
||
|
+ goto slow_path;
|
||
|
|
||
|
c->freelist = get_freepointer(s, object);
|
||
|
c->page->inuse = c->page->objects;
|
||
|
@@ -1651,16 +1666,18 @@ new_slab:
|
||
|
goto load_freelist;
|
||
|
}
|
||
|
|
||
|
+grow_slab:
|
||
|
if (gfpflags & __GFP_WAIT)
|
||
|
local_irq_enable();
|
||
|
|
||
|
- new = new_slab(s, gfpflags, node);
|
||
|
+ new = new_slab(s, gfpflags, node, &reserve);
|
||
|
|
||
|
if (gfpflags & __GFP_WAIT)
|
||
|
local_irq_disable();
|
||
|
|
||
|
if (new) {
|
||
|
c = __this_cpu_ptr(s->cpu_slab);
|
||
|
+ c->reserve = reserve;
|
||
|
stat(s, ALLOC_SLAB);
|
||
|
if (c->page)
|
||
|
flush_slab(s, c);
|
||
|
@@ -1672,10 +1689,21 @@ new_slab:
|
||
|
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
|
||
|
slab_out_of_memory(s, gfpflags, node);
|
||
|
return NULL;
|
||
|
-debug:
|
||
|
- if (!alloc_debug_processing(s, c->page, object, addr))
|
||
|
+
|
||
|
+slow_path:
|
||
|
+ if (PageSlubDebug(c->page) &&
|
||
|
+ !alloc_debug_processing(s, c->page, object, addr))
|
||
|
goto another_slab;
|
||
|
|
||
|
+ /*
|
||
|
+ * Avoid the slub fast path in slab_alloc() by not setting
|
||
|
+ * c->freelist and the fast path in slab_free() by making
|
||
|
+ * node_match() fail by setting c->node to -1.
|
||
|
+ *
|
||
|
+ * We use this for for debug and reserve checks which need
|
||
|
+ * to be done for each allocation.
|
||
|
+ */
|
||
|
+
|
||
|
c->page->inuse++;
|
||
|
c->page->freelist = get_freepointer(s, object);
|
||
|
c->node = -1;
|
||
|
@@ -2100,10 +2128,11 @@ static void early_kmem_cache_node_alloc(
|
||
|
struct page *page;
|
||
|
struct kmem_cache_node *n;
|
||
|
unsigned long flags;
|
||
|
+ int reserve;
|
||
|
|
||
|
BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
|
||
|
|
||
|
- page = new_slab(kmalloc_caches, gfpflags, node);
|
||
|
+ page = new_slab(kmalloc_caches, gfpflags, node, &reserve);
|
||
|
|
||
|
BUG_ON(!page);
|
||
|
if (page_to_nid(page) != node) {
|