From: Peter Zijlstra Subject: [PATCH 18/31] netvm: hook skb allocation to reserves Patch-mainline: not yet Change the skb allocation api to indicate RX usage and use this to fall back to the reserve when needed. SKBs allocated from the reserve are tagged in skb->emergency. Teach all other skb ops about emergency skbs and the reserve accounting. Use the (new) packet split API to allocate and track fragment pages from the emergency reserve. Do this using an atomic counter in page->index. This is needed because the fragments have a different sharing semantic than that indicated by skb_shinfo()->dataref. Note that the decision to distinguish between regular and emergency SKBs allows the accounting overhead to be limited to the later kind. Signed-off-by: Peter Zijlstra Signed-off-by: Suresh Jayaraman --- include/linux/mm_types.h | 1 include/linux/skbuff.h | 35 +++++++++++-- net/core/skbuff.c | 121 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 128 insertions(+), 29 deletions(-) --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -72,6 +72,7 @@ struct page { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* SLUB: freelist req. slab lock */ int reserve; /* page_alloc: page is a reserve page */ + atomic_t frag_count; /* skb fragment use count */ }; struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -29,6 +29,7 @@ #include #include #include +#include /* Don't change this without changing skb_csum_unnecessary! */ #define CHECKSUM_NONE 0 @@ -386,9 +386,12 @@ struct sk_buff { __u8 deliver_no_wcard:1; #endif __u8 ooo_okay:1; +#ifdef CONFIG_NETVM + __u8 emergency:1; +#endif kmemcheck_bitfield_end(flags2); - /* 0/13 bit hole */ + /* 0/12 bit hole */ #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; @@ -423,6 +426,18 @@ struct sk_buff { #include +#define SKB_ALLOC_FCLONE 0x01 +#define SKB_ALLOC_RX 0x02 + +static inline bool skb_emergency(const struct sk_buff *skb) +{ +#ifdef CONFIG_NETVM + return unlikely(skb->emergency); +#else + return false; +#endif +} + /* * skb might have a dst pointer attached, refcounted or not. * _skb_refdst low order bit is set if refcount was _not_ taken @@ -480,7 +495,7 @@ extern void kfree_skb(struct sk_buff *sk extern void consume_skb(struct sk_buff *skb); extern void __kfree_skb(struct sk_buff *skb); extern struct sk_buff *__alloc_skb(unsigned int size, - gfp_t priority, int fclone, int node); + gfp_t priority, int flags, int node); static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { @@ -490,7 +505,7 @@ static inline struct sk_buff *alloc_skb( static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { - return __alloc_skb(size, priority, 1, NUMA_NO_NODE); + return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } extern bool skb_recycle_check(struct sk_buff *skb, int skb_size); @@ -1511,7 +1526,8 @@ static inline void __skb_queue_purge(str static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) { - struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask); + struct sk_buff *skb = + __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, -1); if (likely(skb)) skb_reserve(skb, NET_SKB_PAD); return skb; @@ -1551,6 +1567,8 @@ static inline struct sk_buff *netdev_all return skb; } +extern struct mem_reserve net_skb_reserve; + /** * __netdev_alloc_page - allocate a page for ps-rx on a specific device * @dev: network device to receive on @@ -1562,7 +1580,8 @@ static inline struct sk_buff *netdev_all */ static inline struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) { - return alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0); + return alloc_pages_reserve(NUMA_NO_NODE, gfp_mask | __GFP_MEMALLOC, 0, + &net_skb_reserve, NULL); } /** @@ -1578,9 +1597,14 @@ static inline struct page *netdev_alloc_ return __netdev_alloc_page(dev, GFP_ATOMIC); } +static inline void __netdev_free_page(struct net_device *dev, struct page *page) +{ + free_pages_reserve(page, 0, &net_skb_reserve, page->reserve); +} + static inline void netdev_free_page(struct net_device *dev, struct page *page) { - __free_page(page); + __netdev_free_page(dev, page); } /** --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -168,14 +168,21 @@ static void skb_under_panic(struct sk_bu * %GFP_ATOMIC. */ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int fclone, int node) + int flags, int node) { struct kmem_cache *cache; struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; + int emergency = 0; + int memalloc = sk_memalloc_socks(); - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; + size = SKB_DATA_ALIGN(size); + cache = (flags & SKB_ALLOC_FCLONE) + ? skbuff_fclone_cache : skbuff_head_cache; + + if (memalloc && (flags & SKB_ALLOC_RX)) + gfp_mask |= __GFP_MEMALLOC; /* Get the HEAD */ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); @@ -183,9 +190,8 @@ struct sk_buff *__alloc_skb(unsigned int goto out; prefetchw(skb); - size = SKB_DATA_ALIGN(size); - data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), - gfp_mask, node); + data = kmalloc_reserve(size + sizeof(struct skb_shared_info), + gfp_mask, node, &net_skb_reserve, &emergency); if (!data) goto nodata; prefetchw(data + size); @@ -196,6 +202,9 @@ struct sk_buff *__alloc_skb(unsigned int * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); +#ifdef CONFIG_NETVM + skb->emergency = emergency; +#endif skb->truesize = size + sizeof(struct sk_buff); atomic_set(&skb->users, 1); skb->head = data; @@ -211,7 +220,7 @@ struct sk_buff *__alloc_skb(unsigned int atomic_set(&shinfo->dataref, 1); kmemcheck_annotate_variable(shinfo->destructor_arg); - if (fclone) { + if (flags & SKB_ALLOC_FCLONE) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -221,6 +230,9 @@ struct sk_buff *__alloc_skb(unsigned int atomic_set(fclone_ref, 1); child->fclone = SKB_FCLONE_UNAVAILABLE; +#ifdef CONFIG_NETVM + child->emergency = skb->emergency; +#endif } out: return skb; @@ -249,7 +261,7 @@ struct sk_buff *__netdev_alloc_skb(struc { struct sk_buff *skb; - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; @@ -265,6 +277,27 @@ void skb_add_rx_frag(struct sk_buff *skb skb->len += size; skb->data_len += size; skb->truesize += size; + +#ifdef CONFIG_NETVM + /* + * In the rare case that skb_emergency() != page->reserved we'll + * skew the accounting slightly, but since its only a 'small' constant + * shift its ok. + */ + if (skb_emergency(skb)) { + /* + * We need to track fragment pages so that we properly + * release their reserve in skb_put_page(). + */ + atomic_set(&page->frag_count, 1); + } else if (unlikely(page->reserve)) { + /* + * Release the reserve now, because normal skbs don't + * do the emergency accounting. + */ + mem_reserve_pages_charge(&net_skb_reserve, -1); + } +#endif } EXPORT_SYMBOL(skb_add_rx_frag); @@ -316,21 +349,38 @@ static void skb_clone_fraglist(struct sk skb_get(list); } +static void skb_get_page(struct sk_buff *skb, struct page *page) +{ + get_page(page); + if (skb_emergency(skb)) + atomic_inc(&page->frag_count); +} + +static void skb_put_page(struct sk_buff *skb, struct page *page) +{ + if (skb_emergency(skb) && atomic_dec_and_test(&page->frag_count)) + mem_reserve_pages_charge(&net_skb_reserve, -1); + put_page(page); +} + static void skb_release_data(struct sk_buff *skb) { if (!skb->cloned || !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, &skb_shinfo(skb)->dataref)) { + if (skb_shinfo(skb)->nr_frags) { int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_put_page(skb, + skb_shinfo(skb)->frags[i].page); + } } if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); - kfree(skb->head); + kfree_reserve(skb->head, &net_skb_reserve, skb_emergency(skb)); } } @@ -524,6 +574,9 @@ static void __copy_skb_header(struct sk_ #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) new->ipvs_property = old->ipvs_property; #endif +#ifdef CONFIG_NETVM + new->emergency = old->emergency; +#endif new->protocol = old->protocol; new->mark = old->mark; new->skb_iif = old->skb_iif; @@ -618,6 +671,9 @@ struct sk_buff *skb_clone(struct sk_buff n->fclone = SKB_FCLONE_CLONE; atomic_inc(fclone_ref); } else { + if (skb_emergency(skb)) + gfp_mask |= __GFP_MEMALLOC; + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; @@ -654,6 +710,14 @@ static void copy_skb_header(struct sk_bu skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } +static inline int skb_alloc_rx_flag(const struct sk_buff *skb) +{ + if (skb_emergency(skb)) + return SKB_ALLOC_RX; + + return 0; +} + /** * skb_copy - create private copy of an sk_buff * @skb: buffer to copy @@ -675,7 +739,8 @@ struct sk_buff *skb_copy(const struct sk { int headerlen = skb_headroom(skb); unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; - struct sk_buff *n = alloc_skb(size, gfp_mask); + struct sk_buff *n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) return NULL; @@ -709,7 +774,8 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { unsigned int size = skb_end_pointer(skb) - skb->head; - struct sk_buff *n = alloc_skb(size, gfp_mask); + struct sk_buff *n = __alloc_skb(size, gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (!n) goto out; @@ -729,8 +795,9 @@ struct sk_buff *pskb_copy(struct sk_buff int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; - get_page(skb_shinfo(n)->frags[i].page); + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + skb_shinfo(n)->frags[i] = *frag; + skb_get_page(n, frag->page); } skb_shinfo(n)->nr_frags = i; } @@ -778,7 +845,11 @@ int pskb_expand_head(struct sk_buff *skb goto adjust_others; } - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (skb_emergency(skb)) + gfp_mask |= __GFP_MEMALLOC; + + data = kmalloc_reserve(size + sizeof(struct skb_shared_info), + gfp_mask, -1, &net_skb_reserve, NULL); if (!data) goto nodata; @@ -806,7 +877,7 @@ int pskb_expand_head(struct sk_buff *skb kfree(skb->head); } else { for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); + skb_get_page(skb, skb_shinfo(skb)->frags[i].page); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); @@ -889,8 +960,8 @@ struct sk_buff *skb_copy_expand(const st /* * Allocate the copy buffer */ - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask); + struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), -1); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; int off; @@ -1083,7 +1154,7 @@ drop_pages: skb_shinfo(skb)->nr_frags = i; for (; i < nfrags; i++) - put_page(skb_shinfo(skb)->frags[i].page); + skb_put_page(skb, skb_shinfo(skb)->frags[i].page); if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); @@ -1252,7 +1323,7 @@ pull_pages: k = 0; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { if (skb_shinfo(skb)->frags[i].size <= eat) { - put_page(skb_shinfo(skb)->frags[i].page); + skb_put_page(skb, skb_shinfo(skb)->frags[i].page); eat -= skb_shinfo(skb)->frags[i].size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; @@ -2034,6 +2105,7 @@ static inline void skb_split_no_header(s skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; if (pos < len) { + struct page *page = skb_shinfo(skb)->frags[i].page; /* Split frag. * We have two variants in this case: * 1. Move all the frag to the second @@ -2042,7 +2114,7 @@ static inline void skb_split_no_header(s * where splitting is expensive. * 2. Split is accurately. We make this. */ - get_page(skb_shinfo(skb)->frags[i].page); + skb_get_page(skb1, page); skb_shinfo(skb1)->frags[0].page_offset += len - pos; skb_shinfo(skb1)->frags[0].size -= len - pos; skb_shinfo(skb)->frags[i].size = len - pos; @@ -2540,8 +2612,9 @@ struct sk_buff *skb_segment(struct sk_bu skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { - nskb = alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC); + nskb = __alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC, skb_alloc_rx_flag(skb), + -1); if (unlikely(!nskb)) goto err; @@ -2587,7 +2660,7 @@ struct sk_buff *skb_segment(struct sk_bu while (pos < offset + len && i < nfrags) { *frag = skb_shinfo(skb)->frags[i]; - get_page(frag->page); + skb_get_page(nskb, frag->page); size = frag->size; if (pos < offset) {