From: Nick Piggin Subject: SLAB_DESTROY_BY_RCU for file slab Patch-mainline: not yet Use SLAB_DESTROY_BY_RCU for file slab cache. Ensure we have the correct object by using a spinlock to protect the refcount rather than having it atomic (problem with it being atomic is having to release the last ref on a file we have incorrectly picked up a reference to). This improves single threaded repeated open/close performance by 28% by avoiding the full RCU cycle. Signed-off-by: Nick Piggin --- drivers/net/ppp_generic.c | 4 +- drivers/scsi/osst.c | 2 - drivers/scsi/st.c | 2 - fs/aio.c | 4 +- fs/file_table.c | 70 +++++++++++++++++++++++----------------------- fs/open.c | 2 - include/linux/fs.h | 25 +++++++++++++--- kernel/perf_event.c | 4 +- net/sched/sch_atm.c | 4 +- net/unix/garbage.c | 2 - 10 files changed, 69 insertions(+), 50 deletions(-) --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -590,12 +590,12 @@ static long ppp_ioctl(struct file *file, if (file == ppp->owner) ppp_shutdown_interface(ppp); } - if (atomic_long_read(&file->f_count) <= 2) { + if (file->f_count <= 2) { ppp_release(NULL, file); err = 0; } else printk(KERN_DEBUG "PPPIOCDETACH file->f_count=%ld\n", - atomic_long_read(&file->f_count)); + file->f_count); unlock_kernel(); return err; } --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -4824,7 +4824,7 @@ static int os_scsi_tape_flush(struct fil struct osst_request * SRpnt = NULL; char * name = tape_name(STp); - if (file_count(filp) > 1) + if (filp->f_count > 1) return 0; if ((STps->rw == ST_WRITING || STp->dirty) && !STp->pos_unknown) { --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -1272,7 +1272,7 @@ static int st_flush(struct file *filp, f struct st_partstat *STps = &(STp->ps[STp->partition]); char *name = tape_name(STp); - if (file_count(filp) > 1) + if (filp->f_count > 1) return 0; if (STps->rw == ST_WRITING && !STp->pos_unknown) { --- a/fs/aio.c +++ b/fs/aio.c @@ -545,7 +545,7 @@ static void aio_fput_routine(struct work static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) { dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", - req, atomic_long_read(&req->ki_filp->f_count)); + req, req->ki_filp->f_count); assert_spin_locked(&ctx->ctx_lock); @@ -563,7 +563,7 @@ static int __aio_put_req(struct kioctx * * we would not be holding the last reference to the file*, so * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(file_dec_and_test(req->ki_filp))) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); --- a/fs/file_table.c +++ b/fs/file_table.c @@ -40,19 +40,12 @@ static struct kmem_cache *filp_cachep __ static struct percpu_counter nr_files __cacheline_aligned_in_smp; -static inline void file_free_rcu(struct rcu_head *head) -{ - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - - put_cred(f->f_cred); - kmem_cache_free(filp_cachep, f); -} - static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); file_check_state(f); - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + put_cred(f->f_cred); + kmem_cache_free(filp_cachep, f); } /* @@ -127,7 +120,7 @@ struct file *get_empty_filp(void) goto fail_sec; INIT_LIST_HEAD(&f->f_u.fu_list); - atomic_long_set(&f->f_count, 1); + f->f_count = 1; rwlock_init(&f->f_owner.lock); f->f_cred = get_cred(cred); spin_lock_init(&f->f_lock); @@ -196,7 +189,7 @@ EXPORT_SYMBOL(alloc_file); void fput(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) + if (unlikely(file_dec_and_test(file))) __fput(file); } @@ -267,21 +260,38 @@ void __fput(struct file *file) mntput(mnt); } -struct file *fget(unsigned int fd) +static inline struct file *get_stable_file(struct files_struct *files, unsigned int fd) { + struct fdtable *fdt; struct file *file; - struct files_struct *files = current->files; rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (!atomic_long_inc_not_zero(&file->f_count)) { - /* File object ref couldn't be taken */ - rcu_read_unlock(); - return NULL; + fdt = files_fdtable(files); + if (likely(fd < fdt->max_fds)) { + file = rcu_dereference(fdt->fd[fd]); + if (file) { + spin_lock(&file->f_lock); + if (unlikely(file != fdt->fd[fd] || !file->f_count)) { + spin_unlock(&file->f_lock); + file = NULL; + goto out; + } + file->f_count++; + spin_unlock(&file->f_lock); } - } + } else + file = NULL; +out: rcu_read_unlock(); + return file; +} + +struct file *fget(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + file = get_stable_file(files, fd); return file; } @@ -300,20 +310,12 @@ struct file *fget_light(unsigned int fd, struct file *file; struct files_struct *files = current->files; - *fput_needed = 0; if (likely((atomic_read(&files->count) == 1))) { + *fput_needed = 0; file = fcheck_files(files, fd); } else { - rcu_read_lock(); - file = fcheck_files(files, fd); - if (file) { - if (atomic_long_inc_not_zero(&file->f_count)) - *fput_needed = 1; - else - /* Didn't get the reference, someone's freed */ - file = NULL; - } - rcu_read_unlock(); + *fput_needed = 1; + file = get_stable_file(files, fd); } return file; @@ -322,7 +324,7 @@ struct file *fget_light(unsigned int fd, void put_filp(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { + if (unlikely(file_dec_and_test(file))) { security_file_free(file); file_kill(file); file_free(file); @@ -388,7 +390,7 @@ retry: struct vfsmount *mnt; if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) continue; - if (!file_count(f)) + if (!f->f_count) continue; if (!(f->f_mode & FMODE_WRITE)) continue; @@ -414,7 +416,7 @@ void __init files_init(unsigned long mem int n; filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_DESTROY_BY_RCU | SLAB_PANIC, NULL); /* * One file with associated inode and dcache is very roughly 1K. --- a/fs/open.c +++ b/fs/open.c @@ -1114,7 +1114,7 @@ int filp_close(struct file *filp, fl_own { int retval = 0; - if (!file_count(filp)) { + if (unlikely(!filp->f_count)) { printk(KERN_ERR "VFS: Close: file count is 0\n"); return 0; } --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -920,9 +920,10 @@ struct file { #define f_dentry f_path.dentry #define f_vfsmnt f_path.mnt const struct file_operations *f_op; - spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ - atomic_long_t f_count; + /* f_lock protects f_count, f_ep_links, f_flags, no IRQ */ + spinlock_t f_lock; unsigned int f_flags; + long f_count; fmode_t f_mode; loff_t f_pos; struct fown_struct f_owner; @@ -949,8 +950,24 @@ extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); #define file_list_unlock() spin_unlock(&files_lock); -#define get_file(x) atomic_long_inc(&(x)->f_count) -#define file_count(x) atomic_long_read(&(x)->f_count) +static inline void get_file(struct file *f) +{ + spin_lock(&f->f_lock); + f->f_count++; + spin_unlock(&f->f_lock); +} + +static inline int file_dec_and_test(struct file *f) +{ + int ret; + + spin_lock(&f->f_lock); + f->f_count--; + ret = (f->f_count == 0); + spin_unlock(&f->f_lock); + + return ret; +} #ifdef CONFIG_DEBUG_WRITECOUNT static inline void file_take_write(struct file *f) --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4617,7 +4617,7 @@ static int perf_event_set_output(struct if (event->data) goto out; - atomic_long_inc(&output_file->f_count); + get_file(output_file); set: mutex_lock(&event->mmap_mutex); @@ -4878,7 +4878,7 @@ inherit_event(struct perf_event *parent_ * we are in the parent and we know that the filp still * exists and has a nonzero count: */ - atomic_long_inc(&parent_event->filp->f_count); + get_file(parent_event->filp); /* * Link this into the parent event's child list --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -164,7 +164,7 @@ static void atm_tc_put(struct Qdisc *sch tcf_destroy_chain(&flow->filter_list); if (flow->sock) { pr_debug("atm_tc_put: f_count %ld\n", - file_count(flow->sock->file)); + flow->sock->file->f_count); flow->vcc->pop = flow->old_pop; sockfd_put(flow->sock); } @@ -260,7 +260,7 @@ static int atm_tc_change(struct Qdisc *s sock = sockfd_lookup(fd, &error); if (!sock) return error; /* f_count++ */ - pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file)); + pr_debug("atm_tc_change: f_count %ld\n", sock->file->f_count); if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { error = -EPROTOTYPE; goto err_out; --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -311,7 +311,7 @@ void unix_gc(void) long total_refs; long inflight_refs; - total_refs = file_count(u->sk.sk_socket->file); + total_refs = u->sk.sk_socket->file->f_count; inflight_refs = atomic_long_read(&u->inflight); BUG_ON(inflight_refs < 1);