diff options
author | Greg Banks <gnb@melbourne.sgi.com> | 2006-10-02 02:17:54 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-10-02 07:57:19 -0700 |
commit | 36bdfc8bae51339aa27ef8e4ce148185293061ae (patch) | |
tree | deab54ff70d6991c1e5be0d9efe97d10f65375b0 | |
parent | 4a3ae42dc312dbdffee803efaf393421b79f997a (diff) | |
download | lwn-36bdfc8bae51339aa27ef8e4ce148185293061ae.tar.gz lwn-36bdfc8bae51339aa27ef8e4ce148185293061ae.zip |
[PATCH] knfsd: move tempsock aging to a timer
Following are 11 patches from Greg Banks which combine to make knfsd more
Numa-aware. They reduce hitting on 'global' data structures, and create some
data-structures that can be node-local.
knfsd threads are bound to a particular node, and the thread to handle a new
request is chosen from the threads that are attach to the node that received
the interrupt.
The distribution of threads across nodes can be controlled by a new file in
the 'nfsd' filesystem, though the default approach of an even spread is
probably fine for most sites.
Some (old) numbers that show the efficacy of these patches: N == number of
NICs == number of CPUs == nmber of clients. Number of NUMA nodes == N/2
N Throughput, MiB/s CPU usage, % (max=N*100)
Before After Before After
--- ------ ---- ----- -----
4 312 435 350 228
6 500 656 501 418
8 562 804 690 589
This patch:
Move the aging of RPC/TCP connection sockets from the main svc_recv() loop to
a timer which uses a mark-and-sweep algorithm every 6 minutes. This reduces
the amount of work that needs to be done in the main RPC loop and the length
of time we need to hold the (effectively global) svc_serv->sv_lock.
[akpm@osdl.org: cleanup]
Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | include/linux/sunrpc/svc.h | 1 | ||||
-rw-r--r-- | include/linux/sunrpc/svcsock.h | 2 | ||||
-rw-r--r-- | net/sunrpc/svc.c | 3 | ||||
-rw-r--r-- | net/sunrpc/svcsock.c | 96 |
4 files changed, 76 insertions, 26 deletions
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index cb341f96eb8d..5eabded11061 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -40,6 +40,7 @@ struct svc_serv { struct list_head sv_permsocks; /* all permanent sockets */ struct list_head sv_tempsocks; /* all temporary sockets */ int sv_tmpcnt; /* count of temporary sockets */ + struct timer_list sv_temptimer; /* timer for aging temporary sockets */ char * sv_name; /* service name */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index d5f15e8db929..846aee95eec7 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -31,6 +31,8 @@ struct svc_sock { #define SK_DEAD 6 /* socket closed */ #define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */ #define SK_DEFERRED 8 /* request on sk_deferred */ +#define SK_OLD 9 /* used for temp socket aging mark+sweep */ +#define SK_DETACHED 10 /* detached from tempsocks list */ int sk_reserved; /* space on outq that is reserved */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index eee45a58f3ee..0c2c52276285 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -59,6 +59,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize, INIT_LIST_HEAD(&serv->sv_sockets); INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); + init_timer(&serv->sv_temptimer); spin_lock_init(&serv->sv_lock); /* Remove any stale portmap registrations */ @@ -87,6 +88,8 @@ svc_destroy(struct svc_serv *serv) } else printk("svc_destroy: no threads for serv=%p!\n", serv); + del_timer_sync(&serv->sv_temptimer); + while (!list_empty(&serv->sv_tempsocks)) { svsk = list_entry(serv->sv_tempsocks.next, struct svc_sock, diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index bc9bd189a540..9ba1a071ff06 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -74,6 +74,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + /* * Queue up an idle server thread. Must have serv->sv_lock held. * Note: this is really a stack rather than a queue, so that we only @@ -1220,24 +1227,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) return -EINTR; spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - svsk = list_entry(serv->sv_tempsocks.next, - struct svc_sock, sk_list); - /* apparently the "standard" is that clients close - * idle connections after 5 minutes, servers after - * 6 minutes - * http://www.connectathon.org/talks96/nfstcp.pdf - */ - if (get_seconds() - svsk->sk_lastrecv < 6*60 - || test_bit(SK_BUSY, &svsk->sk_flags)) - svsk = NULL; - } - if (svsk) { - set_bit(SK_BUSY, &svsk->sk_flags); - set_bit(SK_CLOSE, &svsk->sk_flags); - rqstp->rq_sock = svsk; - svsk->sk_inuse++; - } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + if ((svsk = svc_sock_dequeue(serv)) != NULL) { rqstp->rq_sock = svsk; svsk->sk_inuse++; rqstp->rq_reserved = serv->sv_bufsz; @@ -1282,13 +1272,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) return -EAGAIN; } svsk->sk_lastrecv = get_seconds(); - if (test_bit(SK_TEMP, &svsk->sk_flags)) { - /* push active sockets to end of list */ - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&svsk->sk_list)) - list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); - spin_unlock_bh(&serv->sv_lock); - } + clear_bit(SK_OLD, &svsk->sk_flags); rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; rqstp->rq_chandle.defer = svc_defer; @@ -1348,6 +1332,58 @@ svc_send(struct svc_rqst *rqstp) } /* + * Timer function to close old temporary sockets, using + * a mark-and-sweep algorithm. + */ +static void +svc_age_temp_sockets(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_sock *svsk; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_sockets\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_sockets: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + svsk = list_entry(le, struct svc_sock, sk_list); + + if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) + continue; + if (svsk->sk_inuse || test_bit(SK_BUSY, &svsk->sk_flags)) + continue; + svsk->sk_inuse++; + list_move(le, &to_be_aged); + set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(SK_DETACHED, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ + list_del_init(le); + svsk = list_entry(le, struct svc_sock, sk_list); + + dprintk("queuing svsk %p for closing, %lu seconds old\n", + svsk, get_seconds() - svsk->sk_lastrecv); + + /* a thread will dequeue and close it soon */ + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +/* * Initialize socket for RPC use and create svc_sock struct * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ @@ -1400,6 +1436,13 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, set_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_tempsocks); serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp sockets */ + setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } } else { clear_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_permsocks); @@ -1513,7 +1556,8 @@ svc_delete_socket(struct svc_sock *svsk) spin_lock_bh(&serv->sv_lock); - list_del_init(&svsk->sk_list); + if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) + list_del_init(&svsk->sk_list); list_del_init(&svsk->sk_ready); if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) if (test_bit(SK_TEMP, &svsk->sk_flags)) |