summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-04-21 12:52:36 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2006-04-21 12:52:36 -0400
commita748422ee45725e04e1d3792fa19dfa90ddfd116 (patch)
tree978e12895468baaa9f7ab2747b9f7d50beaf1717 /fs
parentc63e31c2cc1ec67372920b5e1aff8204d04dd172 (diff)
parentf4ffaa452e71495a06376f12f772342bc57051fc (diff)
downloadlwn-a748422ee45725e04e1d3792fa19dfa90ddfd116.tar.gz
lwn-a748422ee45725e04e1d3792fa19dfa90ddfd116.zip
Merge branch 'master'
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_super.c13
-rw-r--r--fs/Kconfig9
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c46
-rw-r--r--fs/ext3/resize.c2
-rw-r--r--fs/fifo.c65
-rw-r--r--fs/fuse/dev.c286
-rw-r--r--fs/fuse/dir.c118
-rw-r--r--fs/fuse/file.c66
-rw-r--r--fs/fuse/fuse_i.h72
-rw-r--r--fs/fuse/inode.c158
-rw-r--r--fs/inotify.c2
-rw-r--r--fs/lockd/svclock.c2
-rw-r--r--fs/locks.c9
-rw-r--r--fs/namespace.c7
-rw-r--r--fs/nfs/dir.c5
-rw-r--r--fs/nfs/direct.c8
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/inode.c5
-rw-r--r--fs/nfs/nfs4proc.c10
-rw-r--r--fs/nfsd/auth.c46
-rw-r--r--fs/nfsd/export.c3
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs4acl.c8
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4state.c150
-rw-r--r--fs/nfsd/nfs4xdr.c62
-rw-r--r--fs/nfsd/nfsproc.c2
-rw-r--r--fs/nfsd/vfs.c6
-rw-r--r--fs/ocfs2/cluster/heartbeat.c40
-rw-r--r--fs/ocfs2/dlm/userdlm.c74
-rw-r--r--fs/ocfs2/file.c19
-rw-r--r--fs/open.c24
-rw-r--r--fs/partitions/check.c43
-rw-r--r--fs/pipe.c317
-rw-r--r--fs/proc/base.c21
-rw-r--r--fs/proc/vmcore.c4
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/select.c30
-rw-r--r--fs/splice.c866
-rw-r--r--fs/sync.c4
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/file.c76
-rw-r--r--fs/sysfs/sysfs.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c18
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c20
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c3
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c18
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h16
-rw-r--r--fs/xfs/xfs_ialloc.c15
-rw-r--r--fs/xfs/xfs_iget.c29
-rw-r--r--fs/xfs/xfs_inode.c27
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_mount.c2
58 files changed, 1758 insertions, 1106 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index b0a0ae509c00..61c599b4a1e3 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type
if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
dprintk(DEBUG_ERROR, "problem initiating session\n");
- kfree(v9ses);
- return ERR_PTR(newfid);
+ sb = ERR_PTR(newfid);
+ goto out_free_session;
}
sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-
+ if (IS_ERR(sb))
+ goto out_close_session;
v9fs_fill_super(sb, v9ses, flags);
inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type
return sb;
+out_close_session:
+ v9fs_session_close(v9ses);
+out_free_session:
+ kfree(v9ses);
+ return sb;
+
put_back_sb:
/* deactivate_super calls v9fs_kill_super which will frees the rest */
up_write(&sb->s_umount);
diff --git a/fs/Kconfig b/fs/Kconfig
index 62ee097776f0..563a59e5e694 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -800,6 +800,7 @@ config PROC_KCORE
config PROC_VMCORE
bool "/proc/vmcore support (EXPERIMENTAL)"
depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+ default y
help
Exports the dump image of crashed kernel in ELF format.
@@ -842,6 +843,12 @@ config TMPFS
config HUGETLBFS
bool "HugeTLB file system support"
depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
+ help
+ hugetlbfs is a filesystem backing for HugeTLB pages, based on
+ ramfs. For architectures that support it, say Y here and read
+ <file:Documentation/vm/hugetlbpage.txt> for details.
+
+ If unsure, say N.
config HUGETLB_PAGE
def_bool HUGETLBFS
@@ -862,7 +869,7 @@ config RAMFS
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+ depends on SYSFS && EXPERIMENTAL
help
configfs is a ram-based filesystem that provides the converse
of sysfs's functionality. Where sysfs is a filesystem-based
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8ed9b06a9828..5638c8f9362f 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group)
int ret = 0;
int i;
- if (group && group->default_groups) {
+ if (group->default_groups) {
/* FYI, we're faking mkdir here
* I'm not sure we need this semaphore, as we're called
* from our parent's mkdir. That holds our parent's
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 242fe1a66ce5..1b4491cdd115 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
- epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+ epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
@@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
break;
case EPOLL_CTL_MOD:
if (epi) {
- epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+ epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
diff --git a/fs/exec.c b/fs/exec.c
index 0291a68a3626..3a79d97ac234 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk)
* and to assume its PID:
*/
if (!thread_group_leader(current)) {
- struct task_struct *parent;
struct dentry *proc_dentry1, *proc_dentry2;
- unsigned long ptrace;
/*
* Wait for the thread group leader to be a zombie.
@@ -678,6 +676,18 @@ static int de_thread(struct task_struct *tsk)
while (leader->exit_state != EXIT_ZOMBIE)
yield();
+ /*
+ * The only record we have of the real-time age of a
+ * process, regardless of execs it's done, is start_time.
+ * All the past CPU time is accumulated in signal_struct
+ * from sister threads now dead. But in this non-leader
+ * exec, nothing survives from the original leader thread,
+ * whose birth marks the true age of this process now.
+ * When we take on its identity by switching to its PID, we
+ * also take its birthdate (always earlier than our own).
+ */
+ current->start_time = leader->start_time;
+
spin_lock(&leader->proc_lock);
spin_lock(&current->proc_lock);
proc_dentry1 = proc_pid_unhash(current);
@@ -692,22 +702,6 @@ static int de_thread(struct task_struct *tsk)
* two threads with a switched PID, and release
* the former thread group leader:
*/
- ptrace = leader->ptrace;
- parent = leader->parent;
- if (unlikely(ptrace) && unlikely(parent == current)) {
- /*
- * Joker was ptracing his own group leader,
- * and now he wants to be his own parent!
- * We can't have that.
- */
- ptrace = 0;
- }
-
- ptrace_unlink(current);
- ptrace_unlink(leader);
- remove_parent(current);
- remove_parent(leader);
-
/* Become a process group leader with the old leader's pid.
* Note: The old leader also uses thispid until release_task
@@ -718,19 +712,15 @@ static int de_thread(struct task_struct *tsk)
attach_pid(current, PIDTYPE_PID, current->pid);
attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
attach_pid(current, PIDTYPE_SID, current->signal->session);
- list_add_tail(&current->tasks, &init_task.tasks);
+ list_add_tail_rcu(&current->tasks, &init_task.tasks);
- current->parent = current->real_parent = leader->real_parent;
- leader->parent = leader->real_parent = child_reaper;
current->group_leader = current;
- leader->group_leader = leader;
+ leader->group_leader = current;
- add_parent(current);
- add_parent(leader);
- if (ptrace) {
- current->ptrace = ptrace;
- __ptrace_link(current, parent);
- }
+ /* Reduce leader to a thread */
+ detach_pid(leader, PIDTYPE_PGID);
+ detach_pid(leader, PIDTYPE_SID);
+ list_del_init(&leader->tasks);
current->exit_signal = SIGCHLD;
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 1041dab6de2f..c5ffa8523968 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
if (input->group != sbi->s_groups_count) {
ext3_warning(sb, __FUNCTION__,
"multiple resizers run on filesystem!");
+ unlock_super(sb);
err = -EBUSY;
goto exit_journal;
}
@@ -974,6 +975,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
ext3_warning(sb, __FUNCTION__,
"multiple resizers run on filesystem!");
+ unlock_super(sb);
err = -EBUSY;
goto exit_put;
}
diff --git a/fs/fifo.c b/fs/fifo.c
index 889f722ee36d..49035b174b48 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -15,30 +15,35 @@
#include <linux/fs.h>
#include <linux/pipe_fs_i.h>
-static void wait_for_partner(struct inode* inode, unsigned int* cnt)
+static void wait_for_partner(struct inode* inode, unsigned int *cnt)
{
int cur = *cnt;
- while(cur == *cnt) {
- pipe_wait(inode);
- if(signal_pending(current))
+
+ while (cur == *cnt) {
+ pipe_wait(inode->i_pipe);
+ if (signal_pending(current))
break;
}
}
static void wake_up_partner(struct inode* inode)
{
- wake_up_interruptible(PIPE_WAIT(*inode));
+ wake_up_interruptible(&inode->i_pipe->wait);
}
static int fifo_open(struct inode *inode, struct file *filp)
{
+ struct pipe_inode_info *pipe;
int ret;
- mutex_lock(PIPE_MUTEX(*inode));
- if (!inode->i_pipe) {
+ mutex_lock(&inode->i_mutex);
+ pipe = inode->i_pipe;
+ if (!pipe) {
ret = -ENOMEM;
- if(!pipe_new(inode))
+ pipe = alloc_pipe_info(inode);
+ if (!pipe)
goto err_nocleanup;
+ inode->i_pipe = pipe;
}
filp->f_version = 0;
@@ -53,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp)
* opened, even when there is no process writing the FIFO.
*/
filp->f_op = &read_fifo_fops;
- PIPE_RCOUNTER(*inode)++;
- if (PIPE_READERS(*inode)++ == 0)
+ pipe->r_counter++;
+ if (pipe->readers++ == 0)
wake_up_partner(inode);
- if (!PIPE_WRITERS(*inode)) {
+ if (!pipe->writers) {
if ((filp->f_flags & O_NONBLOCK)) {
/* suppress POLLHUP until we have
* seen a writer */
- filp->f_version = PIPE_WCOUNTER(*inode);
+ filp->f_version = pipe->w_counter;
} else
{
- wait_for_partner(inode, &PIPE_WCOUNTER(*inode));
+ wait_for_partner(inode, &pipe->w_counter);
if(signal_pending(current))
goto err_rd;
}
@@ -78,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp)
* errno=ENXIO when there is no process reading the FIFO.
*/
ret = -ENXIO;
- if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode))
+ if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
goto err;
filp->f_op = &write_fifo_fops;
- PIPE_WCOUNTER(*inode)++;
- if (!PIPE_WRITERS(*inode)++)
+ pipe->w_counter++;
+ if (!pipe->writers++)
wake_up_partner(inode);
- if (!PIPE_READERS(*inode)) {
- wait_for_partner(inode, &PIPE_RCOUNTER(*inode));
+ if (!pipe->readers) {
+ wait_for_partner(inode, &pipe->r_counter);
if (signal_pending(current))
goto err_wr;
}
@@ -102,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp)
*/
filp->f_op = &rdwr_fifo_fops;
- PIPE_READERS(*inode)++;
- PIPE_WRITERS(*inode)++;
- PIPE_RCOUNTER(*inode)++;
- PIPE_WCOUNTER(*inode)++;
- if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1)
+ pipe->readers++;
+ pipe->writers++;
+ pipe->r_counter++;
+ pipe->w_counter++;
+ if (pipe->readers == 1 || pipe->writers == 1)
wake_up_partner(inode);
break;
@@ -116,27 +121,27 @@ static int fifo_open(struct inode *inode, struct file *filp)
}
/* Ok! */
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_unlock(&inode->i_mutex);
return 0;
err_rd:
- if (!--PIPE_READERS(*inode))
- wake_up_interruptible(PIPE_WAIT(*inode));
+ if (!--pipe->readers)
+ wake_up_interruptible(&pipe->wait);
ret = -ERESTARTSYS;
goto err;
err_wr:
- if (!--PIPE_WRITERS(*inode))
- wake_up_interruptible(PIPE_WAIT(*inode));
+ if (!--pipe->writers)
+ wake_up_interruptible(&pipe->wait);
ret = -ERESTARTSYS;
goto err;
err:
- if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode))
+ if (!pipe->readers && !pipe->writers)
free_pipe_info(inode);
err_nocleanup:
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_unlock(&inode->i_mutex);
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 23d1f52eb1b8..cc750c68fe70 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep;
static struct fuse_conn *fuse_get_conn(struct file *file)
{
- struct fuse_conn *fc;
- spin_lock(&fuse_lock);
- fc = file->private_data;
- if (fc && !fc->connected)
- fc = NULL;
- spin_unlock(&fuse_lock);
- return fc;
+ /*
+ * Lockless access is OK, because file->private data is set
+ * once during mount and is valid until the file is released.
+ */
+ return file->private_data;
}
static void fuse_request_init(struct fuse_req *req)
@@ -74,10 +72,8 @@ static void restore_sigs(sigset_t *oldset)
*/
void fuse_reset_request(struct fuse_req *req)
{
- int preallocated = req->preallocated;
BUG_ON(atomic_read(&req->count) != 1);
fuse_request_init(req);
- req->preallocated = preallocated;
}
static void __fuse_get_request(struct fuse_req *req)
@@ -92,80 +88,54 @@ static void __fuse_put_request(struct fuse_req *req)
atomic_dec(&req->count);
}
-static struct fuse_req *do_get_request(struct fuse_conn *fc)
+struct fuse_req *fuse_get_req(struct fuse_conn *fc)
{
struct fuse_req *req;
-
- spin_lock(&fuse_lock);
- BUG_ON(list_empty(&fc->unused_list));
- req = list_entry(fc->unused_list.next, struct fuse_req, list);
- list_del_init(&req->list);
- spin_unlock(&fuse_lock);
- fuse_request_init(req);
- req->preallocated = 1;
- req->in.h.uid = current->fsuid;
- req->in.h.gid = current->fsgid;
- req->in.h.pid = current->pid;
- return req;
-}
-
-/* This can return NULL, but only in case it's interrupted by a SIGKILL */
-struct fuse_req *fuse_get_request(struct fuse_conn *fc)
-{
- int intr;
sigset_t oldset;
+ int intr;
+ int err;
atomic_inc(&fc->num_waiting);
block_sigs(&oldset);
- intr = down_interruptible(&fc->outstanding_sem);
+ intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
restore_sigs(&oldset);
- if (intr) {
- atomic_dec(&fc->num_waiting);
- return NULL;
- }
- return do_get_request(fc);
-}
+ err = -EINTR;
+ if (intr)
+ goto out;
-/* Must be called with fuse_lock held */
-static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
-{
- if (req->preallocated) {
- atomic_dec(&fc->num_waiting);
- list_add(&req->list, &fc->unused_list);
- } else
- fuse_request_free(req);
+ req = fuse_request_alloc();
+ err = -ENOMEM;
+ if (!req)
+ goto out;
- /* If we are in debt decrease that first */
- if (fc->outstanding_debt)
- fc->outstanding_debt--;
- else
- up(&fc->outstanding_sem);
+ req->in.h.uid = current->fsuid;
+ req->in.h.gid = current->fsgid;
+ req->in.h.pid = current->pid;
+ req->waiting = 1;
+ return req;
+
+ out:
+ atomic_dec(&fc->num_waiting);
+ return ERR_PTR(err);
}
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
{
if (atomic_dec_and_test(&req->count)) {
- spin_lock(&fuse_lock);
- fuse_putback_request(fc, req);
- spin_unlock(&fuse_lock);
+ if (req->waiting)
+ atomic_dec(&fc->num_waiting);
+ fuse_request_free(req);
}
}
-static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req)
-{
- if (atomic_dec_and_test(&req->count))
- fuse_putback_request(fc, req);
-}
-
-void fuse_release_background(struct fuse_req *req)
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req)
{
- iput(req->inode);
- iput(req->inode2);
- if (req->file)
- fput(req->file);
- spin_lock(&fuse_lock);
- list_del(&req->bg_entry);
- spin_unlock(&fuse_lock);
+ list_del_init(&req->bg_entry);
+ if (fc->num_background == FUSE_MAX_BACKGROUND) {
+ fc->blocked = 0;
+ wake_up_all(&fc->blocked_waitq);
+ }
+ fc->num_background--;
}
/*
@@ -184,28 +154,38 @@ void fuse_release_background(struct fuse_req *req)
* interrupted and put in the background, it will return with an error
* and hence never be reset and reused.
*
- * Called with fuse_lock, unlocks it
+ * Called with fc->lock, unlocks it
*/
static void request_end(struct fuse_conn *fc, struct fuse_req *req)
{
list_del(&req->list);
req->state = FUSE_REQ_FINISHED;
if (!req->background) {
+ spin_unlock(&fc->lock);
wake_up(&req->waitq);
- fuse_put_request_locked(fc, req);
- spin_unlock(&fuse_lock);
+ fuse_put_request(fc, req);
} else {
+ struct inode *inode = req->inode;
+ struct inode *inode2 = req->inode2;
+ struct file *file = req->file;
void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
req->end = NULL;
- spin_unlock(&fuse_lock);
- down_read(&fc->sbput_sem);
- if (fc->mounted)
- fuse_release_background(req);
- up_read(&fc->sbput_sem);
+ req->inode = NULL;
+ req->inode2 = NULL;
+ req->file = NULL;
+ if (!list_empty(&req->bg_entry))
+ fuse_remove_background(fc, req);
+ spin_unlock(&fc->lock);
+
if (end)
end(fc, req);
else
fuse_put_request(fc, req);
+
+ if (file)
+ fput(file);
+ iput(inode);
+ iput(inode2);
}
}
@@ -242,6 +222,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
{
req->background = 1;
list_add(&req->bg_entry, &fc->background);
+ fc->num_background++;
+ if (fc->num_background == FUSE_MAX_BACKGROUND)
+ fc->blocked = 1;
if (req->inode)
req->inode = igrab(req->inode);
if (req->inode2)
@@ -250,16 +233,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
get_file(req->file);
}
-/* Called with fuse_lock held. Releases, and then reacquires it. */
+/* Called with fc->lock held. Releases, and then reacquires it. */
static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
{
sigset_t oldset;
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
block_sigs(&oldset);
wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
restore_sigs(&oldset);
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
return;
@@ -273,9 +256,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
locked state, there mustn't be any filesystem
operation (e.g. page fault), since that could lead
to deadlock */
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
wait_event(req->waitq, !req->locked);
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
}
if (req->state == FUSE_REQ_PENDING) {
list_del(&req->list);
@@ -304,19 +287,14 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
req->in.h.unique = fc->reqctr;
req->in.h.len = sizeof(struct fuse_in_header) +
len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
- if (!req->preallocated) {
- /* If request is not preallocated (either FORGET or
- RELEASE), then still decrease outstanding_sem, so
- user can't open infinite number of files while not
- processing the RELEASE requests. However for
- efficiency do it without blocking, so if down()
- would block, just increase the debt instead */
- if (down_trylock(&fc->outstanding_sem))
- fc->outstanding_debt++;
- }
list_add_tail(&req->list, &fc->pending);
req->state = FUSE_REQ_PENDING;
+ if (!req->waiting) {
+ req->waiting = 1;
+ atomic_inc(&fc->num_waiting);
+ }
wake_up(&fc->waitq);
+ kill_fasync(&fc->fasync, SIGIO, POLL_IN);
}
/*
@@ -325,7 +303,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
void request_send(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 1;
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
if (!fc->connected)
req->out.h.error = -ENOTCONN;
else if (fc->conn_error)
@@ -338,15 +316,16 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
request_wait_answer(fc, req);
}
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
}
static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
{
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
+ background_request(fc, req);
if (fc->connected) {
queue_request(fc, req);
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
} else {
req->out.h.error = -ENOTCONN;
request_end(fc, req);
@@ -362,9 +341,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
{
req->isreply = 1;
- spin_lock(&fuse_lock);
- background_request(fc, req);
- spin_unlock(&fuse_lock);
request_send_nowait(fc, req);
}
@@ -373,16 +349,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
* anything that could cause a page-fault. If the request was already
* interrupted bail out.
*/
-static int lock_request(struct fuse_req *req)
+static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
{
int err = 0;
if (req) {
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
if (req->interrupted)
err = -ENOENT;
else
req->locked = 1;
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
}
return err;
}
@@ -392,18 +368,19 @@ static int lock_request(struct fuse_req *req)
* requester thread is currently waiting for it to be unlocked, so
* wake it up.
*/
-static void unlock_request(struct fuse_req *req)
+static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
{
if (req) {
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
req->locked = 0;
if (req->interrupted)
wake_up(&req->waitq);
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
}
}
struct fuse_copy_state {
+ struct fuse_conn *fc;
int write;
struct fuse_req *req;
const struct iovec *iov;
@@ -416,11 +393,12 @@ struct fuse_copy_state {
unsigned len;
};
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
- struct fuse_req *req, const struct iovec *iov,
- unsigned long nr_segs)
+static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
+ int write, struct fuse_req *req,
+ const struct iovec *iov, unsigned long nr_segs)
{
memset(cs, 0, sizeof(*cs));
+ cs->fc = fc;
cs->write = write;
cs->req = req;
cs->iov = iov;
@@ -450,7 +428,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
unsigned long offset;
int err;
- unlock_request(cs->req);
+ unlock_request(cs->fc, cs->req);
fuse_copy_finish(cs);
if (!cs->seglen) {
BUG_ON(!cs->nr_segs);
@@ -473,7 +451,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->seglen -= cs->len;
cs->addr += cs->len;
- return lock_request(cs->req);
+ return lock_request(cs->fc, cs->req);
}
/* Do as much copy to/from userspace buffer as we can */
@@ -585,9 +563,9 @@ static void request_wait(struct fuse_conn *fc)
if (signal_pending(current))
break;
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
schedule();
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
}
set_current_state(TASK_RUNNING);
remove_wait_queue(&fc->waitq, &wait);
@@ -606,18 +584,21 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *off)
{
int err;
- struct fuse_conn *fc;
struct fuse_req *req;
struct fuse_in *in;
struct fuse_copy_state cs;
unsigned reqsize;
+ struct fuse_conn *fc = fuse_get_conn(file);
+ if (!fc)
+ return -EPERM;
restart:
- spin_lock(&fuse_lock);
- fc = file->private_data;
- err = -EPERM;
- if (!fc)
+ spin_lock(&fc->lock);
+ err = -EAGAIN;
+ if ((file->f_flags & O_NONBLOCK) && fc->connected &&
+ list_empty(&fc->pending))
goto err_unlock;
+
request_wait(fc);
err = -ENODEV;
if (!fc->connected)
@@ -641,14 +622,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
request_end(fc, req);
goto restart;
}
- spin_unlock(&fuse_lock);
- fuse_copy_init(&cs, 1, req, iov, nr_segs);
+ spin_unlock(&fc->lock);
+ fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
if (!err)
err = fuse_copy_args(&cs, in->numargs, in->argpages,
(struct fuse_arg *) in->args, 0);
fuse_copy_finish(&cs);
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
req->locked = 0;
if (!err && req->interrupted)
err = -ENOENT;
@@ -663,12 +644,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
else {
req->state = FUSE_REQ_SENT;
list_move_tail(&req->list, &fc->processing);
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
}
return reqsize;
err_unlock:
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
return err;
}
@@ -735,9 +716,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
struct fuse_copy_state cs;
struct fuse_conn *fc = fuse_get_conn(file);
if (!fc)
- return -ENODEV;
+ return -EPERM;
- fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+ fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;
@@ -749,7 +730,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
oh.len != nbytes)
goto err_finish;
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
err = -ENOENT;
if (!fc->connected)
goto err_unlock;
@@ -760,9 +741,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
goto err_unlock;
if (req->interrupted) {
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
fuse_copy_finish(&cs);
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
request_end(fc, req);
return -ENOENT;
}
@@ -770,12 +751,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
req->out.h = oh;
req->locked = 1;
cs.req = req;
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
err = copy_out_args(&cs, &req->out, nbytes);
fuse_copy_finish(&cs);
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
req->locked = 0;
if (!err) {
if (req->interrupted)
@@ -787,7 +768,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
return err ? err : nbytes;
err_unlock:
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
err_finish:
fuse_copy_finish(&cs);
return err;
@@ -804,18 +785,19 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
{
- struct fuse_conn *fc = fuse_get_conn(file);
unsigned mask = POLLOUT | POLLWRNORM;
-
+ struct fuse_conn *fc = fuse_get_conn(file);
if (!fc)
- return -ENODEV;
+ return POLLERR;
poll_wait(file, &fc->waitq, wait);
- spin_lock(&fuse_lock);
- if (!list_empty(&fc->pending))
- mask |= POLLIN | POLLRDNORM;
- spin_unlock(&fuse_lock);
+ spin_lock(&fc->lock);
+ if (!fc->connected)
+ mask = POLLERR;
+ else if (!list_empty(&fc->pending))
+ mask |= POLLIN | POLLRDNORM;
+ spin_unlock(&fc->lock);
return mask;
}
@@ -823,7 +805,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
/*
* Abort all requests on the given list (pending or processing)
*
- * This function releases and reacquires fuse_lock
+ * This function releases and reacquires fc->lock
*/
static void end_requests(struct fuse_conn *fc, struct list_head *head)
{
@@ -832,7 +814,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
req = list_entry(head->next, struct fuse_req, list);
req->out.h.error = -ECONNABORTED;
request_end(fc, req);
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
}
}
@@ -863,10 +845,10 @@ static void end_io_requests(struct fuse_conn *fc)
req->end = NULL;
/* The end function will consume this reference */
__fuse_get_request(req);
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
wait_event(req->waitq, !req->locked);
end(fc, req);
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
}
}
}
@@ -893,35 +875,44 @@ static void end_io_requests(struct fuse_conn *fc)
*/
void fuse_abort_conn(struct fuse_conn *fc)
{
- spin_lock(&fuse_lock);
+ spin_lock(&fc->lock);
if (fc->connected) {
fc->connected = 0;
end_io_requests(fc);
end_requests(fc, &fc->pending);
end_requests(fc, &fc->processing);
wake_up_all(&fc->waitq);
+ kill_fasync(&fc->fasync, SIGIO, POLL_IN);
}
- spin_unlock(&fuse_lock);
+ spin_unlock(&fc->lock);
}
static int fuse_dev_release(struct inode *inode, struct file *file)
{
- struct fuse_conn *fc;
-
- spin_lock(&fuse_lock);
- fc = file->private_data;
+ struct fuse_conn *fc = fuse_get_conn(file);
if (fc) {
+ spin_lock(&fc->lock);
fc->connected = 0;
end_requests(fc, &fc->pending);
end_requests(fc, &fc->processing);
- }
- spin_unlock(&fuse_lock);
- if (fc)
+ spin_unlock(&fc->lock);
+ fasync_helper(-1, file, 0, &fc->fasync);
kobject_put(&fc->kobj);
+ }
return 0;
}
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+ struct fuse_conn *fc = fuse_get_conn(file);
+ if (!fc)
+ return -EPERM;
+
+ /* No locking - fasync_helper does its own locking */
+ return fasync_helper(fd, file, on, &fc->fasync);
+}
+
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.llseek = no_llseek,
@@ -931,6 +922,7 @@ const struct file_operations fuse_dev_operations = {
.writev = fuse_dev_writev,
.poll = fuse_dev_poll,
.release = fuse_dev_release,
+ .fasync = fuse_dev_fasync,
};
static struct miscdevice fuse_miscdevice = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 256355b80256..8d7546e832e8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
return 0;
fc = get_fuse_conn(inode);
- req = fuse_get_request(fc);
- if (!req)
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
return 0;
fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
@@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
if (entry->d_name.len > FUSE_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- req = fuse_get_request(fc);
- if (!req)
- return ERR_PTR(-EINTR);
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
fuse_lookup_init(req, dir, entry, &outarg);
request_send(fc, req);
@@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
struct file *file;
int flags = nd->intent.open.flags - 1;
- err = -ENOSYS;
if (fc->no_create)
- goto out;
+ return -ENOSYS;
- err = -EINTR;
- req = fuse_get_request(fc);
- if (!req)
- goto out;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ err = -ENOMEM;
ff = fuse_file_alloc();
if (!ff)
goto out_put_request;
@@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
fuse_file_free(ff);
out_put_request:
fuse_put_request(fc, req);
- out:
return err;
}
@@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
{
struct fuse_mknod_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.mode = mode;
@@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
{
struct fuse_mkdir_in inarg;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.mode = mode;
@@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
{
struct fuse_conn *fc = get_fuse_conn(dir);
unsigned len = strlen(link) + 1;
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req->in.h.opcode = FUSE_SYMLINK;
req->in.numargs = 2;
@@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req->in.h.opcode = FUSE_UNLINK;
req->in.h.nodeid = get_node_id(dir);
@@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
{
int err;
struct fuse_conn *fc = get_fuse_conn(dir);
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req->in.h.opcode = FUSE_RMDIR;
req->in.h.nodeid = get_node_id(dir);
@@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
int err;
struct fuse_rename_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.newdir = get_node_id(newdir);
@@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_link_in inarg;
struct inode *inode = entry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.oldnodeid = get_node_id(inode);
@@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode)
int err;
struct fuse_attr_out arg;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ struct fuse_req *req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req->in.h.opcode = FUSE_GETATTR;
req->in.h.nodeid = get_node_id(inode);
@@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask)
if (fc->no_access)
return 0;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.mask = mask;
@@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
if (is_bad_inode(inode))
return -EIO;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
page = alloc_page(GFP_KERNEL);
if (!page) {
@@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_req *req = fuse_get_request(fc);
+ struct fuse_req *req = fuse_get_req(fc);
char *link;
- if (!req)
- return ERR_PTR(-EINTR);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
link = (char *) __get_free_page(GFP_KERNEL);
if (!link) {
@@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
}
}
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
iattr_to_fattr(attr, &inarg);
@@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
if (fc->no_setxattr)
return -EOPNOTSUPP;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
@@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
if (fc->no_getxattr)
return -EOPNOTSUPP;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
@@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
if (fc->no_listxattr)
return -EOPNOTSUPP;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.size = size;
@@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
if (fc->no_removexattr)
return -EOPNOTSUPP;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req->in.h.opcode = FUSE_REMOVEXATTR;
req->in.h.nodeid = get_node_id(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 975f2697e866..fc342cf7c2cc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
struct fuse_req *req;
int err;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -184,9 +184,9 @@ static int fuse_flush(struct file *file)
if (fc->no_flush)
return 0;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
@@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
return 0;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
@@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page)
if (is_bad_inode(inode))
goto out;
- err = -EINTR;
- req = fuse_get_request(fc);
- if (!req)
+ req = fuse_get_req(fc);
+ err = PTR_ERR(req);
+ if (IS_ERR(req))
goto out;
req->out.page_zeroing = 1;
@@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page)
(req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
req->pages[req->num_pages - 1]->index + 1 != page->index)) {
fuse_send_readpages(req, data->file, inode);
- data->req = req = fuse_get_request(fc);
- if (!req) {
+ data->req = req = fuse_get_req(fc);
+ if (IS_ERR(req)) {
unlock_page(page);
- return -EINTR;
+ return PTR_ERR(req);
}
}
req->pages[req->num_pages] = page;
@@ -392,13 +392,17 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
data.file = file;
data.inode = inode;
- data.req = fuse_get_request(fc);
- if (!data.req)
- return -EINTR;
+ data.req = fuse_get_req(fc);
+ if (IS_ERR(data.req))
+ return PTR_ERR(data.req);
err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
- if (!err)
- fuse_send_readpages(data.req, file, inode);
+ if (!err) {
+ if (data.req->num_pages)
+ fuse_send_readpages(data.req, file, inode);
+ else
+ fuse_put_request(fc, data.req);
+ }
return err;
}
@@ -451,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page,
if (is_bad_inode(inode))
return -EIO;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
req->num_pages = 1;
req->pages[0] = page;
@@ -528,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
if (is_bad_inode(inode))
return -EIO;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
while (count) {
size_t nres;
@@ -561,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
buf += nres;
if (nres != nbytes)
break;
- if (count)
- fuse_reset_request(req);
+ if (count) {
+ fuse_put_request(fc, req);
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ break;
+ }
}
fuse_put_request(fc, req);
if (res > 0) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a16a04fcf41e..59661c481d9d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -18,8 +18,8 @@
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
-/** If more requests are outstanding, then the operation will block */
-#define FUSE_MAX_OUTSTANDING 10
+/** Maximum number of outstanding background requests */
+#define FUSE_MAX_BACKGROUND 10
/** It could be as large as PATH_MAX, but would that have any uses? */
#define FUSE_NAME_MAX 1024
@@ -131,8 +131,8 @@ struct fuse_conn;
* A request to the client
*/
struct fuse_req {
- /** This can be on either unused_list, pending processing or
- io lists in fuse_conn */
+ /** This can be on either pending processing or io lists in
+ fuse_conn */
struct list_head list;
/** Entry on the background list */
@@ -144,15 +144,12 @@ struct fuse_req {
/*
* The following bitfields are either set once before the
* request is queued or setting/clearing them is protected by
- * fuse_lock
+ * fuse_conn->lock
*/
/** True if the request has reply */
unsigned isreply:1;
- /** The request is preallocated */
- unsigned preallocated:1;
-
/** The request was interrupted */
unsigned interrupted:1;
@@ -162,6 +159,9 @@ struct fuse_req {
/** Data is being copied to/from the request */
unsigned locked:1;
+ /** Request is counted as "waiting" */
+ unsigned waiting:1;
+
/** State of the request */
enum fuse_req_state state;
@@ -213,6 +213,9 @@ struct fuse_req {
* unmounted.
*/
struct fuse_conn {
+ /** Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
/** The user id for this mount */
uid_t user_id;
@@ -244,25 +247,20 @@ struct fuse_conn {
interrupted request) */
struct list_head background;
- /** Controls the maximum number of outstanding requests */
- struct semaphore outstanding_sem;
+ /** Number of requests currently in the background */
+ unsigned num_background;
- /** This counts the number of outstanding requests if
- outstanding_sem would go negative */
- unsigned outstanding_debt;
+ /** Flag indicating if connection is blocked. This will be
+ the case before the INIT reply is received, and if there
+ are too many outstading backgrounds requests */
+ int blocked;
- /** RW semaphore for exclusion with fuse_put_super() */
- struct rw_semaphore sbput_sem;
-
- /** The list of unused requests */
- struct list_head unused_list;
+ /** waitq for blocked connection */
+ wait_queue_head_t blocked_waitq;
/** The next unique request id */
u64 reqctr;
- /** Mount is active */
- unsigned mounted;
-
/** Connection established, cleared on umount, connection
abort and device release */
unsigned connected;
@@ -318,6 +316,9 @@ struct fuse_conn {
/** kobject */
struct kobject kobj;
+
+ /** O_ASYNC requests */
+ struct fasync_struct *fasync;
};
static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -349,21 +350,6 @@ static inline u64 get_node_id(struct inode *inode)
extern const struct file_operations fuse_dev_operations;
/**
- * This is the single global spinlock which protects FUSE's structures
- *
- * The following data is protected by this lock:
- *
- * - the private_data field of the device file
- * - the s_fs_info field of the super block
- * - unused_list, pending, processing lists in fuse_conn
- * - background list in fuse_conn
- * - the unique request ID counter reqctr in fuse_conn
- * - the sb (super_block) field in fuse_conn
- * - the file (device file) field in fuse_conn
- */
-extern spinlock_t fuse_lock;
-
-/**
* Get a filled in inode
*/
struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
@@ -461,11 +447,11 @@ void fuse_reset_request(struct fuse_req *req);
/**
* Reserve a preallocated request
*/
-struct fuse_req *fuse_get_request(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
/**
- * Decrement reference count of a request. If count goes to zero put
- * on unused list (preallocated) or free request (not preallocated).
+ * Decrement reference count of a request. If count goes to zero free
+ * the request.
*/
void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
@@ -485,11 +471,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
/**
- * Release inodes and file associated with background request
+ * Remove request from the the background list
*/
-void fuse_release_background(struct fuse_req *req);
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req);
-/* Abort all requests */
+/** Abort all requests */
void fuse_abort_conn(struct fuse_conn *fc);
/**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 879e6fba9480..43a6fc0db8a7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
/*
FUSE: Filesystem in Userspace
- Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
+ Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu>
This program can be distributed under the terms of the GNU GPL.
See the file COPYING.
@@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
MODULE_LICENSE("GPL");
-spinlock_t fuse_lock;
static kmem_cache_t *fuse_inode_cachep;
static struct subsystem connections_subsys;
@@ -205,17 +204,28 @@ static void fuse_put_super(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
- down_write(&fc->sbput_sem);
- while (!list_empty(&fc->background))
- fuse_release_background(list_entry(fc->background.next,
- struct fuse_req, bg_entry));
-
- spin_lock(&fuse_lock);
- fc->mounted = 0;
+ spin_lock(&fc->lock);
fc->connected = 0;
- spin_unlock(&fuse_lock);
- up_write(&fc->sbput_sem);
+ while (!list_empty(&fc->background)) {
+ struct fuse_req *req = list_entry(fc->background.next,
+ struct fuse_req, bg_entry);
+ struct inode *inode = req->inode;
+ struct inode *inode2 = req->inode2;
+
+ /* File would hold a reference to vfsmount */
+ BUG_ON(req->file);
+ req->inode = NULL;
+ req->inode2 = NULL;
+ fuse_remove_background(fc, req);
+
+ spin_unlock(&fc->lock);
+ iput(inode);
+ iput(inode2);
+ spin_lock(&fc->lock);
+ }
+ spin_unlock(&fc->lock);
/* Flush all readers on this fs */
+ kill_fasync(&fc->fasync, SIGIO, POLL_IN);
wake_up_all(&fc->waitq);
kobject_del(&fc->kobj);
kobject_put(&fc->kobj);
@@ -242,9 +252,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
struct fuse_statfs_out outarg;
int err;
- req = fuse_get_request(fc);
- if (!req)
- return -EINTR;
+ req = fuse_get_req(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
memset(&outarg, 0, sizeof(outarg));
req->in.numargs = 0;
@@ -369,15 +379,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
static void fuse_conn_release(struct kobject *kobj)
{
- struct fuse_conn *fc = get_fuse_conn_kobj(kobj);
-
- while (!list_empty(&fc->unused_list)) {
- struct fuse_req *req;
- req = list_entry(fc->unused_list.next, struct fuse_req, list);
- list_del(&req->list);
- fuse_request_free(req);
- }
- kfree(fc);
+ kfree(get_fuse_conn_kobj(kobj));
}
static struct fuse_conn *new_conn(void)
@@ -386,64 +388,24 @@ static struct fuse_conn *new_conn(void)
fc = kzalloc(sizeof(*fc), GFP_KERNEL);
if (fc) {
- int i;
+ spin_lock_init(&fc->lock);
init_waitqueue_head(&fc->waitq);
+ init_waitqueue_head(&fc->blocked_waitq);
INIT_LIST_HEAD(&fc->pending);
INIT_LIST_HEAD(&fc->processing);
INIT_LIST_HEAD(&fc->io);
- INIT_LIST_HEAD(&fc->unused_list);
INIT_LIST_HEAD(&fc->background);
- sema_init(&fc->outstanding_sem, 1); /* One for INIT */
- init_rwsem(&fc->sbput_sem);
kobj_set_kset_s(fc, connections_subsys);
kobject_init(&fc->kobj);
atomic_set(&fc->num_waiting, 0);
- for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
- struct fuse_req *req = fuse_request_alloc();
- if (!req) {
- kobject_put(&fc->kobj);
- return NULL;
- }
- list_add(&req->list, &fc->unused_list);
- }
fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc->bdi.unplug_io_fn = default_unplug_io_fn;
fc->reqctr = 0;
+ fc->blocked = 1;
}
return fc;
}
-static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
-{
- struct fuse_conn *fc;
- int err;
-
- err = -EINVAL;
- if (file->f_op != &fuse_dev_operations)
- goto out_err;
-
- err = -ENOMEM;
- fc = new_conn();
- if (!fc)
- goto out_err;
-
- spin_lock(&fuse_lock);
- err = -EINVAL;
- if (file->private_data)
- goto out_unlock;
-
- kobject_get(&fc->kobj);
- file->private_data = fc;
- spin_unlock(&fuse_lock);
- return fc;
-
- out_unlock:
- spin_unlock(&fuse_lock);
- kobject_put(&fc->kobj);
- out_err:
- return ERR_PTR(err);
-}
-
static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
{
struct fuse_attr attr;
@@ -467,7 +429,6 @@ static struct super_operations fuse_super_operations = {
static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
{
- int i;
struct fuse_init_out *arg = &req->misc.init_out;
if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
@@ -486,22 +447,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
}
-
- /* After INIT reply is received other requests can go
- out. So do (FUSE_MAX_OUTSTANDING - 1) number of
- up()s on outstanding_sem. The last up() is done in
- fuse_putback_request() */
- for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
- up(&fc->outstanding_sem);
-
fuse_put_request(fc, req);
+ fc->blocked = 0;
+ wake_up_all(&fc->blocked_waitq);
}
-static void fuse_send_init(struct fuse_conn *fc)
+static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
{
- /* This is called from fuse_read_super() so there's guaranteed
- to be exactly one request available */
- struct fuse_req *req = fuse_get_request(fc);
struct fuse_init_in *arg = &req->misc.init_in;
arg->major = FUSE_KERNEL_VERSION;
@@ -525,12 +477,9 @@ static void fuse_send_init(struct fuse_conn *fc)
static unsigned long long conn_id(void)
{
+ /* BKL is held for ->get_sb() */
static unsigned long long ctr = 1;
- unsigned long long val;
- spin_lock(&fuse_lock);
- val = ctr++;
- spin_unlock(&fuse_lock);
- return val;
+ return ctr++;
}
static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -540,6 +489,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
struct fuse_mount_data d;
struct file *file;
struct dentry *root_dentry;
+ struct fuse_req *init_req;
int err;
if (!parse_fuse_opt((char *) data, &d))
@@ -555,10 +505,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (!file)
return -EINVAL;
- fc = get_conn(file, sb);
- fput(file);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
+ if (file->f_op != &fuse_dev_operations)
+ return -EINVAL;
+
+ /* Setting file->private_data can't race with other mount()
+ instances, since BKL is held for ->get_sb() */
+ if (file->private_data)
+ return -EINVAL;
+
+ fc = new_conn();
+ if (!fc)
+ return -ENOMEM;
fc->flags = d.flags;
fc->user_id = d.user_id;
@@ -579,27 +536,39 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
goto err;
}
+ init_req = fuse_request_alloc();
+ if (!init_req)
+ goto err_put_root;
+
err = kobject_set_name(&fc->kobj, "%llu", conn_id());
if (err)
- goto err_put_root;
+ goto err_free_req;
err = kobject_add(&fc->kobj);
if (err)
- goto err_put_root;
+ goto err_free_req;
sb->s_root = root_dentry;
- spin_lock(&fuse_lock);
- fc->mounted = 1;
fc->connected = 1;
- spin_unlock(&fuse_lock);
+ kobject_get(&fc->kobj);
+ file->private_data = fc;
+ /*
+ * atomic_dec_and_test() in fput() provides the necessary
+ * memory barrier for file->private_data to be visible on all
+ * CPUs after this
+ */
+ fput(file);
- fuse_send_init(fc);
+ fuse_send_init(fc, init_req);
return 0;
+ err_free_req:
+ fuse_request_free(init_req);
err_put_root:
dput(root_dentry);
err:
+ fput(file);
kobject_put(&fc->kobj);
return err;
}
@@ -753,7 +722,6 @@ static int __init fuse_init(void)
printk("fuse init (API version %i.%i)\n",
FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
- spin_lock_init(&fuse_lock);
res = fuse_fs_init();
if (res)
goto err;
diff --git a/fs/inotify.c b/fs/inotify.c
index 367c487c014b..1f50302849c5 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
spin_lock(&entry->d_lock);
parent = entry->d_parent;
- if (inotify_inode_watched(parent->d_inode))
+ if (parent->d_inode && inotify_inode_watched(parent->d_inode))
entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
spin_unlock(&entry->d_lock);
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d2b66bad7d50..3ef739120dff 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
svc_wake_up(block->b_daemon);
}
-void nlmsvc_grant_release(void *data)
+static void nlmsvc_grant_release(void *data)
{
struct nlm_rqst *call = data;
diff --git a/fs/locks.c b/fs/locks.c
index dda83d6cd48b..efad798824dc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2230,7 +2230,12 @@ void steal_locks(fl_owner_t from)
lock_kernel();
j = 0;
- rcu_read_lock();
+
+ /*
+ * We are not taking a ref to the file structures, so
+ * we need to acquire ->file_lock.
+ */
+ spin_lock(&files->file_lock);
fdt = files_fdtable(files);
for (;;) {
unsigned long set;
@@ -2248,7 +2253,7 @@ void steal_locks(fl_owner_t from)
set >>= 1;
}
}
- rcu_read_unlock();
+ spin_unlock(&files->file_lock);
unlock_kernel();
}
EXPORT_SYMBOL(steal_locks);
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..2c5f1f80bdc2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag)
/*
* do loopback mount.
*/
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags)
{
struct nameidata old_nd;
struct vfsmount *mnt = NULL;
+ int recurse = flags & MS_REC;
int err = mount_is_safe(nd);
+
if (err)
return err;
if (!old_name || !*old_name)
@@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
spin_unlock(&vfsmount_lock);
release_mounts(&umount_list);
}
+ mnt->mnt_flags = mnt_flags;
out:
up_write(&namespace_sem);
@@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
- retval = do_loopback(&nd, dev_name, flags & MS_REC);
+ retval = do_loopback(&nd, dev_name, flags, mnt_flags);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a23f34894167..cae74dd4c7f5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = {
static int
nfs_opendir(struct inode *inode, struct file *filp)
{
- int res = 0;
+ int res;
dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
inode->i_sb->s_id, inode->i_ino);
lock_kernel();
/* Call generic open code in order to cache credentials */
- if (!res)
- res = nfs_open(inode, filp);
+ res = nfs_open(inode, filp);
unlock_kernel();
return res;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0f583cb16ddb..3c72b0c07283 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
*/
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
- struct dentry *dentry = iocb->ki_filp->f_dentry;
-
dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
- dentry->d_name.name, (long long) pos, nr_segs);
+ iocb->ki_filp->f_dentry->d_name.name,
+ (long long) pos, nr_segs);
return -EINVAL;
}
@@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
struct nfs_write_data *data = dreq->commit_data;
- struct rpc_task *task = &data->task;
data->inode = dreq->inode;
data->cred = dreq->ctx->cred;
@@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
dreq->commit_data = NULL;
- dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
+ dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
lock_kernel();
rpc_execute(&data->task);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f1df2c8d9259..fade02c15e6e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
*/
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
{
- struct inode * inode = filp->f_mapping->host;
-
dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
- inode->i_sb->s_id, inode->i_ino,
+ filp->f_dentry->d_inode->i_sb->s_id,
+ filp->f_dentry->d_inode->i_ino,
fl->fl_type, fl->fl_flags);
/*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2f7656b911b6..d0b991a92327 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
/*
* Display superblock I/O counters
*/
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ for_each_possible_cpu(cpu) {
struct nfs_iostats *stats;
- if (!cpu_possible(cpu))
- continue;
-
preempt_disable();
stats = per_cpu_ptr(nfss->io_stats, cpu);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47ece1dd3c67..d86c0db7b1e8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1218,7 +1218,7 @@ out:
return status;
}
-static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
{
struct file *filp;
@@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st
struct nfs_open_context *ctx;
ctx = (struct nfs_open_context *)filp->private_data;
ctx->state = state;
- } else
- nfs4_close_state(state, nd->intent.open.flags);
+ return 0;
+ }
+ nfs4_close_state(state, nd->intent.open.flags);
+ return PTR_ERR(filp);
}
struct dentry *
@@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
nfs_setattr_update_inode(state->inode, sattr);
}
if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN))
- nfs4_intent_set_file(nd, dentry, state);
+ status = nfs4_intent_set_file(nd, dentry, state);
else
nfs4_close_state(state, flags);
out:
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index cfe9ce881613..6e92b0fe5323 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -14,46 +14,46 @@
int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
{
- struct svc_cred *cred = &rqstp->rq_cred;
+ struct svc_cred cred = rqstp->rq_cred;
int i;
int ret;
if (exp->ex_flags & NFSEXP_ALLSQUASH) {
- cred->cr_uid = exp->ex_anon_uid;
- cred->cr_gid = exp->ex_anon_gid;
- put_group_info(cred->cr_group_info);
- cred->cr_group_info = groups_alloc(0);
+ cred.cr_uid = exp->ex_anon_uid;
+ cred.cr_gid = exp->ex_anon_gid;
+ cred.cr_group_info = groups_alloc(0);
} else if (exp->ex_flags & NFSEXP_ROOTSQUASH) {
struct group_info *gi;
- if (!cred->cr_uid)
- cred->cr_uid = exp->ex_anon_uid;
- if (!cred->cr_gid)
- cred->cr_gid = exp->ex_anon_gid;
- gi = groups_alloc(cred->cr_group_info->ngroups);
+ if (!cred.cr_uid)
+ cred.cr_uid = exp->ex_anon_uid;
+ if (!cred.cr_gid)
+ cred.cr_gid = exp->ex_anon_gid;
+ gi = groups_alloc(cred.cr_group_info->ngroups);
if (gi)
- for (i = 0; i < cred->cr_group_info->ngroups; i++) {
- if (!GROUP_AT(cred->cr_group_info, i))
+ for (i = 0; i < cred.cr_group_info->ngroups; i++) {
+ if (!GROUP_AT(cred.cr_group_info, i))
GROUP_AT(gi, i) = exp->ex_anon_gid;
else
- GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i);
+ GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
}
- put_group_info(cred->cr_group_info);
- cred->cr_group_info = gi;
- }
+ cred.cr_group_info = gi;
+ } else
+ get_group_info(cred.cr_group_info);
- if (cred->cr_uid != (uid_t) -1)
- current->fsuid = cred->cr_uid;
+ if (cred.cr_uid != (uid_t) -1)
+ current->fsuid = cred.cr_uid;
else
current->fsuid = exp->ex_anon_uid;
- if (cred->cr_gid != (gid_t) -1)
- current->fsgid = cred->cr_gid;
+ if (cred.cr_gid != (gid_t) -1)
+ current->fsgid = cred.cr_gid;
else
current->fsgid = exp->ex_anon_gid;
- if (!cred->cr_group_info)
+ if (!cred.cr_group_info)
return -ENOMEM;
- ret = set_current_groups(cred->cr_group_info);
- if ((cred->cr_uid)) {
+ ret = set_current_groups(cred.cr_group_info);
+ put_group_info(cred.cr_group_info);
+ if ((cred.cr_uid)) {
cap_t(current->cap_effective) &= ~CAP_NFSD_MASK;
} else {
cap_t(current->cap_effective) |= (CAP_NFSD_MASK &
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c340be0a3f59..4e0578121d9a 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
goto out;
err = path_lookup(buf, 0, &nd);
- if (err) goto out;
+ if (err) goto out_no_path;
exp.h.flags = 0;
exp.ex_client = dom;
@@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
out:
if (nd.dentry)
path_release(&nd);
+ out_no_path:
if (dom)
auth_domain_put(dom);
kfree(buf);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 6d2dfed1de08..f61142afea44 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -682,7 +682,7 @@ static struct svc_procedure nfsd_procedures3[22] = {
PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT),
PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1),
PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4),
- PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE),
+ PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4),
PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4),
PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 7391f4aabedb..edb107e61b91 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl)
/* Also, the remaining entries are for named users and
* groups, and come in threes (mask, allow, deny): */
if (n4acl->naces < 7)
- return -1;
+ return -EINVAL;
if ((n4acl->naces - 7) % 3)
- return -1;
+ return -EINVAL;
return 4 + (n4acl->naces - 7)/3;
}
}
@@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
continue;
error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
- ace->access_mask, ace->whotype, ace->who) == -1;
+ ace->access_mask, ace->whotype, ace->who);
if (error < 0)
goto out;
@@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
struct nfs4_ace *ace;
if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL)
- return -1;
+ return -ENOMEM;
ace->type = type;
ace->flag = flag;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c872bd07fc10..dbaf3f93f328 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp)
goto out_clnt;
}
- /* the task holds a reference to the nfs4_client struct */
cb->cb_client = clnt;
+
+ /* the task holds a reference to the nfs4_client struct */
atomic_inc(&clp->cl_count);
msg.rpc_cred = nfsd4_lookupcred(clp,0);
@@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp)
out_rpciod:
atomic_dec(&clp->cl_count);
rpciod_down();
+ cb->cb_client = NULL;
out_clnt:
rpc_shutdown_client(clnt);
- goto out_err;
out_err:
dprintk("NFSD: warning: no callback path to client %.*s\n",
(int)clp->cl_name.len, clp->cl_name.data);
- cb->cb_client = NULL;
}
static void
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6d63f1d9e5f5..b0e095ea0c03 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
fh_put(current_fh);
status = exp_pseudoroot(rqstp->rq_client, current_fh,
&rqstp->rq_chandle);
- if (!status)
- status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export));
return status;
}
@@ -975,7 +973,7 @@ struct nfsd4_voidargs { int dummy; };
*/
static struct svc_procedure nfsd_procedures4[2] = {
PROC(null, void, void, void, RC_NOCACHE, 1),
- PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE)
+ PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4)
};
struct svc_version nfsd_version4 = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 47ec112b266c..96c7578cbe1e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi)
kref_get(&fi->fi_ref);
}
+static int num_delegations;
+
+/*
+ * Open owner state (share locks)
+ */
+
+/* hash tables for nfs4_stateowner */
+#define OWNER_HASH_BITS 8
+#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
+
+#define ownerid_hashval(id) \
+ ((id) & OWNER_HASH_MASK)
+#define ownerstr_hashval(clientid, ownername) \
+ (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+
+static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE];
+static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
+
+/* hash table for nfs4_file */
+#define FILE_HASH_BITS 8
+#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
+#define FILE_HASH_MASK (FILE_HASH_SIZE - 1)
+/* hash table for (open)nfs4_stateid */
+#define STATEID_HASH_BITS 10
+#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
+#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
+
+#define file_hashval(x) \
+ hash_ptr(x, FILE_HASH_BITS)
+#define stateid_hashval(owner_id, file_id) \
+ (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+
+static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
+
static struct nfs4_delegation *
alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
{
@@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
dprintk("NFSD alloc_init_deleg\n");
+ if (num_delegations > STATEID_HASH_SIZE * 4)
+ return NULL;
dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
if (dp == NULL)
return dp;
+ num_delegations++;
INIT_LIST_HEAD(&dp->dl_perfile);
INIT_LIST_HEAD(&dp->dl_perclnt);
INIT_LIST_HEAD(&dp->dl_recall_lru);
@@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
dprintk("NFSD: freeing dp %p\n",dp);
put_nfs4_file(dp->dl_file);
kmem_cache_free(deleg_slab, dp);
+ num_delegations--;
}
}
@@ -330,22 +370,29 @@ put_nfs4_client(struct nfs4_client *clp)
}
static void
+shutdown_callback_client(struct nfs4_client *clp)
+{
+ struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+
+ /* shutdown rpc client, ending any outstanding recall rpcs */
+ if (clnt) {
+ clp->cl_callback.cb_client = NULL;
+ rpc_shutdown_client(clnt);
+ rpciod_down();
+ }
+}
+
+static void
expire_client(struct nfs4_client *clp)
{
struct nfs4_stateowner *sop;
struct nfs4_delegation *dp;
- struct nfs4_callback *cb = &clp->cl_callback;
- struct rpc_clnt *clnt = clp->cl_callback.cb_client;
struct list_head reaplist;
dprintk("NFSD: expire_client cl_count %d\n",
atomic_read(&clp->cl_count));
- /* shutdown rpc client, ending any outstanding recall rpcs */
- if (atomic_read(&cb->cb_set) == 1 && clnt) {
- rpc_shutdown_client(clnt);
- clnt = clp->cl_callback.cb_client = NULL;
- }
+ shutdown_callback_client(clp);
INIT_LIST_HEAD(&reaplist);
spin_lock(&recall_lock);
@@ -936,40 +983,6 @@ out:
return status;
}
-/*
- * Open owner state (share locks)
- */
-
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS 8
-#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
-
-#define ownerid_hashval(id) \
- ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
- (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
-
-static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
-
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS 8
-#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
-#define FILE_HASH_MASK (FILE_HASH_SIZE - 1)
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS 10
-#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
- hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id) \
- (((owner_id) + (file_id)) & STATEID_HASH_MASK)
-
-static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
-
/* OPEN Share state helper functions */
static inline struct nfs4_file *
alloc_init_file(struct inode *ino)
@@ -1186,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop)
{
dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
- unhash_stateowner(sop);
- list_add_tail(&sop->so_close_lru, &close_lru);
+ list_move_tail(&sop->so_close_lru, &close_lru);
sop->so_time = get_seconds();
}
@@ -1916,8 +1928,7 @@ nfs4_laundromat(void)
}
dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
sop->so_id);
- list_del(&sop->so_close_lru);
- nfs4_put_stateowner(sop);
+ release_stateowner(sop);
}
if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -2495,36 +2506,27 @@ nfs4_transform_lock_offset(struct file_lock *lock)
lock->fl_end = OFFSET_MAX;
}
-static int
-nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval)
-{
- struct nfs4_stateowner *local = NULL;
- int status = 0;
-
- if (hashval >= LOCK_HASH_SIZE)
- goto out;
- list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) {
- if (local == sop) {
- status = 1;
- goto out;
- }
- }
-out:
- return status;
-}
-
+/* Hack!: For now, we're defining this just so we can use a pointer to it
+ * as a unique cookie to identify our (NFSv4's) posix locks. */
+static struct lock_manager_operations nfsd_posix_mng_ops = {
+};
static inline void
nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
{
- struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner;
- unsigned int hval = lockownerid_hashval(sop->so_id);
+ struct nfs4_stateowner *sop;
+ unsigned int hval;
- deny->ld_sop = NULL;
- if (nfs4_verify_lock_stateowner(sop, hval)) {
+ if (fl->fl_lmops == &nfsd_posix_mng_ops) {
+ sop = (struct nfs4_stateowner *) fl->fl_owner;
+ hval = lockownerid_hashval(sop->so_id);
kref_get(&sop->so_ref);
deny->ld_sop = sop;
deny->ld_clientid = sop->so_client->cl_clientid;
+ } else {
+ deny->ld_sop = NULL;
+ deny->ld_clientid.cl_boot = 0;
+ deny->ld_clientid.cl_id = 0;
}
deny->ld_start = fl->fl_start;
deny->ld_length = ~(u64)0;
@@ -2736,6 +2738,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
file_lock.fl_pid = current->tgid;
file_lock.fl_file = filp;
file_lock.fl_flags = FL_POSIX;
+ file_lock.fl_lmops = &nfsd_posix_mng_ops;
file_lock.fl_start = lock->lk_offset;
if ((lock->lk_length == ~(u64)0) ||
@@ -2841,6 +2844,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
file_lock.fl_pid = current->tgid;
file_lock.fl_flags = FL_POSIX;
+ file_lock.fl_lmops = &nfsd_posix_mng_ops;
file_lock.fl_start = lockt->lt_offset;
if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
@@ -2900,6 +2904,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
file_lock.fl_pid = current->tgid;
file_lock.fl_file = filp;
file_lock.fl_flags = FL_POSIX;
+ file_lock.fl_lmops = &nfsd_posix_mng_ops;
file_lock.fl_start = locku->lu_offset;
if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
@@ -3211,15 +3216,8 @@ __nfs4_state_shutdown(void)
int i;
struct nfs4_client *clp = NULL;
struct nfs4_delegation *dp = NULL;
- struct nfs4_stateowner *sop = NULL;
struct list_head *pos, *next, reaplist;
- list_for_each_safe(pos, next, &close_lru) {
- sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
- list_del(&sop->so_close_lru);
- nfs4_put_stateowner(sop);
- }
-
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
while (!list_empty(&conf_id_hashtbl[i])) {
clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
@@ -3244,8 +3242,6 @@ __nfs4_state_shutdown(void)
}
cancel_delayed_work(&laundromat_work);
- flush_workqueue(laundry_wq);
- destroy_workqueue(laundry_wq);
nfsd4_shutdown_recdir();
nfs4_init = 0;
}
@@ -3253,6 +3249,8 @@ __nfs4_state_shutdown(void)
void
nfs4_state_shutdown(void)
{
+ cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+ destroy_workqueue(laundry_wq);
nfs4_lock_state();
nfs4_release_reclaim();
__nfs4_state_shutdown();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 03857fd81126..de3998f15f10 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
buf, dummy32, &ace.who);
if (status)
goto out_nfserr;
- if (nfs4_acl_add_ace(*acl, ace.type, ace.flag,
- ace.access_mask, ace.whotype, ace.who) != 0) {
- status = -ENOMEM;
+ status = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
+ ace.access_mask, ace.whotype, ace.who);
+ if (status)
goto out_nfserr;
- }
}
} else
*acl = NULL;
@@ -2085,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
WRITE32(eof);
WRITE32(maxcount);
ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
-
+ resp->xbuf->head[0].iov_len = (char*)p
+ - (char*)resp->xbuf->head[0].iov_base;
resp->xbuf->page_len = maxcount;
- /* read zero bytes -> don't set up tail */
- if(!maxcount)
- return 0;
-
- /* set up page for remaining responses */
- svc_take_page(resp->rqstp);
- resp->xbuf->tail[0].iov_base =
- page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
- resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+ /* Use rest of head for padding and remaining ops: */
+ resp->rqstp->rq_restailpage = 0;
+ resp->xbuf->tail[0].iov_base = p;
resp->xbuf->tail[0].iov_len = 0;
- resp->p = resp->xbuf->tail[0].iov_base;
- resp->end = resp->p + PAGE_SIZE/4;
-
if (maxcount&3) {
- *(resp->p)++ = 0;
+ RESERVE_SPACE(4);
+ WRITE32(0);
resp->xbuf->tail[0].iov_base += maxcount&3;
resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+ ADJUST_ARGS();
}
return 0;
}
@@ -2142,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
WRITE32(maxcount);
ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+ resp->xbuf->head[0].iov_len = (char*)p
+ - (char*)resp->xbuf->head[0].iov_base;
+ resp->xbuf->page_len = maxcount;
- svc_take_page(resp->rqstp);
- resp->xbuf->tail[0].iov_base =
- page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
- resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+ /* Use rest of head for padding and remaining ops: */
+ resp->rqstp->rq_restailpage = 0;
+ resp->xbuf->tail[0].iov_base = p;
resp->xbuf->tail[0].iov_len = 0;
- resp->p = resp->xbuf->tail[0].iov_base;
- resp->end = resp->p + PAGE_SIZE/4;
-
- resp->xbuf->page_len = maxcount;
if (maxcount&3) {
- *(resp->p)++ = 0;
+ RESERVE_SPACE(4);
+ WRITE32(0);
resp->xbuf->tail[0].iov_base += maxcount&3;
resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+ ADJUST_ARGS();
}
return 0;
}
@@ -2166,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
{
int maxcount;
loff_t offset;
- u32 *page, *savep;
+ u32 *page, *savep, *tailbase;
ENCODE_HEAD;
if (nfserr)
@@ -2182,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
WRITE32(0);
ADJUST_ARGS();
resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+ tailbase = p;
maxcount = PAGE_SIZE;
if (maxcount > readdir->rd_maxcount)
@@ -2226,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
*p++ = htonl(readdir->common.err == nfserr_eof);
resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
- /* allocate a page for the tail */
- svc_take_page(resp->rqstp);
- resp->xbuf->tail[0].iov_base =
- page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
- resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+ /* Use rest of head for padding and remaining ops: */
+ resp->rqstp->rq_restailpage = 0;
+ resp->xbuf->tail[0].iov_base = tailbase;
resp->xbuf->tail[0].iov_len = 0;
resp->p = resp->xbuf->tail[0].iov_base;
- resp->end = resp->p + PAGE_SIZE/4;
+ resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4;
return 0;
err_no_verf:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 3e6b75cd90fd..06cd0db0f32b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -553,7 +553,7 @@ static struct svc_procedure nfsd_procedures2[18] = {
PROC(none, void, void, none, RC_NOCACHE, ST),
PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT),
PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
- PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE),
+ PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
PROC(none, void, void, none, RC_NOCACHE, ST),
PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT),
PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 31018333dc38..6aa92d0e6876 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -371,7 +371,6 @@ out_nfserr:
static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
{
ssize_t buflen;
- int error;
buflen = vfs_getxattr(dentry, key, NULL, 0);
if (buflen <= 0)
@@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
if (!*buf)
return -ENOMEM;
- error = vfs_getxattr(dentry, key, *buf, buflen);
- if (error < 0)
- return error;
- return buflen;
+ return vfs_getxattr(dentry, key, *buf, buflen);
}
#endif
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bff0f0d06867..21f38accd039 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -153,6 +153,7 @@ struct o2hb_region {
struct o2hb_bio_wait_ctxt {
atomic_t wc_num_reqs;
struct completion wc_io_complete;
+ int wc_error;
};
static void o2hb_write_timeout(void *arg)
@@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
{
atomic_set(&wc->wc_num_reqs, num_ios);
init_completion(&wc->wc_io_complete);
+ wc->wc_error = 0;
}
/* Used in error paths too */
@@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio,
{
struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
- if (error)
+ if (error) {
mlog(ML_ERROR, "IO Error %d\n", error);
+ wc->wc_error = error;
+ }
if (bio->bi_size)
return 1;
@@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg,
bail_and_wait:
o2hb_wait_on_io(reg, &wc);
+ if (wc.wc_error && !status)
+ status = wc.wc_error;
if (bios) {
for(i = 0; i < num_bios; i++)
@@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes,
return highest;
}
-static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
{
int i, ret, highest_node, change = 0;
unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
struct bio *write_bio;
struct o2hb_bio_wait_ctxt write_wc;
- if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
- return;
+ ret = o2nm_configured_node_map(configured_nodes,
+ sizeof(configured_nodes));
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
if (highest_node >= O2NM_MAX_NODES) {
mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
- return;
+ return -EINVAL;
}
/* No sense in reading the slots of nodes that don't exist
@@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_read_slots(reg, highest_node + 1);
if (ret < 0) {
mlog_errno(ret);
- return;
+ return ret;
}
/* With an up to date view of the slots, we can check that no
@@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
if (ret < 0) {
mlog_errno(ret);
- return;
+ return ret;
}
i = -1;
@@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
*/
o2hb_wait_on_io(reg, &write_wc);
bio_put(write_bio);
+ if (write_wc.wc_error) {
+ /* Do not re-arm the write timeout on I/O error - we
+ * can't be sure that the new block ever made it to
+ * disk */
+ mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
+ write_wc.wc_error, reg->hr_dev_name);
+ return write_wc.wc_error;
+ }
+
o2hb_arm_write_timeout(reg);
/* let the person who launched us know when things are steady */
@@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
if (atomic_dec_and_test(&reg->hr_steady_iterations))
wake_up(&o2hb_steady_queue);
}
+
+ return 0;
}
/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -913,7 +934,10 @@ static int o2hb_thread(void *data)
* likely to time itself out. */
do_gettimeofday(&before_hb);
- o2hb_do_disk_heartbeat(reg);
+ i = 0;
+ do {
+ ret = o2hb_do_disk_heartbeat(reg);
+ } while (ret && ++i < 2);
do_gettimeofday(&after_hb);
elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index c3764f4744ee..74ca4e5f9765 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -139,6 +139,10 @@ static void user_ast(void *opaque)
return;
}
+ mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+ "Lockres %s, requested ivmode. flags 0x%x\n",
+ lockres->l_name, lockres->l_flags);
+
/* we're downconverting. */
if (lockres->l_requested < lockres->l_level) {
if (lockres->l_requested <=
@@ -229,23 +233,42 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
- if (status != DLM_NORMAL)
+ if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
mlog(ML_ERROR, "Dlm returns status %d\n", status);
spin_lock(&lockres->l_lock);
- if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+ /* The teardown flag gets set early during the unlock process,
+ * so test the cancel flag to make sure that this ast isn't
+ * for a concurrent cancel. */
+ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
+ && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
lockres->l_level = LKM_IVMODE;
- else {
+ } else if (status == DLM_CANCELGRANT) {
+ mlog(0, "Lock %s, cancel fails, flags 0x%x\n",
+ lockres->l_name, lockres->l_flags);
+ /* We tried to cancel a convert request, but it was
+ * already granted. Don't clear the busy flag - the
+ * ast should've done this already. */
+ BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+ lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+ goto out_noclear;
+ } else {
+ BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+ /* Cancel succeeded, we want to re-queue */
+ mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n",
+ lockres->l_name, lockres->l_flags);
lockres->l_requested = LKM_IVMODE; /* cancel an
* upconvert
* request. */
lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
/* we want the unblock thread to look at it again
* now. */
- __user_dlm_queue_lockres(lockres);
+ if (lockres->l_flags & USER_LOCK_BLOCKED)
+ __user_dlm_queue_lockres(lockres);
}
lockres->l_flags &= ~USER_LOCK_BUSY;
+out_noclear:
spin_unlock(&lockres->l_lock);
wake_up(&lockres->l_event);
@@ -268,13 +291,26 @@ static void user_dlm_unblock_lock(void *opaque)
spin_lock(&lockres->l_lock);
- BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
- BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+ mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
+ "Lockres %s, flags 0x%x\n",
+ lockres->l_name, lockres->l_flags);
- /* notice that we don't clear USER_LOCK_BLOCKED here. That's
- * for user_ast to do. */
+ /* notice that we don't clear USER_LOCK_BLOCKED here. If it's
+ * set, we want user_ast clear it. */
lockres->l_flags &= ~USER_LOCK_QUEUED;
+ /* It's valid to get here and no longer be blocked - if we get
+ * several basts in a row, we might be queued by the first
+ * one, the unblock thread might run and clear the queued
+ * flag, and finally we might get another bast which re-queues
+ * us before our ast for the downconvert is called. */
+ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+ mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n",
+ lockres->l_name, lockres->l_flags);
+ spin_unlock(&lockres->l_lock);
+ goto drop_ref;
+ }
+
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
mlog(0, "lock is in teardown so we do nothing\n");
spin_unlock(&lockres->l_lock);
@@ -282,7 +318,9 @@ static void user_dlm_unblock_lock(void *opaque)
}
if (lockres->l_flags & USER_LOCK_BUSY) {
- mlog(0, "BUSY flag detected...\n");
+ mlog(0, "Cancel lock %s, flags 0x%x\n",
+ lockres->l_name, lockres->l_flags);
+
if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
spin_unlock(&lockres->l_lock);
goto drop_ref;
@@ -296,14 +334,7 @@ static void user_dlm_unblock_lock(void *opaque)
LKM_CANCEL,
user_unlock_ast,
lockres);
- if (status == DLM_CANCELGRANT) {
- /* If we got this, then the ast was fired
- * before we could cancel. We cleanup our
- * state, and restart the function. */
- spin_lock(&lockres->l_lock);
- lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
- spin_unlock(&lockres->l_lock);
- } else if (status != DLM_NORMAL)
+ if (status != DLM_NORMAL)
user_log_dlm_error("dlmunlock", status, lockres);
goto drop_ref;
}
@@ -581,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
mlog(0, "asked to destroy %s\n", lockres->l_name);
spin_lock(&lockres->l_lock);
+ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+ mlog(0, "Lock is already torn down\n");
+ spin_unlock(&lockres->l_lock);
+ return 0;
+ }
+
+ lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+
while (lockres->l_flags & USER_LOCK_BUSY) {
spin_unlock(&lockres->l_lock);
@@ -606,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
lockres->l_flags &= ~USER_LOCK_ATTACHED;
lockres->l_flags |= USER_LOCK_BUSY;
- lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
spin_unlock(&lockres->l_lock);
mlog(0, "unlocking lockres %s\n", lockres->l_name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 34e903a6a46b..581eb451a41a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode,
if (new_i_size == le64_to_cpu(fe->i_size))
goto bail;
+ /* This forces other nodes to sync and drop their pages. Do
+ * this even if we have a truncate without allocation change -
+ * ocfs2 cluster sizes can be much greater than page size, so
+ * we have to truncate them anyway. */
+ status = ocfs2_data_lock(inode, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_data_unlock(inode, 1);
+
if (le32_to_cpu(fe->i_clusters) ==
ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
@@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail;
}
- /* This forces other nodes to sync and drop their pages */
- status = ocfs2_data_lock(inode, 1);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- ocfs2_data_unlock(inode, 1);
-
/* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the
* truncate if necessary. This does the task of marking
diff --git a/fs/open.c b/fs/open.c
index c32c89d6d8db..53ec28c36777 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -331,7 +331,10 @@ out:
asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
{
- return do_sys_ftruncate(fd, length, 1);
+ long ret = do_sys_ftruncate(fd, length, 1);
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
}
/* LFS versions of truncate are only needed on 32 bit machines */
@@ -343,7 +346,10 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length)
asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
{
- return do_sys_ftruncate(fd, length, 0);
+ long ret = do_sys_ftruncate(fd, length, 0);
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
}
#endif
@@ -1093,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
asmlinkage long sys_open(const char __user *filename, int flags, int mode)
{
+ long ret;
+
if (force_o_largefile())
flags |= O_LARGEFILE;
- return do_sys_open(AT_FDCWD, filename, flags, mode);
+ ret = do_sys_open(AT_FDCWD, filename, flags, mode);
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
}
EXPORT_SYMBOL_GPL(sys_open);
asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
int mode)
{
+ long ret;
+
if (force_o_largefile())
flags |= O_LARGEFILE;
- return do_sys_open(dfd, filename, flags, mode);
+ ret = do_sys_open(dfd, filename, flags, mode);
+ /* avoid REGPARM breakage on x86: */
+ prevent_tail_call(ret);
+ return ret;
}
EXPORT_SYMBOL_GPL(sys_openat);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index af0cb4b9e784..45ae7dd3c650 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part)
devfs_remove("%s/part%d", disk->devfs_name, part);
if (p->holder_dir)
kobject_unregister(p->holder_dir);
- kobject_unregister(&p->kobj);
+ kobject_uevent(&p->kobj, KOBJ_REMOVE);
+ kobject_del(&p->kobj);
+ kobject_put(&p->kobj);
}
void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
@@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part);
p->kobj.parent = &disk->kobj;
p->kobj.ktype = &ktype_part;
- kobject_register(&p->kobj);
+ kobject_init(&p->kobj);
+ kobject_add(&p->kobj);
+ if (!disk->part_uevent_suppress)
+ kobject_uevent(&p->kobj, KOBJ_ADD);
partition_sysfs_add_subdir(p);
disk->part[part-1] = p;
}
@@ -367,6 +372,7 @@ static char *make_block_name(struct gendisk *disk)
char *name;
static char *block_str = "block:";
int size;
+ char *s;
size = strlen(block_str) + strlen(disk->disk_name) + 1;
name = kmalloc(size, GFP_KERNEL);
@@ -374,6 +380,10 @@ static char *make_block_name(struct gendisk *disk)
return NULL;
strcpy(name, block_str);
strcat(name, disk->disk_name);
+ /* ewww... some of these buggers have / in name... */
+ s = strchr(name, '/');
+ if (s)
+ *s = '!';
return name;
}
@@ -395,6 +405,8 @@ void register_disk(struct gendisk *disk)
{
struct block_device *bdev;
char *s;
+ int i;
+ struct hd_struct *p;
int err;
strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN);
@@ -406,13 +418,12 @@ void register_disk(struct gendisk *disk)
return;
disk_sysfs_symlinks(disk);
disk_sysfs_add_subdirs(disk);
- kobject_uevent(&disk->kobj, KOBJ_ADD);
/* No minors to use for partitions */
if (disk->minors == 1) {
if (disk->devfs_name[0] != '\0')
devfs_add_disk(disk);
- return;
+ goto exit;
}
/* always add handle for the whole disk */
@@ -420,16 +431,32 @@ void register_disk(struct gendisk *disk)
/* No such device (e.g., media were just removed) */
if (!get_capacity(disk))
- return;
+ goto exit;
bdev = bdget_disk(disk, 0);
if (!bdev)
- return;
+ goto exit;
+ /* scan partition table, but suppress uevents */
bdev->bd_invalidated = 1;
- if (blkdev_get(bdev, FMODE_READ, 0) < 0)
- return;
+ disk->part_uevent_suppress = 1;
+ err = blkdev_get(bdev, FMODE_READ, 0);
+ disk->part_uevent_suppress = 0;
+ if (err < 0)
+ goto exit;
blkdev_put(bdev);
+
+exit:
+ /* announce disk after possible partitions are already created */
+ kobject_uevent(&disk->kobj, KOBJ_ADD);
+
+ /* announce possible partitions */
+ for (i = 1; i < disk->minors; i++) {
+ p = disk->part[i-1];
+ if (!p || !p->nr_sects)
+ continue;
+ kobject_uevent(&p->kobj, KOBJ_ADD);
+ }
}
int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
diff --git a/fs/pipe.c b/fs/pipe.c
index 795df987cd38..7fefb10db8d9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -36,7 +36,7 @@
*/
/* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct inode * inode)
+void pipe_wait(struct pipe_inode_info *pipe)
{
DEFINE_WAIT(wait);
@@ -44,11 +44,14 @@ void pipe_wait(struct inode * inode)
* Pipes are system-local resources, so sleeping on them
* is considered a noninteractive wait:
*/
- prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
- mutex_unlock(PIPE_MUTEX(*inode));
+ prepare_to_wait(&pipe->wait, &wait,
+ TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
+ if (pipe->inode)
+ mutex_unlock(&pipe->inode->i_mutex);
schedule();
- finish_wait(PIPE_WAIT(*inode), &wait);
- mutex_lock(PIPE_MUTEX(*inode));
+ finish_wait(&pipe->wait, &wait);
+ if (pipe->inode)
+ mutex_lock(&pipe->inode->i_mutex);
}
static int
@@ -91,7 +94,8 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
return 0;
}
-static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
struct page *page = buf->page;
@@ -100,42 +104,46 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
/*
* If nobody else uses this page, and we don't already have a
* temporary page, let's keep track of it as a one-deep
- * allocation cache
+ * allocation cache. (Otherwise just release our reference to it)
*/
- if (page_count(page) == 1 && !info->tmp_page) {
- info->tmp_page = page;
- return;
- }
-
- /*
- * Otherwise just release our reference to it
- */
- page_cache_release(page);
+ if (page_count(page) == 1 && !pipe->tmp_page)
+ pipe->tmp_page = page;
+ else
+ page_cache_release(page);
}
-static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
return kmap(buf->page);
}
-static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
kunmap(buf->page);
}
-static int anon_pipe_buf_steal(struct pipe_inode_info *info,
+static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
buf->flags |= PIPE_BUF_FLAG_STOLEN;
return 0;
}
+static void anon_pipe_buf_get(struct pipe_inode_info *info,
+ struct pipe_buffer *buf)
+{
+ page_cache_get(buf->page);
+}
+
static struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
.map = anon_pipe_buf_map,
.unmap = anon_pipe_buf_unmap,
.release = anon_pipe_buf_release,
.steal = anon_pipe_buf_steal,
+ .get = anon_pipe_buf_get,
};
static ssize_t
@@ -143,7 +151,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
unsigned long nr_segs, loff_t *ppos)
{
struct inode *inode = filp->f_dentry->d_inode;
- struct pipe_inode_info *info;
+ struct pipe_inode_info *pipe;
int do_wakeup;
ssize_t ret;
struct iovec *iov = (struct iovec *)_iov;
@@ -156,13 +164,13 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
do_wakeup = 0;
ret = 0;
- mutex_lock(PIPE_MUTEX(*inode));
- info = inode->i_pipe;
+ mutex_lock(&inode->i_mutex);
+ pipe = inode->i_pipe;
for (;;) {
- int bufs = info->nrbufs;
+ int bufs = pipe->nrbufs;
if (bufs) {
- int curbuf = info->curbuf;
- struct pipe_buffer *buf = info->bufs + curbuf;
+ int curbuf = pipe->curbuf;
+ struct pipe_buffer *buf = pipe->bufs + curbuf;
struct pipe_buf_operations *ops = buf->ops;
void *addr;
size_t chars = buf->len;
@@ -171,16 +179,17 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
if (chars > total_len)
chars = total_len;
- addr = ops->map(filp, info, buf);
+ addr = ops->map(filp, pipe, buf);
if (IS_ERR(addr)) {
if (!ret)
ret = PTR_ERR(addr);
break;
}
error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
- ops->unmap(info, buf);
+ ops->unmap(pipe, buf);
if (unlikely(error)) {
- if (!ret) ret = -EFAULT;
+ if (!ret)
+ ret = -EFAULT;
break;
}
ret += chars;
@@ -188,10 +197,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
buf->len -= chars;
if (!buf->len) {
buf->ops = NULL;
- ops->release(info, buf);
+ ops->release(pipe, buf);
curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
- info->curbuf = curbuf;
- info->nrbufs = --bufs;
+ pipe->curbuf = curbuf;
+ pipe->nrbufs = --bufs;
do_wakeup = 1;
}
total_len -= chars;
@@ -200,9 +209,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
}
if (bufs) /* More to do? */
continue;
- if (!PIPE_WRITERS(*inode))
+ if (!pipe->writers)
break;
- if (!PIPE_WAITING_WRITERS(*inode)) {
+ if (!pipe->waiting_writers) {
/* syscall merging: Usually we must not sleep
* if O_NONBLOCK is set, or if we got some data.
* But if a writer sleeps in kernel space, then
@@ -216,20 +225,22 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
}
}
if (signal_pending(current)) {
- if (!ret) ret = -ERESTARTSYS;
+ if (!ret)
+ ret = -ERESTARTSYS;
break;
}
if (do_wakeup) {
- wake_up_interruptible_sync(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- pipe_wait(inode);
+ pipe_wait(pipe);
}
- mutex_unlock(PIPE_MUTEX(*inode));
- /* Signal writers asynchronously that there is more room. */
+ mutex_unlock(&inode->i_mutex);
+
+ /* Signal writers asynchronously that there is more room. */
if (do_wakeup) {
- wake_up_interruptible(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
if (ret > 0)
file_accessed(filp);
@@ -240,6 +251,7 @@ static ssize_t
pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = count };
+
return pipe_readv(filp, &iov, 1, ppos);
}
@@ -248,7 +260,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
unsigned long nr_segs, loff_t *ppos)
{
struct inode *inode = filp->f_dentry->d_inode;
- struct pipe_inode_info *info;
+ struct pipe_inode_info *pipe;
ssize_t ret;
int do_wakeup;
struct iovec *iov = (struct iovec *)_iov;
@@ -262,10 +274,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
do_wakeup = 0;
ret = 0;
- mutex_lock(PIPE_MUTEX(*inode));
- info = inode->i_pipe;
+ mutex_lock(&inode->i_mutex);
+ pipe = inode->i_pipe;
- if (!PIPE_READERS(*inode)) {
+ if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
@@ -273,23 +285,25 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
/* We try to merge small writes */
chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
- if (info->nrbufs && chars != 0) {
- int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1);
- struct pipe_buffer *buf = info->bufs + lastbuf;
+ if (pipe->nrbufs && chars != 0) {
+ int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
+ (PIPE_BUFFERS-1);
+ struct pipe_buffer *buf = pipe->bufs + lastbuf;
struct pipe_buf_operations *ops = buf->ops;
int offset = buf->offset + buf->len;
+
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
void *addr;
int error;
- addr = ops->map(filp, info, buf);
+ addr = ops->map(filp, pipe, buf);
if (IS_ERR(addr)) {
error = PTR_ERR(addr);
goto out;
}
error = pipe_iov_copy_from_user(offset + addr, iov,
chars);
- ops->unmap(info, buf);
+ ops->unmap(pipe, buf);
ret = error;
do_wakeup = 1;
if (error)
@@ -304,16 +318,18 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
for (;;) {
int bufs;
- if (!PIPE_READERS(*inode)) {
+
+ if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
- if (!ret) ret = -EPIPE;
+ if (!ret)
+ ret = -EPIPE;
break;
}
- bufs = info->nrbufs;
+ bufs = pipe->nrbufs;
if (bufs < PIPE_BUFFERS) {
- int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1);
- struct pipe_buffer *buf = info->bufs + newbuf;
- struct page *page = info->tmp_page;
+ int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+ struct pipe_buffer *buf = pipe->bufs + newbuf;
+ struct page *page = pipe->tmp_page;
int error;
if (!page) {
@@ -322,9 +338,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
ret = ret ? : -ENOMEM;
break;
}
- info->tmp_page = page;
+ pipe->tmp_page = page;
}
- /* Always wakeup, even if the copy fails. Otherwise
+ /* Always wake up, even if the copy fails. Otherwise
* we lock up (O_NONBLOCK-)readers that sleep due to
* syscall merging.
* FIXME! Is this really true?
@@ -337,7 +353,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
error = pipe_iov_copy_from_user(kmap(page), iov, chars);
kunmap(page);
if (unlikely(error)) {
- if (!ret) ret = -EFAULT;
+ if (!ret)
+ ret = -EFAULT;
break;
}
ret += chars;
@@ -347,8 +364,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = chars;
- info->nrbufs = ++bufs;
- info->tmp_page = NULL;
+ pipe->nrbufs = ++bufs;
+ pipe->tmp_page = NULL;
total_len -= chars;
if (!total_len)
@@ -357,27 +374,29 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
if (bufs < PIPE_BUFFERS)
continue;
if (filp->f_flags & O_NONBLOCK) {
- if (!ret) ret = -EAGAIN;
+ if (!ret)
+ ret = -EAGAIN;
break;
}
if (signal_pending(current)) {
- if (!ret) ret = -ERESTARTSYS;
+ if (!ret)
+ ret = -ERESTARTSYS;
break;
}
if (do_wakeup) {
- wake_up_interruptible_sync(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
do_wakeup = 0;
}
- PIPE_WAITING_WRITERS(*inode)++;
- pipe_wait(inode);
- PIPE_WAITING_WRITERS(*inode)--;
+ pipe->waiting_writers++;
+ pipe_wait(pipe);
+ pipe->waiting_writers--;
}
out:
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_unlock(&inode->i_mutex);
if (do_wakeup) {
- wake_up_interruptible(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
if (ret > 0)
file_update_time(filp);
@@ -389,6 +408,7 @@ pipe_write(struct file *filp, const char __user *buf,
size_t count, loff_t *ppos)
{
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+
return pipe_writev(filp, &iov, 1, ppos);
}
@@ -399,7 +419,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
}
static ssize_t
-bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos)
+bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
+ loff_t *ppos)
{
return -EBADF;
}
@@ -409,21 +430,22 @@ pipe_ioctl(struct inode *pino, struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = filp->f_dentry->d_inode;
- struct pipe_inode_info *info;
+ struct pipe_inode_info *pipe;
int count, buf, nrbufs;
switch (cmd) {
case FIONREAD:
- mutex_lock(PIPE_MUTEX(*inode));
- info = inode->i_pipe;
+ mutex_lock(&inode->i_mutex);
+ pipe = inode->i_pipe;
count = 0;
- buf = info->curbuf;
- nrbufs = info->nrbufs;
+ buf = pipe->curbuf;
+ nrbufs = pipe->nrbufs;
while (--nrbufs >= 0) {
- count += info->bufs[buf].len;
+ count += pipe->bufs[buf].len;
buf = (buf+1) & (PIPE_BUFFERS-1);
}
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_unlock(&inode->i_mutex);
+
return put_user(count, (int __user *)arg);
default:
return -EINVAL;
@@ -436,17 +458,17 @@ pipe_poll(struct file *filp, poll_table *wait)
{
unsigned int mask;
struct inode *inode = filp->f_dentry->d_inode;
- struct pipe_inode_info *info = inode->i_pipe;
+ struct pipe_inode_info *pipe = inode->i_pipe;
int nrbufs;
- poll_wait(filp, PIPE_WAIT(*inode), wait);
+ poll_wait(filp, &pipe->wait, wait);
/* Reading only -- no need for acquiring the semaphore. */
- nrbufs = info->nrbufs;
+ nrbufs = pipe->nrbufs;
mask = 0;
if (filp->f_mode & FMODE_READ) {
mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
- if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
+ if (!pipe->writers && filp->f_version != pipe->w_counter)
mask |= POLLHUP;
}
@@ -456,7 +478,7 @@ pipe_poll(struct file *filp, poll_table *wait)
* Most Unices do not set POLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
*/
- if (!PIPE_READERS(*inode))
+ if (!pipe->readers)
mask |= POLLERR;
}
@@ -466,17 +488,21 @@ pipe_poll(struct file *filp, poll_table *wait)
static int
pipe_release(struct inode *inode, int decr, int decw)
{
- mutex_lock(PIPE_MUTEX(*inode));
- PIPE_READERS(*inode) -= decr;
- PIPE_WRITERS(*inode) -= decw;
- if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
+ struct pipe_inode_info *pipe;
+
+ mutex_lock(&inode->i_mutex);
+ pipe = inode->i_pipe;
+ pipe->readers -= decr;
+ pipe->writers -= decw;
+
+ if (!pipe->readers && !pipe->writers) {
free_pipe_info(inode);
} else {
- wake_up_interruptible(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
- kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_unlock(&inode->i_mutex);
return 0;
}
@@ -487,9 +513,9 @@ pipe_read_fasync(int fd, struct file *filp, int on)
struct inode *inode = filp->f_dentry->d_inode;
int retval;
- mutex_lock(PIPE_MUTEX(*inode));
- retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_lock(&inode->i_mutex);
+ retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
+ mutex_unlock(&inode->i_mutex);
if (retval < 0)
return retval;
@@ -504,9 +530,9 @@ pipe_write_fasync(int fd, struct file *filp, int on)
struct inode *inode = filp->f_dentry->d_inode;
int retval;
- mutex_lock(PIPE_MUTEX(*inode));
- retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_lock(&inode->i_mutex);
+ retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
+ mutex_unlock(&inode->i_mutex);
if (retval < 0)
return retval;
@@ -519,16 +545,17 @@ static int
pipe_rdwr_fasync(int fd, struct file *filp, int on)
{
struct inode *inode = filp->f_dentry->d_inode;
+ struct pipe_inode_info *pipe = inode->i_pipe;
int retval;
- mutex_lock(PIPE_MUTEX(*inode));
+ mutex_lock(&inode->i_mutex);
- retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
+ retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
if (retval >= 0)
- retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
+ retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_unlock(&inode->i_mutex);
if (retval < 0)
return retval;
@@ -567,9 +594,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
{
/* We could have perhaps used atomic_t, but this and friends
below are the only places. So it doesn't seem worthwhile. */
- mutex_lock(PIPE_MUTEX(*inode));
- PIPE_READERS(*inode)++;
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_lock(&inode->i_mutex);
+ inode->i_pipe->readers++;
+ mutex_unlock(&inode->i_mutex);
return 0;
}
@@ -577,9 +604,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
static int
pipe_write_open(struct inode *inode, struct file *filp)
{
- mutex_lock(PIPE_MUTEX(*inode));
- PIPE_WRITERS(*inode)++;
- mutex_unlock(PIPE_MUTEX(*inode));
+ mutex_lock(&inode->i_mutex);
+ inode->i_pipe->writers++;
+ mutex_unlock(&inode->i_mutex);
return 0;
}
@@ -587,12 +614,12 @@ pipe_write_open(struct inode *inode, struct file *filp)
static int
pipe_rdwr_open(struct inode *inode, struct file *filp)
{
- mutex_lock(PIPE_MUTEX(*inode));
+ mutex_lock(&inode->i_mutex);
if (filp->f_mode & FMODE_READ)
- PIPE_READERS(*inode)++;
+ inode->i_pipe->readers++;
if (filp->f_mode & FMODE_WRITE)
- PIPE_WRITERS(*inode)++;
- mutex_unlock(PIPE_MUTEX(*inode));
+ inode->i_pipe->writers++;
+ mutex_unlock(&inode->i_mutex);
return 0;
}
@@ -675,37 +702,38 @@ static struct file_operations rdwr_pipe_fops = {
.fasync = pipe_rdwr_fasync,
};
-void free_pipe_info(struct inode *inode)
+struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+{
+ struct pipe_inode_info *pipe;
+
+ pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+ if (pipe) {
+ init_waitqueue_head(&pipe->wait);
+ pipe->r_counter = pipe->w_counter = 1;
+ pipe->inode = inode;
+ }
+
+ return pipe;
+}
+
+void __free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- struct pipe_inode_info *info = inode->i_pipe;
- inode->i_pipe = NULL;
for (i = 0; i < PIPE_BUFFERS; i++) {
- struct pipe_buffer *buf = info->bufs + i;
+ struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
- buf->ops->release(info, buf);
+ buf->ops->release(pipe, buf);
}
- if (info->tmp_page)
- __free_page(info->tmp_page);
- kfree(info);
+ if (pipe->tmp_page)
+ __free_page(pipe->tmp_page);
+ kfree(pipe);
}
-struct inode* pipe_new(struct inode* inode)
+void free_pipe_info(struct inode *inode)
{
- struct pipe_inode_info *info;
-
- info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
- if (!info)
- goto fail_page;
- inode->i_pipe = info;
-
- init_waitqueue_head(PIPE_WAIT(*inode));
- PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
-
- return inode;
-fail_page:
- return NULL;
+ __free_pipe_info(inode->i_pipe);
+ inode->i_pipe = NULL;
}
static struct vfsmount *pipe_mnt __read_mostly;
@@ -713,6 +741,7 @@ static int pipefs_delete_dentry(struct dentry *dentry)
{
return 1;
}
+
static struct dentry_operations pipefs_dentry_operations = {
.d_delete = pipefs_delete_dentry,
};
@@ -720,13 +749,17 @@ static struct dentry_operations pipefs_dentry_operations = {
static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode(pipe_mnt->mnt_sb);
+ struct pipe_inode_info *pipe;
if (!inode)
goto fail_inode;
- if(!pipe_new(inode))
+ pipe = alloc_pipe_info(inode);
+ if (!pipe)
goto fail_iput;
- PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
+ inode->i_pipe = pipe;
+
+ pipe->readers = pipe->writers = 1;
inode->i_fop = &rdwr_pipe_fops;
/*
@@ -741,10 +774,12 @@ static struct inode * get_pipe_inode(void)
inode->i_gid = current->fsgid;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_blksize = PAGE_SIZE;
+
return inode;
fail_iput:
iput(inode);
+
fail_inode:
return NULL;
}
@@ -757,7 +792,7 @@ int do_pipe(int *fd)
struct inode * inode;
struct file *f1, *f2;
int error;
- int i,j;
+ int i, j;
error = -ENFILE;
f1 = get_empty_filp();
@@ -790,6 +825,7 @@ int do_pipe(int *fd)
dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
if (!dentry)
goto close_f12_inode_i_j;
+
dentry->d_op = &pipefs_dentry_operations;
d_add(dentry, inode);
f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt));
@@ -813,6 +849,7 @@ int do_pipe(int *fd)
fd_install(j, f2);
fd[0] = i;
fd[1] = j;
+
return 0;
close_f12_inode_i_j:
@@ -837,8 +874,9 @@ no_files:
* d_name - pipe: will go nicely and kill the special-casing in procfs.
*/
-static struct super_block *pipefs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static struct super_block *
+pipefs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data)
{
return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
}
@@ -852,6 +890,7 @@ static struct file_system_type pipe_fs_type = {
static int __init init_pipe_fs(void)
{
int err = register_filesystem(&pipe_fs_type);
+
if (!err) {
pipe_mnt = kern_mount(&pipe_fs_type);
if (IS_ERR(pipe_mnt)) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a3a3eecef689..6cc77dc3f3ff 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
files = get_files_struct(task);
if (files) {
- rcu_read_lock();
+ /*
+ * We are not taking a ref to the file structure, so we must
+ * hold ->file_lock.
+ */
+ spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
if (file) {
*mnt = mntget(file->f_vfsmnt);
*dentry = dget(file->f_dentry);
- rcu_read_unlock();
+ spin_unlock(&files->file_lock);
put_files_struct(files);
return 0;
}
- rcu_read_unlock();
+ spin_unlock(&files->file_lock);
put_files_struct(files);
}
return -ENOENT;
@@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
if (!files)
goto out_unlock;
inode->i_mode = S_IFLNK;
- rcu_read_lock();
+
+ /*
+ * We are not taking a ref to the file structure, so we must
+ * hold ->file_lock.
+ */
+ spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
if (!file)
goto out_unlock2;
@@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
inode->i_mode |= S_IRUSR | S_IXUSR;
if (file->f_mode & 2)
inode->i_mode |= S_IWUSR | S_IXUSR;
- rcu_read_unlock();
+ spin_unlock(&files->file_lock);
put_files_struct(files);
inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
@@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
return NULL;
out_unlock2:
- rcu_read_unlock();
+ spin_unlock(&files->file_lock);
put_files_struct(files);
out_unlock:
iput(inode);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7efa73d44c9a..20d4b2237fce 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
size_t buflen, loff_t *fpos)
{
ssize_t acc = 0, tmp;
- size_t tsz, nr_bytes;
- u64 start;
+ size_t tsz;
+ u64 start, nr_bytes;
struct vmcore *curr_m = NULL;
if (buflen == 0 || *fpos >= vmcore_size)
diff --git a/fs/read_write.c b/fs/read_write.c
index 6256ca81a718..5bc0e9234f9d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
goto Einval;
inode = file->f_dentry->d_inode;
- if (inode->i_flock && MANDATORY_LOCK(inode)) {
+ if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
int retval = locks_mandatory_area(
read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
inode, file, pos, count);
diff --git a/fs/select.c b/fs/select.c
index 071660fa7b01..a8109baa5e46 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -310,8 +310,9 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
fd_set __user *exp, s64 *timeout)
{
fd_set_bits fds;
- char *bits;
- int ret, size, max_fdset;
+ void *bits;
+ int ret, max_fdset;
+ unsigned int size;
struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
@@ -333,20 +334,21 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
* since we used fdset we need to allocate memory in units of
* long-words.
*/
- ret = -ENOMEM;
size = FDS_BYTES(n);
- if (6*size < SELECT_STACK_ALLOC)
- bits = stack_fds;
- else
+ bits = stack_fds;
+ if (size > sizeof(stack_fds) / 6) {
+ /* Not enough space in on-stack array; must use kmalloc */
+ ret = -ENOMEM;
bits = kmalloc(6 * size, GFP_KERNEL);
- if (!bits)
- goto out_nofds;
- fds.in = (unsigned long *) bits;
- fds.out = (unsigned long *) (bits + size);
- fds.ex = (unsigned long *) (bits + 2*size);
- fds.res_in = (unsigned long *) (bits + 3*size);
- fds.res_out = (unsigned long *) (bits + 4*size);
- fds.res_ex = (unsigned long *) (bits + 5*size);
+ if (!bits)
+ goto out_nofds;
+ }
+ fds.in = bits;
+ fds.out = bits + size;
+ fds.ex = bits + 2*size;
+ fds.res_in = bits + 3*size;
+ fds.res_out = bits + 4*size;
+ fds.res_ex = bits + 5*size;
if ((ret = get_fd_set(n, inp, fds.in)) ||
(ret = get_fd_set(n, outp, fds.out)) ||
diff --git a/fs/splice.c b/fs/splice.c
index bfa42a277bb8..0559e7577a04 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -9,11 +9,12 @@
* that transfers data buffers to or from a pipe buffer.
*
* Named by Larry McVoy, original implementation from Linus, extended by
- * Jens to support splicing to files and fixing the initial implementation
- * bugs.
+ * Jens to support splicing to files, network, direct splicing, etc and
+ * fixing lots of bugs.
*
- * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
*
*/
#include <linux/fs.h>
@@ -49,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
struct page *page = buf->page;
struct address_space *mapping = page_mapping(page);
- WARN_ON(!PageLocked(page));
+ lock_page(page);
+
WARN_ON(!PageUptodate(page));
/*
@@ -64,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
if (PagePrivate(page))
try_to_release_page(page, mapping_gfp_mask(mapping));
- if (!remove_mapping(mapping, page))
+ if (!remove_mapping(mapping, page)) {
+ unlock_page(page);
return 1;
+ }
buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
return 0;
@@ -84,69 +88,89 @@ static void *page_cache_pipe_buf_map(struct file *file,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
-
- lock_page(page);
+ int err;
if (!PageUptodate(page)) {
- unlock_page(page);
- return ERR_PTR(-EIO);
- }
+ lock_page(page);
- if (!page->mapping) {
+ /*
+ * Page got truncated/unhashed. This will cause a 0-byte
+ * splice, if this is the first page.
+ */
+ if (!page->mapping) {
+ err = -ENODATA;
+ goto error;
+ }
+
+ /*
+ * Uh oh, read-error from disk.
+ */
+ if (!PageUptodate(page)) {
+ err = -EIO;
+ goto error;
+ }
+
+ /*
+ * Page is ok afterall, fall through to mapping.
+ */
unlock_page(page);
- return ERR_PTR(-ENODATA);
}
- return kmap(buf->page);
+ return kmap(page);
+error:
+ unlock_page(page);
+ return ERR_PTR(err);
}
static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
struct pipe_buffer *buf)
{
- unlock_page(buf->page);
kunmap(buf->page);
}
+static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
+ struct pipe_buffer *buf)
+{
+ page_cache_get(buf->page);
+}
+
static struct pipe_buf_operations page_cache_pipe_buf_ops = {
.can_merge = 0,
.map = page_cache_pipe_buf_map,
.unmap = page_cache_pipe_buf_unmap,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
+ .get = page_cache_pipe_buf_get,
};
/*
* Pipe output worker. This sets up our pipe format with the page cache
* pipe buffer operations. Otherwise very similar to the regular pipe_writev().
*/
-static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
- int nr_pages, unsigned long offset,
- unsigned long len, unsigned int flags)
+static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
+ int nr_pages, unsigned long len,
+ unsigned int offset, unsigned int flags)
{
- struct pipe_inode_info *info;
int ret, do_wakeup, i;
ret = 0;
do_wakeup = 0;
i = 0;
- mutex_lock(PIPE_MUTEX(*inode));
+ if (pipe->inode)
+ mutex_lock(&pipe->inode->i_mutex);
- info = inode->i_pipe;
for (;;) {
- int bufs;
-
- if (!PIPE_READERS(*inode)) {
+ if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}
- bufs = info->nrbufs;
- if (bufs < PIPE_BUFFERS) {
- int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
- struct pipe_buffer *buf = info->bufs + newbuf;
+ if (pipe->nrbufs < PIPE_BUFFERS) {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+ struct pipe_buffer *buf = pipe->bufs + newbuf;
struct page *page = pages[i++];
unsigned long this_len;
@@ -158,8 +182,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
buf->offset = offset;
buf->len = this_len;
buf->ops = &page_cache_pipe_buf_ops;
- info->nrbufs = ++bufs;
- do_wakeup = 1;
+ pipe->nrbufs++;
+ if (pipe->inode)
+ do_wakeup = 1;
ret += this_len;
len -= this_len;
@@ -168,7 +193,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
break;
if (!len)
break;
- if (bufs < PIPE_BUFFERS)
+ if (pipe->nrbufs < PIPE_BUFFERS)
continue;
break;
@@ -187,22 +212,26 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
}
if (do_wakeup) {
- wake_up_interruptible_sync(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
- POLL_IN);
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
do_wakeup = 0;
}
- PIPE_WAITING_WRITERS(*inode)++;
- pipe_wait(inode);
- PIPE_WAITING_WRITERS(*inode)--;
+ pipe->waiting_writers++;
+ pipe_wait(pipe);
+ pipe->waiting_writers--;
}
- mutex_unlock(PIPE_MUTEX(*inode));
+ if (pipe->inode)
+ mutex_unlock(&pipe->inode->i_mutex);
if (do_wakeup) {
- wake_up_interruptible(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
while (i < nr_pages)
@@ -211,96 +240,155 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
return ret;
}
-static int __generic_file_splice_read(struct file *in, struct inode *pipe,
- size_t len, unsigned int flags)
+static int
+__generic_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
struct address_space *mapping = in->f_mapping;
- unsigned int offset, nr_pages;
- struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
+ unsigned int loff, offset, nr_pages;
+ struct page *pages[PIPE_BUFFERS];
struct page *page;
- pgoff_t index, pidx;
- int i, j;
+ pgoff_t index, end_index;
+ loff_t isize;
+ size_t bytes;
+ int i, error;
- index = in->f_pos >> PAGE_CACHE_SHIFT;
- offset = in->f_pos & ~PAGE_CACHE_MASK;
+ index = *ppos >> PAGE_CACHE_SHIFT;
+ loff = offset = *ppos & ~PAGE_CACHE_MASK;
nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (nr_pages > PIPE_BUFFERS)
nr_pages = PIPE_BUFFERS;
/*
- * initiate read-ahead on this page range
+ * Initiate read-ahead on this page range. however, don't call into
+ * read-ahead if this is a non-zero offset (we are likely doing small
+ * chunk splice and the page is already there) for a single page.
*/
- do_page_cache_readahead(mapping, in, index, nr_pages);
+ if (!offset || nr_pages > 1)
+ do_page_cache_readahead(mapping, in, index, nr_pages);
/*
- * Get as many pages from the page cache as possible..
- * Start IO on the page cache entries we create (we
- * can assume that any pre-existing ones we find have
- * already had IO started on them).
+ * Now fill in the holes:
*/
- i = find_get_pages(mapping, index, nr_pages, pages);
+ error = 0;
+ bytes = 0;
+ for (i = 0; i < nr_pages; i++, index++) {
+ unsigned int this_len;
- /*
- * common case - we found all pages and they are contiguous,
- * kick them off
- */
- if (i && (pages[i - 1]->index == index + i - 1))
- goto splice_them;
+ if (!len)
+ break;
- /*
- * fill shadow[] with pages at the right locations, so we only
- * have to fill holes
- */
- memset(shadow, 0, nr_pages * sizeof(struct page *));
- for (j = 0; j < i; j++)
- shadow[pages[j]->index - index] = pages[j];
+ /*
+ * this_len is the max we'll use from this page
+ */
+ this_len = min(len, PAGE_CACHE_SIZE - loff);
+find_page:
+ /*
+ * lookup the page for this index
+ */
+ page = find_get_page(mapping, index);
+ if (!page) {
+ /*
+ * page didn't exist, allocate one
+ */
+ page = page_cache_alloc_cold(mapping);
+ if (!page)
+ break;
- /*
- * now fill in the holes
- */
- for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
- int error;
+ error = add_to_page_cache_lru(page, mapping, index,
+ mapping_gfp_mask(mapping));
+ if (unlikely(error)) {
+ page_cache_release(page);
+ break;
+ }
- if (shadow[i])
- continue;
+ goto readpage;
+ }
/*
- * no page there, look one up / create it
+ * If the page isn't uptodate, we may need to start io on it
*/
- page = find_or_create_page(mapping, pidx,
- mapping_gfp_mask(mapping));
- if (!page)
- break;
+ if (!PageUptodate(page)) {
+ /*
+ * If in nonblock mode then dont block on waiting
+ * for an in-flight io page
+ */
+ if (flags & SPLICE_F_NONBLOCK)
+ break;
+
+ lock_page(page);
+
+ /*
+ * page was truncated, stop here. if this isn't the
+ * first page, we'll just complete what we already
+ * added
+ */
+ if (!page->mapping) {
+ unlock_page(page);
+ page_cache_release(page);
+ break;
+ }
+ /*
+ * page was already under io and is now done, great
+ */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ goto fill_it;
+ }
- if (PageUptodate(page))
- unlock_page(page);
- else {
+readpage:
+ /*
+ * need to read in the page
+ */
error = mapping->a_ops->readpage(in, page);
if (unlikely(error)) {
page_cache_release(page);
+ if (error == AOP_TRUNCATED_PAGE)
+ goto find_page;
break;
}
- }
- shadow[i] = page;
- }
- if (!i) {
- for (i = 0; i < nr_pages; i++) {
- if (shadow[i])
- page_cache_release(shadow[i]);
+ /*
+ * i_size must be checked after ->readpage().
+ */
+ isize = i_size_read(mapping->host);
+ end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ if (unlikely(!isize || index > end_index)) {
+ page_cache_release(page);
+ break;
+ }
+
+ /*
+ * if this is the last page, see if we need to shrink
+ * the length and stop
+ */
+ if (end_index == index) {
+ loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
+ if (bytes + loff > isize) {
+ page_cache_release(page);
+ break;
+ }
+ /*
+ * force quit after adding this page
+ */
+ nr_pages = i;
+ this_len = min(this_len, loff);
+ }
}
- return 0;
+fill_it:
+ pages[i] = page;
+ bytes += this_len;
+ len -= this_len;
+ loff = 0;
}
- memcpy(pages, shadow, i * sizeof(struct page *));
+ if (i)
+ return move_to_pipe(pipe, pages, i, bytes, offset, flags);
- /*
- * Now we splice them into the pipe..
- */
-splice_them:
- return move_to_pipe(pipe, pages, i, offset, len, flags);
+ return error;
}
/**
@@ -311,30 +399,34 @@ splice_them:
* @flags: splice modifier flags
*
* Will read pages from given file and fill them into a pipe.
- *
*/
-ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
- size_t len, unsigned int flags)
+ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
ssize_t spliced;
int ret;
ret = 0;
spliced = 0;
+
while (len) {
- ret = __generic_file_splice_read(in, pipe, len, flags);
+ ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
- if (ret <= 0)
+ if (ret < 0)
break;
+ else if (!ret) {
+ if (spliced)
+ break;
+ if (flags & SPLICE_F_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+ }
- in->f_pos += ret;
+ *ppos += ret;
len -= ret;
spliced += ret;
-
- if (!(flags & SPLICE_F_NONBLOCK))
- continue;
- ret = -EAGAIN;
- break;
}
if (spliced)
@@ -360,10 +452,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
int more;
/*
- * sub-optimal, but we are limited by the pipe ->map. we don't
+ * Sub-optimal, but we are limited by the pipe ->map. We don't
* need a kmap'ed buffer here, we just want to make sure we
* have the page pinned if the pipe page originates from the
- * page cache
+ * page cache.
*/
ptr = buf->ops->map(file, info, buf);
if (IS_ERR(ptr))
@@ -414,7 +506,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
int ret;
/*
- * after this, page will be locked and unmapped
+ * make sure the data in this buffer is uptodate
*/
src = buf->ops->map(file, info, buf);
if (IS_ERR(src))
@@ -424,12 +516,13 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
offset = sd->pos & ~PAGE_CACHE_MASK;
/*
- * reuse buf page, if SPLICE_F_MOVE is set
+ * Reuse buf page, if SPLICE_F_MOVE is set.
*/
if (sd->flags & SPLICE_F_MOVE) {
/*
* If steal succeeds, buf->page is now pruned from the vm
- * side (LRU and page cache) and we can reuse it.
+ * side (LRU and page cache) and we can reuse it. The page
+ * will also be looked on successful return.
*/
if (buf->ops->steal(info, buf))
goto find_page;
@@ -442,15 +535,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
lru_cache_add(page);
} else {
find_page:
- ret = -ENOMEM;
- page = find_or_create_page(mapping, index, gfp_mask);
- if (!page)
- goto out;
+ page = find_lock_page(mapping, index);
+ if (!page) {
+ ret = -ENOMEM;
+ page = page_cache_alloc_cold(mapping);
+ if (unlikely(!page))
+ goto out_nomem;
+
+ /*
+ * This will also lock the page
+ */
+ ret = add_to_page_cache_lru(page, mapping, index,
+ gfp_mask);
+ if (unlikely(ret))
+ goto out;
+ }
/*
- * If the page is uptodate, it is also locked. If it isn't
- * uptodate, we can mark it uptodate if we are filling the
- * full page. Otherwise we need to read it in first...
+ * We get here with the page locked. If the page is also
+ * uptodate, we don't need to do more. If it isn't, we
+ * may need to bring it in if we are not going to overwrite
+ * the full page.
*/
if (!PageUptodate(page)) {
if (sd->len < PAGE_CACHE_SIZE) {
@@ -462,7 +567,7 @@ find_page:
if (!PageUptodate(page)) {
/*
- * page got invalidated, repeat
+ * Page got invalidated, repeat.
*/
if (!page->mapping) {
unlock_page(page);
@@ -472,10 +577,8 @@ find_page:
ret = -EIO;
goto out;
}
- } else {
- WARN_ON(!PageLocked(page));
+ } else
SetPageUptodate(page);
- }
}
}
@@ -501,12 +604,14 @@ find_page:
} else if (ret)
goto out;
+ mark_page_accessed(page);
balance_dirty_pages_ratelimited(mapping);
out:
- if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
+ if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
page_cache_release(page);
- unlock_page(page);
- }
+
+ unlock_page(page);
+out_nomem:
buf->ops->unmap(info, buf);
return ret;
}
@@ -519,11 +624,10 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
* key here is the 'actor' worker passed in that actually moves the data
* to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
*/
-static ssize_t move_from_pipe(struct inode *inode, struct file *out,
- size_t len, unsigned int flags,
+static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags,
splice_actor *actor)
{
- struct pipe_inode_info *info;
int ret, do_wakeup, err;
struct splice_desc sd;
@@ -533,24 +637,21 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
sd.total_len = len;
sd.flags = flags;
sd.file = out;
- sd.pos = out->f_pos;
+ sd.pos = *ppos;
- mutex_lock(PIPE_MUTEX(*inode));
+ if (pipe->inode)
+ mutex_lock(&pipe->inode->i_mutex);
- info = inode->i_pipe;
for (;;) {
- int bufs = info->nrbufs;
-
- if (bufs) {
- int curbuf = info->curbuf;
- struct pipe_buffer *buf = info->bufs + curbuf;
+ if (pipe->nrbufs) {
+ struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
struct pipe_buf_operations *ops = buf->ops;
sd.len = buf->len;
if (sd.len > sd.total_len)
sd.len = sd.total_len;
- err = actor(info, buf, &sd);
+ err = actor(pipe, buf, &sd);
if (err) {
if (!ret && err != -ENODATA)
ret = err;
@@ -561,13 +662,14 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
ret += sd.len;
buf->offset += sd.len;
buf->len -= sd.len;
+
if (!buf->len) {
buf->ops = NULL;
- ops->release(info, buf);
- curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
- info->curbuf = curbuf;
- info->nrbufs = --bufs;
- do_wakeup = 1;
+ ops->release(pipe, buf);
+ pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+ pipe->nrbufs--;
+ if (pipe->inode)
+ do_wakeup = 1;
}
sd.pos += sd.len;
@@ -576,11 +678,11 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
break;
}
- if (bufs)
+ if (pipe->nrbufs)
continue;
- if (!PIPE_WRITERS(*inode))
+ if (!pipe->writers)
break;
- if (!PIPE_WAITING_WRITERS(*inode)) {
+ if (!pipe->waiting_writers) {
if (ret)
break;
}
@@ -598,31 +700,32 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
}
if (do_wakeup) {
- wake_up_interruptible_sync(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible_sync(&pipe->wait);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
do_wakeup = 0;
}
- pipe_wait(inode);
+ pipe_wait(pipe);
}
- mutex_unlock(PIPE_MUTEX(*inode));
+ if (pipe->inode)
+ mutex_unlock(&pipe->inode->i_mutex);
if (do_wakeup) {
- wake_up_interruptible(PIPE_WAIT(*inode));
- kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- mutex_lock(&out->f_mapping->host->i_mutex);
- out->f_pos = sd.pos;
- mutex_unlock(&out->f_mapping->host->i_mutex);
return ret;
-
}
/**
* generic_file_splice_write - splice data from a pipe to a file
- * @inode: pipe inode
+ * @pipe: pipe info
* @out: file to write to
* @len: number of bytes to splice
* @flags: splice modifier flags
@@ -631,27 +734,34 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
* the given pipe inode to the given file.
*
*/
-ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
- size_t len, unsigned int flags)
+ssize_t
+generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
struct address_space *mapping = out->f_mapping;
- ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file);
+ ssize_t ret;
- /*
- * if file or inode is SYNC and we actually wrote some data, sync it
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
- && ret > 0) {
+ ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+ if (ret > 0) {
struct inode *inode = mapping->host;
- int err;
- mutex_lock(&inode->i_mutex);
- err = generic_osync_inode(mapping->host, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- mutex_unlock(&inode->i_mutex);
+ *ppos += ret;
+
+ /*
+ * If file or inode is SYNC and we actually wrote some data,
+ * sync it.
+ */
+ if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+ int err;
+
+ mutex_lock(&inode->i_mutex);
+ err = generic_osync_inode(inode, mapping,
+ OSYNC_METADATA|OSYNC_DATA);
+ mutex_unlock(&inode->i_mutex);
- if (err)
- ret = err;
+ if (err)
+ ret = err;
+ }
}
return ret;
@@ -670,10 +780,10 @@ EXPORT_SYMBOL(generic_file_splice_write);
* is involved.
*
*/
-ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
- size_t len, unsigned int flags)
+ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
- return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
+ return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
}
EXPORT_SYMBOL(generic_splice_sendpage);
@@ -681,77 +791,228 @@ EXPORT_SYMBOL(generic_splice_sendpage);
/*
* Attempt to initiate a splice from pipe to file.
*/
-static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
- unsigned int flags)
+static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
- loff_t pos;
int ret;
- if (!out->f_op || !out->f_op->splice_write)
+ if (unlikely(!out->f_op || !out->f_op->splice_write))
return -EINVAL;
- if (!(out->f_mode & FMODE_WRITE))
+ if (unlikely(!(out->f_mode & FMODE_WRITE)))
return -EBADF;
- pos = out->f_pos;
- ret = rw_verify_area(WRITE, out, &pos, len);
+ ret = rw_verify_area(WRITE, out, ppos, len);
if (unlikely(ret < 0))
return ret;
- return out->f_op->splice_write(pipe, out, len, flags);
+ return out->f_op->splice_write(pipe, out, ppos, len, flags);
}
/*
* Attempt to initiate a splice from a file to a pipe.
*/
-static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
+static long do_splice_to(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- loff_t pos, isize, left;
+ loff_t isize, left;
int ret;
- if (!in->f_op || !in->f_op->splice_read)
+ if (unlikely(!in->f_op || !in->f_op->splice_read))
return -EINVAL;
- if (!(in->f_mode & FMODE_READ))
+ if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
- pos = in->f_pos;
- ret = rw_verify_area(READ, in, &pos, len);
+ ret = rw_verify_area(READ, in, ppos, len);
if (unlikely(ret < 0))
return ret;
isize = i_size_read(in->f_mapping->host);
- if (unlikely(in->f_pos >= isize))
+ if (unlikely(*ppos >= isize))
return 0;
- left = isize - in->f_pos;
- if (left < len)
+ left = isize - *ppos;
+ if (unlikely(left < len))
len = left;
- return in->f_op->splice_read(in, pipe, len, flags);
+ return in->f_op->splice_read(in, ppos, pipe, len, flags);
+}
+
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+ size_t len, unsigned int flags)
+{
+ struct pipe_inode_info *pipe;
+ long ret, bytes;
+ loff_t out_off;
+ umode_t i_mode;
+ int i;
+
+ /*
+ * We require the input being a regular file, as we don't want to
+ * randomly drop data for eg socket -> socket splicing. Use the
+ * piped splicing for that!
+ */
+ i_mode = in->f_dentry->d_inode->i_mode;
+ if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+ return -EINVAL;
+
+ /*
+ * neither in nor out is a pipe, setup an internal pipe attached to
+ * 'out' and transfer the wanted data from 'in' to 'out' through that
+ */
+ pipe = current->splice_pipe;
+ if (unlikely(!pipe)) {
+ pipe = alloc_pipe_info(NULL);
+ if (!pipe)
+ return -ENOMEM;
+
+ /*
+ * We don't have an immediate reader, but we'll read the stuff
+ * out of the pipe right after the move_to_pipe(). So set
+ * PIPE_READERS appropriately.
+ */
+ pipe->readers = 1;
+
+ current->splice_pipe = pipe;
+ }
+
+ /*
+ * Do the splice.
+ */
+ ret = 0;
+ bytes = 0;
+ out_off = 0;
+
+ while (len) {
+ size_t read_len, max_read_len;
+
+ /*
+ * Do at most PIPE_BUFFERS pages worth of transfer:
+ */
+ max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
+
+ ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
+ if (unlikely(ret < 0))
+ goto out_release;
+
+ read_len = ret;
+
+ /*
+ * NOTE: nonblocking mode only applies to the input. We
+ * must not do the output in nonblocking mode as then we
+ * could get stuck data in the internal pipe:
+ */
+ ret = do_splice_from(pipe, out, &out_off, read_len,
+ flags & ~SPLICE_F_NONBLOCK);
+ if (unlikely(ret < 0))
+ goto out_release;
+
+ bytes += ret;
+ len -= ret;
+
+ /*
+ * In nonblocking mode, if we got back a short read then
+ * that was due to either an IO error or due to the
+ * pagecache entry not being there. In the IO error case
+ * the _next_ splice attempt will produce a clean IO error
+ * return value (not a short read), so in both cases it's
+ * correct to break out of the loop here:
+ */
+ if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
+ break;
+ }
+
+ pipe->nrbufs = pipe->curbuf = 0;
+
+ return bytes;
+
+out_release:
+ /*
+ * If we did an incomplete transfer we must release
+ * the pipe buffers in question:
+ */
+ for (i = 0; i < PIPE_BUFFERS; i++) {
+ struct pipe_buffer *buf = pipe->bufs + i;
+
+ if (buf->ops) {
+ buf->ops->release(pipe, buf);
+ buf->ops = NULL;
+ }
+ }
+ pipe->nrbufs = pipe->curbuf = 0;
+
+ /*
+ * If we transferred some data, return the number of bytes:
+ */
+ if (bytes > 0)
+ return bytes;
+
+ return ret;
}
+EXPORT_SYMBOL(do_splice_direct);
+
/*
* Determine where to splice to/from.
*/
-static long do_splice(struct file *in, struct file *out, size_t len,
- unsigned int flags)
+static long do_splice(struct file *in, loff_t __user *off_in,
+ struct file *out, loff_t __user *off_out,
+ size_t len, unsigned int flags)
{
- struct inode *pipe;
+ struct pipe_inode_info *pipe;
+ loff_t offset, *off;
+ long ret;
+
+ pipe = in->f_dentry->d_inode->i_pipe;
+ if (pipe) {
+ if (off_in)
+ return -ESPIPE;
+ if (off_out) {
+ if (out->f_op->llseek == no_llseek)
+ return -EINVAL;
+ if (copy_from_user(&offset, off_out, sizeof(loff_t)))
+ return -EFAULT;
+ off = &offset;
+ } else
+ off = &out->f_pos;
+
+ ret = do_splice_from(pipe, out, off, len, flags);
+
+ if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+ ret = -EFAULT;
- pipe = in->f_dentry->d_inode;
- if (pipe->i_pipe)
- return do_splice_from(pipe, out, len, flags);
+ return ret;
+ }
+
+ pipe = out->f_dentry->d_inode->i_pipe;
+ if (pipe) {
+ if (off_out)
+ return -ESPIPE;
+ if (off_in) {
+ if (in->f_op->llseek == no_llseek)
+ return -EINVAL;
+ if (copy_from_user(&offset, off_in, sizeof(loff_t)))
+ return -EFAULT;
+ off = &offset;
+ } else
+ off = &in->f_pos;
- pipe = out->f_dentry->d_inode;
- if (pipe->i_pipe)
- return do_splice_to(in, pipe, len, flags);
+ ret = do_splice_to(in, off, pipe, len, flags);
+
+ if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+ ret = -EFAULT;
+
+ return ret;
+ }
return -EINVAL;
}
-asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
+asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
+ int fd_out, loff_t __user *off_out,
+ size_t len, unsigned int flags)
{
long error;
struct file *in, *out;
@@ -761,13 +1022,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
return 0;
error = -EBADF;
- in = fget_light(fdin, &fput_in);
+ in = fget_light(fd_in, &fput_in);
if (in) {
if (in->f_mode & FMODE_READ) {
- out = fget_light(fdout, &fput_out);
+ out = fget_light(fd_out, &fput_out);
if (out) {
if (out->f_mode & FMODE_WRITE)
- error = do_splice(in, out, len, flags);
+ error = do_splice(in, off_in,
+ out, off_out,
+ len, flags);
fput_light(out, fput_out);
}
}
@@ -777,3 +1040,192 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
return error;
}
+
+/*
+ * Link contents of ipipe to opipe.
+ */
+static int link_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags)
+{
+ struct pipe_buffer *ibuf, *obuf;
+ int ret, do_wakeup, i, ipipe_first;
+
+ ret = do_wakeup = ipipe_first = 0;
+
+ /*
+ * Potential ABBA deadlock, work around it by ordering lock
+ * grabbing by inode address. Otherwise two different processes
+ * could deadlock (one doing tee from A -> B, the other from B -> A).
+ */
+ if (ipipe->inode < opipe->inode) {
+ ipipe_first = 1;
+ mutex_lock(&ipipe->inode->i_mutex);
+ mutex_lock(&opipe->inode->i_mutex);
+ } else {
+ mutex_lock(&opipe->inode->i_mutex);
+ mutex_lock(&ipipe->inode->i_mutex);
+ }
+
+ for (i = 0;; i++) {
+ if (!opipe->readers) {
+ send_sig(SIGPIPE, current, 0);
+ if (!ret)
+ ret = -EPIPE;
+ break;
+ }
+ if (ipipe->nrbufs - i) {
+ ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+
+ /*
+ * If we have room, fill this buffer
+ */
+ if (opipe->nrbufs < PIPE_BUFFERS) {
+ int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+
+ /*
+ * Get a reference to this pipe buffer,
+ * so we can copy the contents over.
+ */
+ ibuf->ops->get(ipipe, ibuf);
+
+ obuf = opipe->bufs + nbuf;
+ *obuf = *ibuf;
+
+ if (obuf->len > len)
+ obuf->len = len;
+
+ opipe->nrbufs++;
+ do_wakeup = 1;
+ ret += obuf->len;
+ len -= obuf->len;
+
+ if (!len)
+ break;
+ if (opipe->nrbufs < PIPE_BUFFERS)
+ continue;
+ }
+
+ /*
+ * We have input available, but no output room.
+ * If we already copied data, return that. If we
+ * need to drop the opipe lock, it must be ordered
+ * last to avoid deadlocks.
+ */
+ if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
+ if (!ret)
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ if (!ret)
+ ret = -ERESTARTSYS;
+ break;
+ }
+ if (do_wakeup) {
+ smp_mb();
+ if (waitqueue_active(&opipe->wait))
+ wake_up_interruptible(&opipe->wait);
+ kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+ do_wakeup = 0;
+ }
+
+ opipe->waiting_writers++;
+ pipe_wait(opipe);
+ opipe->waiting_writers--;
+ continue;
+ }
+
+ /*
+ * No input buffers, do the usual checks for available
+ * writers and blocking and wait if necessary
+ */
+ if (!ipipe->writers)
+ break;
+ if (!ipipe->waiting_writers) {
+ if (ret)
+ break;
+ }
+ /*
+ * pipe_wait() drops the ipipe mutex. To avoid deadlocks
+ * with another process, we can only safely do that if
+ * the ipipe lock is ordered last.
+ */
+ if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
+ if (!ret)
+ ret = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ if (!ret)
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ if (waitqueue_active(&ipipe->wait))
+ wake_up_interruptible_sync(&ipipe->wait);
+ kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
+
+ pipe_wait(ipipe);
+ }
+
+ mutex_unlock(&ipipe->inode->i_mutex);
+ mutex_unlock(&opipe->inode->i_mutex);
+
+ if (do_wakeup) {
+ smp_mb();
+ if (waitqueue_active(&opipe->wait))
+ wake_up_interruptible(&opipe->wait);
+ kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+ }
+
+ return ret;
+}
+
+/*
+ * This is a tee(1) implementation that works on pipes. It doesn't copy
+ * any data, it simply references the 'in' pages on the 'out' pipe.
+ * The 'flags' used are the SPLICE_F_* variants, currently the only
+ * applicable one is SPLICE_F_NONBLOCK.
+ */
+static long do_tee(struct file *in, struct file *out, size_t len,
+ unsigned int flags)
+{
+ struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
+ struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
+
+ /*
+ * Link ipipe to the two output pipes, consuming as we go along.
+ */
+ if (ipipe && opipe)
+ return link_pipe(ipipe, opipe, len, flags);
+
+ return -EINVAL;
+}
+
+asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+{
+ struct file *in;
+ int error, fput_in;
+
+ if (unlikely(!len))
+ return 0;
+
+ error = -EBADF;
+ in = fget_light(fdin, &fput_in);
+ if (in) {
+ if (in->f_mode & FMODE_READ) {
+ int fput_out;
+ struct file *out = fget_light(fdout, &fput_out);
+
+ if (out) {
+ if (out->f_mode & FMODE_WRITE)
+ error = do_tee(in, out, len, flags);
+ fput_light(out, fput_out);
+ }
+ }
+ fput_light(in, fput_in);
+ }
+
+ return error;
+}
diff --git a/fs/sync.c b/fs/sync.c
index 8616006d2094..aab5ffe77e9f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -61,7 +61,7 @@
* will be available after a crash.
*/
asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
- int flags)
+ unsigned int flags)
{
int ret;
struct file *file;
@@ -126,7 +126,7 @@ out:
* `endbyte' is inclusive
*/
int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
- int flags)
+ unsigned int flags)
{
int ret;
struct address_space *mapping;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 6cfdc9a87772..610b5bdbe75b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
memset(sd, 0, sizeof(*sd));
atomic_set(&sd->s_count, 1);
+ atomic_set(&sd->s_event, 0);
INIT_LIST_HEAD(&sd->s_children);
list_add(&sd->s_sibling, &parent_sd->s_children);
sd->s_element = element;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f1cb1ddde511..cf3786625bfa 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -6,6 +6,7 @@
#include <linux/fsnotify.h>
#include <linux/kobject.h>
#include <linux/namei.h>
+#include <linux/poll.h>
#include <asm/uaccess.h>
#include <asm/semaphore.h>
@@ -57,6 +58,7 @@ struct sysfs_buffer {
struct sysfs_ops * ops;
struct semaphore sem;
int needs_read_fill;
+ int event;
};
@@ -72,6 +74,7 @@ struct sysfs_buffer {
*/
static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
{
+ struct sysfs_dirent * sd = dentry->d_fsdata;
struct attribute * attr = to_attr(dentry);
struct kobject * kobj = to_kobj(dentry->d_parent);
struct sysfs_ops * ops = buffer->ops;
@@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
if (!buffer->page)
return -ENOMEM;
+ buffer->event = atomic_read(&sd->s_event);
count = ops->show(kobj,attr,buffer->page);
buffer->needs_read_fill = 0;
BUG_ON(count > (ssize_t)PAGE_SIZE);
@@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp)
return 0;
}
+/* Sysfs attribute files are pollable. The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change. When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, as simply seeking and reading
+ * again will not get new data, or reset the state of 'poll'.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Nether 'poll' or 'select' return
+ * an appropriate error code). When in doubt, set a suitable timeout value.
+ */
+static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+{
+ struct sysfs_buffer * buffer = filp->private_data;
+ struct kobject * kobj = to_kobj(filp->f_dentry->d_parent);
+ struct sysfs_dirent * sd = filp->f_dentry->d_fsdata;
+ int res = 0;
+
+ poll_wait(filp, &kobj->poll, wait);
+
+ if (buffer->event != atomic_read(&sd->s_event)) {
+ res = POLLERR|POLLPRI;
+ buffer->needs_read_fill = 1;
+ }
+
+ return res;
+}
+
+
+static struct dentry *step_down(struct dentry *dir, const char * name)
+{
+ struct dentry * de;
+
+ if (dir == NULL || dir->d_inode == NULL)
+ return NULL;
+
+ mutex_lock(&dir->d_inode->i_mutex);
+ de = lookup_one_len(name, dir, strlen(name));
+ mutex_unlock(&dir->d_inode->i_mutex);
+ dput(dir);
+ if (IS_ERR(de))
+ return NULL;
+ if (de->d_inode == NULL) {
+ dput(de);
+ return NULL;
+ }
+ return de;
+}
+
+void sysfs_notify(struct kobject * k, char *dir, char *attr)
+{
+ struct dentry *de = k->dentry;
+ if (de)
+ dget(de);
+ if (de && dir)
+ de = step_down(de, dir);
+ if (de && attr)
+ de = step_down(de, attr);
+ if (de) {
+ struct sysfs_dirent * sd = de->d_fsdata;
+ if (sd)
+ atomic_inc(&sd->s_event);
+ wake_up_interruptible(&k->poll);
+ dput(de);
+ }
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
+
const struct file_operations sysfs_file_operations = {
.read = sysfs_read_file,
.write = sysfs_write_file,
.llseek = generic_file_llseek,
.open = sysfs_open_file,
.release = sysfs_release,
+ .poll = sysfs_poll,
};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 32958a7c50e9..3651ffb5ec09 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
+extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
extern void sysfs_remove_subdir(struct dentry *);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 6cbbd165c60d..4d191ef39b67 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -870,12 +870,14 @@ xfs_page_state_convert(
pgoff_t end_index, last_index, tlast;
ssize_t size, len;
int flags, err, iomap_valid = 0, uptodate = 1;
- int page_dirty, count = 0, trylock_flag = 0;
+ int page_dirty, count = 0;
+ int trylock = 0;
int all_bh = unmapped;
- /* wait for other IO threads? */
- if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
- trylock_flag |= BMAPI_TRYLOCK;
+ if (startio) {
+ if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+ trylock |= BMAPI_TRYLOCK;
+ }
/* Is this page beyond the end of the file? */
offset = i_size_read(inode);
@@ -956,15 +958,13 @@ xfs_page_state_convert(
if (buffer_unwritten(bh)) {
type = IOMAP_UNWRITTEN;
- flags = BMAPI_WRITE|BMAPI_IGNSTATE;
+ flags = BMAPI_WRITE | BMAPI_IGNSTATE;
} else if (buffer_delay(bh)) {
type = IOMAP_DELAY;
- flags = BMAPI_ALLOCATE;
- if (!startio)
- flags |= trylock_flag;
+ flags = BMAPI_ALLOCATE | trylock;
} else {
type = IOMAP_NEW;
- flags = BMAPI_WRITE|BMAPI_MMAP;
+ flags = BMAPI_WRITE | BMAPI_MMAP;
}
if (!iomap_valid) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9fb0312665ca..26fed0756f01 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -182,7 +182,7 @@ free_address(
{
a_list_t *aentry;
- aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
+ aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
if (likely(aentry)) {
spin_lock(&as_lock);
aentry->next = as_free_head;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ae4c4754ed31..c847416f6d10 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -252,56 +252,60 @@ xfs_file_sendfile_invis(
STATIC ssize_t
xfs_file_splice_read(
struct file *infilp,
- struct inode *pipe,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
size_t len,
unsigned int flags)
{
vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode);
ssize_t rval;
- VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval);
+ VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval);
return rval;
}
STATIC ssize_t
xfs_file_splice_read_invis(
struct file *infilp,
- struct inode *pipe,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
size_t len,
unsigned int flags)
{
vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode);
ssize_t rval;
- VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval);
+ VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval);
return rval;
}
STATIC ssize_t
xfs_file_splice_write(
- struct inode *pipe,
+ struct pipe_inode_info *pipe,
struct file *outfilp,
+ loff_t *ppos,
size_t len,
unsigned int flags)
{
vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode);
ssize_t rval;
- VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval);
+ VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval);
return rval;
}
STATIC ssize_t
xfs_file_splice_write_invis(
- struct inode *pipe,
+ struct pipe_inode_info *pipe,
struct file *outfilp,
+ loff_t *ppos,
size_t len,
unsigned int flags)
{
vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode);
ssize_t rval;
- VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval);
+ VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval);
return rval;
}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 149237304fb6..2e2e275c786f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,8 +673,7 @@ xfs_vn_setattr(
if (ia_valid & ATTR_ATIME) {
vattr.va_mask |= XFS_AT_ATIME;
vattr.va_atime = attr->ia_atime;
- if (ia_valid & ATTR_ATIME_SET)
- inode->i_atime = attr->ia_atime;
+ inode->i_atime = attr->ia_atime;
}
if (ia_valid & ATTR_MTIME) {
vattr.va_mask |= XFS_AT_MTIME;
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 90cd314acbaa..67efe3308980 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -338,7 +338,8 @@ ssize_t
xfs_splice_read(
bhv_desc_t *bdp,
struct file *infilp,
- struct inode *pipe,
+ loff_t *ppos,
+ struct pipe_inode_info *pipe,
size_t count,
int flags,
int ioflags,
@@ -360,7 +361,7 @@ xfs_splice_read(
int error;
error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
- infilp->f_pos, count,
+ *ppos, count,
FILP_DELAY_FLAG(infilp), &locktype);
if (error) {
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -368,8 +369,8 @@ xfs_splice_read(
}
}
xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore,
- pipe, count, infilp->f_pos, ioflags);
- ret = generic_file_splice_read(infilp, pipe, count, flags);
+ pipe, count, *ppos, ioflags);
+ ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -380,8 +381,9 @@ xfs_splice_read(
ssize_t
xfs_splice_write(
bhv_desc_t *bdp,
- struct inode *pipe,
+ struct pipe_inode_info *pipe,
struct file *outfilp,
+ loff_t *ppos,
size_t count,
int flags,
int ioflags,
@@ -403,7 +405,7 @@ xfs_splice_write(
int error;
error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp),
- outfilp->f_pos, count,
+ *ppos, count,
FILP_DELAY_FLAG(outfilp), &locktype);
if (error) {
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -411,8 +413,8 @@ xfs_splice_write(
}
}
xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
- pipe, count, outfilp->f_pos, ioflags);
- ret = generic_file_splice_write(pipe, outfilp, count, flags);
+ pipe, count, *ppos, ioflags);
+ ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
if (ret > 0)
XFS_STATS_ADD(xs_write_bytes, ret);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index eaa5659713fb..8f4539952350 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -93,11 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
loff_t *, int, size_t, read_actor_t,
void *, struct cred *);
-extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *,
- struct inode *, size_t, int, int,
+extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *,
+ struct pipe_inode_info *, size_t, int, int,
struct cred *);
-extern ssize_t xfs_splice_write(struct bhv_desc *, struct inode *,
- struct file *, size_t, int, int,
+extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *,
+ struct file *, loff_t *, size_t, int, int,
struct cred *);
#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 6f1c79a28f8b..2a8e16c22353 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -173,11 +173,11 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
loff_t *, int, size_t, read_actor_t,
void *, struct cred *);
-typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *,
- struct inode *, size_t, int, int,
+typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *,
+ struct pipe_inode_info *, size_t, int, int,
struct cred *);
-typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct inode *,
- struct file *, size_t, int, int,
+typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
+ struct file *, loff_t *, size_t, int, int,
struct cred *);
typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
int, unsigned int, void __user *);
@@ -284,10 +284,10 @@ typedef struct vnodeops {
rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \
rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv) \
- rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
-#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv) \
- rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \
+ rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \
+ rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
#define VOP_BMAP(vp,of,sz,rw,b,n,rv) \
rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
#define VOP_OPEN(vp, cr, rv) \
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 4eeb856183b1..deddbd03c166 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc(
*/
agi = XFS_BUF_TO_AGI(agbp);
newino = be32_to_cpu(agi->agi_newino);
- if(likely(newino != NULLAGINO)) {
- args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
- XFS_IALLOC_BLOCKS(args.mp);
+ args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+ XFS_IALLOC_BLOCKS(args.mp);
+ if (likely(newino != NULLAGINO &&
+ (args.agbno < be32_to_cpu(agi->agi_length)))) {
args.fsbno = XFS_AGB_TO_FSB(args.mp,
be32_to_cpu(agi->agi_seqno), args.agbno);
args.type = XFS_ALLOCTYPE_THIS_BNO;
@@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc(
* Set the alignment for the allocation.
* If stripe alignment is turned on then align at stripe unit
* boundary.
- * If the cluster size is smaller than a filesystem block
- * then we're doing I/O for inodes in filesystem block size
+ * If the cluster size is smaller than a filesystem block
+ * then we're doing I/O for inodes in filesystem block size
* pieces, so don't need alignment anyway.
*/
isaligned = 0;
@@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc(
args.alignment = args.mp->m_dalign;
isaligned = 1;
} else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
- args.mp->m_sb.sb_inoalignmt >=
+ args.mp->m_sb.sb_inoalignmt >=
XFS_B_TO_FSBT(args.mp,
XFS_INODE_CLUSTER_SIZE(args.mp)))
args.alignment = args.mp->m_sb.sb_inoalignmt;
@@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc(
if ((error = xfs_alloc_vextent(&args)))
return error;
}
-
+
/*
* If stripe alignment is turned on, then try again with cluster
* alignment.
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index bb33113eef9f..b53854325266 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -421,7 +421,10 @@ finish_inode:
ip->i_chash = chlnew;
chlnew->chl_ip = ip;
chlnew->chl_blkno = ip->i_blkno;
+ if (ch->ch_list)
+ ch->ch_list->chl_prev = chlnew;
chlnew->chl_next = ch->ch_list;
+ chlnew->chl_prev = NULL;
ch->ch_list = chlnew;
chlnew = NULL;
}
@@ -723,23 +726,15 @@ xfs_iextract(
ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
ASSERT(ip->i_chash != NULL);
chm=NULL;
- for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
- if (chl->chl_blkno == ip->i_blkno) {
- if (chm == NULL) {
- /* first item on the list */
- ch->ch_list = chl->chl_next;
- } else {
- chm->chl_next = chl->chl_next;
- }
- kmem_zone_free(xfs_chashlist_zone, chl);
- break;
- } else {
- ASSERT(chl->chl_ip != ip);
- chm = chl;
- }
- }
- ASSERT_ALWAYS(chl != NULL);
- } else {
+ chl = ip->i_chash;
+ if (chl->chl_prev)
+ chl->chl_prev->chl_next = chl->chl_next;
+ else
+ ch->ch_list = chl->chl_next;
+ if (chl->chl_next)
+ chl->chl_next->chl_prev = chl->chl_prev;
+ kmem_zone_free(xfs_chashlist_zone, chl);
+ } else {
/* delete one inode from a non-empty list */
iq = ip->i_cnext;
iq->i_cprev = ip->i_cprev;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 48146bdc6bdd..94b60dd03801 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2732,16 +2732,29 @@ xfs_iunpin(
ASSERT(atomic_read(&ip->i_pincount) > 0);
if (atomic_dec_and_test(&ip->i_pincount)) {
- vnode_t *vp = XFS_ITOV_NULL(ip);
+ /*
+ * If the inode is currently being reclaimed, the
+ * linux inode _and_ the xfs vnode may have been
+ * freed so we cannot reference either of them safely.
+ * Hence we should not try to do anything to them
+ * if the xfs inode is currently in the reclaim
+ * path.
+ *
+ * However, we still need to issue the unpin wakeup
+ * call as the inode reclaim may be blocked waiting for
+ * the inode to become unpinned.
+ */
+ if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
+ vnode_t *vp = XFS_ITOV_NULL(ip);
- /* make sync come back and flush this inode */
- if (vp) {
- struct inode *inode = vn_to_inode(vp);
+ /* make sync come back and flush this inode */
+ if (vp) {
+ struct inode *inode = vn_to_inode(vp);
- if (!(inode->i_state & I_NEW))
- mark_inode_dirty_sync(inode);
+ if (!(inode->i_state & I_NEW))
+ mark_inode_dirty_sync(inode);
+ }
}
-
wake_up(&ip->i_ipin_wait);
}
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 39ef9c36ea55..3b544db1790b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -189,6 +189,7 @@ typedef struct xfs_ihash {
*/
typedef struct xfs_chashlist {
struct xfs_chashlist *chl_next;
+ struct xfs_chashlist *chl_prev;
struct xfs_inode *chl_ip;
xfs_daddr_t chl_blkno; /* starting block number of
* the cluster */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 049fabb7f7e0..c0b1c2906880 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -270,7 +270,7 @@ xfs_mount_validate_sb(
(sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
(sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
(sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
- (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) {
+ (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
return XFS_ERROR(EFSCORRUPTED);
}