summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/orangefs/dir.c273
1 files changed, 185 insertions, 88 deletions
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 9744fb3ad144..7e9814fc6cc3 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -6,6 +6,22 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
+struct orangefs_dir_part {
+ struct orangefs_dir_part *next;
+ size_t len;
+};
+
+struct orangefs_dir {
+ __u64 token;
+ struct orangefs_dir_part *part;
+ loff_t end;
+ int error;
+};
+
+#define PART_SHIFT (24)
+#define PART_SIZE (1<<24)
+#define PART_MASK (~(PART_SIZE - 1))
+
/*
* There can be up to 512 directory entries. Each entry is encoded as
* follows:
@@ -15,42 +31,39 @@
* padding to 8 bytes
* 16 bytes: khandle
* padding to 8 bytes
- */
-#define MAX_DIRECTORY ((4 + 257 + 3 + 16)*512)
-
-struct orangefs_dir {
- __u64 token;
- void *directory;
- size_t len;
- int error;
-};
-
-/*
- * The userspace component sends several directory entries of the
- * following format. The first four bytes are the string length not
- * including a trailing zero byte. This is followed by the string and a
- * trailing zero padded to the next four byte boundry. This is followed
- * by the sixteen byte khandle padded to the next eight byte boundry.
*
* The trailer_buf starts with a struct orangefs_readdir_response_s
* which must be skipped to get to the directory data.
+ *
+ * The data which is received from the userspace daemon is termed a
+ * part and is stored in a linked list in case more than one part is
+ * needed for a large directory.
+ *
+ * The position pointer (ctx->pos) encodes the part and offset on which
+ * to begin reading at. Bits above PART_SHIFT encode the part and bits
+ * below PART_SHIFT encode the offset. Parts are stored in a linked
+ * list which grows as data is received from the server. The overhead
+ * associated with managing the list is presumed to be small compared to
+ * the overhead of communicating with the server.
+ *
+ * As data is received from the server, it is placed at the end of the
+ * part list. Data is parsed from the current position as it is needed.
+ * When data is determined to be corrupt, it is either because the
+ * userspace component has sent back corrupt data or because the file
+ * pointer has been moved to an invalid location. Since the two cannot
+ * be differentiated, return EIO.
+ *
+ * Part zero is synthesized to contains `.' and `..'. Part one is the
+ * first part of the part list.
*/
-static int orangefs_dir_more(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry)
+static int do_readdir(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct orangefs_kernel_op_s *op)
{
- const size_t offset =
- sizeof(struct orangefs_readdir_response_s);
struct orangefs_readdir_response_s *resp;
- struct orangefs_kernel_op_s *op;
int bufi, r;
- op = op_alloc(ORANGEFS_VFS_OP_READDIR);
- if (!op) {
- od->error = -ENOMEM;
- return -ENOMEM;
- }
-
/*
* Despite the badly named field, readdir does not use shared
* memory. However, there are a limited number of readdir
@@ -66,7 +79,6 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
again:
bufi = orangefs_readdir_index_get();
if (bufi < 0) {
- op_release(op);
od->error = bufi;
return bufi;
}
@@ -84,7 +96,6 @@ again:
goto again;
} else if (r == -EIO) {
vfree(op->downcall.trailer_buf);
- op_release(op);
od->error = r;
return r;
}
@@ -92,82 +103,166 @@ again:
if (r < 0) {
vfree(op->downcall.trailer_buf);
- op_release(op);
od->error = r;
return r;
} else if (op->downcall.status) {
vfree(op->downcall.trailer_buf);
- op_release(op);
od->error = op->downcall.status;
return op->downcall.status;
}
+ /*
+ * The maximum size is size per entry times the 512 entries plus
+ * the header. This is well under the limit.
+ */
+ if (op->downcall.trailer_size > PART_SIZE) {
+ vfree(op->downcall.trailer_buf);
+ od->error = -EIO;
+ return -EIO;
+ }
+
resp = (struct orangefs_readdir_response_s *)
op->downcall.trailer_buf;
od->token = resp->token;
+ return 0;
+}
- if (od->len + op->downcall.trailer_size - offset <=
- MAX_DIRECTORY) {
- memcpy(od->directory + od->len,
- op->downcall.trailer_buf + offset,
- op->downcall.trailer_size - offset);
- od->len += op->downcall.trailer_size - offset;
- } else {
- /* This limit was chosen based on protocol limits. */
- gossip_err("orangefs_dir_more: userspace sent too much data\n");
- vfree(op->downcall.trailer_buf);
- op_release(op);
- od->error = -EIO;
- return -EIO;
+static int parse_readdir(struct orangefs_dir *od,
+ struct orangefs_kernel_op_s *op)
+{
+ struct orangefs_dir_part *part, *new;
+ size_t count;
+
+ count = 1;
+ part = od->part;
+ while (part && part->next) {
+ part = part->next;
+ count++;
}
- vfree(op->downcall.trailer_buf);
- op_release(op);
+ new = (void *)op->downcall.trailer_buf;
+ new->next = NULL;
+ new->len = op->downcall.trailer_size -
+ sizeof(struct orangefs_readdir_response_s);
+ if (!od->part)
+ od->part = new;
+ else
+ part->next = new;
+ count++;
+ od->end = count << PART_SHIFT;
+
return 0;
}
-static int orangefs_dir_fill(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry,
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry)
+{
+ struct orangefs_kernel_op_s *op;
+ int r;
+
+ op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!op) {
+ od->error = -ENOMEM;
+ return -ENOMEM;
+ }
+ r = do_readdir(oi, od, dentry, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+ r = parse_readdir(od, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+
+ od->error = 0;
+out:
+ op_release(op);
+ return od->error;
+}
+
+static int fill_from_part(struct orangefs_dir_part *part,
struct dir_context *ctx)
{
+ const int offset = sizeof(struct orangefs_readdir_response_s);
struct orangefs_khandle *khandle;
__u32 *len, padlen;
loff_t i;
char *s;
- i = ctx->pos - 2;
- while (i < od->len) {
- if (od->len < i + sizeof *len)
- goto eio;
- len = od->directory + i;
+ i = ctx->pos & ~PART_MASK;
+
+ /* The file offset from userspace is too large. */
+ if (i > part->len)
+ return -EIO;
+
+ while (i < part->len) {
+ if (part->len < i + sizeof *len)
+ return -EIO;
+ len = (void *)part + offset + i;
/*
* len is the size of the string itself. padlen is the
* total size of the encoded string.
*/
padlen = (sizeof *len + *len + 1) +
- (4 - (sizeof *len + *len + 1)%8)%8;
- if (od->len < i + padlen + sizeof *khandle)
- goto eio;
- s = od->directory + i + sizeof *len;
+ (8 - (sizeof *len + *len + 1)%8)%8;
+ if (part->len < i + padlen + sizeof *khandle)
+ return -EIO;
+ s = (void *)part + offset + i + sizeof *len;
if (s[*len] != 0)
- goto eio;
- khandle = od->directory + i + padlen;
-
+ return -EIO;
+ khandle = (void *)part + offset + i + padlen;
if (!dir_emit(ctx, s, *len,
- orangefs_khandle_to_ino(khandle), DT_UNKNOWN))
+ orangefs_khandle_to_ino(khandle),
+ DT_UNKNOWN))
return 0;
i += padlen + sizeof *khandle;
i = i + (8 - i%8)%8;
- ctx->pos = i + 2;
+ BUG_ON(i > part->len);
+ ctx->pos = (ctx->pos & PART_MASK) | i;
+ }
+ return 1;
+}
+
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry,
+ struct dir_context *ctx)
+{
+ struct orangefs_dir_part *part;
+ size_t count;
+
+ count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
+
+ part = od->part;
+ while (part->next && count) {
+ count--;
+ part = part->next;
+ }
+ /* This means the userspace file offset is invalid. */
+ if (count) {
+ od->error = -EIO;
+ return -EIO;
+ }
+
+ while (part && part->len) {
+ int r;
+ r = fill_from_part(part, ctx);
+ if (r < 0) {
+ od->error = r;
+ return r;
+ } else if (r == 0) {
+ /* Userspace buffer is full. */
+ break;
+ } else {
+ /*
+ * The part ran out of data. Move to the next
+ * part. */
+ ctx->pos = (ctx->pos & PART_MASK) +
+ (1 << PART_SHIFT);
+ part = part->next;
+ }
}
- BUG_ON(i > od->len);
return 0;
-eio:
- /*
- * Here either data from userspace is corrupt or the application
- * has sought to an invalid location.
- */
- od->error = -EIO;
- return -EIO;
}
static int orangefs_dir_iterate(struct file *file,
@@ -193,28 +288,33 @@ static int orangefs_dir_iterate(struct file *file,
if (ctx->pos == 1) {
if (!dir_emit_dotdot(file, ctx))
return 0;
- ctx->pos++;
+ ctx->pos = 1 << PART_SHIFT;
}
+ /*
+ * The seek position is in the first synthesized part but is not
+ * valid.
+ */
+ if ((ctx->pos & PART_MASK) == 0)
+ return -EIO;
+
r = 0;
/*
* Must read more if the user has sought past what has been read
* so far. Stop a user who has sought past the end.
*/
- while (od->token != ORANGEFS_READDIR_END && ctx->pos - 2 >
- od->len) {
+ while (od->token != ORANGEFS_READDIR_END &&
+ ctx->pos > od->end) {
r = orangefs_dir_more(oi, od, dentry);
if (r)
return r;
}
- if (od->token == ORANGEFS_READDIR_END && ctx->pos - 2 >
- od->len) {
+ if (od->token == ORANGEFS_READDIR_END && ctx->pos > od->end)
return -EIO;
- }
/* Then try to fill if there's any left in the buffer. */
- if (ctx->pos - 2 < od->len) {
+ if (ctx->pos < od->end) {
r = orangefs_dir_fill(oi, od, dentry, ctx);
if (r)
return r;
@@ -240,16 +340,8 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)
return -ENOMEM;
od = file->private_data;
od->token = ORANGEFS_READDIR_START;
- /*
- * XXX: It seems wasteful to allocate such a large buffer for
- * each request. Most will be much smaller.
- */
- od->directory = alloc_pages_exact(MAX_DIRECTORY, GFP_KERNEL);
- if (!od->directory) {
- kfree(file->private_data);
- return -ENOMEM;
- }
- od->len = 0;
+ od->part = NULL;
+ od->end = 1 << PART_SHIFT;
od->error = 0;
return 0;
}
@@ -257,8 +349,13 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)
static int orangefs_dir_release(struct inode *inode, struct file *file)
{
struct orangefs_dir *od = file->private_data;
+ struct orangefs_dir_part *part = od->part;
orangefs_flush_inode(inode);
- free_pages_exact(od->directory, MAX_DIRECTORY);
+ while (part) {
+ struct orangefs_dir_part *next = part->next;
+ vfree(part);
+ part = next;
+ }
kfree(od);
return 0;
}