From 115ff50bade0f93a288677745a5884def6cbf9b1 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 26 Jul 2006 14:52:13 -0500 Subject: JFS: Quota support broken, no quota_read and quota_write jfs_quota_read/write are very near duplicates of ext2_quota_read/write. Cleaned up jfs_get_block as long as I had to change it to be non-static. Signed-off-by: Dave Kleikamp --- fs/jfs/inode.c | 16 ++------ fs/jfs/jfs_inode.h | 1 + fs/jfs/super.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 119 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 43e3f566aad6..a223cf4faa9b 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -168,16 +168,15 @@ void jfs_dirty_inode(struct inode *inode) set_cflag(COMMIT_Dirty, inode); } -static int -jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, - struct buffer_head *bh_result, int create) +int jfs_get_block(struct inode *ip, sector_t lblock, + struct buffer_head *bh_result, int create) { s64 lblock64 = lblock; int rc = 0; xad_t xad; s64 xaddr; int xflag; - s32 xlen = max_blocks; + s32 xlen = bh_result->b_size >> ip->i_blkbits; /* * Take appropriate lock on inode @@ -188,7 +187,7 @@ jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, IREAD_LOCK(ip); if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && - (!xtLookup(ip, lblock64, max_blocks, &xflag, &xaddr, &xlen, 0)) && + (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && xaddr) { if (xflag & XAD_NOTRECORDED) { if (!create) @@ -255,13 +254,6 @@ jfs_get_blocks(struct inode *ip, sector_t lblock, unsigned long max_blocks, return rc; } -static int jfs_get_block(struct inode *ip, sector_t lblock, - struct buffer_head *bh_result, int create) -{ - return jfs_get_blocks(ip, lblock, bh_result->b_size >> ip->i_blkbits, - bh_result, create); -} - static int jfs_writepage(struct page *page, struct writeback_control *wbc) { return nobh_writepage(page, jfs_get_block, wbc); diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index b5c7da6190dc..1fc48df670c8 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -32,6 +32,7 @@ extern void jfs_truncate_nolock(struct inode *, loff_t); extern void jfs_free_zero_link(struct inode *); extern struct dentry *jfs_get_parent(struct dentry *dentry); extern void jfs_set_inode_flags(struct inode *); +extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); extern const struct address_space_operations jfs_aops; extern struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4f6cfebc82db..90ee0de829c1 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -298,7 +299,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, break; } -#if defined(CONFIG_QUOTA) +#ifdef CONFIG_QUOTA case Opt_quota: case Opt_usrquota: *flag |= JFS_USRQUOTA; @@ -597,7 +598,7 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (sbi->flag & JFS_NOINTEGRITY) seq_puts(seq, ",nointegrity"); -#if defined(CONFIG_QUOTA) +#ifdef CONFIG_QUOTA if (sbi->flag & JFS_USRQUOTA) seq_puts(seq, ",usrquota"); @@ -608,6 +609,112 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs) return 0; } +#ifdef CONFIG_QUOTA + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and noone else should touch the files) + * we don't have to be afraid of races */ +static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head tmp_bh; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + + tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; + err = jfs_get_block(inode, blk, &tmp_bh, 0); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t jfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t towrite = len; + struct buffer_head tmp_bh; + struct buffer_head *bh; + + mutex_lock(&inode->i_mutex); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + + tmp_bh.b_state = 0; + err = jfs_get_block(inode, blk, &tmp_bh, 1); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) + return err; + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_version++; + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + mutex_unlock(&inode->i_mutex); + return len - towrite; +} + +#endif + static struct super_operations jfs_super_operations = { .alloc_inode = jfs_alloc_inode, .destroy_inode = jfs_destroy_inode, @@ -621,7 +728,11 @@ static struct super_operations jfs_super_operations = { .unlockfs = jfs_unlockfs, .statfs = jfs_statfs, .remount_fs = jfs_remount, - .show_options = jfs_show_options + .show_options = jfs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = jfs_quota_read, + .quota_write = jfs_quota_write, +#endif }; static struct export_operations jfs_export_operations = { -- cgit v1.2.3 From 8bcb2839b74d605f5549962a6e69dc07768e95b6 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Fri, 28 Jul 2006 08:46:05 -0500 Subject: JFS: Fix bug in quota code. tmp_bh.b_size must be initialized Signed-off-by: Dave Kleikamp --- fs/jfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 90ee0de829c1..143bcd1d5eaa 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -678,6 +678,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, sb->s_blocksize - offset : towrite; tmp_bh.b_state = 0; + tmp_bh.b_size = 1 << inode->i_blkbits; err = jfs_get_block(inode, blk, &tmp_bh, 1); if (err) goto out; -- cgit v1.2.3 From 4b1af774451bbc8440719e3fe441934a337c3b63 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Mon, 26 Jun 2006 15:17:47 -0700 Subject: ocfs2: Fix lvb corruption Properly ignore LVB flags during a PR downconvert. This avoids an illegal lvb update. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmunlock.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index b0c3134f4f70..4ce3f82fb736 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -483,6 +483,10 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data) /* lock was found on queue */ lksb = lock->lksb; + if (flags & (LKM_VALBLK|LKM_PUT_LVB) && + lock->ml.type != LKM_EXMODE) + flags &= ~(LKM_VALBLK|LKM_PUT_LVB); + /* unlockast only called on originating node */ if (flags & LKM_PUT_LVB) { lksb->flags |= DLM_LKSB_PUT_LVB; @@ -632,6 +636,8 @@ retry: spin_lock(&res->spinlock); is_master = (res->owner == dlm->node_num); + if (flags & LKM_VALBLK && lock->ml.type != LKM_EXMODE) + flags &= ~LKM_VALBLK; spin_unlock(&res->spinlock); if (is_master) { -- cgit v1.2.3 From a23eac99d4392b8b779305498d7614e41a0e16e9 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Sun, 18 Jun 2006 21:28:01 -0700 Subject: ocfs2: do not modify lksb->status in the unlock ast This can race with other ast notification, which can cause bad status values to propagate into the unlock ast. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmunlock.c | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 4ce3f82fb736..59866e471d96 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -155,7 +155,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, else status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); - if (status != DLM_NORMAL) + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) goto leave; /* By now this has been masked out of cancel requests. */ @@ -183,8 +183,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_lock(&lock->spinlock); /* if the master told us the lock was already granted, * let the ast handle all of these actions */ - if (status == DLM_NORMAL && - lksb->status == DLM_CANCELGRANT) { + if (status == DLM_CANCELGRANT) { actions &= ~(DLM_UNLOCK_REMOVE_LOCK| DLM_UNLOCK_REGRANT_LOCK| DLM_UNLOCK_CLEAR_CONVERT_TYPE); @@ -349,14 +348,9 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, vec, veclen, owner, &status); if (tmpret >= 0) { // successfully sent and received - if (status == DLM_CANCELGRANT) - ret = DLM_NORMAL; - else if (status == DLM_FORWARD) { + if (status == DLM_FORWARD) mlog(0, "master was in-progress. retry\n"); - ret = DLM_FORWARD; - } else - ret = status; - lksb->status = status; + ret = status; } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { @@ -372,7 +366,6 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, /* something bad. this will BUG in ocfs2 */ ret = dlm_err_to_dlm_status(tmpret); } - lksb->status = ret; } return ret; @@ -511,11 +504,8 @@ not_found: "cookie=%u:%llu\n", dlm_get_lock_cookie_node(unlock->cookie), dlm_get_lock_cookie_seq(unlock->cookie)); - else { - /* send the lksb->status back to the other node */ - status = lksb->status; + else dlm_lock_put(lock); - } leave: if (res) @@ -537,26 +527,22 @@ static enum dlm_status dlm_get_cancel_actions(struct dlm_ctxt *dlm, if (dlm_lock_on_list(&res->blocked, lock)) { /* cancel this outright */ - lksb->status = DLM_NORMAL; status = DLM_NORMAL; *actions = (DLM_UNLOCK_CALL_AST | DLM_UNLOCK_REMOVE_LOCK); } else if (dlm_lock_on_list(&res->converting, lock)) { /* cancel the request, put back on granted */ - lksb->status = DLM_NORMAL; status = DLM_NORMAL; *actions = (DLM_UNLOCK_CALL_AST | DLM_UNLOCK_REMOVE_LOCK | DLM_UNLOCK_REGRANT_LOCK | DLM_UNLOCK_CLEAR_CONVERT_TYPE); } else if (dlm_lock_on_list(&res->granted, lock)) { - /* too late, already granted. DLM_CANCELGRANT */ - lksb->status = DLM_CANCELGRANT; - status = DLM_NORMAL; + /* too late, already granted. */ + status = DLM_CANCELGRANT; *actions = DLM_UNLOCK_CALL_AST; } else { mlog(ML_ERROR, "lock to cancel is not on any list!\n"); - lksb->status = DLM_IVLOCKID; status = DLM_IVLOCKID; *actions = 0; } @@ -573,13 +559,11 @@ static enum dlm_status dlm_get_unlock_actions(struct dlm_ctxt *dlm, /* unlock request */ if (!dlm_lock_on_list(&res->granted, lock)) { - lksb->status = DLM_DENIED; status = DLM_DENIED; dlm_error(status); *actions = 0; } else { /* unlock granted lock */ - lksb->status = DLM_NORMAL; status = DLM_NORMAL; *actions = (DLM_UNLOCK_FREE_LOCK | DLM_UNLOCK_CALL_AST | @@ -671,7 +655,7 @@ retry: } if (call_ast) { - mlog(0, "calling unlockast(%p, %d)\n", data, lksb->status); + mlog(0, "calling unlockast(%p, %d)\n", data, status); if (is_master) { /* it is possible that there is one last bast * pending. make sure it is flushed, then @@ -683,9 +667,12 @@ retry: wait_event(dlm->ast_wq, dlm_lock_basts_flushed(dlm, lock)); } - (*unlockast)(data, lksb->status); + (*unlockast)(data, status); } + if (status == DLM_CANCELGRANT) + status = DLM_NORMAL; + if (status == DLM_NORMAL) { mlog(0, "kicking the thread\n"); dlm_kick_thread(dlm, res); -- cgit v1.2.3 From 34e3d180370c44ad3ecd3a1f9099e150d3bb103f Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Sat, 15 Jul 2006 10:22:39 -0700 Subject: ocfs2: fix check for locally granted state during dlmunlock() If a process requests a lock cancel but the lock has been remotely granted already then there is no need to send the cancel message. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmunlock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 59866e471d96..37be4b2e0d4a 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -155,7 +155,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, else status = dlm_get_unlock_actions(dlm, res, lock, lksb, &actions); - if (status != DLM_NORMAL && status != DLM_CANCELGRANT) + if (status != DLM_NORMAL && (status != DLM_CANCELGRANT || !master_node)) goto leave; /* By now this has been masked out of cancel requests. */ -- cgit v1.2.3 From 9acd72f4240429dfd762c9a2c7eb5c18b5d32529 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 15 Jul 2006 02:36:01 +0200 Subject: [PATCH] fs/ocfs2/dlm/dlmmaster.c: unexport dlm_migrate_lockres This patch removes the unused EXPORT_SYMBOL_GPL(dlm_migrate_lockres). Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmmaster.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 1b8346dd0572..9503240ef0e5 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2375,7 +2375,6 @@ leave: mlog(0, "returning %d\n", ret); return ret; } -EXPORT_SYMBOL_GPL(dlm_migrate_lockres); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) { -- cgit v1.2.3 From 101ebf256de54e78e6d3277adacf656e125a2c5a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 2 May 2006 17:54:45 -0700 Subject: ocfs2: limit cluster bitmap information saved at mount We were storing cluster count on the ocfs2_super structure, but never actually using it so remove that. Also, we don't want to populate the uptodate cache with the unlocked block read - it is technically safe as is, but we should change it for correctness. Signed-off-by: Mark Fasheh --- fs/ocfs2/ocfs2.h | 1 - fs/ocfs2/super.c | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index cd4a6f253d13..d52100d49b6c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -197,7 +197,6 @@ struct ocfs2_super struct ocfs2_node_map recovery_map; struct ocfs2_node_map umount_map; - u32 num_clusters; u64 root_blkno; u64 system_dir_blkno; u64 bitmap_blkno; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 382706a67ffd..d17e33e66a1e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1442,8 +1442,13 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; + /* We don't have a cluster lock on the bitmap here because + * we're only interested in static information and the extra + * complexity at mount time isn't worht it. Don't pass the + * inode in to the read function though as we don't want it to + * be put in the cache. */ status = ocfs2_read_block(osb, osb->bitmap_blkno, &bitmap_bh, 0, - inode); + NULL); iput(inode); if (status < 0) { mlog_errno(status); @@ -1452,7 +1457,6 @@ static int ocfs2_initialize_super(struct super_block *sb, di = (struct ocfs2_dinode *) bitmap_bh->b_data; osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg); - osb->num_clusters = le32_to_cpu(di->id1.bitmap1.i_total); brelse(bitmap_bh); mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n", (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg); -- cgit v1.2.3 From 7bf72edee614e10b8d470c40a326f47bfdd69992 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 3 May 2006 17:46:50 -0700 Subject: ocfs2: better group descriptor consistency checks Try to catch corrupted group descriptors with some stronger checks placed in a couple of strategic locations. Detect a failed resizefs and refuse to allocate past what bitmap i_clusters allows. Signed-off-by: Mark Fasheh --- fs/ocfs2/suballoc.c | 114 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 94 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 195523090c87..7ac68cac041d 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -85,11 +85,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, u64 *bg_blkno); static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, int nr); -static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, - struct buffer_head *bg_bh, - unsigned int bits_wanted, - u16 *bit_off, - u16 *bits_found); static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle, struct inode *alloc_inode, struct ocfs2_group_desc *bg, @@ -143,6 +138,64 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); } +/* somewhat more expensive than our other checks, so use sparingly. */ +static int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct ocfs2_group_desc *gd) +{ + unsigned int max_bits; + + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); + return -EIO; + } + + if (di->i_blkno != gd->bg_parent_dinode) { + ocfs2_error(sb, "Group descriptor # %llu has bad parent " + "pointer (%llu, expected %llu)", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + return -EIO; + } + + max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); + if (le16_to_cpu(gd->bg_bits) > max_bits) { + ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits)); + return -EIO; + } + + if (le16_to_cpu(gd->bg_chain) >= + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { + ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_chain)); + return -EIO; + } + + if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { + ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " + "claims that %u are free", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + return -EIO; + } + + if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { + ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " + "max bitmap bits of %u", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + return -EIO; + } + + return 0; +} + static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle, struct inode *alloc_inode, struct buffer_head *bg_bh, @@ -663,6 +716,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, struct buffer_head *bg_bh, unsigned int bits_wanted, + unsigned int total_bits, u16 *bit_off, u16 *bits_found) { @@ -679,10 +733,8 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; - while((offset = ocfs2_find_next_zero_bit(bitmap, - le16_to_cpu(bg->bg_bits), - start)) != -1) { - if (offset == le16_to_cpu(bg->bg_bits)) + while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) { + if (offset == total_bits) break; if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) { @@ -911,14 +963,35 @@ static int ocfs2_cluster_group_search(struct inode *inode, { int search = -ENOSPC; int ret; - struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; u16 tmp_off, tmp_found; + unsigned int max_bits, gd_cluster_off; BUG_ON(!ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) { + if (gd->bg_free_bits_count) { + max_bits = le16_to_cpu(gd->bg_bits); + + /* Tail groups in cluster bitmaps which aren't cpg + * aligned are prone to partial extention by a failed + * fs resize. If the file system resize never got to + * update the dinode cluster count, then we don't want + * to trust any clusters past it, regardless of what + * the group descriptor says. */ + gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(gd->bg_blkno)); + if ((gd_cluster_off + max_bits) > + OCFS2_I(inode)->ip_clusters) { + max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; + mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", + (unsigned long long)le64_to_cpu(gd->bg_blkno), + le16_to_cpu(gd->bg_bits), + OCFS2_I(inode)->ip_clusters, max_bits); + } + ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, + max_bits, &tmp_off, &tmp_found); if (ret) return ret; @@ -951,6 +1024,7 @@ static int ocfs2_block_group_search(struct inode *inode, if (bg->bg_free_bits_count) ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, + le16_to_cpu(bg->bg_bits), bit_off, bits_found); return ret; @@ -988,9 +1062,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; + status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); + if (status) { + mlog_errno(status); goto bail; } @@ -1018,9 +1092,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; + status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); + if (status) { + mlog_errno(status); goto bail; } } @@ -1494,9 +1568,9 @@ static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle, } group = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(group)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group); - status = -EIO; + status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); + if (status) { + mlog_errno(status); goto bail; } BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); -- cgit v1.2.3 From 883d4cae4a2b01a05193cf2665c77b7489a8b6a0 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 5 Jun 2006 16:41:00 -0400 Subject: ocfs2: allocation hints Record the most recently used allocation group on the allocation context, so that subsequent allocations can attempt to optimize for contiguousness. Local alloc especially should benefit from this as the current chain search tends to let it spew across the disk. Signed-off-by: Mark Fasheh --- fs/ocfs2/localalloc.c | 8 +++ fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/suballoc.c | 147 +++++++++++++++++++++++++++++++++++++++++++++----- fs/ocfs2/suballoc.h | 2 + 4 files changed, 145 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0d1973ea32b0..1f17a4d08287 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -840,6 +840,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, mlog(0, "Allocating %u clusters for a new window.\n", ocfs2_local_alloc_window_bits(osb)); + + /* Instruct the allocation code to try the most recently used + * cluster group. We'll re-record the group used this pass + * below. */ + ac->ac_last_group = osb->la_last_gd; + /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ @@ -852,6 +858,8 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, goto bail; } + osb->la_last_gd = ac->ac_last_group; + la->la_bm_off = cpu_to_le32(cluster_off); alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count); /* just in case... In the future when we find space ourselves, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d52100d49b6c..0462a7f4e21b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -236,6 +236,7 @@ struct ocfs2_super enum ocfs2_local_alloc_state local_alloc_state; struct buffer_head *local_alloc_bh; + u64 la_last_gd; /* Next two fields are for local node slot recovery during * mount. */ diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 7ac68cac041d..9d91e66f51a9 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -70,12 +70,6 @@ static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, u16 *bit_off, u16 *bits_found); -static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, - u32 bits_wanted, - u32 min_bits, - u16 *bit_off, - unsigned int *num_bits, - u64 *bg_blkno); static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, u32 bits_wanted, @@ -1030,12 +1024,103 @@ static int ocfs2_block_group_search(struct inode *inode, return ret; } +static int ocfs2_alloc_dinode_update_counts(struct inode *inode, + struct ocfs2_journal_handle *handle, + struct buffer_head *di_bh, + u32 num_bits, + u16 chain) +{ + int ret; + u32 tmp_used; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; + struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; + + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); + di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); + le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) + mlog_errno(ret); + +out: + return ret; +} + +static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, + u32 bits_wanted, + u32 min_bits, + u16 *bit_off, + unsigned int *num_bits, + u64 gd_blkno, + u16 *bits_left) +{ + int ret; + u16 found; + struct buffer_head *group_bh = NULL; + struct ocfs2_group_desc *gd; + struct inode *alloc_inode = ac->ac_inode; + struct ocfs2_journal_handle *handle = ac->ac_handle; + + ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, + &group_bh, OCFS2_BH_CACHED, alloc_inode); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + gd = (struct ocfs2_group_desc *) group_bh->b_data; + if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { + OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd); + ret = -EIO; + goto out; + } + + ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, + bit_off, &found); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + *num_bits = found; + + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, + *num_bits, + le16_to_cpu(gd->bg_chain)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh, + *bit_off, *num_bits); + if (ret < 0) + mlog_errno(ret); + + *bits_left = le16_to_cpu(gd->bg_free_bits_count); + +out: + brelse(group_bh); + + return ret; +} + static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, u32 bits_wanted, u32 min_bits, u16 *bit_off, unsigned int *num_bits, - u64 *bg_blkno) + u64 *bg_blkno, + u16 *bits_left) { int status; u16 chain, tmp_bits; @@ -1173,6 +1258,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, (unsigned long long)fe->i_blkno); *bg_blkno = le64_to_cpu(bg->bg_blkno); + *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: if (group_bh) brelse(group_bh); @@ -1194,6 +1280,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, { int status; u16 victim, i; + u16 bits_left = 0; + u64 hint_blkno = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1220,6 +1308,28 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, goto bail; } + if (hint_blkno) { + /* Attempt to short-circuit the usual search mechanism + * by jumping straight to the most recently used + * allocation group. This helps us mantain some + * contiguousness across allocations. */ + status = ocfs2_search_one_group(ac, bits_wanted, min_bits, + bit_off, num_bits, + hint_blkno, &bits_left); + if (!status) { + /* Be careful to update *bg_blkno here as the + * caller is expecting it to be filled in, and + * ocfs2_search_one_group() won't do that for + * us. */ + *bg_blkno = hint_blkno; + goto set_hint; + } + if (status < 0 && status != -ENOSPC) { + mlog_errno(status); + goto bail; + } + } + cl = (struct ocfs2_chain_list *) &fe->id2.i_chain; victim = ocfs2_find_victim_chain(cl); @@ -1227,9 +1337,9 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_allow_chain_relink = 1; status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off, - num_bits, bg_blkno); + num_bits, bg_blkno, &bits_left); if (!status) - goto bail; + goto set_hint; if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1251,8 +1361,8 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, ac->ac_chain = i; status = ocfs2_search_chain(ac, bits_wanted, min_bits, - bit_off, num_bits, - bg_blkno); + bit_off, num_bits, bg_blkno, + &bits_left); if (!status) break; if (status < 0 && status != -ENOSPC) { @@ -1260,8 +1370,19 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, goto bail; } } -bail: +set_hint: + if (status != -ENOSPC) { + /* If the next search of this group is not likely to + * yield a suitable extent, then we reset the last + * group hint so as to not waste a disk read */ + if (bits_left < min_bits) + ac->ac_last_group = 0; + else + ac->ac_last_group = *bg_blkno; + } + +bail: mlog_exit(status); return status; } @@ -1415,7 +1536,7 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, { int status; unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given; - u64 bg_blkno; + u64 bg_blkno = 0; u16 bg_bit_off; mlog_entry_void(); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a76c82a7ceac..c787838d1052 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -49,6 +49,8 @@ struct ocfs2_alloc_context { u16 ac_chain; int ac_allow_chain_relink; group_search_t *ac_group_search; + + u64 ac_last_group; }; void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); -- cgit v1.2.3 From 0e1edbd99994270023cea5afe593f972eb09a778 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Thu, 10 Aug 2006 14:40:41 +1000 Subject: [XFS] Fix xfs_free_extent related NULL pointer dereference. We recently fixed an out-of-space deadlock in XFS, and part of that fix involved the addition of the XFS_ALLOC_FLAG_FREEING flag to some of the space allocator calls to indicate they're freeing space, not allocating it. There was a missed xfs_alloc_fix_freelist condition test that did not correctly test "flags". The same test would also test an uninitialised structure field (args->userdata) and depending on its value either would or would not return early with a critical buffer pointer set to NULL. This fixes that up, adds asserts to several places to catch future botches of this nature, and skips sections of xfs_alloc_fix_freelist that are irrelevent for the space-freeing case. SGI-PV: 955303 SGI-Modid: xfs-linux-melb:xfs-kern:26743a Signed-off-by: Nathan Scott --- fs/xfs/xfs_alloc.c | 103 ++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index eef6763f3a67..d2bbcd882a69 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1835,40 +1835,47 @@ xfs_alloc_fix_freelist( &agbp))) return error; if (!pag->pagf_init) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); args->agbp = NULL; return 0; } } else agbp = NULL; - /* If this is a metadata preferred pag and we are user data + /* + * If this is a metadata preferred pag and we are user data * then try somewhere else if we are not being asked to * try harder at this point */ - if (pag->pagf_metadata && args->userdata && flags) { + if (pag->pagf_metadata && args->userdata && + (flags & XFS_ALLOC_FLAG_TRYLOCK)) { + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); args->agbp = NULL; return 0; } - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; - /* - * If it looks like there isn't a long enough extent, or enough - * total blocks, reject it. - */ - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || pag->pagf_longest > 0); - if (args->minlen + args->alignment + args->minalignslop - 1 > longest || - (!(flags & XFS_ALLOC_FLAG_FREEING) && - (int)(pag->pagf_freeblks + pag->pagf_flcount - - need - args->total) < - (int)args->minleft)) { - if (agbp) - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + need = XFS_MIN_FREELIST_PAG(pag, mp); + delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; + /* + * If it looks like there isn't a long enough extent, or enough + * total blocks, reject it. + */ + longest = (pag->pagf_longest > delta) ? + (pag->pagf_longest - delta) : + (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(pag->pagf_freeblks + pag->pagf_flcount - + need - args->total) < (int)args->minleft)) { + if (agbp) + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } } + /* * Get the a.g. freespace buffer. * Can fail if we're not blocking on locks, and it's held. @@ -1878,6 +1885,8 @@ xfs_alloc_fix_freelist( &agbp))) return error; if (agbp == NULL) { + ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); + ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); args->agbp = NULL; return 0; } @@ -1887,22 +1896,24 @@ xfs_alloc_fix_freelist( */ agf = XFS_BUF_TO_AGF(agbp); need = XFS_MIN_FREELIST(agf, mp); - delta = need > be32_to_cpu(agf->agf_flcount) ? - (need - be32_to_cpu(agf->agf_flcount)) : 0; /* * If there isn't enough total or single-extent, reject it. */ - longest = be32_to_cpu(agf->agf_longest); - longest = (longest > delta) ? (longest - delta) : - (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); - if (args->minlen + args->alignment + args->minalignslop - 1 > longest || - (!(flags & XFS_ALLOC_FLAG_FREEING) && - (int)(be32_to_cpu(agf->agf_freeblks) + - be32_to_cpu(agf->agf_flcount) - need - args->total) < - (int)args->minleft)) { - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + if (!(flags & XFS_ALLOC_FLAG_FREEING)) { + delta = need > be32_to_cpu(agf->agf_flcount) ? + (need - be32_to_cpu(agf->agf_flcount)) : 0; + longest = be32_to_cpu(agf->agf_longest); + longest = (longest > delta) ? (longest - delta) : + (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); + if ((args->minlen + args->alignment + args->minalignslop - 1) > + longest || + ((int)(be32_to_cpu(agf->agf_freeblks) + + be32_to_cpu(agf->agf_flcount) - need - args->total) < + (int)args->minleft)) { + xfs_trans_brelse(tp, agbp); + args->agbp = NULL; + return 0; + } } /* * Make the freelist shorter if it's too long. @@ -1950,12 +1961,11 @@ xfs_alloc_fix_freelist( * on a completely full ag. */ if (targs.agbno == NULLAGBLOCK) { - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - xfs_trans_brelse(tp, agflbp); - args->agbp = NULL; - return 0; - } - break; + if (flags & XFS_ALLOC_FLAG_FREEING) + break; + xfs_trans_brelse(tp, agflbp); + args->agbp = NULL; + return 0; } /* * Put each allocated block on the list. @@ -2442,31 +2452,26 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len) /* length of extent */ { -#ifdef DEBUG - xfs_agf_t *agf; /* a.g. freespace header */ -#endif - xfs_alloc_arg_t args; /* allocation argument structure */ + xfs_alloc_arg_t args; int error; ASSERT(len != 0); + memset(&args, 0, sizeof(xfs_alloc_arg_t)); args.tp = tp; args.mp = tp->t_mountp; args.agno = XFS_FSB_TO_AGNO(args.mp, bno); ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - args.alignment = 1; - args.minlen = args.minleft = args.minalignslop = 0; down_read(&args.mp->m_peraglock); args.pag = &args.mp->m_perag[args.agno]; if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; #ifdef DEBUG ASSERT(args.agbp != NULL); - agf = XFS_BUF_TO_AGF(args.agbp); - ASSERT(args.agbno + len <= be32_to_cpu(agf->agf_length)); + ASSERT((args.agbno + len) <= + be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); #endif - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, - len, 0); + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: up_read(&args.mp->m_peraglock); return error; -- cgit v1.2.3 From 1725cd0ae07bb31f68803edcc5bdc99952c7d2f4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 13 Aug 2006 23:24:17 -0700 Subject: [PATCH] adfs error message fix Don't use NULL as a printf control string. Fixes bug #6889. Cc: Ralph Corderoy Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/adfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/adfs/super.c b/fs/adfs/super.c index ba1c88af49fe..82011019494c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -308,7 +308,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di if (adfs_checkmap(sb, dm)) return dm; - adfs_error(sb, NULL, "map corrupted"); + adfs_error(sb, "map corrupted"); error_free: while (--zone >= 0) -- cgit v1.2.3 From 95f8797f42b058333d1e6f0d1dcd8edf5dc6c244 Mon Sep 17 00:00:00 2001 From: Dan Bastone Date: Sun, 13 Aug 2006 23:24:18 -0700 Subject: [PATCH] initialize parts of udf inode earlier in create Eric says: > I saw an oops down this path when trying to create a new file on a UDF > filesystem which was internally marked as readonly, but mounted rw: > > udf_create > udf_new_inode > new_inode > alloc_inode > udf_alloc_inode > udf_new_block > returns EIO due to readonlyness > iput (on error) I ran into the same issue today, but when listing a directory with invalid/corrupt entries: udf_lookup udf_iget get_new_inode_fast alloc_inode udf_alloc_inode __udf_read_inode fails for any reason iput (on error) ... The following patch to udf_alloc_inode() should take care of both (and other similar) cases, but I've only tested it with udf_lookup(). Signed-off-by: Dan Bastone Cc: Eric Sandeen Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/udf/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 4df822c881b6..7de172efa084 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -115,6 +115,13 @@ static struct inode *udf_alloc_inode(struct super_block *sb) ei = (struct udf_inode_info *)kmem_cache_alloc(udf_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; + + ei->i_unique = 0; + ei->i_lenExtents = 0; + ei->i_next_alloc_block = 0; + ei->i_next_alloc_goal = 0; + ei->i_strat4096 = 0; + return &ei->vfs_inode; } -- cgit v1.2.3 From 1d7ea7324ae7a59f8e17e4ba76a2707c1e6f24d2 Mon Sep 17 00:00:00 2001 From: Alexander Zarochentsev Date: Sun, 13 Aug 2006 23:24:27 -0700 Subject: [PATCH] fuse: fix error case in fuse_readpages Don't let fuse_readpages leave the @pages list not empty when exiting on error. [akpm@osdl.org: kernel-doc fixes] Signed-off-by: Alexander Zarochentsev Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/fuse/file.c | 10 ++++++++-- include/linux/mm.h | 1 + mm/swap.c | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 63614ed16336..5c4fcd1dbf59 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -395,14 +395,16 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_readpages_data data; int err; + err = -EIO; if (is_bad_inode(inode)) - return -EIO; + goto clean_pages_up; data.file = file; data.inode = inode; data.req = fuse_get_req(fc); + err = PTR_ERR(data.req); if (IS_ERR(data.req)) - return PTR_ERR(data.req); + goto clean_pages_up; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { @@ -412,6 +414,10 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, fuse_put_request(fc, data.req); } return err; + +clean_pages_up: + put_pages_list(pages); + return err; } static size_t fuse_send_write(struct fuse_req *req, struct file *file, diff --git a/include/linux/mm.h b/include/linux/mm.h index 990957e0929f..f0b135cd86da 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -336,6 +336,7 @@ static inline void init_page_count(struct page *page) } void put_page(struct page *page); +void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); diff --git a/mm/swap.c b/mm/swap.c index 8fd095c4ae51..687686a61f7c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -54,6 +54,26 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/** + * put_pages_list(): release a list of pages + * + * Release a list of pages which are strung together on page.lru. Currently + * used by read_cache_pages() and related error recovery code. + * + * @pages: list of pages threaded on page->lru + */ +void put_pages_list(struct list_head *pages) +{ + while (!list_empty(pages)) { + struct page *victim; + + victim = list_entry(pages->prev, struct page, lru); + list_del(&victim->lru); + page_cache_release(victim); + } +} +EXPORT_SYMBOL(put_pages_list); + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the -- cgit v1.2.3 From 74361cb6828398a96167b3234e186fbd731e5f30 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Aug 2006 08:54:48 -0700 Subject: [PATCH] fcntl(F_SETSIG) fix fcntl(F_SETSIG) no longer works on leases because lease_release_private_callback() gets called as the lease is copied in order to initialise it. The problem is that lease_alloc() performs an unnecessary initialisation, which sets the lease_manager_ops. Avoid the problem by allocating the target lease structure using locks_alloc_lock(). Signed-off-by: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/locks.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index b0b41a64e10b..d7c53392cac1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1421,8 +1421,9 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp) if (!leases_enable) goto out; - error = lease_alloc(filp, arg, &fl); - if (error) + error = -ENOMEM; + fl = locks_alloc_lock(); + if (fl == NULL) goto out; locks_copy_lock(fl, lease); @@ -1430,6 +1431,7 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp) locks_insert_lock(before, fl); *flp = fl; + error = 0; out: return error; } -- cgit v1.2.3 From 78bd4d484f81a611ef6ff02f909e576cb9aac7f2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2006 08:33:23 +0200 Subject: [PATCH] sys_ioprio_set: minor do_each_thread+break fix From include/linux/sched.h: * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- fs/ioprio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index 93aa5715f224..3db31038e9ab 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -111,9 +111,9 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio) continue; ret = set_task_ioprio(p, ioprio); if (ret) - break; + goto free_uid; } while_each_thread(g, p); - +free_uid: if (who) free_uid(user); break; -- cgit v1.2.3 From 9f83e45eb54fc7198dc59fc63255341851ba4c48 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2006 08:34:15 +0200 Subject: [PATCH] Fix current_io_context() vs set_task_ioprio() race I know nothing about io scheduler, but I suspect set_task_ioprio() is not safe. current_io_context() initializes "struct io_context", then sets ->io_context. set_task_ioprio() running on another cpu may see the changes out of order, so ->set_ioprio(ioc) may use io_context which was not initialized properly. Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- block/ll_rw_blk.c | 2 ++ fs/ioprio.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'fs') diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 61d6b3c65b66..ddd9253f9d55 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3628,6 +3628,8 @@ struct io_context *current_io_context(gfp_t gfp_flags) ret->nr_batch_requests = 0; /* because this is 0 */ ret->aic = NULL; ret->cic_root.rb_node = NULL; + /* make sure set_task_ioprio() sees the settings above */ + smp_wmb(); tsk->io_context = ret; } diff --git a/fs/ioprio.c b/fs/ioprio.c index 3db31038e9ab..06578311c63f 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -44,6 +44,9 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) task->ioprio = ioprio; ioc = task->io_context; + /* see wmb() in current_io_context() */ + smp_read_barrier_depends(); + if (ioc && ioc->set_ioprio) ioc->set_ioprio(ioc, ioprio); -- cgit v1.2.3 From e014ff8d4285b81f0de0719d8eee72bc50bfd4be Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2006 10:02:50 +0200 Subject: [PATCH] uninline ioprio_best() Saves 376 bytes (5 callers) for me. Signed-off-by: Oleg Nesterov Signed-off-by: Jens Axboe --- fs/ioprio.c | 23 +++++++++++++++++++++++ include/linux/ioprio.h | 23 +---------------------- 2 files changed, 24 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ioprio.c b/fs/ioprio.c index 06578311c63f..78b1deae3fa2 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -140,6 +140,29 @@ out: return ret; } +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); + unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); + + if (!ioprio_valid(aprio)) + return bprio; + if (!ioprio_valid(bprio)) + return aprio; + + if (aclass == IOPRIO_CLASS_NONE) + aclass = IOPRIO_CLASS_BE; + if (bclass == IOPRIO_CLASS_NONE) + bclass = IOPRIO_CLASS_BE; + + if (aclass == bclass) + return min(aprio, bprio); + if (aclass > bclass) + return bprio; + else + return aprio; +} + asmlinkage long sys_ioprio_get(int which, int who) { struct task_struct *g, *p; diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 88d5961f7a3f..8e2042b9d471 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -59,27 +59,6 @@ static inline int task_nice_ioprio(struct task_struct *task) /* * For inheritance, return the highest of the two given priorities */ -static inline int ioprio_best(unsigned short aprio, unsigned short bprio) -{ - unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); - unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); - - if (!ioprio_valid(aprio)) - return bprio; - if (!ioprio_valid(bprio)) - return aprio; - - if (aclass == IOPRIO_CLASS_NONE) - aclass = IOPRIO_CLASS_BE; - if (bclass == IOPRIO_CLASS_NONE) - bclass = IOPRIO_CLASS_BE; - - if (aclass == bclass) - return min(aprio, bprio); - if (aclass > bclass) - return bprio; - else - return aprio; -} +extern int ioprio_best(unsigned short aprio, unsigned short bprio); #endif -- cgit v1.2.3 From 00a2b0f6dd2372842df73de72d51621b539fea44 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2006 13:56:26 +0200 Subject: Fix possible UDF deadlock and memory corruption (CVE-2006-4145) UDF code is not really ready to handle extents larger that 1GB. This is the easy way to forbid creating those. Also truncation code did not count with the case when there are no extents in the file and we are extending the file. Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/udf/super.c | 2 +- fs/udf/truncate.c | 64 +++++++++++++++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/udf/super.c b/fs/udf/super.c index 7de172efa084..fcce1a21a51b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1659,7 +1659,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) iput(inode); goto error_out; } - sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_maxbytes = 1<<30; return 0; error_out: diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index e1b0e8cfecb4..0abd66ce36ea 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -239,37 +239,51 @@ void udf_truncate_extents(struct inode * inode) { if (offset) { - extoffset -= adsize; - etype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1); - if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) - { - extoffset -= adsize; - elen = EXT_NOT_RECORDED_NOT_ALLOCATED | (elen + offset); - udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 0); + /* + * OK, there is not extent covering inode->i_size and + * no extent above inode->i_size => truncate is + * extending the file by 'offset'. + */ + if ((!bh && extoffset == udf_file_entry_alloc_offset(inode)) || + (bh && extoffset == sizeof(struct allocExtDesc))) { + /* File has no extents at all! */ + memset(&eloc, 0x00, sizeof(kernel_lb_addr)); + elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset; + udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1); } - else if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) - { - kernel_lb_addr neloc = { 0, 0 }; + else { extoffset -= adsize; - nelen = EXT_NOT_RECORDED_NOT_ALLOCATED | - ((elen + offset + inode->i_sb->s_blocksize - 1) & - ~(inode->i_sb->s_blocksize - 1)); - udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1); - udf_add_aext(inode, &bloc, &extoffset, eloc, (etype << 30) | elen, &bh, 1); - } - else - { - if (elen & (inode->i_sb->s_blocksize - 1)) + etype = udf_next_aext(inode, &bloc, &extoffset, &eloc, &elen, &bh, 1); + if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) + { + extoffset -= adsize; + elen = EXT_NOT_RECORDED_NOT_ALLOCATED | (elen + offset); + udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 0); + } + else if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { + kernel_lb_addr neloc = { 0, 0 }; extoffset -= adsize; - elen = EXT_RECORDED_ALLOCATED | - ((elen + inode->i_sb->s_blocksize - 1) & + nelen = EXT_NOT_RECORDED_NOT_ALLOCATED | + ((elen + offset + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); - udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 1); + udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1); + udf_add_aext(inode, &bloc, &extoffset, eloc, (etype << 30) | elen, &bh, 1); + } + else + { + if (elen & (inode->i_sb->s_blocksize - 1)) + { + extoffset -= adsize; + elen = EXT_RECORDED_ALLOCATED | + ((elen + inode->i_sb->s_blocksize - 1) & + ~(inode->i_sb->s_blocksize - 1)); + udf_write_aext(inode, bloc, &extoffset, eloc, elen, bh, 1); + } + memset(&eloc, 0x00, sizeof(kernel_lb_addr)); + elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset; + udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1); } - memset(&eloc, 0x00, sizeof(kernel_lb_addr)); - elen = EXT_NOT_RECORDED_NOT_ALLOCATED | offset; - udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &bh, 1); } } } -- cgit v1.2.3 From ddeff520f02b92128132c282c350fa72afffb84a Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Wed, 9 Aug 2006 13:53:47 -0400 Subject: NFS: Fix a potential deadlock in nfs_release_page nfs_wb_page() waits on request completion and, as a result, is not safe to be called from nfs_release_page() invoked by VM scanner as part of GFP_NOFS allocation. Fix possible deadlock by analyzing gfp mask and refusing to release page if __GFP_FS is not set. Signed-off-by: Nikita Danilov Signed-off-by: Trond Myklebust (cherry picked from 374d969debfb290bafcb41d28918dc6f7e43ce31 commit) --- fs/nfs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cc2b874ad5a4..48e892880d5b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -312,7 +312,13 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) static int nfs_release_page(struct page *page, gfp_t gfp) { - return !nfs_wb_page(page->mapping->host, page); + if (gfp & __GFP_FS) + return !nfs_wb_page(page->mapping->host, page); + else + /* + * Avoid deadlock on nfs_wait_on_request(). + */ + return 0; } const struct address_space_operations nfs_file_aops = { -- cgit v1.2.3 From a634904a7de0d3a0bc606f608007a34e8c05bfee Mon Sep 17 00:00:00 2001 From: ASANO Masahiro Date: Tue, 22 Aug 2006 20:06:02 -0400 Subject: VFS: add lookup hint for network file systems I'm trying to speeding up mkdir(2) for network file systems. A typical mkdir(2) calls two inode_operations: lookup and mkdir. The lookup operation would fail with ENOENT in common case. I think it is unnecessary because the subsequent mkdir operation can check it. In case of creat(2), lookup operation is called with the LOOKUP_CREATE flag, so individual filesystem can omit real lookup. e.g. nfs_lookup(). Here is a sample patch which uses LOOKUP_CREATE and O_EXCL on mkdir, symlink and mknod. This uses the gadget for creat(2). And here is the result of a benchmark on NFSv3. mkdir(2) 10,000 times: original 50.5 sec patched 29.0 sec Signed-off-by: ASANO Masahiro Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust (cherry picked from fab7bf44449b29f9d5572a5dd8adcf7c91d5bf0f commit) --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 55a131230f94..863166441bf3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1767,6 +1767,8 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir) if (nd->last_type != LAST_NORM) goto fail; nd->flags &= ~LOOKUP_PARENT; + nd->flags |= LOOKUP_CREATE; + nd->intent.open.flags = O_EXCL; /* * Do the final lookup. -- cgit v1.2.3 From 5d67476fff2df6ff12f60b540fd0e74cf2a668f9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 31 Jul 2006 14:11:48 -0700 Subject: SUNRPC: make rpc_unlink() take a dentry argument instead of a path Signe-off-by: Trond Myklebust (cherry picked from 88bf6d811b01a4be7fd507d18bf5f1c527989089 commit) --- fs/nfs/idmap.c | 3 +-- include/linux/sunrpc/rpc_pipe_fs.h | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/rpc_pipe.c | 20 ++++++-------------- 4 files changed, 9 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index b81e7ed3c902..df0be1214358 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -130,9 +130,8 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; + rpc_unlink(idmap->idmap_dentry); dput(idmap->idmap_dentry); - idmap->idmap_dentry = NULL; - rpc_unlink(idmap->idmap_path); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 2c2189cb30aa..04d2767d5ef7 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -44,7 +44,7 @@ extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *); extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *); extern int rpc_rmdir(char *); extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags); -extern int rpc_unlink(char *); +extern int rpc_unlink(struct dentry *); extern struct vfsmount *rpc_get_mount(void); extern void rpc_put_mount(void); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4a9aa9393b97..beaa7b848246 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -718,7 +718,7 @@ gss_destroy(struct rpc_auth *auth) auth, auth->au_flavor); gss_auth = container_of(auth, struct gss_auth, rpc_auth); - rpc_unlink(gss_auth->path); + rpc_unlink(gss_auth->dentry); dput(gss_auth->dentry); gss_auth->dentry = NULL; gss_mech_put(gss_auth->mech); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a3bd2db2e024..9144f2767b66 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -746,22 +746,15 @@ err_dput: } int -rpc_unlink(char *path) +rpc_unlink(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; + struct dentry *parent; struct inode *dir; - int error; + int error = 0; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; + parent = dget_parent(dentry); + dir = parent->d_inode; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(nd.last.name, nd.dentry, nd.last.len); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); @@ -769,9 +762,8 @@ rpc_unlink(char *path) } dput(dentry); inode_dir_notify(dir, DN_DELETE); -out_release: mutex_unlock(&dir->i_mutex); - rpc_release_path(&nd); + dput(parent); return error; } -- cgit v1.2.3 From 8f8e7a50f450fcb86a5b2ffb94543c57a14f8260 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Aug 2006 13:11:15 -0400 Subject: SUNRPC: Fix dentry refcounting issues with users of rpc_pipefs rpc_unlink() and rpc_rmdir() will dput the dentry reference for you. Signed-off-by: Trond Myklebust (cherry picked from a05a57effa71a1f67ccbfc52335c10c8b85f3f6a commit) --- fs/nfs/idmap.c | 1 - net/sunrpc/auth_gss/auth_gss.c | 1 - net/sunrpc/clnt.c | 15 ++++++--------- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index df0be1214358..07a5dd57646e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -131,7 +131,6 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; rpc_unlink(idmap->idmap_dentry); - dput(idmap->idmap_dentry); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index beaa7b848246..ef1cf5b476c8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -719,7 +719,6 @@ gss_destroy(struct rpc_auth *auth) gss_auth = container_of(auth, struct gss_auth, rpc_auth); rpc_unlink(gss_auth->dentry); - dput(gss_auth->dentry); gss_auth->dentry = NULL; gss_mech_put(gss_auth->mech); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d307556872db..d9eac7069101 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -184,7 +184,6 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, out_no_auth: if (!IS_ERR(clnt->cl_dentry)) { rpc_rmdir(clnt->cl_dentry); - dput(clnt->cl_dentry); rpc_put_mount(); } out_no_path: @@ -251,10 +250,8 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_autobind = 0; new->cl_oneshot = 0; new->cl_dead = 0; - if (!IS_ERR(new->cl_dentry)) { + if (!IS_ERR(new->cl_dentry)) dget(new->cl_dentry); - rpc_get_mount(); - } rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); if (new->cl_auth) atomic_inc(&new->cl_auth->au_count); @@ -317,11 +314,15 @@ rpc_destroy_client(struct rpc_clnt *clnt) clnt->cl_auth = NULL; } if (clnt->cl_parent != clnt) { + if (!IS_ERR(clnt->cl_dentry)) + dput(clnt->cl_dentry); rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (!IS_ERR(clnt->cl_dentry)) + if (!IS_ERR(clnt->cl_dentry)) { rpc_rmdir(clnt->cl_dentry); + rpc_put_mount(); + } if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; @@ -331,10 +332,6 @@ rpc_destroy_client(struct rpc_clnt *clnt) out_free: rpc_free_iostats(clnt->cl_metrics); clnt->cl_metrics = NULL; - if (!IS_ERR(clnt->cl_dentry)) { - dput(clnt->cl_dentry); - rpc_put_mount(); - } kfree(clnt); return 0; } -- cgit v1.2.3 From 01df9c5e918ae5559f2d96da0143f8bfbb9e6171 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Aug 2006 11:58:57 -0400 Subject: LOCKD: Fix a deadlock in nlm_traverse_files() nlm_traverse_files() is not allowed to hold the nlm_file_mutex while calling nlm_inspect file, since it may end up calling nlm_release_file() when releaseing the blocks. Signed-off-by: Trond Myklebust (cherry picked from e558d3cde986e04f68afe8c790ad68ef4b94587a commit) --- fs/lockd/svcsubs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 2a4df9b3779a..01b4db9e5466 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -237,19 +237,22 @@ static int nlm_traverse_files(struct nlm_host *host, int action) { struct nlm_file *file, **fp; - int i; + int i, ret = 0; mutex_lock(&nlm_file_mutex); for (i = 0; i < FILE_NRHASH; i++) { fp = nlm_files + i; while ((file = *fp) != NULL) { + file->f_count++; + mutex_unlock(&nlm_file_mutex); + /* Traverse locks, blocks and shares of this file * and update file->f_locks count */ - if (nlm_inspect_file(host, file, action)) { - mutex_unlock(&nlm_file_mutex); - return 1; - } + if (nlm_inspect_file(host, file, action)) + ret = 1; + mutex_lock(&nlm_file_mutex); + file->f_count--; /* No more references to this file. Let go of it. */ if (!file->f_blocks && !file->f_locks && !file->f_shares && !file->f_count) { @@ -262,7 +265,7 @@ nlm_traverse_files(struct nlm_host *host, int action) } } mutex_unlock(&nlm_file_mutex); - return 0; + return ret; } /* -- cgit v1.2.3 From 79558f3610efd7928e8882b2eaca3093b283630e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 13:44:32 -0400 Subject: NFS: Fix issue with EIO on NFS read The problem is that we may be caching writes that would extend the file and create a hole in the region that we are reading. In this case, we need to detect the eof from the server, ensure that we zero out the pages that are part of the hole and mark them as up to date. Signed-off-by: Trond Myklebust (cherry picked from 856b603b01b99146918c093969b6cb1b1b0f1c01 commit) --- fs/nfs/read.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 65c0c5b32351..da9cf11c326f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -116,10 +116,17 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; base &= ~PAGE_CACHE_MASK; pglen = PAGE_CACHE_SIZE - base; - if (pglen < remainder) + for (;;) { + if (remainder <= pglen) { + memclear_highpage_flush(*pages, base, remainder); + break; + } memclear_highpage_flush(*pages, base, pglen); - else - memclear_highpage_flush(*pages, base, remainder); + pages++; + remainder -= pglen; + pglen = PAGE_CACHE_SIZE; + base = 0; + } } /* @@ -476,6 +483,8 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) unsigned int base = data->args.pgbase; struct page **pages; + if (data->res.eof) + count = data->args.count; if (unlikely(count == 0)) return; pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; @@ -483,11 +492,7 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) count += base; for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) SetPageUptodate(*pages); - /* - * Was this an eof or a short read? If the latter, don't mark the page - * as uptodate yet. - */ - if (count > 0 && (data->res.eof || data->args.count == data->res.count)) + if (count != 0) SetPageUptodate(*pages); } @@ -502,6 +507,8 @@ static void nfs_readpage_set_pages_error(struct nfs_read_data *data) count += base; for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) SetPageError(*pages); + if (count != 0) + SetPageError(*pages); } /* -- cgit v1.2.3 From e8896495bca8490a427409e0886d63d05419ec65 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 24 Aug 2006 15:44:19 -0400 Subject: NFS: Check lengths more thoroughly in NFS4 readdir XDR decode Check the bounds of length specifiers more thoroughly in the XDR decoding of NFS4 readdir reply data. Currently, if the server returns a bitmap or attr length that causes the current decode point pointer to wrap, this could go undetected (consider a small "negative" length on a 32-bit machine). Also add a check into the main XDR decode handler to make sure that the amount of data is a multiple of four bytes (as specified by RFC-1014). This makes sure that we can do u32* pointer subtraction in the NFS client without risking an undefined result (the result is undefined if the pointers are not correctly aligned with respect to one another). Signed-Off-By: David Howells Signed-off-by: Trond Myklebust (cherry picked from 5861fddd64a7eaf7e8b1a9997455a24e7f688092 commit) --- fs/nfs/nfs4xdr.c | 21 +++++++++++---------- net/sunrpc/clnt.c | 11 +++++++++++ 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1750d996f49f..730ec8fb31c6 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3355,7 +3355,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n struct kvec *iov = rcvbuf->head; unsigned int nr, pglen = rcvbuf->page_len; uint32_t *end, *entry, *p, *kaddr; - uint32_t len, attrlen; + uint32_t len, attrlen, xlen; int hdrlen, recvd, status; status = decode_op_hdr(xdr, OP_READDIR); @@ -3377,10 +3377,10 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0); - end = (uint32_t *) ((char *)p + pglen + readdir->pgbase); + end = p + ((pglen + readdir->pgbase) >> 2); entry = p; for (nr = 0; *p++; nr++) { - if (p + 3 > end) + if (end - p < 3) goto short_pkt; dprintk("cookie = %Lu, ", *((unsigned long long *)p)); p += 2; /* cookie */ @@ -3389,18 +3389,19 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); goto err_unmap; } - dprintk("filename = %*s\n", len, (char *)p); - p += XDR_QUADLEN(len); - if (p + 1 > end) + xlen = XDR_QUADLEN(len); + if (end - p < xlen + 1) goto short_pkt; + dprintk("filename = %*s\n", len, (char *)p); + p += xlen; len = ntohl(*p++); /* bitmap length */ - p += len; - if (p + 1 > end) + if (end - p < len + 1) goto short_pkt; + p += len; attrlen = XDR_QUADLEN(ntohl(*p++)); - p += attrlen; /* attributes */ - if (p + 2 > end) + if (end - p < attrlen + 2) goto short_pkt; + p += attrlen; /* attributes */ entry = p; } if (!nr && (entry[0] != 0 || entry[1] == 0)) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d9eac7069101..3e19d321067a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1181,6 +1181,17 @@ call_verify(struct rpc_task *task) u32 *p = iov->iov_base, n; int error = -EACCES; + if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) { + /* RFC-1014 says that the representation of XDR data must be a + * multiple of four bytes + * - if it isn't pointer subtraction in the NFS client may give + * undefined results + */ + printk(KERN_WARNING + "call_verify: XDR representation not a multiple of" + " 4 bytes: 0x%x\n", task->tk_rqstp->rq_rcv_buf.len); + goto out_eio; + } if ((len -= 3) < 0) goto out_overflow; p += 1; /* skip XID */ -- cgit v1.2.3 From 16b4289c7460ba9c04af40c574949dcca9029658 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 24 Aug 2006 12:27:15 -0400 Subject: NFSv4: Add v4 exception handling for the ACL functions. This is needed in order to handle any NFS4ERR_DELAY errors that might be returned by the server. It also ensures that we map the NFSv4 errors before they are returned to userland. Signed-off-by: Trond Myklebust (cherry picked from 71c12b3f0abc7501f6ed231a6d17bc9c05a238dc commit) --- fs/nfs/nfs4proc.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e6ee97f19d81..153898e1331f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2668,7 +2668,7 @@ out: nfs4_set_cached_acl(inode, acl); } -static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_getaclargs args = { @@ -2721,6 +2721,19 @@ out_free: return ret; } +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + ssize_t ret; + do { + ret = __nfs4_get_acl_uncached(inode, buf, buflen); + if (ret >= 0) + break; + ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); + } while (exception.retry); + return ret; +} + static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); @@ -2737,7 +2750,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) return nfs4_get_acl_uncached(inode, buf, buflen); } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; @@ -2763,6 +2776,18 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen return ret; } +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + __nfs4_proc_set_acl(inode, buf, buflen), + &exception); + } while (exception.retry); + return err; +} + static int nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) { -- cgit v1.2.3 From a343bb7750e6a098909c34f5c5dfddbc4fa40053 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:03 -0400 Subject: VFS: Fix access("file", X_OK) in the presence of ACLs Currently, the access() call will return incorrect information on NFS if there exists an ACL that grants execute access to the user on a regular file. The reason the information is incorrect is that the VFS overrides this execute access in open_exec() by checking (inode->i_mode & 0111). This patch propagates the VFS execute bit check back into the generic permission() call. Signed-off-by: Trond Myklebust (cherry picked from 64cbae98848c4c99851cb0a405f0b4982cd76c1e commit) --- fs/namei.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 863166441bf3..432d6bc6fab0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -227,10 +227,10 @@ int generic_permission(struct inode *inode, int mask, int permission(struct inode *inode, int mask, struct nameidata *nd) { + umode_t mode = inode->i_mode; int retval, submask; if (mask & MAY_WRITE) { - umode_t mode = inode->i_mode; /* * Nobody gets write access to a read-only fs. @@ -247,6 +247,13 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) } + /* + * MAY_EXEC on regular files requires special handling: We override + * filesystem execute permissions if the mode bits aren't set. + */ + if ((mask & MAY_EXEC) && S_ISREG(mode) && !(mode & S_IXUGO)) + return -EACCES; + /* Ordinary permission routines do not understand MAY_APPEND. */ submask = mask & ~MAY_APPEND; if (inode->i_op && inode->i_op->permission) -- cgit v1.2.3 From 9167b0b9a0ab7907191523f5a0528e3b9c288e21 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:03 -0400 Subject: VFS: Remove redundant open-coded mode bit check in prepare_binfmt(). The check in prepare_binfmt() for inode->i_mode & 0111 is redundant, since open_exec() will already have done that. Signed-off-by: Trond Myklebust (cherry picked from 822dec482ced07af32c378cd936d77345786572b commit) --- fs/exec.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 8344ba73a2a6..a6f64a98ac50 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -922,12 +922,6 @@ int prepare_binprm(struct linux_binprm *bprm) int retval; mode = inode->i_mode; - /* - * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, - * generic_permission lets a non-executable through - */ - if (!(mode & 0111)) /* with at least _one_ execute bit set */ - return -EACCES; if (bprm->file->f_op == NULL) return -EACCES; -- cgit v1.2.3 From a969fd5a4e162c4485ae8f3e49d674656a18fa36 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:04 -0400 Subject: VFS: Remove redundant open-coded mode bit checks in open_exec(). The check in open_exec() for inode->i_mode & 0111 has been made redundant by the fix to permission(). Signed-off-by: Trond Myklebust (cherry picked from 1d3741c5d991686699f100b65b9956f7ee7ae0ae commit) --- fs/exec.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index a6f64a98ac50..f7aabfeca033 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -486,8 +486,6 @@ struct file *open_exec(const char *name) if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && S_ISREG(inode->i_mode)) { int err = vfs_permission(&nd, MAY_EXEC); - if (!err && !(inode->i_mode & 0111)) - err = -EACCES; file = ERR_PTR(err); if (!err) { file = nameidata_to_filp(&nd, O_RDONLY); -- cgit v1.2.3 From 81a42d298d8bd1b96be4bd459494f25fdd99b594 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 25 Aug 2006 15:58:57 -0700 Subject: [DISKLABEL] SUN: Fix signed int usage for sector count The current sun disklabel code uses a signed int for the sector count. When partitions larger than 1 TB are used, the cast to a sector_t causes the partition sizes to be invalid: # cat /proc/paritions | grep sdan 66 112 2146435072 sdan 66 115 9223372036853660736 sdan3 66 120 9223372036853660736 sdan8 This patch switches the sector count to an unsigned int to fix this. Signed-off-by: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- fs/partitions/sun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c index abe91ca03edf..0a5927c806ca 100644 --- a/fs/partitions/sun.c +++ b/fs/partitions/sun.c @@ -74,7 +74,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev) spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); for (i = 0; i < 8; i++, p++) { unsigned long st_sector; - int num_sectors; + unsigned int num_sectors; st_sector = be32_to_cpu(p->start_cylinder) * spc; num_sectors = be32_to_cpu(p->num_sectors); -- cgit v1.2.3 From 6946bd636364effce06ea46fe8f8cd6e2edb004e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 27 Aug 2006 01:23:31 -0700 Subject: [PATCH] lockdep: fix blkdev_open() warning On Wed, 2006-08-09 at 07:57 +0200, Rolf Eike Beer wrote: > ============================================= > [ INFO: possible recursive locking detected ] > --------------------------------------------- > parted/7929 is trying to acquire lock: > (&bdev->bd_mutex){--..}, at: [] __blkdev_put+0x1e/0x13c > > but task is already holding lock: > (&bdev->bd_mutex){--..}, at: [] do_open+0x72/0x3a8 > > other info that might help us debug this: > 1 lock held by parted/7929: > #0: (&bdev->bd_mutex){--..}, at: [] do_open+0x72/0x3a8 > stack backtrace: > [] show_trace_log_lvl+0x58/0x15b > [] show_trace+0xd/0x10 > [] dump_stack+0x17/0x1a > [] __lock_acquire+0x753/0x99c > [] lock_acquire+0x4a/0x6a > [] mutex_lock_nested+0xc8/0x20c > [] __blkdev_put+0x1e/0x13c > [] blkdev_put+0xa/0xc > [] do_open+0x336/0x3a8 > [] blkdev_open+0x1f/0x4c > [] __dentry_open+0xc7/0x1aa > [] nameidata_to_filp+0x1c/0x2e > [] do_filp_open+0x2e/0x35 > [] do_sys_open+0x38/0x68 > [] sys_open+0x16/0x18 > [] sysenter_past_esp+0x56/0x8d OK, I'm having a look here; its all new to me so bear with me. blkdev_open() calls do_open(bdev, ...,BD_MUTEX_NORMAL) and takes mutex_lock_nested(&bdev->bd_mutex, BD_MUTEX_NORMAL) then something fails, and we're thrown to: out_first: where if (bdev != bdev->bd_contains) blkdev_put(bdev->bd_contains) which is __blkdev_put(bdev->bd_contains, BD_MUTEX_NORMAL) which does mutex_lock_nested(&bdev->bd_contains->bd_mutex, BD_MUTEX_NORMAL) <--- lockdep trigger When going to out_first, dbev->bd_contains is either bdev or whole, and since we take the branch it must be whole. So it seems to me the following patch would be the right one: [akpm@osdl.org: compile fix] Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Acked-by: NeilBrown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 114 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 56 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 37534573960b..045f98854f14 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -884,6 +884,61 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); +static int __blkdev_put(struct block_device *bdev, unsigned int subclass) +{ + int ret = 0; + struct inode *bd_inode = bdev->bd_inode; + struct gendisk *disk = bdev->bd_disk; + + mutex_lock_nested(&bdev->bd_mutex, subclass); + lock_kernel(); + if (!--bdev->bd_openers) { + sync_blockdev(bdev); + kill_bdev(bdev); + } + if (bdev->bd_contains == bdev) { + if (disk->fops->release) + ret = disk->fops->release(bd_inode, NULL); + } else { + mutex_lock_nested(&bdev->bd_contains->bd_mutex, + subclass + 1); + bdev->bd_contains->bd_part_count--; + mutex_unlock(&bdev->bd_contains->bd_mutex); + } + if (!bdev->bd_openers) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + + if (bdev->bd_contains != bdev) { + kobject_put(&bdev->bd_part->kobj); + bdev->bd_part = NULL; + } + bdev->bd_disk = NULL; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + if (bdev != bdev->bd_contains) + __blkdev_put(bdev->bd_contains, subclass + 1); + bdev->bd_contains = NULL; + } + unlock_kernel(); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} + +int blkdev_put(struct block_device *bdev) +{ + return __blkdev_put(bdev, BD_MUTEX_NORMAL); +} +EXPORT_SYMBOL(blkdev_put); + +int blkdev_put_partition(struct block_device *bdev) +{ + return __blkdev_put(bdev, BD_MUTEX_PARTITION); +} +EXPORT_SYMBOL(blkdev_put_partition); + static int blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags); @@ -980,7 +1035,7 @@ out_first: bdev->bd_disk = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) - blkdev_put(bdev->bd_contains); + __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE); bdev->bd_contains = NULL; put_disk(disk); module_put(owner); @@ -1079,63 +1134,6 @@ static int blkdev_open(struct inode * inode, struct file * filp) return res; } -static int __blkdev_put(struct block_device *bdev, unsigned int subclass) -{ - int ret = 0; - struct inode *bd_inode = bdev->bd_inode; - struct gendisk *disk = bdev->bd_disk; - - mutex_lock_nested(&bdev->bd_mutex, subclass); - lock_kernel(); - if (!--bdev->bd_openers) { - sync_blockdev(bdev); - kill_bdev(bdev); - } - if (bdev->bd_contains == bdev) { - if (disk->fops->release) - ret = disk->fops->release(bd_inode, NULL); - } else { - mutex_lock_nested(&bdev->bd_contains->bd_mutex, - subclass + 1); - bdev->bd_contains->bd_part_count--; - mutex_unlock(&bdev->bd_contains->bd_mutex); - } - if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - - if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->kobj); - bdev->bd_part = NULL; - } - bdev->bd_disk = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; - if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, subclass + 1); - bdev->bd_contains = NULL; - } - unlock_kernel(); - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - return ret; -} - -int blkdev_put(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_NORMAL); -} - -EXPORT_SYMBOL(blkdev_put); - -int blkdev_put_partition(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_PARTITION); -} - -EXPORT_SYMBOL(blkdev_put_partition); - static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); -- cgit v1.2.3 From f5fb09fa3392ad43fbcfc2f4580752f383ab5996 Mon Sep 17 00:00:00 2001 From: Andries Brouwer Date: Sun, 27 Aug 2006 01:23:42 -0700 Subject: [PATCH] Fix for minix crash Mounting a (corrupt) minix filesystem with zero s_zmap_blocks gives a spectacular crash on my 2.6.17.8 system, no doubt because minix/inode.c does an unconditional minix_set_bit(0,sbi->s_zmap[0]->b_data); [akpm@osdl.org: make labels conistent while we're there] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/minix/inode.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 9ea91c5eeb7b..330ff9fc7cf0 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -204,6 +204,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) /* * Allocate the buffer map to keep the superblock small. */ + if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) + goto out_illegal_sb; i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); map = kmalloc(i, GFP_KERNEL); if (!map) @@ -263,7 +265,7 @@ out_no_root: out_no_bitmap: printk("MINIX-fs: bad superblock or unable to read bitmaps\n"); - out_freemap: +out_freemap: for (i = 0; i < sbi->s_imap_blocks; i++) brelse(sbi->s_imap[i]); for (i = 0; i < sbi->s_zmap_blocks; i++) @@ -276,11 +278,16 @@ out_no_map: printk("MINIX-fs: can't allocate map\n"); goto out_release; +out_illegal_sb: + if (!silent) + printk("MINIX-fs: bad superblock\n"); + goto out_release; + out_no_fs: if (!silent) printk("VFS: Can't find a Minix or Minix V2 filesystem " "on device %s\n", s->s_id); - out_release: +out_release: brelse(bh); goto out; @@ -290,7 +297,7 @@ out_bad_hblock: out_bad_sb: printk("MINIX-fs: unable to read superblock\n"); - out: +out: s->s_fs_info = NULL; kfree(sbi); return -EINVAL; -- cgit v1.2.3 From 607eb266aea9dd2abe515985e12c5cd8b32546e8 Mon Sep 17 00:00:00 2001 From: Andries Brouwer Date: Sun, 27 Aug 2006 01:23:43 -0700 Subject: [PATCH] ext2: prevent div-by-zero on corrupted fs Mounting an ext2 filesystem with zero s_inodes_per_group will cause a divide error. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f2702cda9779..681dea8f9532 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -775,7 +775,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_INODE_SIZE(sb) == 0) goto cantfind_ext2; sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) + if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) goto cantfind_ext2; sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; -- cgit v1.2.3 From 08fb306fe63d98eb86e3b16f4cc21816fa47f18e Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Sun, 27 Aug 2006 01:23:44 -0700 Subject: [PATCH] ext3 filesystem bogus ENOSPC with reservation fix To handle the earlier bogus ENOSPC error caused by filesystem full of block reservation, current code falls back to non block reservation, starts to allocate block(s) from the goal allocation block group as if there is no block reservation. Current code needs to re-load the corresponding block group descriptor for the initial goal block group in this case. The patch fixes this. Signed-off-by: Mingming Cao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/balloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a504a40d6d29..063d994bda0b 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1269,12 +1269,12 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, goal = le32_to_cpu(es->s_first_data_block); group_no = (goal - le32_to_cpu(es->s_first_data_block)) / EXT3_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); if (!gdp) goto io_error; - goal_group = group_no; -retry: free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); /* * if there is not enough free blocks to make a new resevation @@ -1349,7 +1349,7 @@ retry: if (my_rsv) { my_rsv = NULL; group_no = goal_group; - goto retry; + goto retry_alloc; } /* No space left on the device */ *errp = -ENOSPC; -- cgit v1.2.3 From c37336b078ba9d2ff38c535b194996a7ad6e69f8 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Sun, 27 Aug 2006 01:23:45 -0700 Subject: [PATCH] ufs: write to hole in big file On UFS, this scenario: open(O_TRUNC) lseek(1024 * 1024 * 80) write("A") lseek(1024 * 2) write("A") may cause access to invalid address. This happened because of "goal" is calculated in wrong way in block allocation path, as I see this problem exists also in 2.4. We use construction like this i_data[lastfrag], i_data array of pointers to direct blocks, indirect and so on, it has ceratain size ~20 elements, and lastfrag may have value for example 40000. Also this patch fixes related to handling such scenario issues, wrong zeroing metadata, in case of block(not fragment) allocation, and wrong goal calculation, when we allocate block Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/inode.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e7c8615beb65..30c6e8a9446c 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -169,18 +169,20 @@ static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh) static struct buffer_head * ufs_clear_frags(struct inode *inode, sector_t beg, - unsigned int n) + unsigned int n, sector_t want) { - struct buffer_head *res, *bh; + struct buffer_head *res = NULL, *bh; sector_t end = beg + n; - res = sb_getblk(inode->i_sb, beg); - ufs_clear_frag(inode, res); - for (++beg; beg < end; ++beg) { + for (; beg < end; ++beg) { bh = sb_getblk(inode->i_sb, beg); ufs_clear_frag(inode, bh); - brelse(bh); + if (want != beg) + brelse(bh); + else + res = bh; } + BUG_ON(!res); return res; } @@ -265,7 +267,9 @@ repeat: lastfrag = ufsi->i_lastfrag; } - goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb; + tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]); + if (tmp) + goal = tmp + uspi->s_fpb; tmp = ufs_new_fragments (inode, p, fragment - blockoff, goal, required + blockoff, err, locked_page); @@ -277,13 +281,15 @@ repeat: tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff), fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), err, locked_page); - } + } else /* (lastblock > block) */ { /* * We will allocate new block before last allocated block */ - else /* (lastblock > block) */ { - if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1]))) - goal = tmp + uspi->s_fpb; + if (block) { + tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[block-1]); + if (tmp) + goal = tmp + uspi->s_fpb; + } tmp = ufs_new_fragments(inode, p, fragment - blockoff, goal, uspi->s_fpb, err, locked_page); } @@ -296,7 +302,7 @@ repeat: } if (!phys) { - result = ufs_clear_frags(inode, tmp + blockoff, required); + result = ufs_clear_frags(inode, tmp, required, tmp + blockoff); } else { *phys = tmp + blockoff; result = NULL; @@ -383,7 +389,7 @@ repeat: } } - if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]) + uspi->s_fpb)) + if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; @@ -397,7 +403,8 @@ repeat: if (!phys) { - result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb); + result = ufs_clear_frags(inode, tmp, uspi->s_fpb, + tmp + blockoff); } else { *phys = tmp + blockoff; *new = 1; -- cgit v1.2.3 From ecdc63948763586e101108dfe1ba316ec069fe39 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Sun, 27 Aug 2006 01:23:46 -0700 Subject: [PATCH] ufs: truncate correction 1) When we allocated last fragment in ufs_truncate, we read page, check if block mapped to address, and if not trying to allocate it. This is wrong behaviour, fragment may be NOT allocated, but mapped, this happened because of "block map" function not checked allocated fragment or not, it just take address of the first fragment in the block, add offset of fragment and return result, this is correct behaviour in almost all situation except call from ufs_truncate. 2) Almost all implementation of UFS, which I can investigate have such "defect": if you have full disk, and try truncate file, for example 3GB to 2MB, and have hole in this region, truncate return -ENOSPC. I tried evade from this problem, but "block allocation" algorithm is tied to right value of i_lastfrag, and fix of this corner case may slow down of ordinaries scenarios, so this patch makes behavior of "truncate" operations similar to what other UFS implementations do. Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/truncate.c | 77 ++++++++++++++++++------------------------------------- 1 file changed, 25 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index c9b55872079b..ea11d04c41a0 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -375,17 +375,15 @@ static int ufs_alloc_lastblock(struct inode *inode) int err = 0; struct address_space *mapping = inode->i_mapping; struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); unsigned lastfrag, i, end; struct page *lastpage; struct buffer_head *bh; lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; - if (!lastfrag) { - ufsi->i_lastfrag = 0; + if (!lastfrag) goto out; - } + lastfrag--; lastpage = ufs_get_locked_page(mapping, lastfrag >> @@ -400,25 +398,25 @@ static int ufs_alloc_lastblock(struct inode *inode) for (i = 0; i < end; ++i) bh = bh->b_this_page; - if (!buffer_mapped(bh)) { - err = ufs_getfrag_block(inode, lastfrag, bh, 1); - - if (unlikely(err)) - goto out_unlock; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - /* - * we do not zeroize fragment, because of - * if it maped to hole, it already contains zeroes - */ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - set_page_dirty(lastpage); - } + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); } + out_unlock: ufs_put_locked_page(lastpage); out: @@ -440,23 +438,11 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; - if (inode->i_size > old_i_size) { - /* - * if we expand file we should care about - * allocation of block for last byte first of all - */ - err = ufs_alloc_lastblock(inode); + err = ufs_alloc_lastblock(inode); - if (err) { - i_size_write(inode, old_i_size); - goto out; - } - /* - * go away, because of we expand file, and we do not - * need free blocks, and zeroizes page - */ - lock_kernel(); - goto almost_end; + if (err) { + i_size_write(inode, old_i_size); + goto out; } block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); @@ -477,21 +463,8 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) yield(); } - if (inode->i_size < old_i_size) { - /* - * now we should have enough space - * to allocate block for last byte - */ - err = ufs_alloc_lastblock(inode); - if (err) - /* - * looks like all the same - we have no space, - * but we truncate file already - */ - inode->i_size = (ufsi->i_lastfrag - 1) * uspi->s_fsize; - } -almost_end: inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ufsi->i_lastfrag = DIRECT_FRAGMENT; unlock_kernel(); mark_inode_dirty(inode); out: -- cgit v1.2.3 From 45f17e0c2ae05c133a348452690de0e5fa863293 Mon Sep 17 00:00:00 2001 From: Masoud Asgharifard Sharbiani Date: Sun, 27 Aug 2006 01:23:48 -0700 Subject: [PATCH] eventpoll.c compile fix Fix two compile failures in eventpoll.c code which would happen if DEBUG_EPOLL is bigger than zero. Signed-off-by: Masoud Sharbiani Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 19ffb043abbc..3a3567433b92 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1168,7 +1168,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", - current, ep, epi->file, error)); + current, ep, epi->ffd.file, error)); return error; } @@ -1236,7 +1236,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", - current, epi->file, epi, ep)); + current, epi->ffd.file, epi, ep)); write_lock_irqsave(&ep->lock, flags); -- cgit v1.2.3 From ea817398e68dfa25612229fda7fc74580cf915fb Mon Sep 17 00:00:00 2001 From: Badari Pulavarty Date: Sun, 27 Aug 2006 01:23:52 -0700 Subject: [PATCH] Manage jbd allocations from its own slabs JBD currently allocates commit and frozen buffers from slabs. With CONFIG_SLAB_DEBUG, its possible for an allocation to cross the page boundary causing IO problems. https://bugzilla.redhat.com/bugzilla/show_bug.cgi?id=200127 So, instead of allocating these from regular slabs - manage allocation from its own slabs and disable slab debug for these slabs. [akpm@osdl.org: cleanups] Signed-off-by: Badari Pulavarty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 6 ++-- fs/jbd/journal.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++---- fs/jbd/transaction.c | 9 ++--- include/linux/jbd.h | 3 ++ 4 files changed, 97 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 0971814c38b8..42da60784311 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -261,7 +261,7 @@ void journal_commit_transaction(journal_t *journal) struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } @@ -745,14 +745,14 @@ restart_loop: * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { - kfree(jh->b_frozen_data); + jbd_slab_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 8c9b28dff119..f66724ce443a 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -84,6 +84,7 @@ EXPORT_SYMBOL(journal_force_commit); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int journal_create_jbd_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -328,10 +329,10 @@ repeat: char *tmp; jbd_unlock_bh_state(bh_in); - tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS); + tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS); jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { - kfree(tmp); + jbd_slab_free(tmp, bh_in->b_size); goto repeat; } @@ -1069,17 +1070,17 @@ static int load_superblock(journal_t *journal) int journal_load(journal_t *journal) { int err; + journal_superblock_t *sb; err = load_superblock(journal); if (err) return err; + sb = journal->j_superblock; /* If this is a V2 superblock, then we have to check the * features flags on it. */ if (journal->j_format_version >= 2) { - journal_superblock_t *sb = journal->j_superblock; - if ((sb->s_feature_ro_compat & ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & @@ -1090,6 +1091,13 @@ int journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (journal_recover(journal)) @@ -1611,6 +1619,77 @@ void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) return kmalloc(size, flags | (retry ? __GFP_NOFAIL : 0)); } +/* + * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed + * and allocate frozen and commit buffers from these slabs. + * + * Reason for doing this is to avoid, SLAB_DEBUG - since it could + * cause bh to cross page boundary. + */ + +#define JBD_MAX_SLABS 5 +#define JBD_SLAB_INDEX(size) (size >> 11) + +static kmem_cache_t *jbd_slab[JBD_MAX_SLABS]; +static const char *jbd_slab_names[JBD_MAX_SLABS] = { + "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k" +}; + +static void journal_destroy_jbd_slabs(void) +{ + int i; + + for (i = 0; i < JBD_MAX_SLABS; i++) { + if (jbd_slab[i]) + kmem_cache_destroy(jbd_slab[i]); + jbd_slab[i] = NULL; + } +} + +static int journal_create_jbd_slab(size_t slab_size) +{ + int i = JBD_SLAB_INDEX(slab_size); + + BUG_ON(i >= JBD_MAX_SLABS); + + /* + * Check if we already have a slab created for this size + */ + if (jbd_slab[i]) + return 0; + + /* + * Create a slab and force alignment to be same as slabsize - + * this will make sure that allocations won't cross the page + * boundary. + */ + jbd_slab[i] = kmem_cache_create(jbd_slab_names[i], + slab_size, slab_size, 0, NULL, NULL); + if (!jbd_slab[i]) { + printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +void * jbd_slab_alloc(size_t size, gfp_t flags) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL); +} + +void jbd_slab_free(void *ptr, size_t size) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + kmem_cache_free(jbd_slab[idx], ptr); +} + /* * Journal_head storage management */ @@ -1799,13 +1878,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh) printk(KERN_WARNING "%s: freeing " "b_frozen_data\n", __FUNCTION__); - kfree(jh->b_frozen_data); + jbd_slab_free(jh->b_frozen_data, bh->b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing " "b_committed_data\n", __FUNCTION__); - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); } bh->b_private = NULL; jh->b_bh = NULL; /* debug, really */ @@ -1961,6 +2040,7 @@ static void journal_destroy_caches(void) journal_destroy_revoke_caches(); journal_destroy_journal_head_cache(); journal_destroy_handle_cache(); + journal_destroy_jbd_slabs(); } static int __init journal_init(void) diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 508b2ea91f43..de2e4cbbf79a 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -666,8 +666,9 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, - GFP_NOFS); + frozen_buffer = + jbd_slab_alloc(jh2bh(jh)->b_size, + GFP_NOFS); if (!frozen_buffer) { printk(KERN_EMERG "%s: OOM for frozen_buffer\n", @@ -879,7 +880,7 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) repeat: if (!jh->b_committed_data) { - committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); + committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { printk(KERN_EMERG "%s: No memory for committed data\n", __FUNCTION__); @@ -906,7 +907,7 @@ repeat: out: journal_put_journal_head(jh); if (unlikely(committed_data)) - kfree(committed_data); + jbd_slab_free(committed_data, bh->b_size); return err; } diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 20eb34403d0c..a04c154c5207 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -72,6 +72,9 @@ extern int journal_enable_debug; #endif extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry); +extern void * jbd_slab_alloc(size_t size, gfp_t flags); +extern void jbd_slab_free(void *ptr, size_t size); + #define jbd_kmalloc(size, flags) \ __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry) #define jbd_rep_kmalloc(size, flags) \ -- cgit v1.2.3 From 4df46240a1312161e3c794f6ace50ef7eb5ff3d7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 27 Aug 2006 01:23:56 -0700 Subject: [PATCH] lockdep: annotate reiserfs reiserfs seems to have another locking level layer for the i_mutex due to the xattrs-are-a-directory thing. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 2 +- include/linux/fs.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 39fedaa88a0c..d935fb9394e3 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -424,7 +424,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf) int res = -ENOTDIR; if (!file->f_op || !file->f_op->readdir) goto out; - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR); // down(&inode->i_zombie); res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 25610205c90d..555bc195c420 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -570,13 +570,14 @@ struct inode { * 3: quota file * * The locking order between these classes is - * parent -> child -> normal -> quota + * parent -> child -> normal -> xattr -> quota */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, + I_MUTEX_XATTR, I_MUTEX_QUOTA }; -- cgit v1.2.3 From 513627d7fec6fcb7b3d56ce355cb4d192c76b530 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sun, 27 Aug 2006 01:23:57 -0700 Subject: [PATCH] fix up lockdep trace in fs/exec.c This fixes the locking error noticed by lockdep: ============================================= [ INFO: possible recursive locking detected ] --------------------------------------------- init/1 is trying to acquire lock: (&sighand->siglock){....}, at: [] flush_old_exec+0x3ae/0x859 but task is already holding lock: (&sighand->siglock){....}, at: [] flush_old_exec+0x39e/0x859 other info that might help us debug this: 2 locks held by init/1: #0: (tasklist_lock){..--}, at: [] flush_old_exec+0x38e/0x859 #1: (&sighand->siglock){....}, at: [] flush_old_exec+0x39e/0x859 stack backtrace: [] show_trace_log_lvl+0x54/0xfd [] show_trace+0xd/0x10 [] dump_stack+0x19/0x1b [] __lock_acquire+0x773/0x997 [] lock_acquire+0x4b/0x6c [] _spin_lock+0x19/0x28 [] flush_old_exec+0x3ae/0x859 [] load_elf_binary+0x4aa/0x1628 [] search_binary_handler+0xa7/0x24e [] do_execve+0x15b/0x1f9 [] sys_execve+0x29/0x4d [] syscall_call+0x7/0xb Signed-off-by: Arjan van de Ven Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index f7aabfeca033..54135df2a966 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -751,7 +751,7 @@ no_thread_group: write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); - spin_lock(&newsighand->siglock); + spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING); rcu_assign_pointer(current->sighand, newsighand); recalc_sigpending(); -- cgit v1.2.3 From f5ef68da5fda5e095b585ea5ecdd42af3c8695f7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 27 Aug 2006 01:23:58 -0700 Subject: [PATCH] /proc/meminfo: don't put spaces in names None of the other /proc/meminfo lines have a space in the identifier. This post-2.6.17 addition has the potential to break existing parsers, so use an underscore instead (like Committed_AS). Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 2 +- fs/proc/proc_misc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/drivers/base/node.c b/drivers/base/node.c index d7de1753e094..e9b0957f15d1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -64,7 +64,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) "Node %d Mapped: %8lu kB\n" "Node %d AnonPages: %8lu kB\n" "Node %d PageTables: %8lu kB\n" - "Node %d NFS Unstable: %8lu kB\n" + "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" "Node %d Slab: %8lu kB\n", nid, K(i.totalram), diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 9f2cfc30f9cf..942156225447 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -169,7 +169,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "PageTables: %8lu kB\n" - "NFS Unstable: %8lu kB\n" + "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" "CommitLimit: %8lu kB\n" "Committed_AS: %8lu kB\n" -- cgit v1.2.3