summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-11-08 11:54:53 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2010-11-08 11:54:53 -0800
commita7bcf21e60c73cb7f7c13fad928967d7e47c3cac (patch)
treee24ae9d2c35508f68016b8cde848b7608e737b32
parent5398a64c63a69a0ac33dbae458ea4aab0dc23f14 (diff)
parent7ff9c073dd4d7200399076554f7ab9b876f196f6 (diff)
downloadlwn-a7bcf21e60c73cb7f7c13fad928967d7e47c3cac.tar.gz
lwn-a7bcf21e60c73cb7f7c13fad928967d7e47c3cac.zip
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: ext4: Add new ext4 inode tracepoints ext4: Don't call sb_issue_discard() in ext4_free_blocks() ext4: do not try to grab the s_umount semaphore in ext4_quota_off ext4: fix potential race when freeing ext4_io_page structures ext4: handle writeback of inodes which are being freed ext4: initialize the percpu counters before replaying the journal ext4: "ret" may be used uninitialized in ext4_lazyinit_thread() ext4: fix lazyinit hang after removing request
-rw-r--r--fs/ext4/ext4.h4
-rw-r--r--fs/ext4/inode.c3
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/page-io.c97
-rw-r--r--fs/ext4/super.c102
-rw-r--r--include/trace/events/ext4.h97
6 files changed, 214 insertions, 91 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8b5dd6369f82..6a5edea2d70b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,7 +177,7 @@ struct mpage_da_data {
struct ext4_io_page {
struct page *p_page;
- int p_count;
+ atomic_t p_count;
};
#define MAX_IO_PAGES 128
@@ -858,6 +858,7 @@ struct ext4_inode_info {
spinlock_t i_completed_io_lock;
/* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio;
+ atomic_t i_ioend_count; /* Number of outstanding io_end structs */
/*
* Transactions that contain inode's metadata needed to complete
@@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
+extern void ext4_ioend_wait(struct inode *);
extern void ext4_free_io_end(ext4_io_end_t *io);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern int ext4_end_io_nolock(ext4_io_end_t *io);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4d78342f3bf0..bdbe69902207 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -53,6 +53,7 @@
static inline int ext4_begin_ordered_truncate(struct inode *inode,
loff_t new_size)
{
+ trace_ext4_begin_ordered_truncate(inode, new_size);
return jbd2_journal_begin_ordered_truncate(
EXT4_SB(inode->i_sb)->s_journal,
&EXT4_I(inode)->jinode,
@@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode)
handle_t *handle;
int err;
+ trace_ext4_evict_inode(inode);
if (inode->i_nlink) {
truncate_inode_pages(&inode->i_data, 0);
goto no_delete;
@@ -5647,6 +5649,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
int err, ret;
might_sleep();
+ trace_ext4_mark_inode_dirty(inode, _RET_IP_);
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (ext4_handle_valid(handle) &&
EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c58eba34724a..5b4d4e3a4d58 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4640,8 +4640,6 @@ do_more:
* with group lock held. generate_buddy look at
* them with group lock_held
*/
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, block_group, bit, count);
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count);
mb_free_blocks(inode, &e4b, bit, count);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 46a7d6a9d976..7f5451cd1d38 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,8 +32,14 @@
static struct kmem_cache *io_page_cachep, *io_end_cachep;
+#define WQ_HASH_SZ 37
+#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
+static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
+
int __init ext4_init_pageio(void)
{
+ int i;
+
io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
if (io_page_cachep == NULL)
return -ENOMEM;
@@ -42,6 +48,8 @@ int __init ext4_init_pageio(void)
kmem_cache_destroy(io_page_cachep);
return -ENOMEM;
}
+ for (i = 0; i < WQ_HASH_SZ; i++)
+ init_waitqueue_head(&ioend_wq[i]);
return 0;
}
@@ -52,24 +60,37 @@ void ext4_exit_pageio(void)
kmem_cache_destroy(io_page_cachep);
}
+void ext4_ioend_wait(struct inode *inode)
+{
+ wait_queue_head_t *wq = to_ioend_wq(inode);
+
+ wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
+}
+
+static void put_io_page(struct ext4_io_page *io_page)
+{
+ if (atomic_dec_and_test(&io_page->p_count)) {
+ end_page_writeback(io_page->p_page);
+ put_page(io_page->p_page);
+ kmem_cache_free(io_page_cachep, io_page);
+ }
+}
+
void ext4_free_io_end(ext4_io_end_t *io)
{
int i;
+ wait_queue_head_t *wq;
BUG_ON(!io);
if (io->page)
put_page(io->page);
- for (i = 0; i < io->num_io_pages; i++) {
- if (--io->pages[i]->p_count == 0) {
- struct page *page = io->pages[i]->p_page;
-
- end_page_writeback(page);
- put_page(page);
- kmem_cache_free(io_page_cachep, io->pages[i]);
- }
- }
+ for (i = 0; i < io->num_io_pages; i++)
+ put_io_page(io->pages[i]);
io->num_io_pages = 0;
- iput(io->inode);
+ wq = to_ioend_wq(io->inode);
+ if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
+ waitqueue_active(wq))
+ wake_up_all(wq);
kmem_cache_free(io_end_cachep, io);
}
@@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
io = kmem_cache_alloc(io_end_cachep, flags);
if (io) {
memset(io, 0, sizeof(*io));
- io->inode = igrab(inode);
- BUG_ON(!io->inode);
+ atomic_inc(&EXT4_I(inode)->i_ioend_count);
+ io->inode = inode;
INIT_WORK(&io->work, ext4_end_io_work);
INIT_LIST_HEAD(&io->list);
}
@@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error)
struct workqueue_struct *wq;
struct inode *inode;
unsigned long flags;
- ext4_fsblk_t err_block;
int i;
BUG_ON(!io_end);
- inode = io_end->inode;
bio->bi_private = NULL;
bio->bi_end_io = NULL;
if (test_bit(BIO_UPTODATE, &bio->bi_flags))
error = 0;
- err_block = bio->bi_sector >> (inode->i_blkbits - 9);
bio_put(bio);
- if (!(inode->i_sb->s_flags & MS_ACTIVE)) {
- pr_err("sb umounted, discard end_io request for inode %lu\n",
- io_end->inode->i_ino);
- ext4_free_io_end(io_end);
- return;
- }
-
- if (error) {
- io_end->flag |= EXT4_IO_END_ERROR;
- ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
- "(offset %llu size %ld starting block %llu)",
- inode->i_ino,
- (unsigned long long) io_end->offset,
- (long) io_end->size,
- (unsigned long long) err_block);
- }
-
for (i = 0; i < io_end->num_io_pages; i++) {
struct page *page = io_end->pages[i]->p_page;
struct buffer_head *bh, *head;
@@ -236,13 +237,7 @@ static void ext4_end_bio(struct bio *bio, int error)
} while (bh != head);
}
- if (--io_end->pages[i]->p_count == 0) {
- struct page *page = io_end->pages[i]->p_page;
-
- end_page_writeback(page);
- put_page(page);
- kmem_cache_free(io_page_cachep, io_end->pages[i]);
- }
+ put_io_page(io_end->pages[i]);
/*
* If this is a partial write which happened to make
@@ -254,8 +249,19 @@ static void ext4_end_bio(struct bio *bio, int error)
if (!partial_write)
SetPageUptodate(page);
}
-
io_end->num_io_pages = 0;
+ inode = io_end->inode;
+
+ if (error) {
+ io_end->flag |= EXT4_IO_END_ERROR;
+ ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+ "(offset %llu size %ld starting block %llu)",
+ inode->i_ino,
+ (unsigned long long) io_end->offset,
+ (long) io_end->size,
+ (unsigned long long)
+ bio->bi_sector >> (inode->i_blkbits - 9));
+ }
/* Add the io_end to per-inode completed io list*/
spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
@@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io,
bio->bi_private = io->io_end = io_end;
bio->bi_end_io = ext4_end_bio;
- io_end->inode = inode;
io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
io->io_bio = bio;
@@ -360,7 +365,7 @@ submit_and_retry:
if ((io_end->num_io_pages == 0) ||
(io_end->pages[io_end->num_io_pages-1] != io_page)) {
io_end->pages[io_end->num_io_pages++] = io_page;
- io_page->p_count++;
+ atomic_inc(&io_page->p_count);
}
return 0;
}
@@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
return -ENOMEM;
}
io_page->p_page = page;
- io_page->p_count = 0;
+ atomic_set(&io_page->p_count, 1);
get_page(page);
for (bh = head = page_buffers(page), block_start = 0;
@@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
* PageWriteback bit from the page to prevent the system from
* wedging later on.
*/
- if (io_page->p_count == 0) {
- put_page(page);
- end_page_writeback(page);
- kmem_cache_free(io_page_cachep, io_page);
- }
+ put_io_page(io_page);
return ret;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 40131b777af6..61182fe6254e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -828,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
+ atomic_set(&ei->i_ioend_count, 0);
return &ei->vfs_inode;
}
+static int ext4_drop_inode(struct inode *inode)
+{
+ int drop = generic_drop_inode(inode);
+
+ trace_ext4_drop_inode(inode, drop);
+ return drop;
+}
+
static void ext4_destroy_inode(struct inode *inode)
{
+ ext4_ioend_wait(inode);
if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): orphan list check failed!",
@@ -1173,6 +1183,7 @@ static const struct super_operations ext4_sops = {
.destroy_inode = ext4_destroy_inode,
.write_inode = ext4_write_inode,
.dirty_inode = ext4_dirty_inode,
+ .drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
.put_super = ext4_put_super,
.sync_fs = ext4_sync_fs,
@@ -1194,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = {
.destroy_inode = ext4_destroy_inode,
.write_inode = ext4_write_inode,
.dirty_inode = ext4_dirty_inode,
+ .drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
.write_super = ext4_write_super,
.put_super = ext4_put_super,
@@ -2699,7 +2711,6 @@ static int ext4_lazyinit_thread(void *arg)
struct ext4_li_request *elr;
unsigned long next_wakeup;
DEFINE_WAIT(wait);
- int ret;
BUG_ON(NULL == eli);
@@ -2723,13 +2734,12 @@ cont_thread:
elr = list_entry(pos, struct ext4_li_request,
lr_request);
- if (time_after_eq(jiffies, elr->lr_next_sched))
- ret = ext4_run_li_request(elr);
-
- if (ret) {
- ret = 0;
- ext4_remove_li_request(elr);
- continue;
+ if (time_after_eq(jiffies, elr->lr_next_sched)) {
+ if (ext4_run_li_request(elr) != 0) {
+ /* error, remove the lazy_init job */
+ ext4_remove_li_request(elr);
+ continue;
+ }
}
if (time_before(elr->lr_next_sched, next_wakeup))
@@ -2740,7 +2750,8 @@ cont_thread:
if (freezing(current))
refrigerator();
- if (time_after_eq(jiffies, next_wakeup)) {
+ if ((time_after_eq(jiffies, next_wakeup)) ||
+ (MAX_JIFFY_OFFSET == next_wakeup)) {
cond_resched();
continue;
}
@@ -3348,6 +3359,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
+ err = percpu_counter_init(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ }
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ }
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "insufficient memory");
+ goto failed_mount3;
+ }
+
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_max_writeback_mb_bump = 128;
@@ -3446,22 +3475,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
-no_journal:
- err = percpu_counter_init(&sbi->s_freeblocks_counter,
- ext4_count_free_blocks(sb));
- if (!err)
- err = percpu_counter_init(&sbi->s_freeinodes_counter,
- ext4_count_free_inodes(sb));
- if (!err)
- err = percpu_counter_init(&sbi->s_dirs_counter,
- ext4_count_dirs(sb));
- if (!err)
- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
- if (err) {
- ext4_msg(sb, KERN_ERR, "insufficient memory");
- goto failed_mount_wq;
- }
+ /*
+ * The journal may have updated the bg summary counts, so we
+ * need to update the global counters.
+ */
+ percpu_counter_set(&sbi->s_freeblocks_counter,
+ ext4_count_free_blocks(sb));
+ percpu_counter_set(&sbi->s_freeinodes_counter,
+ ext4_count_free_inodes(sb));
+ percpu_counter_set(&sbi->s_dirs_counter,
+ ext4_count_dirs(sb));
+ percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
+no_journal:
EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
if (!EXT4_SB(sb)->dio_unwritten_wq) {
printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
@@ -3611,10 +3637,6 @@ failed_mount_wq:
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
- percpu_counter_destroy(&sbi->s_freeblocks_counter);
- percpu_counter_destroy(&sbi->s_freeinodes_counter);
- percpu_counter_destroy(&sbi->s_dirs_counter);
- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount3:
if (sbi->s_flex_groups) {
if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3622,6 +3644,10 @@ failed_mount3:
else
kfree(sbi->s_flex_groups);
}
+ percpu_counter_destroy(&sbi->s_freeblocks_counter);
+ percpu_counter_destroy(&sbi->s_freeinodes_counter);
+ percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -3949,13 +3975,11 @@ static int ext4_commit_super(struct super_block *sb, int sync)
else
es->s_kbytes_written =
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter))
- ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeblocks_counter));
- if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
- es->s_free_inodes_count =
- cpu_to_le32(percpu_counter_sum_positive(
- &EXT4_SB(sb)->s_freeinodes_counter));
+ ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeblocks_counter));
+ es->s_free_inodes_count =
+ cpu_to_le32(percpu_counter_sum_positive(
+ &EXT4_SB(sb)->s_freeinodes_counter));
sb->s_dirt = 0;
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
@@ -4556,12 +4580,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
static int ext4_quota_off(struct super_block *sb, int type)
{
- /* Force all delayed allocation blocks to be allocated */
- if (test_opt(sb, DELALLOC)) {
- down_read(&sb->s_umount);
+ /* Force all delayed allocation blocks to be allocated.
+ * Caller already holds s_umount sem */
+ if (test_opt(sb, DELALLOC))
sync_filesystem(sb);
- up_read(&sb->s_umount);
- }
return dquot_quota_off(sb, type);
}
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 289010d3270b..e5e345fb2a5c 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -98,6 +98,103 @@ TRACE_EVENT(ext4_allocate_inode,
(unsigned long) __entry->dir, __entry->mode)
);
+TRACE_EVENT(ext4_evict_inode,
+ TP_PROTO(struct inode *inode),
+
+ TP_ARGS(inode),
+
+ TP_STRUCT__entry(
+ __field( int, dev_major )
+ __field( int, dev_minor )
+ __field( ino_t, ino )
+ __field( int, nlink )
+ ),
+
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+ __entry->dev_minor = MINOR(inode->i_sb->s_dev);
+ __entry->ino = inode->i_ino;
+ __entry->nlink = inode->i_nlink;
+ ),
+
+ TP_printk("dev %d,%d ino %lu nlink %d",
+ __entry->dev_major, __entry->dev_minor,
+ (unsigned long) __entry->ino, __entry->nlink)
+);
+
+TRACE_EVENT(ext4_drop_inode,
+ TP_PROTO(struct inode *inode, int drop),
+
+ TP_ARGS(inode, drop),
+
+ TP_STRUCT__entry(
+ __field( int, dev_major )
+ __field( int, dev_minor )
+ __field( ino_t, ino )
+ __field( int, drop )
+ ),
+
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+ __entry->dev_minor = MINOR(inode->i_sb->s_dev);
+ __entry->ino = inode->i_ino;
+ __entry->drop = drop;
+ ),
+
+ TP_printk("dev %d,%d ino %lu drop %d",
+ __entry->dev_major, __entry->dev_minor,
+ (unsigned long) __entry->ino, __entry->drop)
+);
+
+TRACE_EVENT(ext4_mark_inode_dirty,
+ TP_PROTO(struct inode *inode, unsigned long IP),
+
+ TP_ARGS(inode, IP),
+
+ TP_STRUCT__entry(
+ __field( int, dev_major )
+ __field( int, dev_minor )
+ __field( ino_t, ino )
+ __field(unsigned long, ip )
+ ),
+
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+ __entry->dev_minor = MINOR(inode->i_sb->s_dev);
+ __entry->ino = inode->i_ino;
+ __entry->ip = IP;
+ ),
+
+ TP_printk("dev %d,%d ino %lu caller %pF",
+ __entry->dev_major, __entry->dev_minor,
+ (unsigned long) __entry->ino, (void *)__entry->ip)
+);
+
+TRACE_EVENT(ext4_begin_ordered_truncate,
+ TP_PROTO(struct inode *inode, loff_t new_size),
+
+ TP_ARGS(inode, new_size),
+
+ TP_STRUCT__entry(
+ __field( int, dev_major )
+ __field( int, dev_minor )
+ __field( ino_t, ino )
+ __field( loff_t, new_size )
+ ),
+
+ TP_fast_assign(
+ __entry->dev_major = MAJOR(inode->i_sb->s_dev);
+ __entry->dev_minor = MINOR(inode->i_sb->s_dev);
+ __entry->ino = inode->i_ino;
+ __entry->new_size = new_size;
+ ),
+
+ TP_printk("dev %d,%d ino %lu new_size %lld",
+ __entry->dev_major, __entry->dev_minor,
+ (unsigned long) __entry->ino,
+ (long long) __entry->new_size)
+);
+
DECLARE_EVENT_CLASS(ext4__write_begin,
TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,