diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-14 10:03:36 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-14 10:03:36 -0800 |
commit | 37222e1c9ee3ce587f5b41fed868bd8a592a992f (patch) | |
tree | b65f22a1e20286185463ca1a2889e593d963a393 /drivers | |
parent | 76b8f82cde2d9c13ef0c9a9aa2581b9b30b68e8c (diff) | |
parent | 06e3c817b750c131a20e82eed57a17841ea88ed2 (diff) | |
download | lwn-37222e1c9ee3ce587f5b41fed868bd8a592a992f.tar.gz lwn-37222e1c9ee3ce587f5b41fed868bd8a592a992f.zip |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (27 commits)
md: add 'recovery_start' per-device sysfs attribute
md: rcu_read_lock() walk of mddev->disks in md_do_sync()
md: integrate spares into array at earliest opportunity.
md: move compat_ioctl handling into md.c
md: revise Kconfig help for MD_MULTIPATH
md: add MODULE_DESCRIPTION for all md related modules.
raid: improve MD/raid10 handling of correctable read errors.
md/raid10: print more useful messages on device failure.
md/bitmap: update dirty flag when bitmap bits are explicitly set.
md: Support write-intent bitmaps with externally managed metadata.
md/bitmap: move setting of daemon_lastrun out of bitmap_read_sb
md: support updating bitmap parameters via sysfs.
md: factor out parsing of fixed-point numbers
md: support bitmap offset appropriate for external-metadata arrays.
md: remove needless setting of thread->timeout in raid10_quiesce
md: change daemon_sleep to be in 'jiffies' rather than 'seconds'.
md: move offset, daemon_sleep and chunksize out of bitmap structure
md: collect bitmap-specific fields into one structure.
md/raid1: add takeover support for raid5->raid1
md: add honouring of suspend_{lo,hi} to raid1.
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/Kconfig | 9 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 449 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 19 | ||||
-rw-r--r-- | drivers/md/faulty.c | 1 | ||||
-rw-r--r-- | drivers/md/linear.c | 3 | ||||
-rw-r--r-- | drivers/md/md.c | 393 | ||||
-rw-r--r-- | drivers/md/md.h | 51 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 3 | ||||
-rw-r--r-- | drivers/md/raid1.c | 217 | ||||
-rw-r--r-- | drivers/md/raid1.h | 5 | ||||
-rw-r--r-- | drivers/md/raid10.c | 116 | ||||
-rw-r--r-- | drivers/md/raid5.c | 63 | ||||
-rw-r--r-- | drivers/md/raid6algos.c | 20 |
14 files changed, 1059 insertions, 293 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2158377a1359..acb3a4e404ff 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -185,11 +185,10 @@ config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD help - Multipath-IO is the ability of certain devices to address the same - physical disk over multiple 'IO paths'. The code ensures that such - paths can be defined and handled at runtime, and ensures that a - transparent failover to the backup path(s) happens if a IO errors - arrives on the primary path. + MD_MULTIPATH provides a simple multi-path personality for use + the MD framework. It is not under active development. New + projects should consider using DM_MULTIPATH which has more + features and more testing. If unsure, say N. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 60e2b322db11..26ac8aad0b19 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -212,7 +212,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) */ /* IO operations when bitmap is stored near all superblocks */ -static struct page *read_sb_page(mddev_t *mddev, long offset, +static struct page *read_sb_page(mddev_t *mddev, loff_t offset, struct page *page, unsigned long index, int size) { @@ -287,27 +287,36 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { int size = PAGE_SIZE; + loff_t offset = mddev->bitmap_info.offset; if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, bdev_logical_block_size(rdev->bdev)); /* Just make sure we aren't corrupting data or * metadata */ - if (bitmap->offset < 0) { + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + (page->index *(PAGE_SIZE/512)) > + rdev->data_offset && + rdev->sb_start + offset < + rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE/512)) + goto bad_alignment; + } else if (offset < 0) { /* DATA BITMAP METADATA */ - if (bitmap->offset + if (offset + (long)(page->index * (PAGE_SIZE/512)) + size/512 > 0) /* bitmap runs in to metadata */ goto bad_alignment; if (rdev->data_offset + mddev->dev_sectors - > rdev->sb_start + bitmap->offset) + > rdev->sb_start + offset) /* data runs in to bitmap */ goto bad_alignment; } else if (rdev->sb_start < rdev->data_offset) { /* METADATA BITMAP DATA */ if (rdev->sb_start - + bitmap->offset + + offset + page->index*(PAGE_SIZE/512) + size/512 > rdev->data_offset) /* bitmap runs in to data */ @@ -316,7 +325,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) /* DATA METADATA BITMAP - no problems */ } md_super_write(mddev, rdev, - rdev->sb_start + bitmap->offset + rdev->sb_start + offset + page->index * (PAGE_SIZE/512), size, page); @@ -488,6 +497,8 @@ void bitmap_update_sb(struct bitmap *bitmap) if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ return; + if (bitmap->mddev->bitmap_info.external) + return; spin_lock_irqsave(&bitmap->lock, flags); if (!bitmap->sb_page) { /* no superblock */ spin_unlock_irqrestore(&bitmap->lock, flags); @@ -501,6 +512,9 @@ void bitmap_update_sb(struct bitmap *bitmap) bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->events_cleared); } + /* Just in case these have been changed via sysfs: */ + sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); + sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); kunmap_atomic(sb, KM_USER0); write_page(bitmap, bitmap->sb_page, 1); } @@ -550,7 +564,8 @@ static int bitmap_read_sb(struct bitmap *bitmap) bitmap->sb_page = read_page(bitmap->file, 0, bitmap, bytes); } else { - bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, + bitmap->sb_page = read_sb_page(bitmap->mddev, + bitmap->mddev->bitmap_info.offset, NULL, 0, sizeof(bitmap_super_t)); } @@ -563,7 +578,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) sb = (bitmap_super_t *)kmap_atomic(bitmap->sb_page, KM_USER0); chunksize = le32_to_cpu(sb->chunksize); - daemon_sleep = le32_to_cpu(sb->daemon_sleep); + daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); /* verify that the bitmap-specific fields are valid */ @@ -576,7 +591,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) reason = "bitmap chunksize too small"; else if ((1 << ffz(~chunksize)) != chunksize) reason = "bitmap chunksize not a power of 2"; - else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT / HZ) + else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) reason = "daemon sleep period out of range"; else if (write_behind > COUNTER_MAX) reason = "write-behind limit out of range (0 - 16383)"; @@ -610,10 +625,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) } success: /* assign fields using values from superblock */ - bitmap->chunksize = chunksize; - bitmap->daemon_sleep = daemon_sleep; - bitmap->daemon_lastrun = jiffies; - bitmap->max_write_behind = write_behind; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; bitmap->flags |= le32_to_cpu(sb->state); if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) bitmap->flags |= BITMAP_HOSTENDIAN; @@ -664,16 +678,26 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, * general bitmap file operations */ +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ /* calculate the index of the page that contains this bit */ -static inline unsigned long file_page_index(unsigned long chunk) +static inline unsigned long file_page_index(struct bitmap *bitmap, unsigned long chunk) { - return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; + if (!bitmap->mddev->bitmap_info.external) + chunk += sizeof(bitmap_super_t) << 3; + return chunk >> PAGE_BIT_SHIFT; } /* calculate the (bit) offset of this bit within a page */ -static inline unsigned long file_page_offset(unsigned long chunk) +static inline unsigned long file_page_offset(struct bitmap *bitmap, unsigned long chunk) { - return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); + if (!bitmap->mddev->bitmap_info.external) + chunk += sizeof(bitmap_super_t) << 3; + return chunk & (PAGE_BITS - 1); } /* @@ -686,8 +710,9 @@ static inline unsigned long file_page_offset(unsigned long chunk) static inline struct page *filemap_get_page(struct bitmap *bitmap, unsigned long chunk) { - if (file_page_index(chunk) >= bitmap->file_pages) return NULL; - return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; + if (file_page_index(bitmap, chunk) >= bitmap->file_pages) return NULL; + return bitmap->filemap[file_page_index(bitmap, chunk) + - file_page_index(bitmap, 0)]; } @@ -710,7 +735,7 @@ static void bitmap_file_unmap(struct bitmap *bitmap) spin_unlock_irqrestore(&bitmap->lock, flags); while (pages--) - if (map[pages]->index != 0) /* 0 is sb_page, release it below */ + if (map[pages] != sb_page) /* 0 is sb_page, release it below */ free_buffers(map[pages]); kfree(map); kfree(attr); @@ -821,7 +846,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) page = filemap_get_page(bitmap, chunk); if (!page) return; - bit = file_page_offset(chunk); + bit = file_page_offset(bitmap, chunk); /* set the bit */ kaddr = kmap_atomic(page, KM_USER0); @@ -907,7 +932,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) chunks = bitmap->chunks; file = bitmap->file; - BUG_ON(!file && !bitmap->offset); + BUG_ON(!file && !bitmap->mddev->bitmap_info.offset); #ifdef INJECT_FAULTS_3 outofdate = 1; @@ -919,14 +944,17 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) "recovery\n", bmname(bitmap)); bytes = (chunks + 7) / 8; + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); - num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; + + num_pages = (bytes + PAGE_SIZE - 1) / PAGE_SIZE; - if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { + if (file && i_size_read(file->f_mapping->host) < bytes) { printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", bmname(bitmap), (unsigned long) i_size_read(file->f_mapping->host), - bytes + sizeof(bitmap_super_t)); + bytes); goto err; } @@ -947,17 +975,16 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) for (i = 0; i < chunks; i++) { int b; - index = file_page_index(i); - bit = file_page_offset(i); + index = file_page_index(bitmap, i); + bit = file_page_offset(bitmap, i); if (index != oldindex) { /* this is a new page, read it in */ int count; /* unmap the old page, we're done with it */ if (index == num_pages-1) - count = bytes + sizeof(bitmap_super_t) - - index * PAGE_SIZE; + count = bytes - index * PAGE_SIZE; else count = PAGE_SIZE; - if (index == 0) { + if (index == 0 && bitmap->sb_page) { /* * if we're here then the superblock page * contains some bits (PAGE_SIZE != sizeof sb) @@ -967,14 +994,15 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) offset = sizeof(bitmap_super_t); if (!file) read_sb_page(bitmap->mddev, - bitmap->offset, + bitmap->mddev->bitmap_info.offset, page, index, count); } else if (file) { page = read_page(file, index, bitmap, count); offset = 0; } else { - page = read_sb_page(bitmap->mddev, bitmap->offset, + page = read_sb_page(bitmap->mddev, + bitmap->mddev->bitmap_info.offset, NULL, index, count); offset = 0; @@ -1078,23 +1106,32 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, * out to disk */ -void bitmap_daemon_work(struct bitmap *bitmap) +void bitmap_daemon_work(mddev_t *mddev) { + struct bitmap *bitmap; unsigned long j; unsigned long flags; struct page *page = NULL, *lastpage = NULL; int blocks; void *paddr; - if (bitmap == NULL) + /* Use a mutex to guard daemon_work against + * bitmap_destroy. + */ + mutex_lock(&mddev->bitmap_info.mutex); + bitmap = mddev->bitmap; + if (bitmap == NULL) { + mutex_unlock(&mddev->bitmap_info.mutex); return; - if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) + } + if (time_before(jiffies, bitmap->daemon_lastrun + + bitmap->mddev->bitmap_info.daemon_sleep)) goto done; bitmap->daemon_lastrun = jiffies; if (bitmap->allclean) { bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - return; + goto done; } bitmap->allclean = 1; @@ -1142,7 +1179,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) /* We are possibly going to clear some bits, so make * sure that events_cleared is up-to-date. */ - if (bitmap->need_sync) { + if (bitmap->need_sync && + bitmap->mddev->bitmap_info.external == 0) { bitmap_super_t *sb; bitmap->need_sync = 0; sb = kmap_atomic(bitmap->sb_page, KM_USER0); @@ -1152,7 +1190,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) write_page(bitmap, bitmap->sb_page, 1); } spin_lock_irqsave(&bitmap->lock, flags); - clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); + if (!bitmap->need_sync) + clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); } bmc = bitmap_get_counter(bitmap, (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), @@ -1167,7 +1206,7 @@ void bitmap_daemon_work(struct bitmap *bitmap) if (*bmc == 2) { *bmc=1; /* maybe clear the bit next time */ set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); - } else if (*bmc == 1) { + } else if (*bmc == 1 && !bitmap->need_sync) { /* we can clear the bit */ *bmc = 0; bitmap_count_page(bitmap, @@ -1177,9 +1216,11 @@ void bitmap_daemon_work(struct bitmap *bitmap) /* clear the bit */ paddr = kmap_atomic(page, KM_USER0); if (bitmap->flags & BITMAP_HOSTENDIAN) - clear_bit(file_page_offset(j), paddr); + clear_bit(file_page_offset(bitmap, j), + paddr); else - ext2_clear_bit(file_page_offset(j), paddr); + ext2_clear_bit(file_page_offset(bitmap, j), + paddr); kunmap_atomic(paddr, KM_USER0); } } else @@ -1202,7 +1243,9 @@ void bitmap_daemon_work(struct bitmap *bitmap) done: if (bitmap->allclean == 0) - bitmap->mddev->thread->timeout = bitmap->daemon_sleep * HZ; + bitmap->mddev->thread->timeout = + bitmap->mddev->bitmap_info.daemon_sleep; + mutex_unlock(&mddev->bitmap_info.mutex); } static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, @@ -1332,6 +1375,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto bitmap->events_cleared < bitmap->mddev->events) { bitmap->events_cleared = bitmap->mddev->events; bitmap->need_sync = 1; + sysfs_notify_dirent(bitmap->sysfs_can_clear); } if (!success && ! (*bmc & NEEDED_MASK)) @@ -1470,7 +1514,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) return; } if (time_before(jiffies, (bitmap->last_end_sync - + bitmap->daemon_sleep * HZ))) + + bitmap->mddev->bitmap_info.daemon_sleep))) return; wait_event(bitmap->mddev->recovery_wait, atomic_read(&bitmap->mddev->recovery_active) == 0); @@ -1522,6 +1566,12 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); bitmap_set_memory_bits(bitmap, sec, 1); bitmap_file_set_bit(bitmap, sec); + if (sec < bitmap->mddev->recovery_cp) + /* We are asserting that the array is dirty, + * so move the recovery_cp address back so + * that it is obvious that it is dirty + */ + bitmap->mddev->recovery_cp = sec; } } @@ -1531,7 +1581,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) void bitmap_flush(mddev_t *mddev) { struct bitmap *bitmap = mddev->bitmap; - int sleep; + long sleep; if (!bitmap) /* there was no bitmap */ return; @@ -1539,12 +1589,13 @@ void bitmap_flush(mddev_t *mddev) /* run the daemon_work three time to ensure everything is flushed * that can be */ - sleep = bitmap->daemon_sleep; - bitmap->daemon_sleep = 0; - bitmap_daemon_work(bitmap); - bitmap_daemon_work(bitmap); - bitmap_daemon_work(bitmap); - bitmap->daemon_sleep = sleep; + sleep = mddev->bitmap_info.daemon_sleep * 2; + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); bitmap_update_sb(bitmap); } @@ -1574,6 +1625,7 @@ static void bitmap_free(struct bitmap *bitmap) kfree(bp); kfree(bitmap); } + void bitmap_destroy(mddev_t *mddev) { struct bitmap *bitmap = mddev->bitmap; @@ -1581,10 +1633,15 @@ void bitmap_destroy(mddev_t *mddev) if (!bitmap) /* there was no bitmap */ return; + mutex_lock(&mddev->bitmap_info.mutex); mddev->bitmap = NULL; /* disconnect from the md device */ + mutex_unlock(&mddev->bitmap_info.mutex); if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + bitmap_free(bitmap); } @@ -1598,16 +1655,17 @@ int bitmap_create(mddev_t *mddev) sector_t blocks = mddev->resync_max_sectors; unsigned long chunks; unsigned long pages; - struct file *file = mddev->bitmap_file; + struct file *file = mddev->bitmap_info.file; int err; sector_t start; + struct sysfs_dirent *bm; BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); - if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ + if (!file && !mddev->bitmap_info.offset) /* bitmap disabled, nothing to do */ return 0; - BUG_ON(file && mddev->bitmap_offset); + BUG_ON(file && mddev->bitmap_info.offset); bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) @@ -1620,8 +1678,14 @@ int bitmap_create(mddev_t *mddev) bitmap->mddev = mddev; + bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); + if (bm) { + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); + sysfs_put(bm); + } else + bitmap->sysfs_can_clear = NULL; + bitmap->file = file; - bitmap->offset = mddev->bitmap_offset; if (file) { get_file(file); /* As future accesses to this file will use bmap, @@ -1630,12 +1694,22 @@ int bitmap_create(mddev_t *mddev) */ vfs_fsync(file, file->f_dentry, 1); } - /* read superblock from bitmap file (this sets bitmap->chunksize) */ - err = bitmap_read_sb(bitmap); + /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ + if (!mddev->bitmap_info.external) + err = bitmap_read_sb(bitmap); + else { + err = 0; + if (mddev->bitmap_info.chunksize == 0 || + mddev->bitmap_info.daemon_sleep == 0) + /* chunksize and time_base need to be + * set first. */ + err = -EINVAL; + } if (err) goto error; - bitmap->chunkshift = ffz(~bitmap->chunksize); + bitmap->daemon_lastrun = jiffies; + bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); /* now that chunksize and chunkshift are set, we can use these macros */ chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> @@ -1677,7 +1751,8 @@ int bitmap_create(mddev_t *mddev) mddev->bitmap = bitmap; - mddev->thread->timeout = bitmap->daemon_sleep * HZ; + mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + md_wakeup_thread(mddev->thread); bitmap_update_sb(bitmap); @@ -1688,6 +1763,264 @@ int bitmap_create(mddev_t *mddev) return err; } +static ssize_t +location_show(mddev_t *mddev, char *page) +{ + ssize_t len; + if (mddev->bitmap_info.file) { + len = sprintf(page, "file"); + } else if (mddev->bitmap_info.offset) { + len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); + } else + len = sprintf(page, "none"); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +location_store(mddev_t *mddev, const char *buf, size_t len) +{ + + if (mddev->pers) { + if (!mddev->pers->quiesce) + return -EBUSY; + if (mddev->recovery || mddev->sync_thread) + return -EBUSY; + } + + if (mddev->bitmap || mddev->bitmap_info.file || + mddev->bitmap_info.offset) { + /* bitmap already configured. Only option is to clear it */ + if (strncmp(buf, "none", 4) != 0) + return -EBUSY; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } + mddev->bitmap_info.offset = 0; + if (mddev->bitmap_info.file) { + struct file *f = mddev->bitmap_info.file; + mddev->bitmap_info.file = NULL; + restore_bitmap_write_access(f); + fput(f); + } + } else { + /* No bitmap, OK to set a location */ + long long offset; + if (strncmp(buf, "none", 4) == 0) + /* nothing to be done */; + else if (strncmp(buf, "file:", 5) == 0) { + /* Not supported yet */ + return -EINVAL; + } else { + int rv; + if (buf[0] == '+') + rv = strict_strtoll(buf+1, 10, &offset); + else + rv = strict_strtoll(buf, 10, &offset); + if (rv) + return rv; + if (offset == 0) + return -EINVAL; + if (mddev->bitmap_info.external == 0 && + mddev->major_version == 0 && + offset != mddev->bitmap_info.default_offset) + return -EINVAL; + mddev->bitmap_info.offset = offset; + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + rv = bitmap_create(mddev); + if (rv) { + bitmap_destroy(mddev); + mddev->bitmap_info.offset = 0; + } + mddev->pers->quiesce(mddev, 0); + if (rv) + return rv; + } + } + } + if (!mddev->external) { + /* Ensure new bitmap info is stored in + * metadata promptly. + */ + set_bit(MD_CHANGE_DEVS, &mddev->flags); + md_wakeup_thread(mddev->thread); + } + return len; +} + +static struct md_sysfs_entry bitmap_location = +__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); + +static ssize_t +timeout_show(mddev_t *mddev, char *page) +{ + ssize_t len; + unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; + unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; + + len = sprintf(page, "%lu", secs); + if (jifs) + len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +timeout_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* timeout can be set at any time */ + unsigned long timeout; + int rv = strict_strtoul_scaled(buf, &timeout, 4); + if (rv) + return rv; + + /* just to make sure we don't overflow... */ + if (timeout >= LONG_MAX / HZ) + return -EINVAL; + + timeout = timeout * HZ / 10000; + + if (timeout >= MAX_SCHEDULE_TIMEOUT) + timeout = MAX_SCHEDULE_TIMEOUT-1; + if (timeout < 1) + timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; + if (mddev->thread) { + /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then + * the bitmap is all clean and we don't need to + * adjust the timeout right now + */ + if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { + mddev->thread->timeout = timeout; + md_wakeup_thread(mddev->thread); + } + } + return len; +} + +static struct md_sysfs_entry bitmap_timeout = +__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); + +static ssize_t +backlog_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); +} + +static ssize_t +backlog_store(mddev_t *mddev, const char *buf, size_t len) +{ + unsigned long backlog; + int rv = strict_strtoul(buf, 10, &backlog); + if (rv) + return rv; + if (backlog > COUNTER_MAX) + return -EINVAL; + mddev->bitmap_info.max_write_behind = backlog; + return len; +} + +static struct md_sysfs_entry bitmap_backlog = +__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); + +static ssize_t +chunksize_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); +} + +static ssize_t +chunksize_store(mddev_t *mddev, const char *buf, size_t len) +{ + /* Can only be changed when no bitmap is active */ + int rv; + unsigned long csize; + if (mddev->bitmap) + return -EBUSY; + rv = strict_strtoul(buf, 10, &csize); + if (rv) + return rv; + if (csize < 512 || + !is_power_of_2(csize)) + return -EINVAL; + mddev->bitmap_info.chunksize = csize; + return len; +} + +static struct md_sysfs_entry bitmap_chunksize = +__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); + +static ssize_t metadata_show(mddev_t *mddev, char *page) +{ + return sprintf(page, "%s\n", (mddev->bitmap_info.external + ? "external" : "internal")); +} + +static ssize_t metadata_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap || + mddev->bitmap_info.file || + mddev->bitmap_info.offset) + return -EBUSY; + if (strncmp(buf, "external", 8) == 0) + mddev->bitmap_info.external = 1; + else if (strncmp(buf, "internal", 8) == 0) + mddev->bitmap_info.external = 0; + else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_metadata = +__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t can_clear_show(mddev_t *mddev, char *page) +{ + int len; + if (mddev->bitmap) + len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? + "false" : "true")); + else + len = sprintf(page, "\n"); + return len; +} + +static ssize_t can_clear_store(mddev_t *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap == NULL) + return -ENOENT; + if (strncmp(buf, "false", 5) == 0) + mddev->bitmap->need_sync = 1; + else if (strncmp(buf, "true", 4) == 0) { + if (mddev->degraded) + return -EBUSY; + mddev->bitmap->need_sync = 0; + } else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_can_clear = +__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); + +static struct attribute *md_bitmap_attrs[] = { + &bitmap_location.attr, + &bitmap_timeout.attr, + &bitmap_backlog.attr, + &bitmap_chunksize.attr, + &bitmap_metadata.attr, + &bitmap_can_clear.attr, + NULL +}; +struct attribute_group md_bitmap_group = { + .name = "bitmap", + .attrs = md_bitmap_attrs, +}; + + /* the bitmap API -- for raid personalities */ EXPORT_SYMBOL(bitmap_startwrite); EXPORT_SYMBOL(bitmap_endwrite); diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index e98900671ca9..cb821d76d1b4 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -106,7 +106,7 @@ typedef __u16 bitmap_counter_t; #define BITMAP_BLOCK_SHIFT 9 /* how many blocks per chunk? (this is variable) */ -#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) @@ -118,16 +118,6 @@ typedef __u16 bitmap_counter_t; (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) -/* - * on-disk bitmap: - * - * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap - * file a page at a time. There's a superblock at the start of the file. - */ - -/* map chunks (bits) to file pages - offset by the size of the superblock */ -#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) - #endif /* @@ -209,7 +199,6 @@ struct bitmap { int counter_bits; /* how many bits per block counter */ /* bitmap chunksize -- how much data does each bit represent? */ - unsigned long chunksize; unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ unsigned long chunks; /* total number of data chunks for the array */ @@ -226,7 +215,6 @@ struct bitmap { /* bitmap spinlock */ spinlock_t lock; - long offset; /* offset from superblock if file is NULL */ struct file *file; /* backing disk file */ struct page *sb_page; /* cached copy of the bitmap file superblock */ struct page **filemap; /* list of cache pages for the file */ @@ -238,7 +226,6 @@ struct bitmap { int allclean; - unsigned long max_write_behind; /* write-behind mode */ atomic_t behind_writes; /* @@ -246,7 +233,6 @@ struct bitmap { * file, cleaning up bits and flushing out pages to disk as necessary */ unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long daemon_sleep; /* how many seconds between updates? */ unsigned long last_end_sync; /* when we lasted called end_sync to * update bitmap with resync progress */ @@ -254,6 +240,7 @@ struct bitmap { wait_queue_head_t write_wait; wait_queue_head_t overflow_wait; + struct sysfs_dirent *sysfs_can_clear; }; /* the bitmap API */ @@ -282,7 +269,7 @@ void bitmap_close_sync(struct bitmap *bitmap); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); void bitmap_unplug(struct bitmap *bitmap); -void bitmap_daemon_work(struct bitmap *bitmap); +void bitmap_daemon_work(mddev_t *mddev); #endif #endif diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 87d88dbb667f..713acd02ab39 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -360,6 +360,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fault injection personality for MD"); MODULE_ALIAS("md-personality-10"); /* faulty */ MODULE_ALIAS("md-faulty"); MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 1ceceb334d5e..00435bd20699 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -292,7 +292,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio) int cpu; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - bio_endio(bio, -EOPNOTSUPP); + md_barrier_request(mddev, bio); return 0; } @@ -383,6 +383,7 @@ static void linear_exit (void) module_init(linear_init); module_exit(linear_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD"); MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ MODULE_ALIAS("md-linear"); MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md.c b/drivers/md/md.c index 5f154ef1e4be..e1f3c1715cca 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -44,6 +44,7 @@ #include <linux/random.h> #include <linux/reboot.h> #include <linux/file.h> +#include <linux/compat.h> #include <linux/delay.h> #include <linux/raid/md_p.h> #include <linux/raid/md_u.h> @@ -68,6 +69,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } /* + * Default number of read corrections we'll attempt on an rdev + * before ejecting it from the array. We divide the read error + * count by 2 for every hour elapsed between read errors. + */ +#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. * Increase it if you want to have more _guaranteed_ speed. Note that @@ -213,12 +220,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) return 0; } rcu_read_lock(); - if (mddev->suspended) { + if (mddev->suspended || mddev->barrier) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!mddev->suspended && !mddev->barrier) break; rcu_read_unlock(); schedule(); @@ -260,10 +267,110 @@ static void mddev_resume(mddev_t *mddev) int mddev_congested(mddev_t *mddev, int bits) { + if (mddev->barrier) + return 1; return mddev->suspended; } EXPORT_SYMBOL(mddev_congested); +/* + * Generic barrier handling for md + */ + +#define POST_REQUEST_BARRIER ((void*)1) + +static void md_end_barrier(struct bio *bio, int err) +{ + mdk_rdev_t *rdev = bio->bi_private; + mddev_t *mddev = rdev->mddev; + if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER) + set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags); + + rdev_dec_pending(rdev, mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) { + if (mddev->barrier == POST_REQUEST_BARRIER) { + /* This was a post-request barrier */ + mddev->barrier = NULL; + wake_up(&mddev->sb_wait); + } else + /* The pre-request barrier has finished */ + schedule_work(&mddev->barrier_work); + } + bio_put(bio); +} + +static void submit_barriers(mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + rcu_read_lock(); + list_for_each_entry_rcu(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* Take two references, one is dropped + * when request finishes, one after + * we reclaim rcu_read_lock + */ + struct bio *bi; + atomic_inc(&rdev->nr_pending); + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + bi = bio_alloc(GFP_KERNEL, 0); + bi->bi_end_io = md_end_barrier; + bi->bi_private = rdev; + bi->bi_bdev = rdev->bdev; + atomic_inc(&mddev->flush_pending); + submit_bio(WRITE_BARRIER, bi); + rcu_read_lock(); + rdev_dec_pending(rdev, mddev); + } + rcu_read_unlock(); +} + +static void md_submit_barrier(struct work_struct *ws) +{ + mddev_t *mddev = container_of(ws, mddev_t, barrier_work); + struct bio *bio = mddev->barrier; + + atomic_set(&mddev->flush_pending, 1); + + if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) + bio_endio(bio, -EOPNOTSUPP); + else if (bio->bi_size == 0) + /* an empty barrier - all done */ + bio_endio(bio, 0); + else { + bio->bi_rw &= ~(1<<BIO_RW_BARRIER); + if (mddev->pers->make_request(mddev->queue, bio)) + generic_make_request(bio); + mddev->barrier = POST_REQUEST_BARRIER; + submit_barriers(mddev); + } + if (atomic_dec_and_test(&mddev->flush_pending)) { + mddev->barrier = NULL; + wake_up(&mddev->sb_wait); + } +} + +void md_barrier_request(mddev_t *mddev, struct bio *bio) +{ + spin_lock_irq(&mddev->write_lock); + wait_event_lock_irq(mddev->sb_wait, + !mddev->barrier, + mddev->write_lock, /*nothing*/); + mddev->barrier = bio; + spin_unlock_irq(&mddev->write_lock); + + atomic_set(&mddev->flush_pending, 1); + INIT_WORK(&mddev->barrier_work, md_submit_barrier); + + submit_barriers(mddev); + + if (atomic_dec_and_test(&mddev->flush_pending)) + schedule_work(&mddev->barrier_work); +} +EXPORT_SYMBOL(md_barrier_request); static inline mddev_t *mddev_get(mddev_t *mddev) { @@ -363,6 +470,7 @@ static mddev_t * mddev_find(dev_t unit) mutex_init(&new->open_mutex); mutex_init(&new->reconfig_mutex); + mutex_init(&new->bitmap_info.mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); @@ -370,6 +478,7 @@ static mddev_t * mddev_find(dev_t unit) atomic_set(&new->openers, 0); atomic_set(&new->active_io, 0); spin_lock_init(&new->write_lock); + atomic_set(&new->flush_pending, 0); init_waitqueue_head(&new->sb_wait); init_waitqueue_head(&new->recovery_wait); new->reshape_position = MaxSector; @@ -748,7 +857,7 @@ struct super_type { */ int md_check_no_bitmap(mddev_t *mddev) { - if (!mddev->bitmap_file && !mddev->bitmap_offset) + if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) return 0; printk(KERN_ERR "%s: bitmaps are not supported for %s\n", mdname(mddev), mddev->pers->name); @@ -876,8 +985,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->raid_disks = sb->raid_disks; mddev->dev_sectors = sb->size * 2; mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; if (mddev->minor_version >= 91) { mddev->reshape_position = sb->reshape_position; @@ -911,8 +1020,9 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = MD_SB_DISKS; if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && - mddev->bitmap_file == NULL) - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.file == NULL) + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; } else if (mddev->pers == NULL) { /* Insist on good event counter while assembling */ @@ -1029,7 +1139,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_sectors << 9; - if (mddev->bitmap && mddev->bitmap_file == NULL) + if (mddev->bitmap && mddev->bitmap_info.file == NULL) sb->state |= (1<<MD_SB_BITMAP_PRESENT); sb->disks[0].state = (1<<MD_DISK_REMOVED); @@ -1107,7 +1217,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) { if (num_sectors && num_sectors < rdev->mddev->dev_sectors) return 0; /* component must fit device */ - if (rdev->mddev->bitmap_offset) + if (rdev->mddev->bitmap_info.offset) return 0; /* can't move bitmap */ rdev->sb_start = calc_dev_sboffset(rdev->bdev); if (!num_sectors || num_sectors > rdev->sb_start) @@ -1286,8 +1396,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->raid_disks = le32_to_cpu(sb->raid_disks); mddev->dev_sectors = le64_to_cpu(sb->size); mddev->events = ev1; - mddev->bitmap_offset = 0; - mddev->default_bitmap_offset = 1024 >> 9; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 1024 >> 9; mddev->recovery_cp = le64_to_cpu(sb->resync_offset); memcpy(mddev->uuid, sb->set_uuid, 16); @@ -1295,8 +1405,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) mddev->max_disks = (4096-256)/2; if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && - mddev->bitmap_file == NULL ) - mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); + mddev->bitmap_info.file == NULL ) + mddev->bitmap_info.offset = + (__s32)le32_to_cpu(sb->bitmap_offset); if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { mddev->reshape_position = le64_to_cpu(sb->reshape_position); @@ -1390,19 +1501,17 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->level = cpu_to_le32(mddev->level); sb->layout = cpu_to_le32(mddev->layout); - if (mddev->bitmap && mddev->bitmap_file == NULL) { - sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); + if (mddev->bitmap && mddev->bitmap_info.file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags)) { - if (rdev->recovery_offset > 0) { - sb->feature_map |= - cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); - sb->recovery_offset = - cpu_to_le64(rdev->recovery_offset); - } + sb->feature_map |= + cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); + sb->recovery_offset = + cpu_to_le64(rdev->recovery_offset); } if (mddev->reshape_position != MaxSector) { @@ -1436,7 +1545,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->dev_roles[i] = cpu_to_le16(0xfffe); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); - else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) + else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else sb->dev_roles[i] = cpu_to_le16(0xffff); @@ -1458,7 +1567,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) max_sectors -= rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - } else if (rdev->mddev->bitmap_offset) { + } else if (rdev->mddev->bitmap_info.offset) { /* minor version 0 with bitmap we can't move */ return 0; } else { @@ -2442,12 +2551,49 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) static struct rdev_sysfs_entry rdev_size = __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); + +static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page) +{ + unsigned long long recovery_start = rdev->recovery_offset; + + if (test_bit(In_sync, &rdev->flags) || + recovery_start == MaxSector) + return sprintf(page, "none\n"); + + return sprintf(page, "%llu\n", recovery_start); +} + +static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len) +{ + unsigned long long recovery_start; + + if (cmd_match(buf, "none")) + recovery_start = MaxSector; + else if (strict_strtoull(buf, 10, &recovery_start)) + return -EINVAL; + + if (rdev->mddev->pers && + rdev->raid_disk >= 0) + return -EBUSY; + + rdev->recovery_offset = recovery_start; + if (recovery_start == MaxSector) + set_bit(In_sync, &rdev->flags); + else + clear_bit(In_sync, &rdev->flags); + return len; +} + +static struct rdev_sysfs_entry rdev_recovery_start = +__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); + static struct attribute *rdev_default_attrs[] = { &rdev_state.attr, &rdev_errors.attr, &rdev_slot.attr, &rdev_offset.attr, &rdev_size.attr, + &rdev_recovery_start.attr, NULL, }; static ssize_t @@ -2549,6 +2695,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi rdev->flags = 0; rdev->data_offset = 0; rdev->sb_events = 0; + rdev->last_read_error.tv_sec = 0; + rdev->last_read_error.tv_nsec = 0; atomic_set(&rdev->nr_pending, 0); atomic_set(&rdev->read_errors, 0); atomic_set(&rdev->corrected_errors, 0); @@ -2659,6 +2807,47 @@ static void analyze_sbs(mddev_t * mddev) } } +/* Read a fixed-point number. + * Numbers in sysfs attributes should be in "standard" units where + * possible, so time should be in seconds. + * However we internally use a a much smaller unit such as + * milliseconds or jiffies. + * This function takes a decimal number with a possible fractional + * component, and produces an integer which is the result of + * multiplying that number by 10^'scale'. + * all without any floating-point arithmetic. + */ +int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) +{ + unsigned long result = 0; + long decimals = -1; + while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { + if (*cp == '.') + decimals = 0; + else if (decimals < scale) { + unsigned int value; + value = *cp - '0'; + result = result * 10 + value; + if (decimals >= 0) + decimals++; + } + cp++; + } + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + if (decimals < 0) + decimals = 0; + while (decimals < scale) { + result *= 10; + decimals ++; + } + *res = result; + return 0; +} + + static void md_safemode_timeout(unsigned long data); static ssize_t @@ -2670,31 +2859,10 @@ safe_delay_show(mddev_t *mddev, char *page) static ssize_t safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) { - int scale=1; - int dot=0; - int i; unsigned long msec; - char buf[30]; - /* remove a period, and count digits after it */ - if (len >= sizeof(buf)) - return -EINVAL; - strlcpy(buf, cbuf, sizeof(buf)); - for (i=0; i<len; i++) { - if (dot) { - if (isdigit(buf[i])) { - buf[i-1] = buf[i]; - scale *= 10; - } - buf[i] = 0; - } else if (buf[i] == '.') { - dot=1; - buf[i] = 0; - } - } - if (strict_strtoul(buf, 10, &msec) < 0) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; - msec = (msec * 1000) / scale; if (msec == 0) mddev->safemode_delay = 0; else { @@ -2970,7 +3138,9 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) if (mddev->pers) return -EBUSY; - if (!*buf || (*e && *e != '\n')) + if (cmd_match(buf, "none")) + n = MaxSector; + else if (!*buf || (*e && *e != '\n')) return -EINVAL; mddev->recovery_cp = n; @@ -3166,6 +3336,29 @@ static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t +max_corrected_read_errors_show(mddev_t *mddev, char *page) { + return sprintf(page, "%d\n", + atomic_read(&mddev->max_corr_read_errors)); +} + +static ssize_t +max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len) +{ + char *e; + unsigned long n = simple_strtoul(buf, &e, 10); + + if (*buf && (*e == 0 || *e == '\n')) { + atomic_set(&mddev->max_corr_read_errors, n); + return len; + } + return -EINVAL; +} + +static struct md_sysfs_entry max_corr_read_errors = +__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, + max_corrected_read_errors_store); + +static ssize_t null_show(mddev_t *mddev, char *page) { return -EINVAL; @@ -3790,6 +3983,7 @@ static struct attribute *md_default_attrs[] = { &md_array_state.attr, &md_reshape_position.attr, &md_array_size.attr, + &max_corr_read_errors.attr, NULL, }; @@ -3894,6 +4088,7 @@ static void mddev_delayed_delete(struct work_struct *ws) mddev->sysfs_action = NULL; mddev->private = NULL; } + sysfs_remove_group(&mddev->kobj, &md_bitmap_group); kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } @@ -3985,6 +4180,8 @@ static int md_alloc(dev_t dev, char *name) disk->disk_name); error = 0; } + if (sysfs_create_group(&mddev->kobj, &md_bitmap_group)) + printk(KERN_DEBUG "pointless warning\n"); abort: mutex_unlock(&disks_mutex); if (!error) { @@ -4206,6 +4403,8 @@ static int do_md_run(mddev_t * mddev) mddev->ro = 0; atomic_set(&mddev->writes_pending,0); + atomic_set(&mddev->max_corr_read_errors, + MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; mddev->safemode_timer.function = md_safemode_timeout; mddev->safemode_timer.data = (unsigned long) mddev; @@ -4310,7 +4509,7 @@ static int deny_bitmap_write_access(struct file * file) return 0; } -static void restore_bitmap_write_access(struct file *file) +void restore_bitmap_write_access(struct file *file) { struct inode *inode = file->f_mapping->host; @@ -4405,12 +4604,12 @@ out: printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); bitmap_destroy(mddev); - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + if (mddev->bitmap_info.file) { + restore_bitmap_write_access(mddev->bitmap_info.file); + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; } - mddev->bitmap_offset = 0; + mddev->bitmap_info.offset = 0; /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); @@ -4451,6 +4650,11 @@ out: mddev->degraded = 0; mddev->barriers_work = 0; mddev->safemode = 0; + mddev->bitmap_info.offset = 0; + mddev->bitmap_info.default_offset = 0; + mddev->bitmap_info.chunksize = 0; + mddev->bitmap_info.daemon_sleep = 0; + mddev->bitmap_info.max_write_behind = 0; kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); if (mddev->hold_active == UNTIL_STOP) mddev->hold_active = 0; @@ -4636,7 +4840,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg) info.state = 0; if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); - if (mddev->bitmap && mddev->bitmap_offset) + if (mddev->bitmap && mddev->bitmap_info.offset) info.state = (1<<MD_SB_BITMAP_PRESENT); info.active_disks = insync; info.working_disks = working; @@ -4994,23 +5198,23 @@ static int set_bitmap_file(mddev_t *mddev, int fd) if (fd >= 0) { if (mddev->bitmap) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_file = fget(fd); + mddev->bitmap_info.file = fget(fd); - if (mddev->bitmap_file == NULL) { + if (mddev->bitmap_info.file == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - err = deny_bitmap_write_access(mddev->bitmap_file); + err = deny_bitmap_write_access(mddev->bitmap_info.file); if (err) { printk(KERN_ERR "%s: error: bitmap file is already in use\n", mdname(mddev)); - fput(mddev->bitmap_file); - mddev->bitmap_file = NULL; + fput(mddev->bitmap_info.file); + mddev->bitmap_info.file = NULL; return err; } - mddev->bitmap_offset = 0; /* file overrides offset */ + mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ err = 0; @@ -5025,11 +5229,11 @@ static int set_bitmap_file(mddev_t *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_file) { - restore_bitmap_write_access(mddev->bitmap_file); - fput(mddev->bitmap_file); + if (mddev->bitmap_info.file) { + restore_bitmap_write_access(mddev->bitmap_info.file); + fput(mddev->bitmap_info.file); } - mddev->bitmap_file = NULL; + mddev->bitmap_info.file = NULL; } return err; @@ -5096,8 +5300,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) mddev->flags = 0; set_bit(MD_CHANGE_DEVS, &mddev->flags); - mddev->default_bitmap_offset = MD_SB_BYTES >> 9; - mddev->bitmap_offset = 0; + mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; + mddev->bitmap_info.offset = 0; mddev->reshape_position = MaxSector; @@ -5197,7 +5401,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) int state = 0; /* calculate expected state,ignoring low bits */ - if (mddev->bitmap && mddev->bitmap_offset) + if (mddev->bitmap && mddev->bitmap_info.offset) state |= (1 << MD_SB_BITMAP_PRESENT); if (mddev->major_version != info->major_version || @@ -5256,9 +5460,10 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) /* add the bitmap */ if (mddev->bitmap) return -EEXIST; - if (mddev->default_bitmap_offset == 0) + if (mddev->bitmap_info.default_offset == 0) return -EINVAL; - mddev->bitmap_offset = mddev->default_bitmap_offset; + mddev->bitmap_info.offset = + mddev->bitmap_info.default_offset; mddev->pers->quiesce(mddev, 1); rv = bitmap_create(mddev); if (rv) @@ -5273,7 +5478,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); mddev->pers->quiesce(mddev, 0); - mddev->bitmap_offset = 0; + mddev->bitmap_info.offset = 0; } } md_update_sb(mddev, 1); @@ -5524,6 +5729,25 @@ done: abort: return err; } +#ifdef CONFIG_COMPAT +static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case HOT_REMOVE_DISK: + case HOT_ADD_DISK: + case SET_DISK_FAULTY: + case SET_BITMAP_FILE: + /* These take in integer arg, do not convert */ + break; + default: + arg = (unsigned long)compat_ptr(arg); + break; + } + + return md_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ static int md_open(struct block_device *bdev, fmode_t mode) { @@ -5589,6 +5813,9 @@ static const struct block_device_operations md_fops = .open = md_open, .release = md_release, .ioctl = md_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = md_compat_ioctl, +#endif .getgeo = md_getgeo, .media_changed = md_media_changed, .revalidate_disk= md_revalidate, @@ -5982,14 +6209,14 @@ static int md_seq_show(struct seq_file *seq, void *v) unsigned long chunk_kb; unsigned long flags; spin_lock_irqsave(&bitmap->lock, flags); - chunk_kb = bitmap->chunksize >> 10; + chunk_kb = mddev->bitmap_info.chunksize >> 10; seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " "%lu%s chunk", bitmap->pages - bitmap->missing_pages, bitmap->pages, (bitmap->pages - bitmap->missing_pages) << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->chunksize, + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, chunk_kb ? "KB" : "B"); if (bitmap->file) { seq_printf(seq, ", file: "); @@ -6338,12 +6565,14 @@ void md_do_sync(mddev_t *mddev) /* recovery follows the physical size of devices */ max_sectors = mddev->dev_sectors; j = MaxSector; - list_for_each_entry(rdev, &mddev->disks, same_set) + rcu_read_lock(); + list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) j = rdev->recovery_offset; + rcu_read_unlock(); } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -6380,6 +6609,7 @@ void md_do_sync(mddev_t *mddev) desc, mdname(mddev)); mddev->curr_resync = j; } + mddev->curr_resync_completed = mddev->curr_resync; while (j < max_sectors) { sector_t sectors; @@ -6512,22 +6742,29 @@ void md_do_sync(mddev_t *mddev) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - list_for_each_entry(rdev, &mddev->disks, same_set) + rcu_read_lock(); + list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); } } set_bit(MD_CHANGE_DEVS, &mddev->flags); skip: + if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + /* We completed so min/max setting can be forgotten if used. */ + if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = 0; + mddev->resync_max = MaxSector; + } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; - mddev->curr_resync_completed = 0; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) - /* We completed so max setting can be forgotten. */ - mddev->resync_max = MaxSector; + mddev->curr_resync_completed = 0; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); @@ -6590,6 +6827,7 @@ static int remove_and_add_spares(mddev_t *mddev) nm, mdname(mddev)); spares++; md_new_event(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); } else break; } @@ -6625,7 +6863,7 @@ void md_check_recovery(mddev_t *mddev) if (mddev->bitmap) - bitmap_daemon_work(mddev->bitmap); + bitmap_daemon_work(mddev); if (mddev->ro) return; @@ -6995,5 +7233,6 @@ EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); EXPORT_SYMBOL(md_check_recovery); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD RAID framework"); MODULE_ALIAS("md"); MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); diff --git a/drivers/md/md.h b/drivers/md/md.h index f184b69ef337..8e4c75c00d46 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -97,6 +97,9 @@ struct mdk_rdev_s atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ + struct timespec last_read_error; /* monotonic time since our + * last read error + */ atomic_t corrected_errors; /* number of corrected read errors, * for reporting to userspace and storing * in superblock. @@ -280,17 +283,38 @@ struct mddev_s unsigned int max_write_behind; /* 0 = sync */ struct bitmap *bitmap; /* the bitmap for the device */ - struct file *bitmap_file; /* the bitmap file */ - long bitmap_offset; /* offset from superblock of - * start of bitmap. May be - * negative, but not '0' - */ - long default_bitmap_offset; /* this is the offset to use when - * hot-adding a bitmap. It should - * eventually be settable by sysfs. - */ - + struct { + struct file *file; /* the bitmap file */ + loff_t offset; /* offset from superblock of + * start of bitmap. May be + * negative, but not '0' + * For external metadata, offset + * from start of device. + */ + loff_t default_offset; /* this is the offset to use when + * hot-adding a bitmap. It should + * eventually be settable by sysfs. + */ + struct mutex mutex; + unsigned long chunksize; + unsigned long daemon_sleep; /* how many seconds between updates? */ + unsigned long max_write_behind; /* write-behind mode */ + int external; + } bitmap_info; + + atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; + + /* Generic barrier handling. + * If there is a pending barrier request, all other + * writes are blocked while the devices are flushed. + * The last to finish a flush schedules a worker to + * submit the barrier request (without the barrier flag), + * then submit more flush requests. + */ + struct bio *barrier; + atomic_t flush_pending; + struct work_struct barrier_work; }; @@ -353,7 +377,7 @@ struct md_sysfs_entry { ssize_t (*show)(mddev_t *, char *); ssize_t (*store)(mddev_t *, const char *, size_t); }; - +extern struct attribute_group md_bitmap_group; static inline char * mdname (mddev_t * mddev) { @@ -431,6 +455,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); extern int mddev_congested(mddev_t *mddev, int bits); +extern void md_barrier_request(mddev_t *mddev, struct bio *bio); extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, sector_t sector, int size, struct page *page); extern void md_super_wait(mddev_t *mddev); @@ -443,6 +468,8 @@ extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); extern int md_check_no_bitmap(mddev_t *mddev); extern int md_integrity_register(mddev_t *mddev); -void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); +extern void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); +extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); +extern void restore_bitmap_write_access(struct file *file); #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index ee7646f974a0..32a662fc55c9 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -145,7 +145,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio) int cpu; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - bio_endio(bio, -EOPNOTSUPP); + md_barrier_request(mddev, bio); return 0; } @@ -581,6 +581,7 @@ static void __exit multipath_exit (void) module_init(multipath_init); module_exit(multipath_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("simple multi-path personality for MD"); MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ MODULE_ALIAS("md-multipath"); MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d3a4ce06015a..77605cdceaf1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -453,7 +453,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio) int cpu; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - bio_endio(bio, -EOPNOTSUPP); + md_barrier_request(mddev, bio); return 0; } @@ -567,6 +567,7 @@ static void raid0_exit (void) module_init(raid0_init); module_exit(raid0_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID0 (striping) personality for MD"); MODULE_ALIAS("md-personality-2"); /* RAID0 */ MODULE_ALIAS("md-raid0"); MODULE_ALIAS("md-level-0"); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e07ce2e033a9..859bd3ffe435 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -677,6 +677,7 @@ static void raise_barrier(conf_t *conf) static void lower_barrier(conf_t *conf) { unsigned long flags; + BUG_ON(conf->barrier <= 0); spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; spin_unlock_irqrestore(&conf->resync_lock, flags); @@ -801,6 +802,25 @@ static int make_request(struct request_queue *q, struct bio * bio) md_write_start(mddev, bio); /* wait on superblock update early */ + if (bio_data_dir(bio) == WRITE && + bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo && + bio->bi_sector < mddev->suspend_hi) { + /* As the suspend_* range is controlled by + * userspace, we want an interruptible + * wait. + */ + DEFINE_WAIT(w); + for (;;) { + flush_signals(current); + prepare_to_wait(&conf->wait_barrier, + &w, TASK_INTERRUPTIBLE); + if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo || + bio->bi_sector >= mddev->suspend_hi) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } if (unlikely(!mddev->barriers_work && bio_rw_flagged(bio, BIO_RW_BARRIER))) { if (rw == WRITE) @@ -923,7 +943,8 @@ static int make_request(struct request_queue *q, struct bio * bio) /* do behind I/O ? */ if (bitmap && - atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && + (atomic_read(&bitmap->behind_writes) + < mddev->bitmap_info.max_write_behind) && (behind_pages = alloc_behind_pages(bio)) != NULL) set_bit(R1BIO_BehindIO, &r1_bio->state); @@ -1941,74 +1962,48 @@ static sector_t raid1_size(mddev_t *mddev, sector_t sectors, int raid_disks) return mddev->dev_sectors; } -static int run(mddev_t *mddev) +static conf_t *setup_conf(mddev_t *mddev) { conf_t *conf; - int i, j, disk_idx; + int i; mirror_info_t *disk; mdk_rdev_t *rdev; + int err = -ENOMEM; - if (mddev->level != 1) { - printk("raid1: %s: raid level not set to mirroring (%d)\n", - mdname(mddev), mddev->level); - goto out; - } - if (mddev->reshape_position != MaxSector) { - printk("raid1: %s: reshape_position set but not supported\n", - mdname(mddev)); - goto out; - } - /* - * copy the already verified devices into our private RAID1 - * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] - */ conf = kzalloc(sizeof(conf_t), GFP_KERNEL); - mddev->private = conf; if (!conf) - goto out_no_mem; + goto abort; conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, GFP_KERNEL); if (!conf->mirrors) - goto out_no_mem; + goto abort; conf->tmppage = alloc_page(GFP_KERNEL); if (!conf->tmppage) - goto out_no_mem; + goto abort; - conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); + conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL); if (!conf->poolinfo) - goto out_no_mem; - conf->poolinfo->mddev = NULL; + goto abort; conf->poolinfo->raid_disks = mddev->raid_disks; conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, r1bio_pool_free, conf->poolinfo); if (!conf->r1bio_pool) - goto out_no_mem; + goto abort; + conf->poolinfo->mddev = mddev; spin_lock_init(&conf->device_lock); - mddev->queue->queue_lock = &conf->device_lock; - list_for_each_entry(rdev, &mddev->disks, same_set) { - disk_idx = rdev->raid_disk; + int disk_idx = rdev->raid_disk; if (disk_idx >= mddev->raid_disks || disk_idx < 0) continue; disk = conf->mirrors + disk_idx; disk->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - /* as we don't honour merge_bvec_fn, we must never risk - * violating it, so limit ->max_sector to one PAGE, as - * a one page request is never in violation. - */ - if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) - blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; } @@ -2022,8 +2017,7 @@ static int run(mddev_t *mddev) bio_list_init(&conf->pending_bio_list); bio_list_init(&conf->flushing_bio_list); - - mddev->degraded = 0; + conf->last_used = -1; for (i = 0; i < conf->raid_disks; i++) { disk = conf->mirrors + i; @@ -2031,38 +2025,97 @@ static int run(mddev_t *mddev) if (!disk->rdev || !test_bit(In_sync, &disk->rdev->flags)) { disk->head_position = 0; - mddev->degraded++; if (disk->rdev) conf->fullsync = 1; - } + } else if (conf->last_used < 0) + /* + * The first working device is used as a + * starting point to read balancing. + */ + conf->last_used = i; } - if (mddev->degraded == conf->raid_disks) { + + err = -EIO; + if (conf->last_used < 0) { printk(KERN_ERR "raid1: no operational mirrors for %s\n", - mdname(mddev)); - goto out_free_conf; + mdname(mddev)); + goto abort; } - if (conf->raid_disks - mddev->degraded == 1) - mddev->recovery_cp = MaxSector; + err = -ENOMEM; + conf->thread = md_register_thread(raid1d, mddev, NULL); + if (!conf->thread) { + printk(KERN_ERR + "raid1: couldn't allocate thread for %s\n", + mdname(mddev)); + goto abort; + } + + return conf; + + abort: + if (conf) { + if (conf->r1bio_pool) + mempool_destroy(conf->r1bio_pool); + kfree(conf->mirrors); + safe_put_page(conf->tmppage); + kfree(conf->poolinfo); + kfree(conf); + } + return ERR_PTR(err); +} +static int run(mddev_t *mddev) +{ + conf_t *conf; + int i; + mdk_rdev_t *rdev; + + if (mddev->level != 1) { + printk("raid1: %s: raid level not set to mirroring (%d)\n", + mdname(mddev), mddev->level); + return -EIO; + } + if (mddev->reshape_position != MaxSector) { + printk("raid1: %s: reshape_position set but not supported\n", + mdname(mddev)); + return -EIO; + } /* - * find the first working one and use it as a starting point - * to read balancing. + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in run(), + * should be freed in stop()] */ - for (j = 0; j < conf->raid_disks && - (!conf->mirrors[j].rdev || - !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) - /* nothing */; - conf->last_used = j; + if (mddev->private == NULL) + conf = setup_conf(mddev); + else + conf = mddev->private; + if (IS_ERR(conf)) + return PTR_ERR(conf); - mddev->thread = md_register_thread(raid1d, mddev, NULL); - if (!mddev->thread) { - printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", - mdname(mddev)); - goto out_free_conf; + mddev->queue->queue_lock = &conf->device_lock; + list_for_each_entry(rdev, &mddev->disks, same_set) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + /* as we don't honour merge_bvec_fn, we must never risk + * violating it, so limit ->max_sector to one PAGE, as + * a one page request is never in violation. + */ + if (rdev->bdev->bd_disk->queue->merge_bvec_fn && + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); } + mddev->degraded = 0; + for (i=0; i < conf->raid_disks; i++) + if (conf->mirrors[i].rdev == NULL || + !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || + test_bit(Faulty, &conf->mirrors[i].rdev->flags)) + mddev->degraded++; + + if (conf->raid_disks - mddev->degraded == 1) + mddev->recovery_cp = MaxSector; + if (mddev->recovery_cp != MaxSector) printk(KERN_NOTICE "raid1: %s is not clean" " -- starting background reconstruction\n", @@ -2071,9 +2124,14 @@ static int run(mddev_t *mddev) "raid1: raid set %s active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); + /* * Ok, everything is just fine now */ + mddev->thread = conf->thread; + conf->thread = NULL; + mddev->private = conf; + md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); mddev->queue->unplug_fn = raid1_unplug; @@ -2081,23 +2139,6 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; md_integrity_register(mddev); return 0; - -out_no_mem: - printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", - mdname(mddev)); - -out_free_conf: - if (conf) { - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - kfree(conf->poolinfo); - kfree(conf); - mddev->private = NULL; - } -out: - return -EIO; } static int stop(mddev_t *mddev) @@ -2271,6 +2312,9 @@ static void raid1_quiesce(mddev_t *mddev, int state) conf_t *conf = mddev->private; switch(state) { + case 2: /* wake for suspend */ + wake_up(&conf->wait_barrier); + break; case 1: raise_barrier(conf); break; @@ -2280,6 +2324,23 @@ static void raid1_quiesce(mddev_t *mddev, int state) } } +static void *raid1_takeover(mddev_t *mddev) +{ + /* raid1 can take over: + * raid5 with 2 devices, any layout or chunk size + */ + if (mddev->level == 5 && mddev->raid_disks == 2) { + conf_t *conf; + mddev->new_level = 1; + mddev->new_layout = 0; + mddev->new_chunk_sectors = 0; + conf = setup_conf(mddev); + if (!IS_ERR(conf)) + conf->barrier = 1; + return conf; + } + return ERR_PTR(-EINVAL); +} static struct mdk_personality raid1_personality = { @@ -2299,6 +2360,7 @@ static struct mdk_personality raid1_personality = .size = raid1_size, .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, + .takeover = raid1_takeover, }; static int __init raid_init(void) @@ -2314,6 +2376,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD"); MODULE_ALIAS("md-personality-3"); /* RAID1 */ MODULE_ALIAS("md-raid1"); MODULE_ALIAS("md-level-1"); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e87b84deff68..5f2d443ae28a 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -59,6 +59,11 @@ struct r1_private_data_s { mempool_t *r1bio_pool; mempool_t *r1buf_pool; + + /* When taking over an array from a different personality, we store + * the new thread here until we fully activate the array. + */ + struct mdk_thread_s *thread; }; typedef struct r1_private_data_s conf_t; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c2cb7b87b440..d119b7b75e71 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -804,7 +804,7 @@ static int make_request(struct request_queue *q, struct bio * bio) mdk_rdev_t *blocked_rdev; if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { - bio_endio(bio, -EOPNOTSUPP); + md_barrier_request(mddev, bio); return 0; } @@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) /* + * Used by fix_read_error() to decay the per rdev read_errors. + * We halve the read error count for every hour that has elapsed + * since the last recorded read error. + * + */ +static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) +{ + struct timespec cur_time_mon; + unsigned long hours_since_last; + unsigned int read_errors = atomic_read(&rdev->read_errors); + + ktime_get_ts(&cur_time_mon); + + if (rdev->last_read_error.tv_sec == 0 && + rdev->last_read_error.tv_nsec == 0) { + /* first time we've seen a read error */ + rdev->last_read_error = cur_time_mon; + return; + } + + hours_since_last = (cur_time_mon.tv_sec - + rdev->last_read_error.tv_sec) / 3600; + + rdev->last_read_error = cur_time_mon; + + /* + * if hours_since_last is > the number of bits in read_errors + * just set read errors to 0. We do this to avoid + * overflowing the shift of read_errors by hours_since_last. + */ + if (hours_since_last >= 8 * sizeof(read_errors)) + atomic_set(&rdev->read_errors, 0); + else + atomic_set(&rdev->read_errors, read_errors >> hours_since_last); +} + +/* * This is a kernel thread which: * * 1. Retries failed read operations on working mirrors. @@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) int sect = 0; /* Offset from r10_bio->sector */ int sectors = r10_bio->sectors; mdk_rdev_t*rdev; + int max_read_errors = atomic_read(&mddev->max_corr_read_errors); + + rcu_read_lock(); + { + int d = r10_bio->devs[r10_bio->read_slot].devnum; + char b[BDEVNAME_SIZE]; + int cur_read_error_count = 0; + + rdev = rcu_dereference(conf->mirrors[d].rdev); + bdevname(rdev->bdev, b); + + if (test_bit(Faulty, &rdev->flags)) { + rcu_read_unlock(); + /* drive has already been failed, just ignore any + more fix_read_error() attempts */ + return; + } + + check_decay_read_errors(mddev, rdev); + atomic_inc(&rdev->read_errors); + cur_read_error_count = atomic_read(&rdev->read_errors); + if (cur_read_error_count > max_read_errors) { + rcu_read_unlock(); + printk(KERN_NOTICE + "raid10: %s: Raid device exceeded " + "read_error threshold " + "[cur %d:max %d]\n", + b, cur_read_error_count, max_read_errors); + printk(KERN_NOTICE + "raid10: %s: Failing raid " + "device\n", b); + md_error(mddev, conf->mirrors[d].rdev); + return; + } + } + rcu_read_unlock(); + while(sectors) { int s = sectors; int sl = r10_bio->read_slot; @@ -1488,6 +1562,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) /* write it back and re-read */ rcu_read_lock(); while (sl != r10_bio->read_slot) { + char b[BDEVNAME_SIZE]; int d; if (sl==0) sl = conf->copies; @@ -1503,9 +1578,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) r10_bio->devs[sl].addr + sect + rdev->data_offset, s<<9, conf->tmppage, WRITE) - == 0) + == 0) { /* Well, this device is dead */ + printk(KERN_NOTICE + "raid10:%s: read correction " + "write failed" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect+ + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "raid10:%s: failing " + "drive\n", + bdevname(rdev->bdev, b)); md_error(mddev, rdev); + } rdev_dec_pending(rdev, mddev); rcu_read_lock(); } @@ -1526,10 +1613,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) if (sync_page_io(rdev->bdev, r10_bio->devs[sl].addr + sect + rdev->data_offset, - s<<9, conf->tmppage, READ) == 0) + s<<9, conf->tmppage, + READ) == 0) { /* Well, this device is dead */ + printk(KERN_NOTICE + "raid10:%s: unable to read back " + "corrected sectors" + " (%d sectors at %llu on %s)\n", + mdname(mddev), s, + (unsigned long long)(sect+ + rdev->data_offset), + bdevname(rdev->bdev, b)); + printk(KERN_NOTICE "raid10:%s: failing drive\n", + bdevname(rdev->bdev, b)); + md_error(mddev, rdev); - else + } else { printk(KERN_INFO "raid10:%s: read error corrected" " (%d sectors at %llu on %s)\n", @@ -1537,6 +1636,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) (unsigned long long)(sect+ rdev->data_offset), bdevname(rdev->bdev, b)); + } rdev_dec_pending(rdev, mddev); rcu_read_lock(); @@ -2275,13 +2375,6 @@ static void raid10_quiesce(mddev_t *mddev, int state) lower_barrier(conf); break; } - if (mddev->thread) { - if (mddev->bitmap) - mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; - else - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - md_wakeup_thread(mddev->thread); - } } static struct mdk_personality raid10_personality = @@ -2315,6 +2408,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); MODULE_ALIAS("md-personality-9"); /* RAID10 */ MODULE_ALIAS("md-raid10"); MODULE_ALIAS("md-level-10"); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d29215d966da..e84204eb12df 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2947,6 +2947,7 @@ static void handle_stripe5(struct stripe_head *sh) struct r5dev *dev; mdk_rdev_t *blocked_rdev = NULL; int prexor; + int dec_preread_active = 0; memset(&s, 0, sizeof(s)); pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " @@ -3096,12 +3097,8 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(STRIPE_INSYNC, &sh->state); } } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + dec_preread_active = 1; } /* Now to consider new write requests and what else, if anything @@ -3208,6 +3205,16 @@ static void handle_stripe5(struct stripe_head *sh) ops_run_io(sh, &s); + if (dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a barrier, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } return_io(return_bi); } @@ -3221,6 +3228,7 @@ static void handle_stripe6(struct stripe_head *sh) struct r6_state r6s; struct r5dev *dev, *pdev, *qdev; mdk_rdev_t *blocked_rdev = NULL; + int dec_preread_active = 0; pr_debug("handling stripe %llu, state=%#lx cnt=%d, " "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", @@ -3358,7 +3366,6 @@ static void handle_stripe6(struct stripe_head *sh) * completed */ if (sh->reconstruct_state == reconstruct_state_drain_result) { - int qd_idx = sh->qd_idx; sh->reconstruct_state = reconstruct_state_idle; /* All the 'written' buffers and the parity blocks are ready to @@ -3380,12 +3387,8 @@ static void handle_stripe6(struct stripe_head *sh) set_bit(STRIPE_INSYNC, &sh->state); } } - if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - atomic_dec(&conf->preread_active_stripes); - if (atomic_read(&conf->preread_active_stripes) < - IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } + if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + dec_preread_active = 1; } /* Now to consider new write requests and what else, if anything @@ -3494,6 +3497,18 @@ static void handle_stripe6(struct stripe_head *sh) ops_run_io(sh, &s); + + if (dec_preread_active) { + /* We delay this until after ops_run_io so that if make_request + * is waiting on a barrier, it won't continue until the writes + * have actually been submitted. + */ + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } + return_io(return_bi); } @@ -3741,7 +3756,7 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; raid5_conf_t *conf = mddev->private; - unsigned int dd_idx; + int dd_idx; struct bio* align_bi; mdk_rdev_t *rdev; @@ -3866,7 +3881,13 @@ static int make_request(struct request_queue *q, struct bio * bi) int cpu, remaining; if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { - bio_endio(bi, -EOPNOTSUPP); + /* Drain all pending writes. We only really need + * to ensure they have been submitted, but this is + * easier. + */ + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + md_barrier_request(mddev, bi); return 0; } @@ -3990,6 +4011,9 @@ static int make_request(struct request_queue *q, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); + if (mddev->barrier && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); release_stripe(sh); } else { /* cannot get stripe for read-ahead, just give-up */ @@ -4009,6 +4033,14 @@ static int make_request(struct request_queue *q, struct bio * bi) bio_endio(bi, 0); } + + if (mddev->barrier) { + /* We need to wait for the stripes to all be handled. + * So: wait for preread_active_stripes to drop to 0. + */ + wait_event(mddev->thread->wqueue, + atomic_read(&conf->preread_active_stripes) == 0); + } return 0; } @@ -5860,6 +5892,7 @@ static void raid5_exit(void) module_init(raid5_init); module_exit(raid5_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID4/5/6 (striping with parity) personality for MD"); MODULE_ALIAS("md-personality-4"); /* RAID5 */ MODULE_ALIAS("md-raid5"); MODULE_ALIAS("md-raid4"); diff --git a/drivers/md/raid6algos.c b/drivers/md/raid6algos.c index 866215ac7f25..bffc61bff5ab 100644 --- a/drivers/md/raid6algos.c +++ b/drivers/md/raid6algos.c @@ -31,25 +31,6 @@ EXPORT_SYMBOL(raid6_empty_zero_page); struct raid6_calls raid6_call; EXPORT_SYMBOL_GPL(raid6_call); -/* Various routine sets */ -extern const struct raid6_calls raid6_intx1; -extern const struct raid6_calls raid6_intx2; -extern const struct raid6_calls raid6_intx4; -extern const struct raid6_calls raid6_intx8; -extern const struct raid6_calls raid6_intx16; -extern const struct raid6_calls raid6_intx32; -extern const struct raid6_calls raid6_mmxx1; -extern const struct raid6_calls raid6_mmxx2; -extern const struct raid6_calls raid6_sse1x1; -extern const struct raid6_calls raid6_sse1x2; -extern const struct raid6_calls raid6_sse2x1; -extern const struct raid6_calls raid6_sse2x2; -extern const struct raid6_calls raid6_sse2x4; -extern const struct raid6_calls raid6_altivec1; -extern const struct raid6_calls raid6_altivec2; -extern const struct raid6_calls raid6_altivec4; -extern const struct raid6_calls raid6_altivec8; - const struct raid6_calls * const raid6_algos[] = { &raid6_intx1, &raid6_intx2, @@ -169,3 +150,4 @@ static void raid6_exit(void) subsys_initcall(raid6_select_algo); module_exit(raid6_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID6 Q-syndrome calculations"); |