diff options
author | Jaroslav Kysela <perex@petra> | 2005-06-22 12:19:24 +0200 |
---|---|---|
committer | Jaroslav Kysela <perex@petra> | 2005-06-22 12:19:24 +0200 |
commit | da04b128cf0d74dd4cab270c53d9264e70f9203e (patch) | |
tree | 095355c32dfd709236a85b497d3bd461d7cdfe8a /drivers/md | |
parent | fae6ec69c84d71b1d5bda9ede1a262c1681684aa (diff) | |
parent | 2a5a68b840cbab31baab2d9b2e1e6de3b289ae1e (diff) | |
download | lwn-da04b128cf0d74dd4cab270c53d9264e70f9203e.tar.gz lwn-da04b128cf0d74dd4cab270c53d9264e70f9203e.zip |
Merge with rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Makefile | 3 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 1586 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 3 | ||||
-rw-r--r-- | drivers/md/linear.c | 3 | ||||
-rw-r--r-- | drivers/md/md.c | 525 | ||||
-rw-r--r-- | drivers/md/multipath.c | 3 | ||||
-rw-r--r-- | drivers/md/raid0.c | 12 | ||||
-rw-r--r-- | drivers/md/raid1.c | 242 | ||||
-rw-r--r-- | drivers/md/raid10.c | 30 | ||||
-rw-r--r-- | drivers/md/raid5.c | 12 | ||||
-rw-r--r-- | drivers/md/raid6main.c | 12 |
11 files changed, 2212 insertions, 219 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 90de9c146a5f..d3efedf6a6ad 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ dm-multipath-objs := dm-hw-handler.o dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-mirror-objs := dm-log.o dm-raid1.o +md-mod-objs := md.o bitmap.o raid6-objs := raid6main.o raid6algos.o raid6recov.o raid6tables.o \ raid6int1.o raid6int2.o raid6int4.o \ raid6int8.o raid6int16.o raid6int32.o \ @@ -28,7 +29,7 @@ obj-$(CONFIG_MD_RAID5) += raid5.o xor.o obj-$(CONFIG_MD_RAID6) += raid6.o xor.o obj-$(CONFIG_MD_MULTIPATH) += multipath.o obj-$(CONFIG_MD_FAULTY) += faulty.o -obj-$(CONFIG_BLK_DEV_MD) += md.o +obj-$(CONFIG_BLK_DEV_MD) += md-mod.o obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c new file mode 100644 index 000000000000..95980ad6b27b --- /dev/null +++ b/drivers/md/bitmap.c @@ -0,0 +1,1586 @@ +/* + * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * bitmap_create - sets up the bitmap structure + * bitmap_destroy - destroys the bitmap structure + * + * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: + * - added disk storage for bitmap + * - changes to allow various bitmap chunk sizes + * - added bitmap daemon (to asynchronously clear bitmap bits from disk) + */ + +/* + * Still to do: + * + * flush after percent set rather than just time based. (maybe both). + * wait if count gets too high, wake when it drops to half. + * allow bitmap to be mirrored with superblock (before or after...) + * allow hot-add to re-instate a current device. + * allow hot-add of bitmap after quiessing device + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/config.h> +#include <linux/timer.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/buffer_head.h> +#include <linux/raid/md.h> +#include <linux/raid/bitmap.h> + +/* debug macros */ + +#define DEBUG 0 + +#if DEBUG +/* these are for debugging purposes only! */ + +/* define one and only one of these */ +#define INJECT_FAULTS_1 0 /* cause bitmap_alloc_page to fail always */ +#define INJECT_FAULTS_2 0 /* cause bitmap file to be kicked when first bit set*/ +#define INJECT_FAULTS_3 0 /* treat bitmap file as kicked at init time */ +#define INJECT_FAULTS_4 0 /* undef */ +#define INJECT_FAULTS_5 0 /* undef */ +#define INJECT_FAULTS_6 0 + +/* if these are defined, the driver will fail! debug only */ +#define INJECT_FATAL_FAULT_1 0 /* fail kmalloc, causing bitmap_create to fail */ +#define INJECT_FATAL_FAULT_2 0 /* undef */ +#define INJECT_FATAL_FAULT_3 0 /* undef */ +#endif + +//#define DPRINTK PRINTK /* set this NULL to avoid verbose debug output */ +#define DPRINTK(x...) do { } while(0) + +#ifndef PRINTK +# if DEBUG > 0 +# define PRINTK(x...) printk(KERN_DEBUG x) +# else +# define PRINTK(x...) +# endif +#endif + +static inline char * bmname(struct bitmap *bitmap) +{ + return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; +} + + +/* + * test if the bitmap is active + */ +int bitmap_active(struct bitmap *bitmap) +{ + unsigned long flags; + int res = 0; + + if (!bitmap) + return res; + spin_lock_irqsave(&bitmap->lock, flags); + res = bitmap->flags & BITMAP_ACTIVE; + spin_unlock_irqrestore(&bitmap->lock, flags); + return res; +} + +#define WRITE_POOL_SIZE 256 +/* mempool for queueing pending writes on the bitmap file */ +static void *write_pool_alloc(unsigned int gfp_flags, void *data) +{ + return kmalloc(sizeof(struct page_list), gfp_flags); +} + +static void write_pool_free(void *ptr, void *data) +{ + kfree(ptr); +} + +/* + * just a placeholder - calls kmalloc for bitmap pages + */ +static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) +{ + unsigned char *page; + +#if INJECT_FAULTS_1 + page = NULL; +#else + page = kmalloc(PAGE_SIZE, GFP_NOIO); +#endif + if (!page) + printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); + else + PRINTK("%s: bitmap_alloc_page: allocated page at %p\n", + bmname(bitmap), page); + return page; +} + +/* + * for now just a placeholder -- just calls kfree for bitmap pages + */ +static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) +{ + PRINTK("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page); + kfree(page); +} + +/* + * check a page and, if necessary, allocate it (or hijack it if the alloc fails) + * + * 1) check to see if this page is allocated, if it's not then try to alloc + * 2) if the alloc fails, set the page's hijacked flag so we'll use the + * page pointer directly as a counter + * + * if we find our page, we increment the page's refcount so that it stays + * allocated while we're using it + */ +static int bitmap_checkpage(struct bitmap *bitmap, unsigned long page, int create) +{ + unsigned char *mappage; + + if (page >= bitmap->pages) { + printk(KERN_ALERT + "%s: invalid bitmap page request: %lu (> %lu)\n", + bmname(bitmap), page, bitmap->pages-1); + return -EINVAL; + } + + + if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ + return 0; + + if (bitmap->bp[page].map) /* page is already allocated, just return */ + return 0; + + if (!create) + return -ENOENT; + + spin_unlock_irq(&bitmap->lock); + + /* this page has not been allocated yet */ + + if ((mappage = bitmap_alloc_page(bitmap)) == NULL) { + PRINTK("%s: bitmap map page allocation failed, hijacking\n", + bmname(bitmap)); + /* failed - set the hijacked flag so that we can use the + * pointer as a counter */ + spin_lock_irq(&bitmap->lock); + if (!bitmap->bp[page].map) + bitmap->bp[page].hijacked = 1; + goto out; + } + + /* got a page */ + + spin_lock_irq(&bitmap->lock); + + /* recheck the page */ + + if (bitmap->bp[page].map || bitmap->bp[page].hijacked) { + /* somebody beat us to getting the page */ + bitmap_free_page(bitmap, mappage); + return 0; + } + + /* no page was in place and we have one, so install it */ + + memset(mappage, 0, PAGE_SIZE); + bitmap->bp[page].map = mappage; + bitmap->missing_pages--; +out: + return 0; +} + + +/* if page is completely empty, put it back on the free list, or dealloc it */ +/* if page was hijacked, unmark the flag so it might get alloced next time */ +/* Note: lock should be held when calling this */ +static inline void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) +{ + char *ptr; + + if (bitmap->bp[page].count) /* page is still busy */ + return; + + /* page is no longer in use, it can be released */ + + if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ + bitmap->bp[page].hijacked = 0; + bitmap->bp[page].map = NULL; + return; + } + + /* normal case, free the page */ + +#if 0 +/* actually ... let's not. We will probably need the page again exactly when + * memory is tight and we are flusing to disk + */ + return; +#else + ptr = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + bitmap_free_page(bitmap, ptr); + return; +#endif +} + + +/* + * bitmap file handling - read and write the bitmap file and its superblock + */ + +/* copy the pathname of a file to a buffer */ +char *file_path(struct file *file, char *buf, int count) +{ + struct dentry *d; + struct vfsmount *v; + + if (!buf) + return NULL; + + d = file->f_dentry; + v = file->f_vfsmnt; + + buf = d_path(d, v, buf, count); + + return IS_ERR(buf) ? NULL : buf; +} + +/* + * basic page I/O operations + */ + +/* IO operations when bitmap is stored near all superblocks */ +static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long index) +{ + /* choose a good rdev and read the page from there */ + + mdk_rdev_t *rdev; + struct list_head *tmp; + struct page *page = alloc_page(GFP_KERNEL); + sector_t target; + + if (!page) + return ERR_PTR(-ENOMEM); + do { + ITERATE_RDEV(mddev, rdev, tmp) + if (rdev->in_sync && !rdev->faulty) + goto found; + return ERR_PTR(-EIO); + + found: + target = (rdev->sb_offset << 1) + offset + index * (PAGE_SIZE/512); + + } while (!sync_page_io(rdev->bdev, target, PAGE_SIZE, page, READ)); + + page->index = index; + return page; +} + +static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev, rdev, tmp) + if (rdev->in_sync && !rdev->faulty) + md_super_write(mddev, rdev, + (rdev->sb_offset<<1) + offset + + page->index * (PAGE_SIZE/512), + PAGE_SIZE, + page); + + if (wait) + wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + return 0; +} + +/* + * write out a page to a file + */ +static int write_page(struct bitmap *bitmap, struct page *page, int wait) +{ + int ret = -ENOMEM; + + if (bitmap->file == NULL) + return write_sb_page(bitmap->mddev, bitmap->offset, page, wait); + + if (wait) + lock_page(page); + else { + if (TestSetPageLocked(page)) + return -EAGAIN; /* already locked */ + if (PageWriteback(page)) { + unlock_page(page); + return -EAGAIN; + } + } + + ret = page->mapping->a_ops->prepare_write(NULL, page, 0, PAGE_SIZE); + if (!ret) + ret = page->mapping->a_ops->commit_write(NULL, page, 0, + PAGE_SIZE); + if (ret) { + unlock_page(page); + return ret; + } + + set_page_dirty(page); /* force it to be written out */ + + if (!wait) { + /* add to list to be waited for by daemon */ + struct page_list *item = mempool_alloc(bitmap->write_pool, GFP_NOIO); + item->page = page; + page_cache_get(page); + spin_lock(&bitmap->write_lock); + list_add(&item->list, &bitmap->complete_pages); + spin_unlock(&bitmap->write_lock); + md_wakeup_thread(bitmap->writeback_daemon); + } + return write_one_page(page, wait); +} + +/* read a page from a file, pinning it into cache, and return bytes_read */ +static struct page *read_page(struct file *file, unsigned long index, + unsigned long *bytes_read) +{ + struct inode *inode = file->f_mapping->host; + struct page *page = NULL; + loff_t isize = i_size_read(inode); + unsigned long end_index = isize >> PAGE_CACHE_SHIFT; + + PRINTK("read bitmap file (%dB @ %Lu)\n", (int)PAGE_CACHE_SIZE, + (unsigned long long)index << PAGE_CACHE_SHIFT); + + page = read_cache_page(inode->i_mapping, index, + (filler_t *)inode->i_mapping->a_ops->readpage, file); + if (IS_ERR(page)) + goto out; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + goto out; + } + + if (index > end_index) /* we have read beyond EOF */ + *bytes_read = 0; + else if (index == end_index) /* possible short read */ + *bytes_read = isize & ~PAGE_CACHE_MASK; + else + *bytes_read = PAGE_CACHE_SIZE; /* got a full page */ +out: + if (IS_ERR(page)) + printk(KERN_ALERT "md: bitmap read error: (%dB @ %Lu): %ld\n", + (int)PAGE_CACHE_SIZE, + (unsigned long long)index << PAGE_CACHE_SHIFT, + PTR_ERR(page)); + return page; +} + +/* + * bitmap file superblock operations + */ + +/* update the event counter and sync the superblock to disk */ +int bitmap_update_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + unsigned long flags; + + if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ + return 0; + spin_lock_irqsave(&bitmap->lock, flags); + if (!bitmap->sb_page) { /* no superblock */ + spin_unlock_irqrestore(&bitmap->lock, flags); + return 0; + } + spin_unlock_irqrestore(&bitmap->lock, flags); + sb = (bitmap_super_t *)kmap(bitmap->sb_page); + sb->events = cpu_to_le64(bitmap->mddev->events); + if (!bitmap->mddev->degraded) + sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + kunmap(bitmap->sb_page); + return write_page(bitmap, bitmap->sb_page, 1); +} + +/* print out the bitmap file superblock */ +void bitmap_print_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + + if (!bitmap || !bitmap->sb_page) + return; + sb = (bitmap_super_t *)kmap(bitmap->sb_page); + printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); + printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); + printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); + printk(KERN_DEBUG " uuid: %08x.%08x.%08x.%08x\n", + *(__u32 *)(sb->uuid+0), + *(__u32 *)(sb->uuid+4), + *(__u32 *)(sb->uuid+8), + *(__u32 *)(sb->uuid+12)); + printk(KERN_DEBUG " events: %llu\n", + (unsigned long long) le64_to_cpu(sb->events)); + printk(KERN_DEBUG "events cleared: %llu\n", + (unsigned long long) le64_to_cpu(sb->events_cleared)); + printk(KERN_DEBUG " state: %08x\n", le32_to_cpu(sb->state)); + printk(KERN_DEBUG " chunksize: %d B\n", le32_to_cpu(sb->chunksize)); + printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); + printk(KERN_DEBUG " sync size: %llu KB\n", + (unsigned long long)le64_to_cpu(sb->sync_size)/2); + kunmap(bitmap->sb_page); +} + +/* read the superblock from the bitmap file and initialize some bitmap fields */ +static int bitmap_read_sb(struct bitmap *bitmap) +{ + char *reason = NULL; + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep; + unsigned long bytes_read; + unsigned long long events; + int err = -EINVAL; + + /* page 0 is the superblock, read it... */ + if (bitmap->file) + bitmap->sb_page = read_page(bitmap->file, 0, &bytes_read); + else { + bitmap->sb_page = read_sb_page(bitmap->mddev, bitmap->offset, 0); + bytes_read = PAGE_SIZE; + } + if (IS_ERR(bitmap->sb_page)) { + err = PTR_ERR(bitmap->sb_page); + bitmap->sb_page = NULL; + return err; + } + + sb = (bitmap_super_t *)kmap(bitmap->sb_page); + + if (bytes_read < sizeof(*sb)) { /* short read */ + printk(KERN_INFO "%s: bitmap file superblock truncated\n", + bmname(bitmap)); + err = -ENOSPC; + goto out; + } + + chunksize = le32_to_cpu(sb->chunksize); + daemon_sleep = le32_to_cpu(sb->daemon_sleep); + + /* verify that the bitmap-specific fields are valid */ + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) + reason = "bad magic"; + else if (sb->version != cpu_to_le32(BITMAP_MAJOR)) + reason = "unrecognized superblock version"; + else if (chunksize < 512 || chunksize > (1024 * 1024 * 4)) + reason = "bitmap chunksize out of range (512B - 4MB)"; + else if ((1 << ffz(~chunksize)) != chunksize) + reason = "bitmap chunksize not a power of 2"; + else if (daemon_sleep < 1 || daemon_sleep > 15) + reason = "daemon sleep period out of range"; + if (reason) { + printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", + bmname(bitmap), reason); + goto out; + } + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + if (!bitmap->mddev->persistent) + goto success; + + /* + * if we have a persistent array superblock, compare the + * bitmap's UUID and event counter to the mddev's + */ + if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { + printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); + goto out; + } + events = le64_to_cpu(sb->events); + if (events < bitmap->mddev->events) { + printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " + "-- forcing full recovery\n", bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); + sb->state |= BITMAP_STALE; + } +success: + /* assign fields using values from superblock */ + bitmap->chunksize = chunksize; + bitmap->daemon_sleep = daemon_sleep; + bitmap->flags |= sb->state; + bitmap->events_cleared = le64_to_cpu(sb->events_cleared); + err = 0; +out: + kunmap(bitmap->sb_page); + if (err) + bitmap_print_sb(bitmap); + return err; +} + +enum bitmap_mask_op { + MASK_SET, + MASK_UNSET +}; + +/* record the state of the bitmap in the superblock */ +static void bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, + enum bitmap_mask_op op) +{ + bitmap_super_t *sb; + unsigned long flags; + + spin_lock_irqsave(&bitmap->lock, flags); + if (!bitmap || !bitmap->sb_page) { /* can't set the state */ + spin_unlock_irqrestore(&bitmap->lock, flags); + return; + } + page_cache_get(bitmap->sb_page); + spin_unlock_irqrestore(&bitmap->lock, flags); + sb = (bitmap_super_t *)kmap(bitmap->sb_page); + switch (op) { + case MASK_SET: sb->state |= bits; + break; + case MASK_UNSET: sb->state &= ~bits; + break; + default: BUG(); + } + kunmap(bitmap->sb_page); + page_cache_release(bitmap->sb_page); +} + +/* + * general bitmap file operations + */ + +/* calculate the index of the page that contains this bit */ +static inline unsigned long file_page_index(unsigned long chunk) +{ + return CHUNK_BIT_OFFSET(chunk) >> PAGE_BIT_SHIFT; +} + +/* calculate the (bit) offset of this bit within a page */ +static inline unsigned long file_page_offset(unsigned long chunk) +{ + return CHUNK_BIT_OFFSET(chunk) & (PAGE_BITS - 1); +} + +/* + * return a pointer to the page in the filemap that contains the given bit + * + * this lookup is complicated by the fact that the bitmap sb might be exactly + * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page + * 0 or page 1 + */ +static inline struct page *filemap_get_page(struct bitmap *bitmap, + unsigned long chunk) +{ + return bitmap->filemap[file_page_index(chunk) - file_page_index(0)]; +} + + +static void bitmap_file_unmap(struct bitmap *bitmap) +{ + struct page **map, *sb_page; + unsigned long *attr; + int pages; + unsigned long flags; + + spin_lock_irqsave(&bitmap->lock, flags); + map = bitmap->filemap; + bitmap->filemap = NULL; + attr = bitmap->filemap_attr; + bitmap->filemap_attr = NULL; + pages = bitmap->file_pages; + bitmap->file_pages = 0; + sb_page = bitmap->sb_page; + bitmap->sb_page = NULL; + spin_unlock_irqrestore(&bitmap->lock, flags); + + while (pages--) + if (map[pages]->index != 0) /* 0 is sb_page, release it below */ + page_cache_release(map[pages]); + kfree(map); + kfree(attr); + + if (sb_page) + page_cache_release(sb_page); +} + +static void bitmap_stop_daemons(struct bitmap *bitmap); + +/* dequeue the next item in a page list -- don't call from irq context */ +static struct page_list *dequeue_page(struct bitmap *bitmap) +{ + struct page_list *item = NULL; + struct list_head *head = &bitmap->complete_pages; + + spin_lock(&bitmap->write_lock); + if (list_empty(head)) + goto out; + item = list_entry(head->prev, struct page_list, list); + list_del(head->prev); +out: + spin_unlock(&bitmap->write_lock); + return item; +} + +static void drain_write_queues(struct bitmap *bitmap) +{ + struct page_list *item; + + while ((item = dequeue_page(bitmap))) { + /* don't bother to wait */ + page_cache_release(item->page); + mempool_free(item, bitmap->write_pool); + } + + wake_up(&bitmap->write_wait); +} + +static void bitmap_file_put(struct bitmap *bitmap) +{ + struct file *file; + struct inode *inode; + unsigned long flags; + + spin_lock_irqsave(&bitmap->lock, flags); + file = bitmap->file; + bitmap->file = NULL; + spin_unlock_irqrestore(&bitmap->lock, flags); + + bitmap_stop_daemons(bitmap); + + drain_write_queues(bitmap); + + bitmap_file_unmap(bitmap); + + if (file) { + inode = file->f_mapping->host; + spin_lock(&inode->i_lock); + atomic_set(&inode->i_writecount, 1); /* allow writes again */ + spin_unlock(&inode->i_lock); + fput(file); + } +} + + +/* + * bitmap_file_kick - if an error occurs while manipulating the bitmap file + * then it is no longer reliable, so we stop using it and we mark the file + * as failed in the superblock + */ +static void bitmap_file_kick(struct bitmap *bitmap) +{ + char *path, *ptr = NULL; + + bitmap_mask_state(bitmap, BITMAP_STALE, MASK_SET); + bitmap_update_sb(bitmap); + + if (bitmap->file) { + path = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (path) + ptr = file_path(bitmap->file, path, PAGE_SIZE); + + printk(KERN_ALERT "%s: kicking failed bitmap file %s from array!\n", + bmname(bitmap), ptr ? ptr : ""); + + kfree(path); + } + + bitmap_file_put(bitmap); + + return; +} + +enum bitmap_page_attr { + BITMAP_PAGE_DIRTY = 1, // there are set bits that need to be synced + BITMAP_PAGE_CLEAN = 2, // there are bits that might need to be cleared + BITMAP_PAGE_NEEDWRITE=4, // there are cleared bits that need to be synced +}; + +static inline void set_page_attr(struct bitmap *bitmap, struct page *page, + enum bitmap_page_attr attr) +{ + bitmap->filemap_attr[page->index] |= attr; +} + +static inline void clear_page_attr(struct bitmap *bitmap, struct page *page, + enum bitmap_page_attr attr) +{ + bitmap->filemap_attr[page->index] &= ~attr; +} + +static inline unsigned long get_page_attr(struct bitmap *bitmap, struct page *page) +{ + return bitmap->filemap_attr[page->index]; +} + +/* + * bitmap_file_set_bit -- called before performing a write to the md device + * to set (and eventually sync) a particular bit in the bitmap file + * + * we set the bit immediately, then we record the page number so that + * when an unplug occurs, we can flush the dirty pages out to disk + */ +static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *kaddr; + unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); + + if (!bitmap->filemap) { + return; + } + + page = filemap_get_page(bitmap, chunk); + bit = file_page_offset(chunk); + + + /* make sure the page stays cached until it gets written out */ + if (! (get_page_attr(bitmap, page) & BITMAP_PAGE_DIRTY)) + page_cache_get(page); + + /* set the bit */ + kaddr = kmap_atomic(page, KM_USER0); + set_bit(bit, kaddr); + kunmap_atomic(kaddr, KM_USER0); + PRINTK("set file bit %lu page %lu\n", bit, page->index); + + /* record page number so it gets flushed to disk when unplug occurs */ + set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); + +} + +/* this gets called when the md device is ready to unplug its underlying + * (slave) device queues -- before we let any writes go down, we need to + * sync the dirty pages of the bitmap file to disk */ +int bitmap_unplug(struct bitmap *bitmap) +{ + unsigned long i, attr, flags; + struct page *page; + int wait = 0; + int err; + + if (!bitmap) + return 0; + + /* look at each page to see if there are any set bits that need to be + * flushed out to disk */ + for (i = 0; i < bitmap->file_pages; i++) { + spin_lock_irqsave(&bitmap->lock, flags); + if (!bitmap->filemap) { + spin_unlock_irqrestore(&bitmap->lock, flags); + return 0; + } + page = bitmap->filemap[i]; + attr = get_page_attr(bitmap, page); + clear_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); + clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); + if ((attr & BITMAP_PAGE_DIRTY)) + wait = 1; + spin_unlock_irqrestore(&bitmap->lock, flags); + + if (attr & (BITMAP_PAGE_DIRTY | BITMAP_PAGE_NEEDWRITE)) { + err = write_page(bitmap, page, 0); + if (err == -EAGAIN) { + if (attr & BITMAP_PAGE_DIRTY) + err = write_page(bitmap, page, 1); + else + err = 0; + } + if (err) + return 1; + } + } + if (wait) { /* if any writes were performed, we need to wait on them */ + if (bitmap->file) { + spin_lock_irq(&bitmap->write_lock); + wait_event_lock_irq(bitmap->write_wait, + list_empty(&bitmap->complete_pages), bitmap->write_lock, + wake_up_process(bitmap->writeback_daemon->tsk)); + spin_unlock_irq(&bitmap->write_lock); + } else + wait_event(bitmap->mddev->sb_wait, + atomic_read(&bitmap->mddev->pending_writes)==0); + } + return 0; +} + +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int in_sync); +/* * bitmap_init_from_disk -- called at bitmap_create time to initialize + * the in-memory bitmap from the on-disk bitmap -- also, sets up the + * memory mapping of the bitmap file + * Special cases: + * if there's no bitmap file, or if the bitmap file had been + * previously kicked from the array, we mark all the bits as + * 1's in order to cause a full resync. + */ +static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync) +{ + unsigned long i, chunks, index, oldindex, bit; + struct page *page = NULL, *oldpage = NULL; + unsigned long num_pages, bit_cnt = 0; + struct file *file; + unsigned long bytes, offset, dummy; + int outofdate; + int ret = -ENOSPC; + + chunks = bitmap->chunks; + file = bitmap->file; + + BUG_ON(!file && !bitmap->offset); + +#if INJECT_FAULTS_3 + outofdate = 1; +#else + outofdate = bitmap->flags & BITMAP_STALE; +#endif + if (outofdate) + printk(KERN_INFO "%s: bitmap file is out of date, doing full " + "recovery\n", bmname(bitmap)); + + bytes = (chunks + 7) / 8; + + num_pages = (bytes + sizeof(bitmap_super_t) + PAGE_SIZE - 1) / PAGE_SIZE; + + if (file && i_size_read(file->f_mapping->host) < bytes + sizeof(bitmap_super_t)) { + printk(KERN_INFO "%s: bitmap file too short %lu < %lu\n", + bmname(bitmap), + (unsigned long) i_size_read(file->f_mapping->host), + bytes + sizeof(bitmap_super_t)); + goto out; + } + + ret = -ENOMEM; + + bitmap->filemap = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (!bitmap->filemap) + goto out; + + bitmap->filemap_attr = kmalloc(sizeof(long) * num_pages, GFP_KERNEL); + if (!bitmap->filemap_attr) + goto out; + + memset(bitmap->filemap_attr, 0, sizeof(long) * num_pages); + + oldindex = ~0L; + + for (i = 0; i < chunks; i++) { + index = file_page_index(i); + bit = file_page_offset(i); + if (index != oldindex) { /* this is a new page, read it in */ + /* unmap the old page, we're done with it */ + if (oldpage != NULL) + kunmap(oldpage); + if (index == 0) { + /* + * if we're here then the superblock page + * contains some bits (PAGE_SIZE != sizeof sb) + * we've already read it in, so just use it + */ + page = bitmap->sb_page; + offset = sizeof(bitmap_super_t); + } else if (file) { + page = read_page(file, index, &dummy); + offset = 0; + } else { + page = read_sb_page(bitmap->mddev, bitmap->offset, index); + offset = 0; + } + if (IS_ERR(page)) { /* read error */ + ret = PTR_ERR(page); + goto out; + } + + oldindex = index; + oldpage = page; + kmap(page); + + if (outofdate) { + /* + * if bitmap is out of date, dirty the + * whole page and write it out + */ + memset(page_address(page) + offset, 0xff, + PAGE_SIZE - offset); + ret = write_page(bitmap, page, 1); + if (ret) { + kunmap(page); + /* release, page not in filemap yet */ + page_cache_release(page); + goto out; + } + } + + bitmap->filemap[bitmap->file_pages++] = page; + } + if (test_bit(bit, page_address(page))) { + /* if the disk bit is set, set the memory bit */ + bitmap_set_memory_bits(bitmap, + i << CHUNK_BLOCK_SHIFT(bitmap), 1, in_sync); + bit_cnt++; + } + } + + /* everything went OK */ + ret = 0; + bitmap_mask_state(bitmap, BITMAP_STALE, MASK_UNSET); + + if (page) /* unmap the last page */ + kunmap(page); + + if (bit_cnt) { /* Kick recovery if any bits were set */ + set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); + md_wakeup_thread(bitmap->mddev->thread); + } + +out: + printk(KERN_INFO "%s: bitmap initialized from disk: " + "read %lu/%lu pages, set %lu bits, status: %d\n", + bmname(bitmap), bitmap->file_pages, num_pages, bit_cnt, ret); + + return ret; +} + +void bitmap_write_all(struct bitmap *bitmap) +{ + /* We don't actually write all bitmap blocks here, + * just flag them as needing to be written + */ + + unsigned long chunks = bitmap->chunks; + unsigned long bytes = (chunks+7)/8 + sizeof(bitmap_super_t); + unsigned long num_pages = (bytes + PAGE_SIZE-1) / PAGE_SIZE; + while (num_pages--) + bitmap->filemap_attr[num_pages] |= BITMAP_PAGE_NEEDWRITE; +} + + +static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) +{ + sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + bitmap->bp[page].count += inc; +/* + if (page == 0) printk("count page 0, offset %llu: %d gives %d\n", + (unsigned long long)offset, inc, bitmap->bp[page].count); +*/ + bitmap_checkfree(bitmap, page); +} +static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, + sector_t offset, int *blocks, + int create); + +/* + * bitmap daemon -- periodically wakes up to clean bits and flush pages + * out to disk + */ + +int bitmap_daemon_work(struct bitmap *bitmap) +{ + unsigned long j; + unsigned long flags; + struct page *page = NULL, *lastpage = NULL; + int err = 0; + int blocks; + int attr; + + if (bitmap == NULL) + return 0; + if (time_before(jiffies, bitmap->daemon_lastrun + bitmap->daemon_sleep*HZ)) + return 0; + bitmap->daemon_lastrun = jiffies; + + for (j = 0; j < bitmap->chunks; j++) { + bitmap_counter_t *bmc; + spin_lock_irqsave(&bitmap->lock, flags); + if (!bitmap->filemap) { + /* error or shutdown */ + spin_unlock_irqrestore(&bitmap->lock, flags); + break; + } + + page = filemap_get_page(bitmap, j); + + if (page != lastpage) { + /* skip this page unless it's marked as needing cleaning */ + if (!((attr=get_page_attr(bitmap, page)) & BITMAP_PAGE_CLEAN)) { + if (attr & BITMAP_PAGE_NEEDWRITE) { + page_cache_get(page); + clear_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); + } + spin_unlock_irqrestore(&bitmap->lock, flags); + if (attr & BITMAP_PAGE_NEEDWRITE) { + switch (write_page(bitmap, page, 0)) { + case -EAGAIN: + set_page_attr(bitmap, page, BITMAP_PAGE_NEEDWRITE); + break; + case 0: + break; + default: + bitmap_file_kick(bitmap); + } + page_cache_release(page); + } + continue; + } + + /* grab the new page, sync and release the old */ + page_cache_get(page); + if (lastpage != NULL) { + if (get_page_attr(bitmap, lastpage) & BITMAP_PAGE_NEEDWRITE) { + clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + spin_unlock_irqrestore(&bitmap->lock, flags); + err = write_page(bitmap, lastpage, 0); + if (err == -EAGAIN) { + err = 0; + set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + } + } else { + set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + spin_unlock_irqrestore(&bitmap->lock, flags); + } + kunmap(lastpage); + page_cache_release(lastpage); + if (err) + bitmap_file_kick(bitmap); + } else + spin_unlock_irqrestore(&bitmap->lock, flags); + lastpage = page; + kmap(page); +/* + printk("bitmap clean at page %lu\n", j); +*/ + spin_lock_irqsave(&bitmap->lock, flags); + clear_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); + } + bmc = bitmap_get_counter(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), + &blocks, 0); + if (bmc) { +/* + if (j < 100) printk("bitmap: j=%lu, *bmc = 0x%x\n", j, *bmc); +*/ + if (*bmc == 2) { + *bmc=1; /* maybe clear the bit next time */ + set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); + } else if (*bmc == 1) { + /* we can clear the bit */ + *bmc = 0; + bitmap_count_page(bitmap, j << CHUNK_BLOCK_SHIFT(bitmap), + -1); + + /* clear the bit */ + clear_bit(file_page_offset(j), page_address(page)); + } + } + spin_unlock_irqrestore(&bitmap->lock, flags); + } + + /* now sync the final page */ + if (lastpage != NULL) { + kunmap(lastpage); + spin_lock_irqsave(&bitmap->lock, flags); + if (get_page_attr(bitmap, lastpage) &BITMAP_PAGE_NEEDWRITE) { + clear_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + spin_unlock_irqrestore(&bitmap->lock, flags); + err = write_page(bitmap, lastpage, 0); + if (err == -EAGAIN) { + set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + err = 0; + } + } else { + set_page_attr(bitmap, lastpage, BITMAP_PAGE_NEEDWRITE); + spin_unlock_irqrestore(&bitmap->lock, flags); + } + + page_cache_release(lastpage); + } + + return err; +} + +static void daemon_exit(struct bitmap *bitmap, mdk_thread_t **daemon) +{ + mdk_thread_t *dmn; + unsigned long flags; + + /* if no one is waiting on us, we'll free the md thread struct + * and exit, otherwise we let the waiter clean things up */ + spin_lock_irqsave(&bitmap->lock, flags); + if ((dmn = *daemon)) { /* no one is waiting, cleanup and exit */ + *daemon = NULL; + spin_unlock_irqrestore(&bitmap->lock, flags); + kfree(dmn); + complete_and_exit(NULL, 0); /* do_exit not exported */ + } + spin_unlock_irqrestore(&bitmap->lock, flags); +} + +static void bitmap_writeback_daemon(mddev_t *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + struct page *page; + struct page_list *item; + int err = 0; + + if (signal_pending(current)) { + printk(KERN_INFO + "%s: bitmap writeback daemon got signal, exiting...\n", + bmname(bitmap)); + err = -EINTR; + goto out; + } + + PRINTK("%s: bitmap writeback daemon woke up...\n", bmname(bitmap)); + /* wait on bitmap page writebacks */ + while ((item = dequeue_page(bitmap))) { + page = item->page; + mempool_free(item, bitmap->write_pool); + PRINTK("wait on page writeback: %p\n", page); + wait_on_page_writeback(page); + PRINTK("finished page writeback: %p\n", page); + + err = PageError(page); + page_cache_release(page); + if (err) { + printk(KERN_WARNING "%s: bitmap file writeback " + "failed (page %lu): %d\n", + bmname(bitmap), page->index, err); + bitmap_file_kick(bitmap); + goto out; + } + } + out: + wake_up(&bitmap->write_wait); + if (err) { + printk(KERN_INFO "%s: bitmap writeback daemon exiting (%d)\n", + bmname(bitmap), err); + daemon_exit(bitmap, &bitmap->writeback_daemon); + } +} + +static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr, + void (*func)(mddev_t *), char *name) +{ + mdk_thread_t *daemon; + unsigned long flags; + char namebuf[32]; + + spin_lock_irqsave(&bitmap->lock, flags); + *ptr = NULL; + + if (!bitmap->file) /* no need for daemon if there's no backing file */ + goto out_unlock; + + spin_unlock_irqrestore(&bitmap->lock, flags); + +#if INJECT_FATAL_FAULT_2 + daemon = NULL; +#else + sprintf(namebuf, "%%s_%s", name); + daemon = md_register_thread(func, bitmap->mddev, namebuf); +#endif + if (!daemon) { + printk(KERN_ERR "%s: failed to start bitmap daemon\n", + bmname(bitmap)); + return -ECHILD; + } + + spin_lock_irqsave(&bitmap->lock, flags); + *ptr = daemon; + + md_wakeup_thread(daemon); /* start it running */ + + PRINTK("%s: %s daemon (pid %d) started...\n", + bmname(bitmap), name, daemon->tsk->pid); +out_unlock: + spin_unlock_irqrestore(&bitmap->lock, flags); + return 0; +} + +static int bitmap_start_daemons(struct bitmap *bitmap) +{ + int err = bitmap_start_daemon(bitmap, &bitmap->writeback_daemon, + bitmap_writeback_daemon, "bitmap_wb"); + return err; +} + +static void bitmap_stop_daemon(struct bitmap *bitmap, mdk_thread_t **ptr) +{ + mdk_thread_t *daemon; + unsigned long flags; + + spin_lock_irqsave(&bitmap->lock, flags); + daemon = *ptr; + *ptr = NULL; + spin_unlock_irqrestore(&bitmap->lock, flags); + if (daemon) + md_unregister_thread(daemon); /* destroy the thread */ +} + +static void bitmap_stop_daemons(struct bitmap *bitmap) +{ + /* the daemons can't stop themselves... they'll just exit instead... */ + if (bitmap->writeback_daemon && + current->pid != bitmap->writeback_daemon->tsk->pid) + bitmap_stop_daemon(bitmap, &bitmap->writeback_daemon); +} + +static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, + sector_t offset, int *blocks, + int create) +{ + /* If 'create', we might release the lock and reclaim it. + * The lock must have been taken with interrupts enabled. + * If !create, we don't release the lock. + */ + sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; + sector_t csize; + + if (bitmap_checkpage(bitmap, page, create) < 0) { + csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); + *blocks = csize - (offset & (csize- 1)); + return NULL; + } + /* now locked ... */ + + if (bitmap->bp[page].hijacked) { /* hijacked pointer */ + /* should we use the first or second counter field + * of the hijacked pointer? */ + int hi = (pageoff > PAGE_COUNTER_MASK); + csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + + PAGE_COUNTER_SHIFT - 1); + *blocks = csize - (offset & (csize- 1)); + return &((bitmap_counter_t *) + &bitmap->bp[page].map)[hi]; + } else { /* page is allocated */ + csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); + *blocks = csize - (offset & (csize- 1)); + return (bitmap_counter_t *) + &(bitmap->bp[page].map[pageoff]); + } +} + +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) +{ + if (!bitmap) return 0; + while (sectors) { + int blocks; + bitmap_counter_t *bmc; + + spin_lock_irq(&bitmap->lock); + bmc = bitmap_get_counter(bitmap, offset, &blocks, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->lock); + return 0; + } + + switch(*bmc) { + case 0: + bitmap_file_set_bit(bitmap, offset); + bitmap_count_page(bitmap,offset, 1); + blk_plug_device(bitmap->mddev->queue); + /* fall through */ + case 1: + *bmc = 2; + } + if ((*bmc & COUNTER_MAX) == COUNTER_MAX) BUG(); + (*bmc)++; + + spin_unlock_irq(&bitmap->lock); + + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else sectors = 0; + } + return 0; +} + +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success) +{ + if (!bitmap) return; + while (sectors) { + int blocks; + unsigned long flags; + bitmap_counter_t *bmc; + + spin_lock_irqsave(&bitmap->lock, flags); + bmc = bitmap_get_counter(bitmap, offset, &blocks, 0); + if (!bmc) { + spin_unlock_irqrestore(&bitmap->lock, flags); + return; + } + + if (!success && ! (*bmc & NEEDED_MASK)) + *bmc |= NEEDED_MASK; + + (*bmc)--; + if (*bmc <= 2) { + set_page_attr(bitmap, + filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), + BITMAP_PAGE_CLEAN); + } + spin_unlock_irqrestore(&bitmap->lock, flags); + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else sectors = 0; + } +} + +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks) +{ + bitmap_counter_t *bmc; + int rv; + if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ + *blocks = 1024; + return 1; /* always resync if no bitmap */ + } + spin_lock_irq(&bitmap->lock); + bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + rv = 0; + if (bmc) { + /* locked */ + if (RESYNC(*bmc)) + rv = 1; + else if (NEEDED(*bmc)) { + rv = 1; + *bmc |= RESYNC_MASK; + *bmc &= ~NEEDED_MASK; + } + } + spin_unlock_irq(&bitmap->lock); + return rv; +} + +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted) +{ + bitmap_counter_t *bmc; + unsigned long flags; +/* + if (offset == 0) printk("bitmap_end_sync 0 (%d)\n", aborted); +*/ if (bitmap == NULL) { + *blocks = 1024; + return; + } + spin_lock_irqsave(&bitmap->lock, flags); + bmc = bitmap_get_counter(bitmap, offset, blocks, 0); + if (bmc == NULL) + goto unlock; + /* locked */ +/* + if (offset == 0) printk("bitmap_end sync found 0x%x, blocks %d\n", *bmc, *blocks); +*/ + if (RESYNC(*bmc)) { + *bmc &= ~RESYNC_MASK; + + if (!NEEDED(*bmc) && aborted) + *bmc |= NEEDED_MASK; + else { + if (*bmc <= 2) { + set_page_attr(bitmap, + filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), + BITMAP_PAGE_CLEAN); + } + } + } + unlock: + spin_unlock_irqrestore(&bitmap->lock, flags); +} + +void bitmap_close_sync(struct bitmap *bitmap) +{ + /* Sync has finished, and any bitmap chunks that weren't synced + * properly have been aborted. It remains to us to clear the + * RESYNC bit wherever it is still on + */ + sector_t sector = 0; + int blocks; + if (!bitmap) return; + while (sector < bitmap->mddev->resync_max_sectors) { + bitmap_end_sync(bitmap, sector, &blocks, 0); +/* + if (sector < 500) printk("bitmap_close_sync: sec %llu blks %d\n", + (unsigned long long)sector, blocks); +*/ sector += blocks; + } +} + +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int in_sync) +{ + /* For each chunk covered by any of these sectors, set the + * counter to 1 and set resync_needed unless in_sync. They should all + * be 0 at this point + */ + while (sectors) { + int secs; + bitmap_counter_t *bmc; + spin_lock_irq(&bitmap->lock); + bmc = bitmap_get_counter(bitmap, offset, &secs, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->lock); + return; + } + if (! *bmc) { + struct page *page; + *bmc = 1 | (in_sync? 0 : NEEDED_MASK); + bitmap_count_page(bitmap, offset, 1); + page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); + set_page_attr(bitmap, page, BITMAP_PAGE_CLEAN); + } + spin_unlock_irq(&bitmap->lock); + if (sectors > secs) + sectors -= secs; + else + sectors = 0; + } +} + +/* + * free memory that was allocated + */ +void bitmap_destroy(mddev_t *mddev) +{ + unsigned long k, pages; + struct bitmap_page *bp; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) /* there was no bitmap */ + return; + + mddev->bitmap = NULL; /* disconnect from the md device */ + + /* release the bitmap file and kill the daemon */ + bitmap_file_put(bitmap); + + bp = bitmap->bp; + pages = bitmap->pages; + + /* free all allocated memory */ + + mempool_destroy(bitmap->write_pool); + + if (bp) /* deallocate the page memory */ + for (k = 0; k < pages; k++) + if (bp[k].map && !bp[k].hijacked) + kfree(bp[k].map); + kfree(bp); + kfree(bitmap); +} + +/* + * initialize the bitmap structure + * if this returns an error, bitmap_destroy must be called to do clean up + */ +int bitmap_create(mddev_t *mddev) +{ + struct bitmap *bitmap; + unsigned long blocks = mddev->resync_max_sectors; + unsigned long chunks; + unsigned long pages; + struct file *file = mddev->bitmap_file; + int err; + + BUG_ON(sizeof(bitmap_super_t) != 256); + + if (!file && !mddev->bitmap_offset) /* bitmap disabled, nothing to do */ + return 0; + + BUG_ON(file && mddev->bitmap_offset); + + bitmap = kmalloc(sizeof(*bitmap), GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0, sizeof(*bitmap)); + + spin_lock_init(&bitmap->lock); + bitmap->mddev = mddev; + mddev->bitmap = bitmap; + + spin_lock_init(&bitmap->write_lock); + INIT_LIST_HEAD(&bitmap->complete_pages); + init_waitqueue_head(&bitmap->write_wait); + bitmap->write_pool = mempool_create(WRITE_POOL_SIZE, write_pool_alloc, + write_pool_free, NULL); + if (!bitmap->write_pool) + return -ENOMEM; + + bitmap->file = file; + bitmap->offset = mddev->bitmap_offset; + if (file) get_file(file); + /* read superblock from bitmap file (this sets bitmap->chunksize) */ + err = bitmap_read_sb(bitmap); + if (err) + return err; + + bitmap->chunkshift = find_first_bit(&bitmap->chunksize, + sizeof(bitmap->chunksize)); + + /* now that chunksize and chunkshift are set, we can use these macros */ + chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) / + CHUNK_BLOCK_RATIO(bitmap); + pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; + + BUG_ON(!pages); + + bitmap->chunks = chunks; + bitmap->pages = pages; + bitmap->missing_pages = pages; + bitmap->counter_bits = COUNTER_BITS; + + bitmap->syncchunk = ~0UL; + +#if INJECT_FATAL_FAULT_1 + bitmap->bp = NULL; +#else + bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL); +#endif + if (!bitmap->bp) + return -ENOMEM; + memset(bitmap->bp, 0, pages * sizeof(*bitmap->bp)); + + bitmap->flags |= BITMAP_ACTIVE; + + /* now that we have some pages available, initialize the in-memory + * bitmap from the on-disk bitmap */ + err = bitmap_init_from_disk(bitmap, mddev->recovery_cp == MaxSector); + if (err) + return err; + + printk(KERN_INFO "created bitmap (%lu pages) for device %s\n", + pages, bmname(bitmap)); + + /* kick off the bitmap daemons */ + err = bitmap_start_daemons(bitmap); + if (err) + return err; + return bitmap_update_sb(bitmap); +} + +/* the bitmap API -- for raid personalities */ +EXPORT_SYMBOL(bitmap_startwrite); +EXPORT_SYMBOL(bitmap_endwrite); +EXPORT_SYMBOL(bitmap_start_sync); +EXPORT_SYMBOL(bitmap_end_sync); +EXPORT_SYMBOL(bitmap_unplug); +EXPORT_SYMBOL(bitmap_close_sync); +EXPORT_SYMBOL(bitmap_daemon_work); diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0dd6c2b5391b..d0a4bab220e5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -704,8 +704,7 @@ static void crypt_dtr(struct dm_target *ti) mempool_destroy(cc->page_pool); mempool_destroy(cc->io_pool); - if (cc->iv_mode) - kfree(cc->iv_mode); + kfree(cc->iv_mode); if (cc->iv_gen_ops && cc->iv_gen_ops->dtr) cc->iv_gen_ops->dtr(cc); crypto_free_tfm(cc->tfm); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b1941b887f46..8d740013d74d 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -217,8 +217,7 @@ static int linear_run (mddev_t *mddev) return 0; out: - if (conf) - kfree(conf); + kfree(conf); return 1; } diff --git a/drivers/md/md.c b/drivers/md/md.c index d899204d3743..0c6b5b6baff6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -19,6 +19,9 @@ Neil Brown <neilb@cse.unsw.edu.au>. + - persistent bitmap code + Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) @@ -33,6 +36,7 @@ #include <linux/config.h> #include <linux/linkage.h> #include <linux/raid/md.h> +#include <linux/raid/bitmap.h> #include <linux/sysctl.h> #include <linux/devfs_fs_kernel.h> #include <linux/buffer_head.h> /* for invalidate_bdev */ @@ -40,6 +44,8 @@ #include <linux/init.h> +#include <linux/file.h> + #ifdef CONFIG_KMOD #include <linux/kmod.h> #endif @@ -189,8 +195,7 @@ static mddev_t * mddev_find(dev_t unit) if (mddev->unit == unit) { mddev_get(mddev); spin_unlock(&all_mddevs_lock); - if (new) - kfree(new); + kfree(new); return mddev; } @@ -218,6 +223,8 @@ static mddev_t * mddev_find(dev_t unit) INIT_LIST_HEAD(&new->all_mddevs); init_timer(&new->safemode_timer); atomic_set(&new->active, 1); + spin_lock_init(&new->write_lock); + init_waitqueue_head(&new->sb_wait); new->queue = blk_alloc_queue(GFP_KERNEL); if (!new->queue) { @@ -320,6 +327,40 @@ static void free_disk_sb(mdk_rdev_t * rdev) } +static int super_written(struct bio *bio, unsigned int bytes_done, int error) +{ + mdk_rdev_t *rdev = bio->bi_private; + if (bio->bi_size) + return 1; + + if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) + md_error(rdev->mddev, rdev); + + if (atomic_dec_and_test(&rdev->mddev->pending_writes)) + wake_up(&rdev->mddev->sb_wait); + return 0; +} + +void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, + sector_t sector, int size, struct page *page) +{ + /* write first size bytes of page to sector of rdev + * Increment mddev->pending_writes before returning + * and decrement it on completion, waking up sb_wait + * if zero is reached. + * If an error occurred, call md_error + */ + struct bio *bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_bdev = rdev->bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); + bio->bi_private = rdev; + bio->bi_end_io = super_written; + atomic_inc(&mddev->pending_writes); + submit_bio((1<<BIO_RW)|(1<<BIO_RW_SYNC), bio); +} + static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) { if (bio->bi_size) @@ -329,7 +370,7 @@ static int bi_complete(struct bio *bio, unsigned int bytes_done, int error) return 0; } -static int sync_page_io(struct block_device *bdev, sector_t sector, int size, +int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw) { struct bio *bio = bio_alloc(GFP_NOIO, 1); @@ -416,11 +457,8 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) ret = 1; abort: - if (tmp1) - kfree(tmp1); - if (tmp2) - kfree(tmp2); - + kfree(tmp1); + kfree(tmp2); return ret; } @@ -569,6 +607,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) mdp_disk_t *desc; mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); + rdev->raid_disk = -1; + rdev->in_sync = 0; if (mddev->raid_disks == 0) { mddev->major_version = 0; mddev->minor_version = sb->minor_version; @@ -599,16 +639,35 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) memcpy(mddev->uuid+12,&sb->set_uuid3, 4); mddev->max_disks = MD_SB_DISKS; - } else { - __u64 ev1; - ev1 = md_event(sb); + + if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && + mddev->bitmap_file == NULL) { + if (mddev->level != 1) { + /* FIXME use a better test */ + printk(KERN_WARNING "md: bitmaps only support for raid1\n"); + return -EINVAL; + } + mddev->bitmap_offset = (MD_SB_BYTES >> 9); + } + + } else if (mddev->pers == NULL) { + /* Insist on good event counter while assembling */ + __u64 ev1 = md_event(sb); ++ev1; if (ev1 < mddev->events) return -EINVAL; - } + } else if (mddev->bitmap) { + /* if adding to array with a bitmap, then we can accept an + * older device ... but not too old. + */ + __u64 ev1 = md_event(sb); + if (ev1 < mddev->bitmap->events_cleared) + return 0; + } else /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; + if (mddev->level != LEVEL_MULTIPATH) { - rdev->raid_disk = -1; - rdev->in_sync = rdev->faulty = 0; + rdev->faulty = 0; desc = sb->disks + rdev->desc_nr; if (desc->state & (1<<MD_DISK_FAULTY)) @@ -618,7 +677,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) rdev->in_sync = 1; rdev->raid_disk = desc->raid_disk; } - } + } else /* MULTIPATH are always insync */ + rdev->in_sync = 1; return 0; } @@ -683,6 +743,9 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->layout = mddev->layout; sb->chunk_size = mddev->chunk_size; + if (mddev->bitmap && mddev->bitmap_file == NULL) + sb->state |= (1<<MD_SB_BITMAP_PRESENT); + sb->disks[0].state = (1<<MD_DISK_REMOVED); ITERATE_RDEV(mddev,rdev2,tmp) { mdp_disk_t *d; @@ -780,7 +843,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) case 0: sb_offset = rdev->bdev->bd_inode->i_size >> 9; sb_offset -= 8*2; - sb_offset &= ~(4*2-1); + sb_offset &= ~(sector_t)(4*2-1); /* convert from sectors to K */ sb_offset /= 2; break; @@ -860,6 +923,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) { struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); + rdev->raid_disk = -1; + rdev->in_sync = 0; if (mddev->raid_disks == 0) { mddev->major_version = 1; mddev->patch_version = 0; @@ -877,13 +942,30 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) memcpy(mddev->uuid, sb->set_uuid, 16); mddev->max_disks = (4096-256)/2; - } else { - __u64 ev1; - ev1 = le64_to_cpu(sb->events); + + if ((le32_to_cpu(sb->feature_map) & 1) && + mddev->bitmap_file == NULL ) { + if (mddev->level != 1) { + printk(KERN_WARNING "md: bitmaps only supported for raid1\n"); + return -EINVAL; + } + mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); + } + } else if (mddev->pers == NULL) { + /* Insist of good event counter while assembling */ + __u64 ev1 = le64_to_cpu(sb->events); ++ev1; if (ev1 < mddev->events) return -EINVAL; - } + } else if (mddev->bitmap) { + /* If adding to array with a bitmap, then we can accept an + * older device, but not too old. + */ + __u64 ev1 = le64_to_cpu(sb->events); + if (ev1 < mddev->bitmap->events_cleared) + return 0; + } else /* just a hot-add of a new device, leave raid_disk at -1 */ + return 0; if (mddev->level != LEVEL_MULTIPATH) { int role; @@ -891,14 +973,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ - rdev->in_sync = 0; rdev->faulty = 0; - rdev->raid_disk = -1; break; case 0xfffe: /* faulty */ - rdev->in_sync = 0; rdev->faulty = 1; - rdev->raid_disk = -1; break; default: rdev->in_sync = 1; @@ -906,7 +984,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) rdev->raid_disk = role; break; } - } + } else /* MULTIPATH are always insync */ + rdev->in_sync = 1; + return 0; } @@ -933,6 +1013,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) else sb->resync_offset = cpu_to_le64(0); + if (mddev->bitmap && mddev->bitmap_file == NULL) { + sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); + sb->feature_map = cpu_to_le32(1); + } + max_dev = 0; ITERATE_RDEV(mddev,rdev2,tmp) if (rdev2->desc_nr+1 > max_dev) @@ -1196,8 +1281,11 @@ void md_print_devices(void) printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); printk("md: **********************************\n"); ITERATE_MDDEV(mddev,tmp) { - printk("%s: ", mdname(mddev)); + if (mddev->bitmap) + bitmap_print_sb(mddev->bitmap); + else + printk("%s: ", mdname(mddev)); ITERATE_RDEV(mddev,rdev,tmp2) printk("<%s>", bdevname(rdev->bdev,b)); printk("\n"); @@ -1210,30 +1298,6 @@ void md_print_devices(void) } -static int write_disk_sb(mdk_rdev_t * rdev) -{ - char b[BDEVNAME_SIZE]; - if (!rdev->sb_loaded) { - MD_BUG(); - return 1; - } - if (rdev->faulty) { - MD_BUG(); - return 1; - } - - dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)rdev->sb_offset); - - if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE)) - return 0; - - printk("md: write_disk_sb failed for device %s\n", - bdevname(rdev->bdev,b)); - return 1; -} - static void sync_sbs(mddev_t * mddev) { mdk_rdev_t *rdev; @@ -1248,12 +1312,14 @@ static void sync_sbs(mddev_t * mddev) static void md_update_sb(mddev_t * mddev) { - int err, count = 100; + int err; struct list_head *tmp; mdk_rdev_t *rdev; + int sync_req; - mddev->sb_dirty = 0; repeat: + spin_lock(&mddev->write_lock); + sync_req = mddev->in_sync; mddev->utime = get_seconds(); mddev->events ++; @@ -1266,20 +1332,26 @@ repeat: MD_BUG(); mddev->events --; } + mddev->sb_dirty = 2; sync_sbs(mddev); /* * do not write anything to disk if using * nonpersistent superblocks */ - if (!mddev->persistent) + if (!mddev->persistent) { + mddev->sb_dirty = 0; + spin_unlock(&mddev->write_lock); + wake_up(&mddev->sb_wait); return; + } + spin_unlock(&mddev->write_lock); dprintk(KERN_INFO "md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev),mddev->in_sync); - err = 0; + err = bitmap_update_sb(mddev->bitmap); ITERATE_RDEV(mddev,rdev,tmp) { char b[BDEVNAME_SIZE]; dprintk(KERN_INFO "md: "); @@ -1288,22 +1360,32 @@ repeat: dprintk("%s ", bdevname(rdev->bdev,b)); if (!rdev->faulty) { - err += write_disk_sb(rdev); + md_super_write(mddev,rdev, + rdev->sb_offset<<1, MD_SB_BYTES, + rdev->sb_page); + dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", + bdevname(rdev->bdev,b), + (unsigned long long)rdev->sb_offset); + } else dprintk(")\n"); - if (!err && mddev->level == LEVEL_MULTIPATH) + if (mddev->level == LEVEL_MULTIPATH) /* only need to write one superblock... */ break; } - if (err) { - if (--count) { - printk(KERN_ERR "md: errors occurred during superblock" - " update, repeating\n"); - goto repeat; - } - printk(KERN_ERR \ - "md: excessive errors occurred during superblock update, exiting\n"); + wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); + /* if there was a failure, sb_dirty was set to 1, and we re-write super */ + + spin_lock(&mddev->write_lock); + if (mddev->in_sync != sync_req|| mddev->sb_dirty == 1) { + /* have to write it out again */ + spin_unlock(&mddev->write_lock); + goto repeat; } + mddev->sb_dirty = 0; + spin_unlock(&mddev->write_lock); + wake_up(&mddev->sb_wait); + } /* @@ -1607,12 +1689,19 @@ static int do_md_run(mddev_t * mddev) mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */ - err = mddev->pers->run(mddev); + /* before we start the array running, initialise the bitmap */ + err = bitmap_create(mddev); + if (err) + printk(KERN_ERR "%s: failed to create bitmap (%d)\n", + mdname(mddev), err); + else + err = mddev->pers->run(mddev); if (err) { printk(KERN_ERR "md: pers->run() failed ...\n"); module_put(mddev->pers->owner); mddev->pers = NULL; - return -EINVAL; + bitmap_destroy(mddev); + return err; } atomic_set(&mddev->writes_pending,0); mddev->safemode = 0; @@ -1725,6 +1814,14 @@ static int do_md_stop(mddev_t * mddev, int ro) if (ro) set_disk_ro(disk, 1); } + + bitmap_destroy(mddev); + if (mddev->bitmap_file) { + atomic_set(&mddev->bitmap_file->f_dentry->d_inode->i_writecount, 1); + fput(mddev->bitmap_file); + mddev->bitmap_file = NULL; + } + /* * Free resources if final stop */ @@ -1983,6 +2080,42 @@ static int get_array_info(mddev_t * mddev, void __user * arg) return 0; } +static int get_bitmap_file(mddev_t * mddev, void * arg) +{ + mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ + char *ptr, *buf = NULL; + int err = -ENOMEM; + + file = kmalloc(sizeof(*file), GFP_KERNEL); + if (!file) + goto out; + + /* bitmap disabled, zero the first byte and copy out */ + if (!mddev->bitmap || !mddev->bitmap->file) { + file->pathname[0] = '\0'; + goto copy_out; + } + + buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); + if (!buf) + goto out; + + ptr = file_path(mddev->bitmap->file, buf, sizeof(file->pathname)); + if (!ptr) + goto out; + + strcpy(file->pathname, ptr); + +copy_out: + err = 0; + if (copy_to_user(arg, file, sizeof(*file))) + err = -EFAULT; +out: + kfree(buf); + kfree(file); + return err; +} + static int get_disk_info(mddev_t * mddev, void __user * arg) { mdu_disk_info_t info; @@ -2078,11 +2211,25 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) PTR_ERR(rdev)); return PTR_ERR(rdev); } + /* set save_raid_disk if appropriate */ + if (!mddev->persistent) { + if (info->state & (1<<MD_DISK_SYNC) && + info->raid_disk < mddev->raid_disks) + rdev->raid_disk = info->raid_disk; + else + rdev->raid_disk = -1; + } else + super_types[mddev->major_version]. + validate_super(mddev, rdev); + rdev->saved_raid_disk = rdev->raid_disk; + rdev->in_sync = 0; /* just to be sure */ rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) export_rdev(rdev); + + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (mddev->thread) md_wakeup_thread(mddev->thread); return err; @@ -2256,6 +2403,49 @@ abort_export: return err; } +/* similar to deny_write_access, but accounts for our holding a reference + * to the file ourselves */ +static int deny_bitmap_write_access(struct file * file) +{ + struct inode *inode = file->f_mapping->host; + + spin_lock(&inode->i_lock); + if (atomic_read(&inode->i_writecount) > 1) { + spin_unlock(&inode->i_lock); + return -ETXTBSY; + } + atomic_set(&inode->i_writecount, -1); + spin_unlock(&inode->i_lock); + + return 0; +} + +static int set_bitmap_file(mddev_t *mddev, int fd) +{ + int err; + + if (mddev->pers) + return -EBUSY; + + mddev->bitmap_file = fget(fd); + + if (mddev->bitmap_file == NULL) { + printk(KERN_ERR "%s: error: failed to get bitmap file\n", + mdname(mddev)); + return -EBADF; + } + + err = deny_bitmap_write_access(mddev->bitmap_file); + if (err) { + printk(KERN_ERR "%s: error: bitmap file is already in use\n", + mdname(mddev)); + fput(mddev->bitmap_file); + mddev->bitmap_file = NULL; + } else + mddev->bitmap_offset = 0; /* file overrides offset */ + return err; +} + /* * set_array_info is used two different ways * The original usage is when creating a new array. @@ -2567,8 +2757,10 @@ static int md_ioctl(struct inode *inode, struct file *file, /* * Commands querying/configuring an existing array: */ - /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */ - if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) { + /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, + * RUN_ARRAY, and SET_BITMAP_FILE are allowed */ + if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY + && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE) { err = -ENODEV; goto abort_unlock; } @@ -2582,6 +2774,10 @@ static int md_ioctl(struct inode *inode, struct file *file, err = get_array_info(mddev, argp); goto done_unlock; + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, (void *)arg); + goto done_unlock; + case GET_DISK_INFO: err = get_disk_info(mddev, argp); goto done_unlock; @@ -2662,6 +2858,10 @@ static int md_ioctl(struct inode *inode, struct file *file, err = do_md_run (mddev); goto done_unlock; + case SET_BITMAP_FILE: + err = set_bitmap_file(mddev, (int)arg); + goto done_unlock; + default: if (_IOC_TYPE(cmd) == MD_MAJOR) printk(KERN_WARNING "md: %s(pid %d) used" @@ -2773,8 +2973,9 @@ static int md_thread(void * arg) while (thread->run) { void (*run)(mddev_t *); - wait_event_interruptible(thread->wqueue, - test_bit(THREAD_WAKEUP, &thread->flags)); + wait_event_interruptible_timeout(thread->wqueue, + test_bit(THREAD_WAKEUP, &thread->flags), + thread->timeout); if (current->flags & PF_FREEZE) refrigerator(PF_FREEZE); @@ -2820,6 +3021,7 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, thread->run = run; thread->mddev = mddev; thread->name = name; + thread->timeout = MAX_SCHEDULE_TIMEOUT; ret = kernel_thread(md_thread, thread, 0); if (ret < 0) { kfree(thread); @@ -2858,13 +3060,13 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) if (!rdev || rdev->faulty) return; - +/* dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", mdname(mddev), MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), __builtin_return_address(0),__builtin_return_address(1), __builtin_return_address(2),__builtin_return_address(3)); - +*/ if (!mddev->pers->error_handler) return; mddev->pers->error_handler(mddev,rdev); @@ -3018,6 +3220,7 @@ static int md_seq_show(struct seq_file *seq, void *v) struct list_head *tmp2; mdk_rdev_t *rdev; int i; + struct bitmap *bitmap; if (v == (void*)1) { seq_printf(seq, "Personalities : "); @@ -3070,10 +3273,35 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->pers) { mddev->pers->status (seq, mddev); seq_printf(seq, "\n "); - if (mddev->curr_resync > 2) + if (mddev->curr_resync > 2) { status_resync (seq, mddev); - else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) - seq_printf(seq, " resync=DELAYED"); + seq_printf(seq, "\n "); + } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + seq_printf(seq, " resync=DELAYED\n "); + } else + seq_printf(seq, "\n "); + + if ((bitmap = mddev->bitmap)) { + unsigned long chunk_kb; + unsigned long flags; + spin_lock_irqsave(&bitmap->lock, flags); + chunk_kb = bitmap->chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + bitmap->pages - bitmap->missing_pages, + bitmap->pages, + (bitmap->pages - bitmap->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->file) { + seq_printf(seq, ", file: "); + seq_path(seq, bitmap->file->f_vfsmnt, + bitmap->file->f_dentry," \t\n"); + } + + seq_printf(seq, "\n"); + spin_unlock_irqrestore(&bitmap->lock, flags); } seq_printf(seq, "\n"); @@ -3176,19 +3404,28 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok) } -void md_write_start(mddev_t *mddev) +/* md_write_start(mddev, bi) + * If we need to update some array metadata (e.g. 'active' flag + * in superblock) before writing, schedule a superblock update + * and wait for it to complete. + */ +void md_write_start(mddev_t *mddev, struct bio *bi) { - if (!atomic_read(&mddev->writes_pending)) { - mddev_lock_uninterruptible(mddev); + DEFINE_WAIT(w); + if (bio_data_dir(bi) != WRITE) + return; + + atomic_inc(&mddev->writes_pending); + if (mddev->in_sync) { + spin_lock(&mddev->write_lock); if (mddev->in_sync) { mddev->in_sync = 0; - del_timer(&mddev->safemode_timer); - md_update_sb(mddev); + mddev->sb_dirty = 1; + md_wakeup_thread(mddev->thread); } - atomic_inc(&mddev->writes_pending); - mddev_unlock(mddev); - } else - atomic_inc(&mddev->writes_pending); + spin_unlock(&mddev->write_lock); + } + wait_event(mddev->sb_wait, mddev->sb_dirty==0); } void md_write_end(mddev_t *mddev) @@ -3201,37 +3438,6 @@ void md_write_end(mddev_t *mddev) } } -static inline void md_enter_safemode(mddev_t *mddev) -{ - if (!mddev->safemode) return; - if (mddev->safemode == 2 && - (atomic_read(&mddev->writes_pending) || mddev->in_sync || - mddev->recovery_cp != MaxSector)) - return; /* avoid the lock */ - mddev_lock_uninterruptible(mddev); - if (mddev->safemode && !atomic_read(&mddev->writes_pending) && - !mddev->in_sync && mddev->recovery_cp == MaxSector) { - mddev->in_sync = 1; - md_update_sb(mddev); - } - mddev_unlock(mddev); - - if (mddev->safemode == 1) - mddev->safemode = 0; -} - -void md_handle_safemode(mddev_t *mddev) -{ - if (signal_pending(current)) { - printk(KERN_INFO "md: %s in immediate safe mode\n", - mdname(mddev)); - mddev->safemode = 2; - flush_signals(current); - } - md_enter_safemode(mddev); -} - - static DECLARE_WAIT_QUEUE_HEAD(resync_wait); #define SYNC_MARKS 10 @@ -3241,12 +3447,13 @@ static void md_do_sync(mddev_t *mddev) mddev_t *mddev2; unsigned int currspeed = 0, window; - sector_t max_sectors,j; + sector_t max_sectors,j, io_sectors; unsigned long mark[SYNC_MARKS]; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; struct list_head *tmp; sector_t last_check; + int skipped = 0; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -3312,7 +3519,7 @@ static void md_do_sync(mddev_t *mddev) if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) /* resync follows the size requested by the personality, - * which default to physical size, but can be virtual size + * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; else @@ -3327,13 +3534,15 @@ static void md_do_sync(mddev_t *mddev) sysctl_speed_limit_max); is_mddev_idle(mddev); /* this also initializes IO event counters */ - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) + /* we don't use the checkpoint if there's a bitmap */ + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && !mddev->bitmap) j = mddev->recovery_cp; else j = 0; + io_sectors = 0; for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; - mark_cnt[m] = j; + mark_cnt[m] = io_sectors; } last_mark = 0; mddev->resync_mark = mark[last_mark]; @@ -3358,21 +3567,29 @@ static void md_do_sync(mddev_t *mddev) } while (j < max_sectors) { - int sectors; + sector_t sectors; - sectors = mddev->pers->sync_request(mddev, j, currspeed < sysctl_speed_limit_min); - if (sectors < 0) { + skipped = 0; + sectors = mddev->pers->sync_request(mddev, j, &skipped, + currspeed < sysctl_speed_limit_min); + if (sectors == 0) { set_bit(MD_RECOVERY_ERR, &mddev->recovery); goto out; } - atomic_add(sectors, &mddev->recovery_active); + + if (!skipped) { /* actual IO requested */ + io_sectors += sectors; + atomic_add(sectors, &mddev->recovery_active); + } + j += sectors; if (j>1) mddev->curr_resync = j; - if (last_check + window > j || j == max_sectors) + + if (last_check + window > io_sectors || j == max_sectors) continue; - last_check = j; + last_check = io_sectors; if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) || test_bit(MD_RECOVERY_ERR, &mddev->recovery)) @@ -3386,7 +3603,7 @@ static void md_do_sync(mddev_t *mddev) mddev->resync_mark = mark[next]; mddev->resync_mark_cnt = mark_cnt[next]; mark[next] = jiffies; - mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); last_mark = next; } @@ -3413,7 +3630,8 @@ static void md_do_sync(mddev_t *mddev) mddev->queue->unplug_fn(mddev->queue); cond_resched(); - currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 + /((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > sysctl_speed_limit_min) { if ((currspeed > sysctl_speed_limit_max) || @@ -3433,7 +3651,7 @@ static void md_do_sync(mddev_t *mddev) wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); /* tell personality that we are finished */ - mddev->pers->sync_request(mddev, max_sectors, 1); + mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) && mddev->curr_resync > 2 && @@ -3447,7 +3665,6 @@ static void md_do_sync(mddev_t *mddev) mddev->recovery_cp = MaxSector; } - md_enter_safemode(mddev); skip: mddev->curr_resync = 0; wake_up(&resync_wait); @@ -3484,20 +3701,48 @@ void md_check_recovery(mddev_t *mddev) struct list_head *rtmp; - dprintk(KERN_INFO "md: recovery thread got woken up ...\n"); + if (mddev->bitmap) + bitmap_daemon_work(mddev->bitmap); if (mddev->ro) return; + + if (signal_pending(current)) { + if (mddev->pers->sync_request) { + printk(KERN_INFO "md: %s in immediate safe mode\n", + mdname(mddev)); + mddev->safemode = 2; + } + flush_signals(current); + } + if ( ! ( mddev->sb_dirty || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || - test_bit(MD_RECOVERY_DONE, &mddev->recovery) + test_bit(MD_RECOVERY_DONE, &mddev->recovery) || + (mddev->safemode == 1) || + (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) + && !mddev->in_sync && mddev->recovery_cp == MaxSector) )) return; + if (mddev_trylock(mddev)==0) { int spares =0; + + spin_lock(&mddev->write_lock); + if (mddev->safemode && !atomic_read(&mddev->writes_pending) && + !mddev->in_sync && mddev->recovery_cp == MaxSector) { + mddev->in_sync = 1; + mddev->sb_dirty = 1; + } + if (mddev->safemode == 1) + mddev->safemode = 0; + spin_unlock(&mddev->write_lock); + if (mddev->sb_dirty) md_update_sb(mddev); + + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { /* resync/recovery still happening */ @@ -3515,6 +3760,14 @@ void md_check_recovery(mddev_t *mddev) mddev->pers->spare_active(mddev); } md_update_sb(mddev); + + /* if array is no-longer degraded, then any saved_raid_disk + * information must be scrapped + */ + if (!mddev->degraded) + ITERATE_RDEV(mddev,rdev,rtmp) + rdev->saved_raid_disk = -1; + mddev->recovery = 0; /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -3557,6 +3810,13 @@ void md_check_recovery(mddev_t *mddev) set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); if (!spares) set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + if (spares && mddev->bitmap && ! mddev->bitmap->file) { + /* We are adding a device or devices to an array + * which has the bitmap stored on all devices. + * So make sure all bitmap pages get written + */ + bitmap_write_all(mddev->bitmap); + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "%s_resync"); @@ -3624,6 +3884,8 @@ static int __init md_init(void) " MD_SB_DISKS=%d\n", MD_MAJOR_VERSION, MD_MINOR_VERSION, MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS); + printk(KERN_INFO "md: bitmap version %d.%d\n", BITMAP_MAJOR, + BITMAP_MINOR); if (register_blkdev(MAJOR_NR, "md")) return -1; @@ -3739,7 +4001,6 @@ EXPORT_SYMBOL(md_error); EXPORT_SYMBOL(md_done_sync); EXPORT_SYMBOL(md_write_start); EXPORT_SYMBOL(md_write_end); -EXPORT_SYMBOL(md_handle_safemode); EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_unregister_thread); EXPORT_SYMBOL(md_wakeup_thread); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 2ae2d709cb15..2d2ca7fa0265 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -533,8 +533,7 @@ static int multipath_run (mddev_t *mddev) out_free_conf: if (conf->pool) mempool_destroy(conf->pool); - if (conf->multipaths) - kfree(conf->multipaths); + kfree(conf->multipaths); kfree(conf); mddev->private = NULL; out: diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e7d934eca06f..e11dd14d0b43 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -371,10 +371,8 @@ static int raid0_run (mddev_t *mddev) return 0; out_free_conf: - if (conf->strip_zone) - kfree(conf->strip_zone); - if (conf->devlist) - kfree (conf->devlist); + kfree(conf->strip_zone); + kfree(conf->devlist); kfree(conf); mddev->private = NULL; out: @@ -386,11 +384,11 @@ static int raid0_stop (mddev_t *mddev) raid0_conf_t *conf = mddev_to_conf(mddev); blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ - kfree (conf->hash_table); + kfree(conf->hash_table); conf->hash_table = NULL; - kfree (conf->strip_zone); + kfree(conf->strip_zone); conf->strip_zone = NULL; - kfree (conf); + kfree(conf); mddev->private = NULL; return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1db5de52d376..ff1dbec864af 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -12,6 +12,15 @@ * Fixes to reconstruction by Jakob stergaard" <jakob@ostenfeld.dk> * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> * + * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support + * bitmapped intelligence in resync: + * + * - bitmap marked during normal i/o + * - bitmap used to skip nondirty blocks during sync + * + * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: + * - persistent bitmap code + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) @@ -22,7 +31,16 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include "dm-bio-list.h" #include <linux/raid/raid1.h> +#include <linux/raid/bitmap.h> + +#define DEBUG 0 +#if DEBUG +#define PRINTK(x...) printk(x) +#else +#define PRINTK(x...) +#endif /* * Number of guaranteed r1bios in case of extreme VM load: @@ -287,9 +305,11 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int /* * this branch is our 'one mirror IO has finished' event handler: */ - if (!uptodate) + if (!uptodate) { md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); - else + /* an I/O failed, we can't clear the bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } else /* * Set R1BIO_Uptodate in our master bio, so that * we will return a good error code for to the higher @@ -309,6 +329,10 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int * already. */ if (atomic_dec_and_test(&r1_bio->remaining)) { + /* clear the bitmap if all writes complete successfully */ + bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, + r1_bio->sectors, + !test_bit(R1BIO_Degraded, &r1_bio->state)); md_write_end(r1_bio->mddev); raid_end_bio_io(r1_bio); } @@ -458,7 +482,10 @@ static void unplug_slaves(mddev_t *mddev) static void raid1_unplug(request_queue_t *q) { - unplug_slaves(q->queuedata); + mddev_t *mddev = q->queuedata; + + unplug_slaves(mddev); + md_wakeup_thread(mddev->thread); } static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, @@ -501,16 +528,16 @@ static void device_barrier(conf_t *conf, sector_t sect) { spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); if (!conf->barrier++) { wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); if (conf->nr_pending) BUG(); } wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); conf->next_resync = sect; spin_unlock_irq(&conf->resync_lock); } @@ -522,14 +549,20 @@ static int make_request(request_queue_t *q, struct bio * bio) mirror_info_t *mirror; r1bio_t *r1_bio; struct bio *read_bio; - int i, disks; + int i, targets = 0, disks; mdk_rdev_t *rdev; + struct bitmap *bitmap = mddev->bitmap; + unsigned long flags; + struct bio_list bl; + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. * Continue immediately if no resync is active currently. */ + md_write_start(mddev, bio); /* wait on superblock update early */ + spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, ); conf->nr_pending++; @@ -552,7 +585,7 @@ static int make_request(request_queue_t *q, struct bio * bio) r1_bio->master_bio = bio; r1_bio->sectors = bio->bi_size >> 9; - + r1_bio->state = 0; r1_bio->mddev = mddev; r1_bio->sector = bio->bi_sector; @@ -595,6 +628,13 @@ static int make_request(request_queue_t *q, struct bio * bio) * bios[x] to bio */ disks = conf->raid_disks; +#if 0 + { static int first=1; + if (first) printk("First Write sector %llu disks %d\n", + (unsigned long long)r1_bio->sector, disks); + first = 0; + } +#endif rcu_read_lock(); for (i = 0; i < disks; i++) { if ((rdev=conf->mirrors[i].rdev) != NULL && @@ -605,13 +645,21 @@ static int make_request(request_queue_t *q, struct bio * bio) r1_bio->bios[i] = NULL; } else r1_bio->bios[i] = bio; + targets++; } else r1_bio->bios[i] = NULL; } rcu_read_unlock(); - atomic_set(&r1_bio->remaining, 1); - md_write_start(mddev); + if (targets < conf->raid_disks) { + /* array is degraded, we will not clear the bitmap + * on I/O completion (see raid1_end_write_request) */ + set_bit(R1BIO_Degraded, &r1_bio->state); + } + + atomic_set(&r1_bio->remaining, 0); + + bio_list_init(&bl); for (i = 0; i < disks; i++) { struct bio *mbio; if (!r1_bio->bios[i]) @@ -627,14 +675,23 @@ static int make_request(request_queue_t *q, struct bio * bio) mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); - generic_make_request(mbio); - } - if (atomic_dec_and_test(&r1_bio->remaining)) { - md_write_end(mddev); - raid_end_bio_io(r1_bio); + bio_list_add(&bl, mbio); } + bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); + spin_lock_irqsave(&conf->device_lock, flags); + bio_list_merge(&conf->pending_bio_list, &bl); + bio_list_init(&bl); + + blk_plug_device(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); + +#if 0 + while ((bio = bio_list_pop(&bl)) != NULL) + generic_make_request(bio); +#endif + return 0; } @@ -714,7 +771,7 @@ static void close_sync(conf_t *conf) { spin_lock_irq(&conf->resync_lock); wait_event_lock_irq(conf->wait_resume, !conf->barrier, - conf->resync_lock, unplug_slaves(conf->mddev)); + conf->resync_lock, raid1_unplug(conf->mddev->queue)); spin_unlock_irq(&conf->resync_lock); if (conf->barrier) BUG(); @@ -754,9 +811,12 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { conf_t *conf = mddev->private; int found = 0; - int mirror; + int mirror = 0; mirror_info_t *p; + if (rdev->saved_raid_disk >= 0 && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + mirror = rdev->saved_raid_disk; for (mirror=0; mirror < mddev->raid_disks; mirror++) if ( !(p=conf->mirrors+mirror)->rdev) { @@ -773,6 +833,8 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) p->head_position = 0; rdev->raid_disk = mirror; found = 1; + if (rdev->saved_raid_disk != mirror) + conf->fullsync = 1; p->rdev = rdev; break; } @@ -828,10 +890,11 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (!uptodate) + if (!uptodate) { md_error(r1_bio->mddev, conf->mirrors[r1_bio->read_disk].rdev); - else + set_bit(R1BIO_Degraded, &r1_bio->state); + } else set_bit(R1BIO_Uptodate, &r1_bio->state); rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev); reschedule_retry(r1_bio); @@ -855,8 +918,10 @@ static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) mirror = i; break; } - if (!uptodate) + if (!uptodate) { md_error(mddev, conf->mirrors[mirror].rdev); + set_bit(R1BIO_Degraded, &r1_bio->state); + } update_head_pos(mirror, r1_bio); if (atomic_dec_and_test(&r1_bio->remaining)) { @@ -876,6 +941,9 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) bio = r1_bio->bios[r1_bio->read_disk]; +/* + if (r1_bio->sector == 0) printk("First sync write startss\n"); +*/ /* * schedule writes */ @@ -903,10 +971,12 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) atomic_inc(&conf->mirrors[i].rdev->nr_pending); atomic_inc(&r1_bio->remaining); md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); + generic_make_request(wbio); } if (atomic_dec_and_test(&r1_bio->remaining)) { + /* if we're here, all write(s) have completed, so clean up */ md_done_sync(mddev, r1_bio->sectors, 1); put_buf(r1_bio); } @@ -931,11 +1001,30 @@ static void raid1d(mddev_t *mddev) mdk_rdev_t *rdev; md_check_recovery(mddev); - md_handle_safemode(mddev); for (;;) { char b[BDEVNAME_SIZE]; spin_lock_irqsave(&conf->device_lock, flags); + + if (conf->pending_bio_list.head) { + bio = bio_list_get(&conf->pending_bio_list); + blk_remove_plug(mddev->queue); + spin_unlock_irqrestore(&conf->device_lock, flags); + /* flush any pending bitmap writes to disk before proceeding w/ I/O */ + if (bitmap_unplug(mddev->bitmap) != 0) + printk("%s: bitmap file write failed!\n", mdname(mddev)); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + unplug = 1; + + continue; + } + if (list_empty(head)) break; r1_bio = list_entry(head->prev, r1bio_t, retry_list); @@ -1009,7 +1098,7 @@ static int init_resync(conf_t *conf) * that can be installed to exclude normal IO requests. */ -static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) +static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { conf_t *conf = mddev_to_conf(mddev); mirror_info_t *mirror; @@ -1019,17 +1108,43 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) int disk; int i; int write_targets = 0; + int sync_blocks; if (!conf->r1buf_pool) + { +/* + printk("sync start - bitmap %p\n", mddev->bitmap); +*/ if (init_resync(conf)) - return -ENOMEM; + return 0; + } max_sector = mddev->size << 1; if (sector_nr >= max_sector) { + /* If we aborted, we need to abort the + * sync on the 'current' bitmap chunk (there will + * only be one in raid1 resync. + * We can find the current addess in mddev->curr_resync + */ + if (!conf->fullsync) { + if (mddev->curr_resync < max_sector) + bitmap_end_sync(mddev->bitmap, + mddev->curr_resync, + &sync_blocks, 1); + bitmap_close_sync(mddev->bitmap); + } + if (mddev->curr_resync >= max_sector) + conf->fullsync = 0; close_sync(conf); return 0; } + if (!conf->fullsync && + !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks)) { + /* We can skip this block, and probably several more */ + *skipped = 1; + return sync_blocks; + } /* * If there is non-resync activity waiting for us then * put in a delay to throttle resync. @@ -1068,6 +1183,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) r1_bio->mddev = mddev; r1_bio->sector = sector_nr; + r1_bio->state = 0; set_bit(R1BIO_IsSync, &r1_bio->state); r1_bio->read_disk = disk; @@ -1102,18 +1218,24 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) bio->bi_bdev = conf->mirrors[i].rdev->bdev; bio->bi_private = r1_bio; } + + if (write_targets + 1 < conf->raid_disks) + /* array degraded, can't clear bitmap */ + set_bit(R1BIO_Degraded, &r1_bio->state); + if (write_targets == 0) { /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished */ - int rv = max_sector - sector_nr; - md_done_sync(mddev, rv, 1); + sector_t rv = max_sector - sector_nr; + *skipped = 1; put_buf(r1_bio); rdev_dec_pending(conf->mirrors[disk].rdev, mddev); return rv; } nr_sectors = 0; + sync_blocks = 0; do { struct page *page; int len = PAGE_SIZE; @@ -1121,6 +1243,17 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) len = (max_sector - sector_nr) << 9; if (len == 0) break; + if (!conf->fullsync) { + if (sync_blocks == 0) { + if (!bitmap_start_sync(mddev->bitmap, + sector_nr, &sync_blocks)) + break; + if (sync_blocks < (PAGE_SIZE>>9)) + BUG(); + if (len > (sync_blocks<<9)) len = sync_blocks<<9; + } + } + for (i=0 ; i < conf->raid_disks; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io) { @@ -1143,6 +1276,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) } nr_sectors += len>>9; sector_nr += len>>9; + sync_blocks -= (len>>9); } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); bio_full: bio = r1_bio->bios[disk]; @@ -1231,6 +1365,9 @@ static int run(mddev_t *mddev) init_waitqueue_head(&conf->wait_idle); init_waitqueue_head(&conf->wait_resume); + bio_list_init(&conf->pending_bio_list); + bio_list_init(&conf->flushing_bio_list); + if (!conf->working_disks) { printk(KERN_ERR "raid1: no operational mirrors for %s\n", mdname(mddev)); @@ -1259,16 +1396,15 @@ static int run(mddev_t *mddev) conf->last_used = j; - - { - mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); - if (!mddev->thread) { - printk(KERN_ERR - "raid1: couldn't allocate thread for %s\n", - mdname(mddev)); - goto out_free_conf; - } + mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); + if (!mddev->thread) { + printk(KERN_ERR + "raid1: couldn't allocate thread for %s\n", + mdname(mddev)); + goto out_free_conf; } + if (mddev->bitmap) mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; + printk(KERN_INFO "raid1: raid set %s active with %d out of %d mirrors\n", mdname(mddev), mddev->raid_disks - mddev->degraded, @@ -1291,10 +1427,8 @@ out_free_conf: if (conf) { if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); - if (conf->mirrors) - kfree(conf->mirrors); - if (conf->poolinfo) - kfree(conf->poolinfo); + kfree(conf->mirrors); + kfree(conf->poolinfo); kfree(conf); mddev->private = NULL; } @@ -1311,10 +1445,8 @@ static int stop(mddev_t *mddev) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); - if (conf->mirrors) - kfree(conf->mirrors); - if (conf->poolinfo) - kfree(conf->poolinfo); + kfree(conf->mirrors); + kfree(conf->poolinfo); kfree(conf); mddev->private = NULL; return 0; @@ -1349,17 +1481,26 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) * We allocate a new r1bio_pool if we can. * Then raise a device barrier and wait until all IO stops. * Then resize conf->mirrors and swap in the new r1bio pool. + * + * At the same time, we "pack" the devices so that all the missing + * devices have the higher raid_disk numbers. */ mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; mirror_info_t *newmirrors; conf_t *conf = mddev_to_conf(mddev); + int cnt; - int d; + int d, d2; - for (d= raid_disks; d < conf->raid_disks; d++) - if (conf->mirrors[d].rdev) + if (raid_disks < conf->raid_disks) { + cnt=0; + for (d= 0; d < conf->raid_disks; d++) + if (conf->mirrors[d].rdev) + cnt++; + if (cnt > raid_disks) return -EBUSY; + } newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); if (!newpoolinfo) @@ -1384,14 +1525,18 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) spin_lock_irq(&conf->resync_lock); conf->barrier++; wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, - conf->resync_lock, unplug_slaves(mddev)); + conf->resync_lock, raid1_unplug(mddev->queue)); spin_unlock_irq(&conf->resync_lock); /* ok, everything is stopped */ oldpool = conf->r1bio_pool; conf->r1bio_pool = newpool; - for (d=0; d < raid_disks && d < conf->raid_disks; d++) - newmirrors[d] = conf->mirrors[d]; + + for (d=d2=0; d < conf->raid_disks; d++) + if (conf->mirrors[d].rdev) { + conf->mirrors[d].rdev->raid_disk = d2; + newmirrors[d2++].rdev = conf->mirrors[d].rdev; + } kfree(conf->mirrors); conf->mirrors = newmirrors; kfree(conf->poolinfo); @@ -1400,6 +1545,7 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks) mddev->degraded += (raid_disks - conf->raid_disks); conf->raid_disks = mddev->raid_disks = raid_disks; + conf->last_used = 0; /* just make sure it is in-range */ spin_lock_irq(&conf->resync_lock); conf->barrier--; spin_unlock_irq(&conf->resync_lock); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3c37be6423d7..62ebb1bc72be 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -700,6 +700,8 @@ static int make_request(request_queue_t *q, struct bio * bio) return 0; } + md_write_start(mddev, bio); + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -774,7 +776,7 @@ static int make_request(request_queue_t *q, struct bio * bio) rcu_read_unlock(); atomic_set(&r10_bio->remaining, 1); - md_write_start(mddev); + for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; @@ -1216,7 +1218,6 @@ static void raid10d(mddev_t *mddev) mdk_rdev_t *rdev; md_check_recovery(mddev); - md_handle_safemode(mddev); for (;;) { char b[BDEVNAME_SIZE]; @@ -1319,7 +1320,7 @@ static int init_resync(conf_t *conf) * */ -static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) +static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { conf_t *conf = mddev_to_conf(mddev); r10bio_t *r10_bio; @@ -1333,7 +1334,7 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) if (!conf->r10buf_pool) if (init_resync(conf)) - return -ENOMEM; + return 0; skipped: max_sector = mddev->size << 1; @@ -1341,15 +1342,15 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { close_sync(conf); + *skipped = 1; return sectors_skipped; } if (chunks_skipped >= conf->raid_disks) { /* if there has been nothing to do on any drive, * then there is nothing to do at all.. */ - sector_t sec = max_sector - sector_nr; - md_done_sync(mddev, sec, 1); - return sec + sectors_skipped; + *skipped = 1; + return (max_sector - sector_nr) + sectors_skipped; } /* make sure whole request will fit in a chunk - if chunks @@ -1563,17 +1564,22 @@ static int sync_request(mddev_t *mddev, sector_t sector_nr, int go_faster) } } + if (sectors_skipped) + /* pretend they weren't skipped, it makes + * no important difference in this case + */ + md_done_sync(mddev, sectors_skipped, 1); + return sectors_skipped + nr_sectors; giveup: /* There is nowhere to write, so all non-sync * drives must be failed, so try the next chunk... */ { - int sec = max_sector - sector_nr; + sector_t sec = max_sector - sector_nr; sectors_skipped += sec; chunks_skipped ++; sector_nr = max_sector; - md_done_sync(mddev, sec, 1); goto skipped; } } @@ -1731,8 +1737,7 @@ static int run(mddev_t *mddev) out_free_conf: if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); - if (conf->mirrors) - kfree(conf->mirrors); + kfree(conf->mirrors); kfree(conf); mddev->private = NULL; out: @@ -1748,8 +1753,7 @@ static int stop(mddev_t *mddev) blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); - if (conf->mirrors) - kfree(conf->mirrors); + kfree(conf->mirrors); kfree(conf); mddev->private = NULL; return 0; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3cb11ac232fa..93a9726cc2d6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1411,6 +1411,8 @@ static int make_request (request_queue_t *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; + md_write_start(mddev, bi); + if (bio_data_dir(bi)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); @@ -1423,8 +1425,7 @@ static int make_request (request_queue_t *q, struct bio * bi) last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - if ( bio_data_dir(bi) == WRITE ) - md_write_start(mddev); + for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); @@ -1475,7 +1476,7 @@ static int make_request (request_queue_t *q, struct bio * bi) } /* FIXME go_faster isn't used */ -static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) +static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; struct stripe_head *sh; @@ -1498,8 +1499,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) * nothing we can do. */ if (mddev->degraded >= 1 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - int rv = (mddev->size << 1) - sector_nr; - md_done_sync(mddev, rv, 1); + sector_t rv = (mddev->size << 1) - sector_nr; + *skipped = 1; return rv; } @@ -1546,7 +1547,6 @@ static void raid5d (mddev_t *mddev) PRINTK("+++ raid5d active\n"); md_check_recovery(mddev); - md_handle_safemode(mddev); handled = 0; spin_lock_irq(&conf->device_lock); diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c index 908edd78a792..f62ea1a73d0d 100644 --- a/drivers/md/raid6main.c +++ b/drivers/md/raid6main.c @@ -1570,6 +1570,8 @@ static int make_request (request_queue_t *q, struct bio * bi) sector_t logical_sector, last_sector; struct stripe_head *sh; + md_write_start(mddev, bi); + if (bio_data_dir(bi)==WRITE) { disk_stat_inc(mddev->gendisk, writes); disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi)); @@ -1583,8 +1585,7 @@ static int make_request (request_queue_t *q, struct bio * bi) bi->bi_next = NULL; bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ - if ( bio_data_dir(bi) == WRITE ) - md_write_start(mddev); + for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); @@ -1634,7 +1635,7 @@ static int make_request (request_queue_t *q, struct bio * bi) } /* FIXME go_faster isn't used */ -static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) +static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) { raid6_conf_t *conf = (raid6_conf_t *) mddev->private; struct stripe_head *sh; @@ -1657,8 +1658,8 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster) * nothing we can do. */ if (mddev->degraded >= 2 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - int rv = (mddev->size << 1) - sector_nr; - md_done_sync(mddev, rv, 1); + sector_t rv = (mddev->size << 1) - sector_nr; + *skipped = 1; return rv; } @@ -1705,7 +1706,6 @@ static void raid6d (mddev_t *mddev) PRINTK("+++ raid6d active\n"); md_check_recovery(mddev); - md_handle_safemode(mddev); handled = 0; spin_lock_irq(&conf->device_lock); |