From 4b8164b91d9fdff4dbac0a742d076bdff7fda21b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 31 Jan 2015 20:08:47 -0500 Subject: new helper: dup_iter() Copy iter and kmemdup the underlying array for the copy. Returns a pointer to result of kmemdup() to be kfree()'d later. Signed-off-by: Al Viro --- mm/iov_iter.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/iov_iter.c b/mm/iov_iter.c index 827732047da1..9d96e283520c 100644 --- a/mm/iov_iter.c +++ b/mm/iov_iter.c @@ -751,3 +751,18 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) return npages; } EXPORT_SYMBOL(iov_iter_npages); + +const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) +{ + *new = *old; + if (new->type & ITER_BVEC) + return new->bvec = kmemdup(new->bvec, + new->nr_segs * sizeof(struct bio_vec), + flags); + else + /* iovec and kvec have identical layout */ + return new->iov = kmemdup(new->iov, + new->nr_segs * sizeof(struct iovec), + flags); +} +EXPORT_SYMBOL(dup_iter); -- cgit v1.2.3 From d879cb83417a71c435f1263e1160a9fce8e95d87 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Dec 2014 16:05:55 -0500 Subject: move iov_iter.c from mm/ to lib/ Signed-off-by: Al Viro --- lib/Makefile | 2 +- lib/iov_iter.c | 768 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/Makefile | 2 +- mm/iov_iter.c | 768 --------------------------------------------------------- 4 files changed, 770 insertions(+), 770 deletions(-) create mode 100644 lib/iov_iter.c delete mode 100644 mm/iov_iter.c (limited to 'mm') diff --git a/lib/Makefile b/lib/Makefile index 87eb3bffc283..58f74d2dd396 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -24,7 +24,7 @@ obj-y += lockref.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ - gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \ + gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o obj-y += string_helpers.o diff --git a/lib/iov_iter.c b/lib/iov_iter.c new file mode 100644 index 000000000000..9d96e283520c --- /dev/null +++ b/lib/iov_iter.c @@ -0,0 +1,768 @@ +#include +#include +#include +#include +#include +#include + +#define iterate_iovec(i, n, __v, __p, skip, STEP) { \ + size_t left; \ + size_t wanted = n; \ + __p = i->iov; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } else { \ + left = 0; \ + } \ + while (unlikely(!left && n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + left = (STEP); \ + __v.iov_len -= left; \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted - n; \ +} + +#define iterate_kvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->kvec; \ + __v.iov_len = min(n, __p->iov_len - skip); \ + if (likely(__v.iov_len)) { \ + __v.iov_base = __p->iov_base + skip; \ + (void)(STEP); \ + skip += __v.iov_len; \ + n -= __v.iov_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.iov_len = min(n, __p->iov_len); \ + if (unlikely(!__v.iov_len)) \ + continue; \ + __v.iov_base = __p->iov_base; \ + (void)(STEP); \ + skip = __v.iov_len; \ + n -= __v.iov_len; \ + } \ + n = wanted; \ +} + +#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ + size_t wanted = n; \ + __p = i->bvec; \ + __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ + if (likely(__v.bv_len)) { \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset + skip; \ + (void)(STEP); \ + skip += __v.bv_len; \ + n -= __v.bv_len; \ + } \ + while (unlikely(n)) { \ + __p++; \ + __v.bv_len = min_t(size_t, n, __p->bv_len); \ + if (unlikely(!__v.bv_len)) \ + continue; \ + __v.bv_page = __p->bv_page; \ + __v.bv_offset = __p->bv_offset; \ + (void)(STEP); \ + skip = __v.bv_len; \ + n -= __v.bv_len; \ + } \ + n = wanted; \ +} + +#define iterate_all_kinds(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + } \ +} + +#define iterate_and_advance(i, n, v, I, B, K) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + const struct bio_vec *bvec; \ + struct bio_vec v; \ + iterate_bvec(i, n, v, bvec, skip, (B)) \ + if (skip == bvec->bv_len) { \ + bvec++; \ + skip = 0; \ + } \ + i->nr_segs -= bvec - i->bvec; \ + i->bvec = bvec; \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + if (skip == kvec->iov_len) { \ + kvec++; \ + skip = 0; \ + } \ + i->nr_segs -= kvec - i->kvec; \ + i->kvec = kvec; \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + if (skip == iov->iov_len) { \ + iov++; \ + skip = 0; \ + } \ + i->nr_segs -= iov - i->iov; \ + i->iov = iov; \ + } \ + i->count -= n; \ + i->iov_offset = skip; \ +} + +static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *to; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_readable(buf, copy)) { + kaddr = kmap_atomic(page); + to = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user_inatomic(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = to - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + to = kaddr + offset; + left = __copy_from_user(to, buf, copy); + copy -= left; + skip += copy; + to += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_from_user(to, buf, copy); + copy -= left; + skip = copy; + to += copy; + bytes -= copy; + } + kunmap(page); +done: + if (skip == iov->iov_len) { + iov++; + skip = 0; + } + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + if (!(i->type & (ITER_BVEC|ITER_KVEC))) { + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); + } + return 0; +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +void iov_iter_init(struct iov_iter *i, int direction, + const struct iovec *iov, unsigned long nr_segs, + size_t count) +{ + /* It will get better. Eventually... */ + if (segment_eq(get_fs(), KERNEL_DS)) { + direction |= ITER_KVEC; + i->type = direction; + i->kvec = (struct kvec *)iov; + } else { + i->type = direction; + i->iov = iov; + } + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_init); + +static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) +{ + char *from = kmap_atomic(page); + memcpy(to, from + offset, len); + kunmap_atomic(from); +} + +static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) +{ + char *to = kmap_atomic(page); + memcpy(to + offset, from, len); + kunmap_atomic(to); +} + +static void memzero_page(struct page *page, size_t offset, size_t len) +{ + char *addr = kmap_atomic(page); + memset(addr + offset, 0, len); + kunmap_atomic(addr); +} + +size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + char *from = addr; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, + v.iov_len), + memcpy_to_page(v.bv_page, v.bv_offset, + (from += v.bv_len) - v.bv_len, v.bv_len), + memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(copy_to_iter); + +size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, + v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(copy_from_iter); + +size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) +{ + char *to = addr; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __copy_from_user_nocache((to += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(copy_from_iter_nocache); + +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_to_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else + return copy_page_to_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_to_iter); + +size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + if (i->type & (ITER_BVEC|ITER_KVEC)) { + void *kaddr = kmap_atomic(page); + size_t wanted = copy_from_iter(kaddr + offset, bytes, i); + kunmap_atomic(kaddr); + return wanted; + } else + return copy_page_from_iter_iovec(page, offset, bytes, i); +} +EXPORT_SYMBOL(copy_page_from_iter); + +size_t iov_iter_zero(size_t bytes, struct iov_iter *i) +{ + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + iterate_and_advance(i, bytes, v, + __clear_user(v.iov_base, v.iov_len), + memzero_page(v.bv_page, v.bv_offset, v.bv_len), + memset(v.iov_base, 0, v.iov_len) + ) + + return bytes; +} +EXPORT_SYMBOL(iov_iter_zero); + +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr = kmap_atomic(page), *p = kaddr + offset; + iterate_all_kinds(i, bytes, v, + __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, + v.iov_base, v.iov_len), + memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, + v.bv_offset, v.bv_len), + memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) + ) + kunmap_atomic(kaddr); + return bytes; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +void iov_iter_advance(struct iov_iter *i, size_t size) +{ + iterate_and_advance(i, size, v, 0, 0, 0) +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + if (i->nr_segs == 1) + return i->count; + else if (i->type & ITER_BVEC) + return min(i->count, i->bvec->bv_len - i->iov_offset); + else + return min(i->count, i->iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); + +void iov_iter_kvec(struct iov_iter *i, int direction, + const struct kvec *kvec, unsigned long nr_segs, + size_t count) +{ + BUG_ON(!(direction & ITER_KVEC)); + i->type = direction; + i->kvec = kvec; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_kvec); + +void iov_iter_bvec(struct iov_iter *i, int direction, + const struct bio_vec *bvec, unsigned long nr_segs, + size_t count) +{ + BUG_ON(!(direction & ITER_BVEC)); + i->type = direction; + i->bvec = bvec; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count; +} +EXPORT_SYMBOL(iov_iter_bvec); + +unsigned long iov_iter_alignment(const struct iov_iter *i) +{ + unsigned long res = 0; + size_t size = i->count; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, + (res |= (unsigned long)v.iov_base | v.iov_len, 0), + res |= v.bv_offset | v.bv_len, + res |= (unsigned long)v.iov_base | v.iov_len + ) + return res; +} +EXPORT_SYMBOL(iov_iter_alignment); + +ssize_t iov_iter_get_pages(struct iov_iter *i, + struct page **pages, size_t maxsize, unsigned maxpages, + size_t *start) +{ + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + if (len > maxpages * PAGE_SIZE) + len = maxpages * PAGE_SIZE; + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); + if (unlikely(res < 0)) + return res; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + get_page(*pages = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; +} +EXPORT_SYMBOL(iov_iter_get_pages); + +static struct page **get_pages_array(size_t n) +{ + struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); + if (!p) + p = vmalloc(n * sizeof(struct page *)); + return p; +} + +ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, + struct page ***pages, size_t maxsize, + size_t *start) +{ + struct page **p; + + if (maxsize > i->count) + maxsize = i->count; + + if (!maxsize) + return 0; + + iterate_all_kinds(i, maxsize, v, ({ + unsigned long addr = (unsigned long)v.iov_base; + size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); + int n; + int res; + + addr &= ~(PAGE_SIZE - 1); + n = DIV_ROUND_UP(len, PAGE_SIZE); + p = get_pages_array(n); + if (!p) + return -ENOMEM; + res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); + if (unlikely(res < 0)) { + kvfree(p); + return res; + } + *pages = p; + return (res == n ? len : res * PAGE_SIZE) - *start; + 0;}),({ + /* can't be more than PAGE_SIZE */ + *start = v.bv_offset; + *pages = p = get_pages_array(1); + if (!p) + return -ENOMEM; + get_page(*p = v.bv_page); + return v.bv_len; + }),({ + return -EFAULT; + }) + ) + return 0; +} +EXPORT_SYMBOL(iov_iter_get_pages_alloc); + +size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *to = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_from_user(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck(p + v.bv_offset, + (to += v.bv_len) - v.bv_len, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck(v.iov_base, + (to += v.iov_len) - v.iov_len, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_from_iter); + +size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, + struct iov_iter *i) +{ + char *from = addr; + __wsum sum, next; + size_t off = 0; + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + sum = *csum; + iterate_and_advance(i, bytes, v, ({ + int err = 0; + next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0, &err); + if (!err) { + sum = csum_block_add(sum, next, off); + off += v.iov_len; + } + err ? v.iov_len : 0; + }), ({ + char *p = kmap_atomic(v.bv_page); + next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, + p + v.bv_offset, + v.bv_len, 0); + kunmap_atomic(p); + sum = csum_block_add(sum, next, off); + off += v.bv_len; + }),({ + next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, + v.iov_base, + v.iov_len, 0); + sum = csum_block_add(sum, next, off); + off += v.iov_len; + }) + ) + *csum = sum; + return bytes; +} +EXPORT_SYMBOL(csum_and_copy_to_iter); + +int iov_iter_npages(const struct iov_iter *i, int maxpages) +{ + size_t size = i->count; + int npages = 0; + + if (!size) + return 0; + + iterate_all_kinds(i, size, v, ({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + 0;}),({ + npages++; + if (npages >= maxpages) + return maxpages; + }),({ + unsigned long p = (unsigned long)v.iov_base; + npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) + - p / PAGE_SIZE; + if (npages >= maxpages) + return maxpages; + }) + ) + return npages; +} +EXPORT_SYMBOL(iov_iter_npages); + +const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) +{ + *new = *old; + if (new->type & ITER_BVEC) + return new->bvec = kmemdup(new->bvec, + new->nr_segs * sizeof(struct bio_vec), + flags); + else + /* iovec and kvec have identical layout */ + return new->iov = kmemdup(new->iov, + new->nr_segs * sizeof(struct iovec), + flags); +} +EXPORT_SYMBOL(dup_iter); diff --git a/mm/Makefile b/mm/Makefile index 3c1caa2693bd..15dbe9903c27 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -21,7 +21,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - iov_iter.o debug.o $(mmu-y) + debug.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/iov_iter.c b/mm/iov_iter.c deleted file mode 100644 index 9d96e283520c..000000000000 --- a/mm/iov_iter.c +++ /dev/null @@ -1,768 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define iterate_iovec(i, n, __v, __p, skip, STEP) { \ - size_t left; \ - size_t wanted = n; \ - __p = i->iov; \ - __v.iov_len = min(n, __p->iov_len - skip); \ - if (likely(__v.iov_len)) { \ - __v.iov_base = __p->iov_base + skip; \ - left = (STEP); \ - __v.iov_len -= left; \ - skip += __v.iov_len; \ - n -= __v.iov_len; \ - } else { \ - left = 0; \ - } \ - while (unlikely(!left && n)) { \ - __p++; \ - __v.iov_len = min(n, __p->iov_len); \ - if (unlikely(!__v.iov_len)) \ - continue; \ - __v.iov_base = __p->iov_base; \ - left = (STEP); \ - __v.iov_len -= left; \ - skip = __v.iov_len; \ - n -= __v.iov_len; \ - } \ - n = wanted - n; \ -} - -#define iterate_kvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->kvec; \ - __v.iov_len = min(n, __p->iov_len - skip); \ - if (likely(__v.iov_len)) { \ - __v.iov_base = __p->iov_base + skip; \ - (void)(STEP); \ - skip += __v.iov_len; \ - n -= __v.iov_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.iov_len = min(n, __p->iov_len); \ - if (unlikely(!__v.iov_len)) \ - continue; \ - __v.iov_base = __p->iov_base; \ - (void)(STEP); \ - skip = __v.iov_len; \ - n -= __v.iov_len; \ - } \ - n = wanted; \ -} - -#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->bvec; \ - __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ - if (likely(__v.bv_len)) { \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset + skip; \ - (void)(STEP); \ - skip += __v.bv_len; \ - n -= __v.bv_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.bv_len = min_t(size_t, n, __p->bv_len); \ - if (unlikely(!__v.bv_len)) \ - continue; \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset; \ - (void)(STEP); \ - skip = __v.bv_len; \ - n -= __v.bv_len; \ - } \ - n = wanted; \ -} - -#define iterate_all_kinds(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - } \ -} - -#define iterate_and_advance(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - if (skip == bvec->bv_len) { \ - bvec++; \ - skip = 0; \ - } \ - i->nr_segs -= bvec - i->bvec; \ - i->bvec = bvec; \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - if (skip == kvec->iov_len) { \ - kvec++; \ - skip = 0; \ - } \ - i->nr_segs -= kvec - i->kvec; \ - i->kvec = kvec; \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - if (skip == iov->iov_len) { \ - iov++; \ - skip = 0; \ - } \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ - } \ - i->count -= n; \ - i->iov_offset = skip; \ -} - -static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *from; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_writeable(buf, copy)) { - kaddr = kmap_atomic(page); - from = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = from - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - from = kaddr + offset; - left = __copy_to_user(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - kunmap(page); -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *to; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_readable(buf, copy)) { - kaddr = kmap_atomic(page); - to = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = to - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - to = kaddr + offset; - left = __copy_from_user(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - kunmap(page); -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - if (!(i->type & (ITER_BVEC|ITER_KVEC))) { - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); - } - return 0; -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -void iov_iter_init(struct iov_iter *i, int direction, - const struct iovec *iov, unsigned long nr_segs, - size_t count) -{ - /* It will get better. Eventually... */ - if (segment_eq(get_fs(), KERNEL_DS)) { - direction |= ITER_KVEC; - i->type = direction; - i->kvec = (struct kvec *)iov; - } else { - i->type = direction; - i->iov = iov; - } - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_init); - -static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) -{ - char *from = kmap_atomic(page); - memcpy(to, from + offset, len); - kunmap_atomic(from); -} - -static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) -{ - char *to = kmap_atomic(page); - memcpy(to + offset, from, len); - kunmap_atomic(to); -} - -static void memzero_page(struct page *page, size_t offset, size_t len) -{ - char *addr = kmap_atomic(page); - memset(addr + offset, 0, len); - kunmap_atomic(addr); -} - -size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - char *from = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, - v.iov_len), - memcpy_to_page(v.bv_page, v.bv_offset, - (from += v.bv_len) - v.bv_len, v.bv_len), - memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_to_iter); - -size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, - v.iov_len), - memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_from_iter); - -size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) -{ - char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_from_user_nocache((to += v.iov_len) - v.iov_len, - v.iov_base, v.iov_len), - memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_from_iter_nocache); - -size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - if (i->type & (ITER_BVEC|ITER_KVEC)) { - void *kaddr = kmap_atomic(page); - size_t wanted = copy_to_iter(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; - } else - return copy_page_to_iter_iovec(page, offset, bytes, i); -} -EXPORT_SYMBOL(copy_page_to_iter); - -size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - if (i->type & (ITER_BVEC|ITER_KVEC)) { - void *kaddr = kmap_atomic(page); - size_t wanted = copy_from_iter(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; - } else - return copy_page_from_iter_iovec(page, offset, bytes, i); -} -EXPORT_SYMBOL(copy_page_from_iter); - -size_t iov_iter_zero(size_t bytes, struct iov_iter *i) -{ - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __clear_user(v.iov_base, v.iov_len), - memzero_page(v.bv_page, v.bv_offset, v.bv_len), - memset(v.iov_base, 0, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(iov_iter_zero); - -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr = kmap_atomic(page), *p = kaddr + offset; - iterate_all_kinds(i, bytes, v, - __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, - v.iov_base, v.iov_len), - memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - kunmap_atomic(kaddr); - return bytes; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -void iov_iter_advance(struct iov_iter *i, size_t size) -{ - iterate_and_advance(i, size, v, 0, 0, 0) -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - if (i->nr_segs == 1) - return i->count; - else if (i->type & ITER_BVEC) - return min(i->count, i->bvec->bv_len - i->iov_offset); - else - return min(i->count, i->iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - -void iov_iter_kvec(struct iov_iter *i, int direction, - const struct kvec *kvec, unsigned long nr_segs, - size_t count) -{ - BUG_ON(!(direction & ITER_KVEC)); - i->type = direction; - i->kvec = kvec; - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_kvec); - -void iov_iter_bvec(struct iov_iter *i, int direction, - const struct bio_vec *bvec, unsigned long nr_segs, - size_t count) -{ - BUG_ON(!(direction & ITER_BVEC)); - i->type = direction; - i->bvec = bvec; - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_bvec); - -unsigned long iov_iter_alignment(const struct iov_iter *i) -{ - unsigned long res = 0; - size_t size = i->count; - - if (!size) - return 0; - - iterate_all_kinds(i, size, v, - (res |= (unsigned long)v.iov_base | v.iov_len, 0), - res |= v.bv_offset | v.bv_len, - res |= (unsigned long)v.iov_base | v.iov_len - ) - return res; -} -EXPORT_SYMBOL(iov_iter_alignment); - -ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - if (maxsize > i->count) - maxsize = i->count; - - if (!maxsize) - return 0; - - iterate_all_kinds(i, maxsize, v, ({ - unsigned long addr = (unsigned long)v.iov_base; - size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); - int n; - int res; - - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; - addr &= ~(PAGE_SIZE - 1); - n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); - if (unlikely(res < 0)) - return res; - return (res == n ? len : res * PAGE_SIZE) - *start; - 0;}),({ - /* can't be more than PAGE_SIZE */ - *start = v.bv_offset; - get_page(*pages = v.bv_page); - return v.bv_len; - }),({ - return -EFAULT; - }) - ) - return 0; -} -EXPORT_SYMBOL(iov_iter_get_pages); - -static struct page **get_pages_array(size_t n) -{ - struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); - if (!p) - p = vmalloc(n * sizeof(struct page *)); - return p; -} - -ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start) -{ - struct page **p; - - if (maxsize > i->count) - maxsize = i->count; - - if (!maxsize) - return 0; - - iterate_all_kinds(i, maxsize, v, ({ - unsigned long addr = (unsigned long)v.iov_base; - size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); - int n; - int res; - - addr &= ~(PAGE_SIZE - 1); - n = DIV_ROUND_UP(len, PAGE_SIZE); - p = get_pages_array(n); - if (!p) - return -ENOMEM; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); - if (unlikely(res < 0)) { - kvfree(p); - return res; - } - *pages = p; - return (res == n ? len : res * PAGE_SIZE) - *start; - 0;}),({ - /* can't be more than PAGE_SIZE */ - *start = v.bv_offset; - *pages = p = get_pages_array(1); - if (!p) - return -ENOMEM; - get_page(*p = v.bv_page); - return v.bv_len; - }),({ - return -EFAULT; - }) - ) - return 0; -} -EXPORT_SYMBOL(iov_iter_get_pages_alloc); - -size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, - struct iov_iter *i) -{ - char *to = addr; - __wsum sum, next; - size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - sum = *csum; - iterate_and_advance(i, bytes, v, ({ - int err = 0; - next = csum_and_copy_from_user(v.iov_base, - (to += v.iov_len) - v.iov_len, - v.iov_len, 0, &err); - if (!err) { - sum = csum_block_add(sum, next, off); - off += v.iov_len; - } - err ? v.iov_len : 0; - }), ({ - char *p = kmap_atomic(v.bv_page); - next = csum_partial_copy_nocheck(p + v.bv_offset, - (to += v.bv_len) - v.bv_len, - v.bv_len, 0); - kunmap_atomic(p); - sum = csum_block_add(sum, next, off); - off += v.bv_len; - }),({ - next = csum_partial_copy_nocheck(v.iov_base, - (to += v.iov_len) - v.iov_len, - v.iov_len, 0); - sum = csum_block_add(sum, next, off); - off += v.iov_len; - }) - ) - *csum = sum; - return bytes; -} -EXPORT_SYMBOL(csum_and_copy_from_iter); - -size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, - struct iov_iter *i) -{ - char *from = addr; - __wsum sum, next; - size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - sum = *csum; - iterate_and_advance(i, bytes, v, ({ - int err = 0; - next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, - v.iov_base, - v.iov_len, 0, &err); - if (!err) { - sum = csum_block_add(sum, next, off); - off += v.iov_len; - } - err ? v.iov_len : 0; - }), ({ - char *p = kmap_atomic(v.bv_page); - next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, - p + v.bv_offset, - v.bv_len, 0); - kunmap_atomic(p); - sum = csum_block_add(sum, next, off); - off += v.bv_len; - }),({ - next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, - v.iov_base, - v.iov_len, 0); - sum = csum_block_add(sum, next, off); - off += v.iov_len; - }) - ) - *csum = sum; - return bytes; -} -EXPORT_SYMBOL(csum_and_copy_to_iter); - -int iov_iter_npages(const struct iov_iter *i, int maxpages) -{ - size_t size = i->count; - int npages = 0; - - if (!size) - return 0; - - iterate_all_kinds(i, size, v, ({ - unsigned long p = (unsigned long)v.iov_base; - npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - - p / PAGE_SIZE; - if (npages >= maxpages) - return maxpages; - 0;}),({ - npages++; - if (npages >= maxpages) - return maxpages; - }),({ - unsigned long p = (unsigned long)v.iov_base; - npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - - p / PAGE_SIZE; - if (npages >= maxpages) - return maxpages; - }) - ) - return npages; -} -EXPORT_SYMBOL(iov_iter_npages); - -const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) -{ - *new = *old; - if (new->type & ITER_BVEC) - return new->bvec = kmemdup(new->bvec, - new->nr_segs * sizeof(struct bio_vec), - flags); - else - /* iovec and kvec have identical layout */ - return new->iov = kmemdup(new->iov, - new->nr_segs * sizeof(struct iovec), - flags); -} -EXPORT_SYMBOL(dup_iter); -- cgit v1.2.3 From 7d70e15480c0450d2bfafaad338a32e884fc215e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 4 Mar 2015 10:37:43 -0500 Subject: writeback: add missing INITIAL_JIFFIES init in global_update_bandwidth() global_update_bandwidth() uses static variable update_time as the timestamp for the last update but forgets to initialize it to INITIALIZE_JIFFIES. This means that global_dirty_limit will be 5 mins into the future on 32bit and some large amount jiffies into the past on 64bit. This isn't critical as the only effect is that global_dirty_limit won't be updated for the first 5 mins after booting on 32bit machines, especially given the auxiliary nature of global_dirty_limit's role - protecting against global dirty threshold's sudden dips; however, it does lead to unintended suboptimal behavior. Fix it. Fixes: c42843f2f0bb ("writeback: introduce smoothed global dirty limit") Signed-off-by: Tejun Heo Acked-by: Jan Kara Cc: Wu Fengguang Cc: Jens Axboe Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 45e187b2d971..b4fd980a93eb 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -922,7 +922,7 @@ static void global_update_bandwidth(unsigned long thresh, unsigned long now) { static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time; + static unsigned long update_time = INITIAL_JIFFIES; /* * check locklessly first to optimize away locking for the most time -- cgit v1.2.3 From 53da3bc2ba9e4899f32707b5cd7d18421b943687 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 12 Mar 2015 08:45:46 -0700 Subject: mm: fix up numa read-only thread grouping logic Dave Chinner reported that commit 4d9424669946 ("mm: convert p[te|md]_mknonnuma and remaining page table manipulations") slowed down his xfsrepair test enormously. In particular, it was using more system time due to extra TLB flushing. The ultimate reason turns out to be how the change to use the regular page table accessor functions broke the NUMA grouping logic. The old special mknuma/mknonnuma code accessed the page table present bit and the magic NUMA bit directly, while the new code just changes the page protections using PROT_NONE and the regular vma protections. That sounds equivalent, and from a fault standpoint it really is, but a subtle side effect is that the *other* protection bits of the page table entries also change. And the code to decide how to group the NUMA entries together used the writable bit to decide whether a particular page was likely to be shared read-only or not. And with the change to make the NUMA handling use the regular permission setting functions, that writable bit was basically always cleared for private mappings due to COW. So even if the page actually ends up being written to in the end, the NUMA balancing would act as if it was always shared RO. This code is a heuristic anyway, so the fix - at least for now - is to instead check whether the page is dirty rather than writable. The bit doesn't change with protection changes. NOTE! This also adds a FIXME comment to revisit this issue, Not only should we probably re-visit the whole "is this a shared read-only page" heuristic (we might want to take the vma permissions into account and base this more on those than the per-page ones, and also look at whether the particular access that triggers it is a write or not), but the whole COW issue shows that we should think about the NUMA fault handling some more. For example, maybe we should do the early-COW thing that a regular fault does. Or maybe we should accept that while using the same bits as PROTNONE was a good thing (and got rid of the specual NUMA bit), we might still want to just preseve the other protection bits across NUMA faulting. Those are bigger questions, left for later. This just fixes up the heuristic so that it at least approximates working again. More analysis and work needed. Reported-by: Dave Chinner Tested-by: Mel Gorman Cc: Andrew Morton Cc: Aneesh Kumar Cc: Ingo Molnar , Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++++++- mm/memory.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fc00c8cb5a82..89b9075f8c11 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1295,8 +1295,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Avoid grouping on DSO/COW pages in specific and RO pages * in general, RO pages shouldn't hurt as much anyway since * they can be in shared cache state. + * + * FIXME! This checks "pmd_dirty()" as an approximation of + * "is this a read-only page", since checking "pmd_write()" + * is even more broken. We haven't actually turned this into + * a writable page, so pmd_write() will always be false. */ - if (!pmd_write(pmd)) + if (!pmd_dirty(pmd)) flags |= TNF_NO_GROUP; /* diff --git a/mm/memory.c b/mm/memory.c index 8068893697bb..411144f977b1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3072,8 +3072,13 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Avoid grouping on DSO/COW pages in specific and RO pages * in general, RO pages shouldn't hurt as much anyway since * they can be in shared cache state. + * + * FIXME! This checks "pmd_dirty()" as an approximation of + * "is this a read-only page", since checking "pmd_write()" + * is even more broken. We haven't actually turned this into + * a writable page, so pmd_write() will always be false. */ - if (!pte_write(pte)) + if (!pte_dirty(pte)) flags |= TNF_NO_GROUP; /* -- cgit v1.2.3 From ba68bc0115ebfc37f911db4e87bf5f7991f89698 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 7 Mar 2015 15:20:48 +0000 Subject: mm: thp: Return the correct value for change_huge_pmd The wrong value is being returned by change_huge_pmd since commit 10c1045f28e8 ("mm: numa: avoid unnecessary TLB flushes when setting NUMA hinting entries") which allows a fallthrough that tries to adjust non-existent PTEs. This patch corrects it. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 89b9075f8c11..626e93db28ba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1487,6 +1487,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; + ret = 1; /* * Avoid trapping faults against the zero page. The read-only @@ -1495,11 +1496,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (prot_numa && is_huge_zero_pmd(*pmd)) { spin_unlock(ptl); - return 0; + return ret; } if (!prot_numa || !pmd_protnone(*pmd)) { - ret = 1; entry = pmdp_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; -- cgit v1.2.3 From e009d5dc0a94a7133e5f1c083732d760bfd038e6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 12 Mar 2015 16:25:52 -0700 Subject: mm, oom: do not fail __GFP_NOFAIL allocation if oom killer is disabled Tetsuo Handa has pointed out that __GFP_NOFAIL allocations might fail after OOM killer is disabled if the allocation is performed by a kernel thread. This behavior was introduced from the very beginning by 7f33d49a2ed5 ("mm, PM/Freezer: Disable OOM killer when tasks are frozen"). This means that the basic contract for the allocation request is broken and the context requesting such an allocation might blow up unexpectedly. There are basically two ways forward. 1) move oom_killer_disable after kernel threads are frozen. This has a risk that the OOM victim wouldn't be able to finish because it would depend on an already frozen kernel thread. This would be really tricky to debug. 2) do not fail GFP_NOFAIL allocation no matter what and risk a potential Freezable kernel threads will loop and fail the suspend. Incidental allocations after kernel threads are frozen will at least dump a warning - if we are lucky and the serial console is still active of course... This patch implements the later option because it is safer. We would see warning rather than allocation failures for the kernel threads which would blow up otherwise and have a higher chances to identify __GFP_NOFAIL users from deeper pm code. Signed-off-by: Michal Hocko Acked-by: David Rientjes Cc: Johannes Weiner Cc: Tetsuo Handa Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7abfa70cdc1a..40e29429e7b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2373,7 +2373,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) + || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); -- cgit v1.2.3 From 44fc80573cc760a7154f41fd0a958ee10eba1a81 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 12 Mar 2015 16:25:54 -0700 Subject: mm, hugetlb: close race when setting PageTail for gigantic pages Now that gigantic pages are dynamically allocatable, care must be taken to ensure that p->first_page is valid before setting PageTail. If this isn't done, then it is possible to race and have compound_head() return NULL. Signed-off-by: David Rientjes Acked-by: Davidlohr Bueso Cc: Luiz Capitulino Cc: Joonsoo Kim Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a9ac6c26832..c41b2a0ee273 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -917,7 +917,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -933,6 +932,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } -- cgit v1.2.3 From 850fc430f47aad52092deaaeb32b99f97f0e6aca Mon Sep 17 00:00:00 2001 From: Danesh Petigara Date: Thu, 12 Mar 2015 16:25:57 -0700 Subject: mm: cma: fix CMA aligned offset calculation The CMA aligned offset calculation is incorrect for non-zero order_per_bit values. For example, if cma->order_per_bit=1, cma->base_pfn= 0x2f800000 and align_order=12, the function returns a value of 0x17c00 instead of 0x400. This patch fixes the CMA aligned offset calculation. The previous calculation was wrong and would return too-large values for the offset, so that when cma_alloc looks for free pages in the bitmap with the requested alignment > order_per_bit, it starts too far into the bitmap and so CMA allocations will fail despite there actually being plenty of free pages remaining. It will also probably have the wrong alignment. With this change, we will get the correct offset into the bitmap. One affected user is powerpc KVM, which has kvm_cma->order_per_bit set to KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, or 18 - 12 = 6. [gregory.0xf0@gmail.com: changelog additions] Signed-off-by: Danesh Petigara Reviewed-by: Gregory Fong Acked-by: Michal Nazarewicz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 75016fd1de90..68ecb7a42983 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -64,15 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) return (1UL << (align_order - cma->order_per_bit)) - 1; } +/* + * Find a PFN aligned to the specified order and return an offset represented in + * order_per_bits. + */ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) { - unsigned int alignment; - if (align_order <= cma->order_per_bit) return 0; - alignment = 1UL << (align_order - cma->order_per_bit); - return ALIGN(cma->base_pfn, alignment) - - (cma->base_pfn >> cma->order_per_bit); + + return (ALIGN(cma->base_pfn, (1UL << align_order)) + - cma->base_pfn) >> cma->order_per_bit; } static unsigned long cma_bitmap_maxno(struct cma *cma) -- cgit v1.2.3 From 5b8bf30721980b254be7a07315c353b3a3175b74 Mon Sep 17 00:00:00 2001 From: gchen gchen Date: Thu, 12 Mar 2015 16:26:05 -0700 Subject: mm/nommu.c: export symbol max_mapnr Several modules may need max_mapnr, so export, the related error with allmodconfig under c6x: MODPOST 3327 modules ERROR: "max_mapnr" [fs/pstore/ramoops.ko] undefined! ERROR: "max_mapnr" [drivers/media/v4l2-core/videobuf2-dma-contig.ko] undefined! Signed-off-by: Chen Gang Cc: Mark Salter Cc: Aurelien Jacquiot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 3e67e7538ecf..3fba2dc97c44 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -62,6 +62,7 @@ void *high_memory; EXPORT_SYMBOL(high_memory); struct page *mem_map; unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -- cgit v1.2.3 From a5af5aa8b67dfdba36c853b70564fd2dfe73d478 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 12 Mar 2015 16:26:11 -0700 Subject: kasan, module, vmalloc: rework shadow allocation for modules Current approach in handling shadow memory for modules is broken. Shadow memory could be freed only after memory shadow corresponds it is no longer used. vfree() called from interrupt context could use memory its freeing to store 'struct llist_node' in it: void vfree(const void *addr) { ... if (unlikely(in_interrupt())) { struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); Later this list node used in free_work() which actually frees memory. Currently module_memfree() called in interrupt context will free shadow before freeing module's memory which could provoke kernel crash. So shadow memory should be freed after module's memory. However, such deallocation order could race with kasan_module_alloc() in module_alloc(). Free shadow right before releasing vm area. At this point vfree()'d memory is not used anymore and yet not available for other allocations. New VM_KASAN flag used to indicate that vm area has dynamically allocated shadow memory so kasan frees shadow only if it was previously allocated. Signed-off-by: Andrey Ryabinin Acked-by: Rusty Russell Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 5 +++-- include/linux/vmalloc.h | 1 + kernel/module.c | 2 -- mm/kasan/kasan.c | 14 +++++++++++--- mm/vmalloc.c | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 72ba725ddf9c..5fa48a21d73e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -5,6 +5,7 @@ struct kmem_cache; struct page; +struct vm_struct; #ifdef CONFIG_KASAN @@ -52,7 +53,7 @@ void kasan_slab_free(struct kmem_cache *s, void *object); #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) int kasan_module_alloc(void *addr, size_t size); -void kasan_module_free(void *addr); +void kasan_free_shadow(const struct vm_struct *vm); #else /* CONFIG_KASAN */ @@ -82,7 +83,7 @@ static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } -static inline void kasan_module_free(void *addr) {} +static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* CONFIG_KASAN */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 7d7acb35603d..0ec598381f97 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -17,6 +17,7 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ /* bits [20..32] reserved for arch specific ioremap internals */ /* diff --git a/kernel/module.c b/kernel/module.c index cc93cf68653c..b3d634ed06c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { } void __weak module_memfree(void *module_region) { vfree(module_region); - kasan_module_free(module_region); } void __weak module_arch_cleanup(struct module *mod) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 78fee632a7ee..936d81661c47 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "kasan.h" @@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size) GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); - return ret ? 0 : -ENOMEM; + + if (ret) { + find_vm_area(addr)->flags |= VM_KASAN; + return 0; + } + + return -ENOMEM; } -void kasan_module_free(void *addr) +void kasan_free_shadow(const struct vm_struct *vm) { - vfree(kasan_mem_to_shadow(addr)); + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); } static void register_global(struct kasan_global *global) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35b25e1340ca..49abccf29a29 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); + kasan_free_shadow(vm); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; -- cgit v1.2.3 From a5a6579db33af91f4f5134e14be758dc71c1b694 Mon Sep 17 00:00:00 2001 From: Jeff Vander Stoep Date: Thu, 12 Mar 2015 16:26:17 -0700 Subject: mm: reorder can_do_mlock to fix audit denial A userspace call to mmap(MAP_LOCKED) may result in the successful locking of memory while also producing a confusing audit log denial. can_do_mlock checks capable and rlimit. If either of these return positive can_do_mlock returns true. The capable check leads to an LSM hook used by apparmour and selinux which produce the audit denial. Reordering so rlimit is checked first eliminates the denial on success, only recording a denial when the lock is unsuccessful as a result of the denial. Signed-off-by: Jeff Vander Stoep Acked-by: Nick Kralevich Cc: Jeff Vander Stoep Cc: Sasha Levin Cc: "Paul E. McKenney" Cc: Rik van Riel Cc: Vlastimil Babka Cc: Paul Cassella Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 73cf0987088c..8a54cd214925 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -26,10 +26,10 @@ int can_do_mlock(void) { - if (capable(CAP_IPC_LOCK)) - return 1; if (rlimit(RLIMIT_MEMLOCK) != 0) return 1; + if (capable(CAP_IPC_LOCK)) + return 1; return 0; } EXPORT_SYMBOL(can_do_mlock); -- cgit v1.2.3 From 7feee590bb18ffc42636975f74c2c3120ce1901c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 12 Mar 2015 16:26:19 -0700 Subject: memcg: disable hierarchy support if bound to the legacy cgroup hierarchy If the memory cgroup controller is initially mounted in the scope of the default cgroup hierarchy and then remounted to a legacy hierarchy, it will still have hierarchy support enabled, which is incorrect. We should disable hierarchy support if bound to the legacy cgroup hierarchy. Signed-off-by: Vladimir Davydov Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9fe07692eaad..b34ef4a32a3b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5232,7 +5232,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) * on for the root memcg is enough. */ if (cgroup_on_dfl(root_css->cgroup)) - mem_cgroup_from_css(root_css)->use_hierarchy = true; + root_mem_cgroup->use_hierarchy = true; + else + root_mem_cgroup->use_hierarchy = false; } static u64 memory_current_read(struct cgroup_subsys_state *css, -- cgit v1.2.3 From c72efb658f7c8b27ca3d0efb5cfd5ded9fcac89e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Mar 2015 00:18:48 -0400 Subject: writeback: fix possible underflow in write bandwidth calculation From 1ebf33901ecc75d9496862dceb1ef0377980587c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 23 Mar 2015 00:08:19 -0400 2f800fbd777b ("writeback: fix dirtied pages accounting on redirty") introduced account_page_redirty() which reverts stat updates for a redirtied page, making BDI_DIRTIED no longer monotonically increasing. bdi_update_write_bandwidth() uses the delta in BDI_DIRTIED as the basis for bandwidth calculation. While unlikely, since the above patch, the newer value may be lower than the recorded past value and underflow the bandwidth calculation leading to a wild result. Fix it by subtracing min of the old and new values when calculating delta. AFAIK, there hasn't been any report of it happening but the resulting erratic behavior would be non-critical and temporary, so it's possible that the issue is happening without being reported. The risk of the fix is very low, so tagged for -stable. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Fixes: 2f800fbd777b ("writeback: fix dirtied pages accounting on redirty") Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- mm/page-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b4fd980a93eb..644bcb665773 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -857,8 +857,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period + * + * @written may have decreased due to account_page_redirty(). + * Avoid underflowing @bw calculation. */ - bw = written - bdi->written_stamp; + bw = written - min(written, bdi->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); -- cgit v1.2.3 From 3fe89b3e2a7bbf3e97657104b9b33a9d81b950b3 Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Wed, 25 Mar 2015 15:55:11 -0700 Subject: mm: fix anon_vma->degree underflow in anon_vma endless growing prevention I have constantly stumbled upon "kernel BUG at mm/rmap.c:399!" after upgrading to 3.19 and had no luck with 4.0-rc1 neither. So, after looking into new logic introduced by commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy"), I found chances are that unlink_anon_vmas() is called without incrementing dst->anon_vma->degree in anon_vma_clone() due to allocation failure. If dst->anon_vma is not NULL in error path, its degree will be incorrectly decremented in unlink_anon_vmas() and eventually underflow when exiting as a result of another call to unlink_anon_vmas(). That's how "kernel BUG at mm/rmap.c:399!" is triggered for me. This patch fixes the underflow by dropping dst->anon_vma when allocation fails. It's safe to do so regardless of original value of dst->anon_vma because dst->anon_vma doesn't have valid meaning if anon_vma_clone() fails. Besides, callers don't care dst->anon_vma in such case neither. Also suggested by Michal Hocko, we can clean up vma_adjust() a bit as anon_vma_clone() now does the work. [akpm@linux-foundation.org: tweak comment] Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy") Signed-off-by: Leon Yu Signed-off-by: Konstantin Khlebnikov Reviewed-by: Michal Hocko Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 +--- mm/rmap.c | 7 +++++++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index da9990acc08b..9ec50a368634 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -774,10 +774,8 @@ again: remove_next = 1 + (end > next->vm_end); importer->anon_vma = exporter->anon_vma; error = anon_vma_clone(importer, exporter); - if (error) { - importer->anon_vma = NULL; + if (error) return error; - } } } diff --git a/mm/rmap.c b/mm/rmap.c index 5e3e09081164..c161a14b6a8f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -287,6 +287,13 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) return 0; enomem_failure: + /* + * dst->anon_vma is dropped here otherwise its degree can be incorrectly + * decremented in unlink_anon_vmas(). + * We can safely do this because callers of anon_vma_clone() don't care + * about dst->anon_vma if anon_vma_clone() failed. + */ + dst->anon_vma = NULL; unlink_anon_vmas(dst); return -ENOMEM; } -- cgit v1.2.3 From f683739539e819e9b821a197d80e52258510837b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 25 Mar 2015 15:55:14 -0700 Subject: mm/pagewalk.c: prevent positive return value of walk_page_test() from being passed to callers walk_page_test() is purely pagewalk's internal stuff, and its positive return values are not intended to be passed to the callers of pagewalk. However, in the current code if the last vma in the do-while loop in walk_page_range() happens to return a positive value, it leaks outside walk_page_range(). So the user visible effect is invalid/unexpected return value (according to the reporter, mbind() causes it.) This patch fixes it simply by reinitializing the return value after checked. Another exposed interface, walk_page_vma(), already returns 0 for such cases so no problem. Fixes: fafaa4264eba ("pagewalk: improve vma handling") Signed-off-by: Naoya Horiguchi Signed-off-by: Kazutomo Yoshii Reported-by: Kazutomo Yoshii Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pagewalk.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 75c1f2878519..29f2f8b853ae 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -265,8 +265,15 @@ int walk_page_range(unsigned long start, unsigned long end, vma = vma->vm_next; err = walk_page_test(start, next, walk); - if (err > 0) + if (err > 0) { + /* + * positive return values are purely for + * controlling the pagewalk, so should never + * be passed to the callers. + */ + err = 0; continue; + } if (err < 0) break; } -- cgit v1.2.3 From b0dc3a342af36f95a68fe229b8f0f73552c5ca08 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 25 Mar 2015 15:55:20 -0700 Subject: mm/memory hotplug: postpone the reset of obsolete pgdat Qiu Xishi reported the following BUG when testing hot-add/hot-remove node under stress condition: BUG: unable to handle kernel paging request at 0000000000025f60 IP: next_online_pgdat+0x1/0x50 PGD 0 Oops: 0000 [#1] SMP ACPI: Device does not support D3cold Modules linked in: fuse nls_iso8859_1 nls_cp437 vfat fat loop dm_mod coretemp mperf crc32c_intel ghash_clmulni_intel aesni_intel ablk_helper cryptd lrw gf128mul glue_helper aes_x86_64 pcspkr microcode igb dca i2c_algo_bit ipv6 megaraid_sas iTCO_wdt i2c_i801 i2c_core iTCO_vendor_support tg3 sg hwmon ptp lpc_ich pps_core mfd_core acpi_pad rtc_cmos button ext3 jbd mbcache sd_mod crc_t10dif scsi_dh_alua scsi_dh_rdac scsi_dh_hp_sw scsi_dh_emc scsi_dh ahci libahci libata scsi_mod [last unloaded: rasf] CPU: 23 PID: 238 Comm: kworker/23:1 Tainted: G O 3.10.15-5885-euler0302 #1 Hardware name: HUAWEI TECHNOLOGIES CO.,LTD. Huawei N1/Huawei N1, BIOS V100R001 03/02/2015 Workqueue: events vmstat_update task: ffffa800d32c0000 ti: ffffa800d32ae000 task.ti: ffffa800d32ae000 RIP: 0010: next_online_pgdat+0x1/0x50 RSP: 0018:ffffa800d32afce8 EFLAGS: 00010286 RAX: 0000000000001440 RBX: ffffffff81da53b8 RCX: 0000000000000082 RDX: 0000000000000000 RSI: 0000000000000082 RDI: 0000000000000000 RBP: ffffa800d32afd28 R08: ffffffff81c93bfc R09: ffffffff81cbdc96 R10: 00000000000040ec R11: 00000000000000a0 R12: ffffa800fffb3440 R13: ffffa800d32afd38 R14: 0000000000000017 R15: ffffa800e6616800 FS: 0000000000000000(0000) GS:ffffa800e6600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000025f60 CR3: 0000000001a0b000 CR4: 00000000001407e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: refresh_cpu_vm_stats+0xd0/0x140 vmstat_update+0x11/0x50 process_one_work+0x194/0x3d0 worker_thread+0x12b/0x410 kthread+0xc6/0xd0 ret_from_fork+0x7c/0xb0 The cause is the "memset(pgdat, 0, sizeof(*pgdat))" at the end of try_offline_node, which will reset all the content of pgdat to 0, as the pgdat is accessed lock-free, so that the users still using the pgdat will panic, such as the vmstat_update routine. process A: offline node XX: vmstat_updat() refresh_cpu_vm_stats() for_each_populated_zone() find online node XX cond_resched() offline cpu and memory, then try_offline_node() node_set_offline(nid), and memset(pgdat, 0, sizeof(*pgdat)) zone = next_zone(zone) pg_data_t *pgdat = zone->zone_pgdat; // here pgdat is NULL now next_online_pgdat(pgdat) next_online_node(pgdat->node_id); // NULL pointer access So the solution here is postponing the reset of obsolete pgdat from try_offline_node() to hotadd_new_pgdat(), and just resetting pgdat->nr_zones and pgdat->classzone_idx to be 0 rather than the memset 0 to avoid breaking pointer information in pgdat. Signed-off-by: Gu Zheng Reported-by: Xishi Qiu Suggested-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Tang Chen Cc: Xie XiuQi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fab10795bea..65842d688b7c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1092,6 +1092,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) return NULL; arch_refresh_nodedata(nid, pgdat); + } else { + /* Reset the nr_zones and classzone_idx to 0 before reuse */ + pgdat->nr_zones = 0; + pgdat->classzone_idx = 0; } /* we can use NODE_DATA(nid) from here */ @@ -1977,15 +1981,6 @@ void try_offline_node(int nid) if (is_vmalloc_addr(zone->wait_table)) vfree(zone->wait_table); } - - /* - * Since there is no way to guarentee the address of pgdat/zone is not - * on stack of any kernel threads or used by other kernel objects - * without reference counting or other symchronizing method, do not - * reset node_data and free pgdat here. Just reset it to 0 and reuse - * the memory when the node is online again. - */ - memset(pgdat, 0, sizeof(*pgdat)); } EXPORT_SYMBOL(try_offline_node); -- cgit v1.2.3 From 859b7a0e89120505c304d7afbbe90325abaa0a6b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 25 Mar 2015 15:55:23 -0700 Subject: mm/slub: fix lockups on PREEMPT && !SMP kernels Commit 9aabf810a67c ("mm/slub: optimize alloc/free fastpath by removing preemption on/off") introduced an occasional hang for kernels built with CONFIG_PREEMPT && !CONFIG_SMP. The problem is the following loop the patch introduced to slab_alloc_node and slab_free: do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); GCC 4.9 has been observed to hoist the load of c and c->tid above the loop for !SMP kernels (as in this case raw_cpu_ptr(x) is compile-time constant and does not force a reload). On arm64 the generated assembly looks like: ldr x4, [x0,#8] loop: ldr x1, [x0,#8] cmp x1, x4 b.ne loop If the thread is preempted between the load of c->tid (into x1) and tid (into x4), and an allocation or free occurs in another thread (bumping the cpu_slab's tid), the thread will be stuck in the loop until s->cpu_slab->tid wraps, which may be forever in the absence of allocations/frees on the same CPU. This patch changes the loop condition to access c->tid with READ_ONCE. This ensures that the value is reloaded even when the compiler would otherwise assume it could cache the value, and also ensures that the load will not be torn. Signed-off-by: Mark Rutland Cc: Catalin Marinas Acked-by: Christoph Lameter Cc: David Rientjes Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Steve Capper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6832c4eab104..82c473780c91 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2449,7 +2449,8 @@ redo: do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); /* * Irqless object alloc/free algorithm used here depends on sequence @@ -2718,7 +2719,8 @@ redo: do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); /* Same with comment on barrier() in slab_alloc_node() */ barrier(); -- cgit v1.2.3 From cfa869438282be84ad4110bba5027ef1fbbe71e4 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Wed, 25 Mar 2015 15:55:26 -0700 Subject: mm/page_alloc.c: call kernel_map_pages in unset_migrateype_isolate Commit 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock") changed the logic of unset_migratetype_isolate to check the buddy allocator and explicitly call __free_pages to merge. The page that is being freed in this path never had prep_new_page called so set_page_refcounted is called explicitly but there is no call to kernel_map_pages. With the default kernel_map_pages this is mostly harmless but if kernel_map_pages does any manipulation of the page tables (unmapping or setting pages to read only) this may trigger a fault: alloc_contig_range test_pages_isolated(ceb00, ced00) failed Unable to handle kernel paging request at virtual address ffffffc0cec00000 pgd = ffffffc045fc4000 [ffffffc0cec00000] *pgd=0000000000000000 Internal error: Oops: 9600004f [#1] PREEMPT SMP Modules linked in: exfatfs CPU: 1 PID: 23237 Comm: TimedEventQueue Not tainted 3.10.49-gc72ad36-dirty #1 task: ffffffc03de52100 ti: ffffffc015388000 task.ti: ffffffc015388000 PC is at memset+0xc8/0x1c0 LR is at kernel_map_pages+0x1ec/0x244 Fix this by calling kernel_map_pages to ensure the page is set in the page table properly Fixes: 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock") Signed-off-by: Laura Abbott Cc: Naoya Horiguchi Cc: Mel Gorman Acked-by: Rik van Riel Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Xishi Qiu Cc: Vladimir Davydov Acked-by: Joonsoo Kim Cc: Gioh Kim Cc: Michal Nazarewicz Cc: Marek Szyprowski Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 72f5ac381ab3..755a42c76eb4 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -103,6 +103,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (!is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); + kernel_map_pages(page, (1 << order), 1); set_page_refcounted(page); isolated_page = page; } -- cgit v1.2.3 From bea66fbd11af1ca98ae26855eea41eda8582923e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Mar 2015 15:55:37 -0700 Subject: mm: numa: group related processes based on VMA flags instead of page table flags These are three follow-on patches based on the xfsrepair workload Dave Chinner reported was problematic in 4.0-rc1 due to changes in page table management -- https://lkml.org/lkml/2015/3/1/226. Much of the problem was reduced by commit 53da3bc2ba9e ("mm: fix up numa read-only thread grouping logic") and commit ba68bc0115eb ("mm: thp: Return the correct value for change_huge_pmd"). It was known that the performance in 3.19 was still better even if is far less safe. This series aims to restore the performance without compromising on safety. For the test of this mail, I'm comparing 3.19 against 4.0-rc4 and the three patches applied on top autonumabench 3.19.0 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 vanilla vanilla vmwrite-v5r8 preserve-v5r8 slowscan-v5r8 Time System-NUMA01 124.00 ( 0.00%) 161.86 (-30.53%) 107.13 ( 13.60%) 103.13 ( 16.83%) 145.01 (-16.94%) Time System-NUMA01_THEADLOCAL 115.54 ( 0.00%) 107.64 ( 6.84%) 131.87 (-14.13%) 83.30 ( 27.90%) 92.35 ( 20.07%) Time System-NUMA02 9.35 ( 0.00%) 10.44 (-11.66%) 8.95 ( 4.28%) 10.72 (-14.65%) 8.16 ( 12.73%) Time System-NUMA02_SMT 3.87 ( 0.00%) 4.63 (-19.64%) 4.57 (-18.09%) 3.99 ( -3.10%) 3.36 ( 13.18%) Time Elapsed-NUMA01 570.06 ( 0.00%) 567.82 ( 0.39%) 515.78 ( 9.52%) 517.26 ( 9.26%) 543.80 ( 4.61%) Time Elapsed-NUMA01_THEADLOCAL 393.69 ( 0.00%) 384.83 ( 2.25%) 384.10 ( 2.44%) 384.31 ( 2.38%) 380.73 ( 3.29%) Time Elapsed-NUMA02 49.09 ( 0.00%) 49.33 ( -0.49%) 48.86 ( 0.47%) 48.78 ( 0.63%) 50.94 ( -3.77%) Time Elapsed-NUMA02_SMT 47.51 ( 0.00%) 47.15 ( 0.76%) 47.98 ( -0.99%) 48.12 ( -1.28%) 49.56 ( -4.31%) 3.19.0 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 vanilla vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8 User 46334.60 46391.94 44383.95 43971.89 44372.12 System 252.84 284.66 252.61 201.24 249.00 Elapsed 1062.14 1050.96 998.68 1000.94 1026.78 Overall the system CPU usage is comparable and the test is naturally a bit variable. The slowing of the scanner hurts numa01 but on this machine it is an adverse workload and patches that dramatically help it often hurt absolutely everything else. Due to patch 2, the fault activity is interesting 3.19.0 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 vanilla vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8 Minor Faults 2097811 2656646 2597249 1981230 1636841 Major Faults 362 450 365 364 365 Note the impact preserving the write bit across protection updates and fault reduces faults. NUMA alloc hit 1229008 1217015 1191660 1178322 1199681 NUMA alloc miss 0 0 0 0 0 NUMA interleave hit 0 0 0 0 0 NUMA alloc local 1228514 1216317 1190871 1177448 1199021 NUMA base PTE updates 245706197 240041607 238195516 244704842 115012800 NUMA huge PMD updates 479530 468448 464868 477573 224487 NUMA page range updates 491225557 479886983 476207932 489222218 229950144 NUMA hint faults 659753 656503 641678 656926 294842 NUMA hint local faults 381604 373963 360478 337585 186249 NUMA hint local percent 57 56 56 51 63 NUMA pages migrated 5412140 6374899 6266530 5277468 5755096 AutoNUMA cost 5121% 5083% 4994% 5097% 2388% Here the impact of slowing the PTE scanner on migratrion failures is obvious as "NUMA base PTE updates" and "NUMA huge PMD updates" are massively reduced even though the headline performance is very similar. As xfsrepair was the reported workload here is the impact of the series on it. xfsrepair 3.19.0 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 vanilla vanilla vmwrite-v5r8 preserve-v5r8 slowscan-v5r8 Min real-fsmark 1183.29 ( 0.00%) 1165.73 ( 1.48%) 1152.78 ( 2.58%) 1153.64 ( 2.51%) 1177.62 ( 0.48%) Min syst-fsmark 4107.85 ( 0.00%) 4027.75 ( 1.95%) 3986.74 ( 2.95%) 3979.16 ( 3.13%) 4048.76 ( 1.44%) Min real-xfsrepair 441.51 ( 0.00%) 463.96 ( -5.08%) 449.50 ( -1.81%) 440.08 ( 0.32%) 439.87 ( 0.37%) Min syst-xfsrepair 195.76 ( 0.00%) 278.47 (-42.25%) 262.34 (-34.01%) 203.70 ( -4.06%) 143.64 ( 26.62%) Amean real-fsmark 1188.30 ( 0.00%) 1177.34 ( 0.92%) 1157.97 ( 2.55%) 1158.21 ( 2.53%) 1182.22 ( 0.51%) Amean syst-fsmark 4111.37 ( 0.00%) 4055.70 ( 1.35%) 3987.19 ( 3.02%) 3998.72 ( 2.74%) 4061.69 ( 1.21%) Amean real-xfsrepair 450.88 ( 0.00%) 468.32 ( -3.87%) 454.14 ( -0.72%) 442.36 ( 1.89%) 440.59 ( 2.28%) Amean syst-xfsrepair 199.66 ( 0.00%) 290.60 (-45.55%) 277.20 (-38.84%) 204.68 ( -2.51%) 150.55 ( 24.60%) Stddev real-fsmark 4.12 ( 0.00%) 10.82 (-162.29%) 4.14 ( -0.28%) 5.98 (-45.05%) 4.60 (-11.53%) Stddev syst-fsmark 2.63 ( 0.00%) 20.32 (-671.82%) 0.37 ( 85.89%) 16.47 (-525.59%) 15.05 (-471.79%) Stddev real-xfsrepair 6.87 ( 0.00%) 4.55 ( 33.75%) 3.46 ( 49.58%) 1.78 ( 74.12%) 0.52 ( 92.50%) Stddev syst-xfsrepair 3.02 ( 0.00%) 10.30 (-241.37%) 13.17 (-336.37%) 0.71 ( 76.63%) 5.00 (-65.61%) CoeffVar real-fsmark 0.35 ( 0.00%) 0.92 (-164.73%) 0.36 ( -2.91%) 0.52 (-48.82%) 0.39 (-12.10%) CoeffVar syst-fsmark 0.06 ( 0.00%) 0.50 (-682.41%) 0.01 ( 85.45%) 0.41 (-543.22%) 0.37 (-478.78%) CoeffVar real-xfsrepair 1.52 ( 0.00%) 0.97 ( 36.21%) 0.76 ( 49.94%) 0.40 ( 73.62%) 0.12 ( 92.33%) CoeffVar syst-xfsrepair 1.51 ( 0.00%) 3.54 (-134.54%) 4.75 (-214.31%) 0.34 ( 77.20%) 3.32 (-119.63%) Max real-fsmark 1193.39 ( 0.00%) 1191.77 ( 0.14%) 1162.90 ( 2.55%) 1166.66 ( 2.24%) 1188.50 ( 0.41%) Max syst-fsmark 4114.18 ( 0.00%) 4075.45 ( 0.94%) 3987.65 ( 3.08%) 4019.45 ( 2.30%) 4082.80 ( 0.76%) Max real-xfsrepair 457.80 ( 0.00%) 474.60 ( -3.67%) 457.82 ( -0.00%) 444.42 ( 2.92%) 441.03 ( 3.66%) Max syst-xfsrepair 203.11 ( 0.00%) 303.65 (-49.50%) 294.35 (-44.92%) 205.33 ( -1.09%) 155.28 ( 23.55%) The really relevant lines as syst-xfsrepair which is the system CPU usage when running xfsrepair. Note that on my machine the overhead was 45% higher on 4.0-rc4 which may be part of what Dave is seeing. Once we preserve the write bit across faults, it's only 2.51% higher on average. With the full series applied, system CPU usage is 24.6% lower on average. Again, the impact of preserving the write bit on minor faults is obvious and the impact of slowing scanning after migration failures is obvious on the PTE updates. Note also that the number of pages migrated is much reduced even though the headline performance is comparable. 3.19.0 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 vanilla vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8 Minor Faults 153466827 254507978 249163829 153501373 105737890 Major Faults 610 702 690 649 724 NUMA base PTE updates 217735049 210756527 217729596 216937111 144344993 NUMA huge PMD updates 129294 85044 106921 127246 79887 NUMA pages migrated 21938995 29705270 28594162 22687324 16258075 3.19.0 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 4.0.0-rc4 vanilla vanillavmwrite-v5r8preserve-v5r8slowscan-v5r8 Mean sdb-avgqusz 13.47 2.54 2.55 2.47 2.49 Mean sdb-avgrqsz 202.32 140.22 139.50 139.02 138.12 Mean sdb-await 25.92 5.09 5.33 5.02 5.22 Mean sdb-r_await 4.71 0.19 0.83 0.51 0.11 Mean sdb-w_await 104.13 5.21 5.38 5.05 5.32 Mean sdb-svctm 0.59 0.13 0.14 0.13 0.14 Mean sdb-rrqm 0.16 0.00 0.00 0.00 0.00 Mean sdb-wrqm 3.59 1799.43 1826.84 1812.21 1785.67 Max sdb-avgqusz 111.06 12.13 14.05 11.66 15.60 Max sdb-avgrqsz 255.60 190.34 190.01 187.33 191.78 Max sdb-await 168.24 39.28 49.22 44.64 65.62 Max sdb-r_await 660.00 52.00 280.00 76.00 12.00 Max sdb-w_await 7804.00 39.28 49.22 44.64 65.62 Max sdb-svctm 4.00 2.82 2.86 1.98 2.84 Max sdb-rrqm 8.30 0.00 0.00 0.00 0.00 Max sdb-wrqm 34.20 5372.80 5278.60 5386.60 5546.15 FWIW, I also checked SPECjbb in different configurations but it's similar observations -- minor faults lower, PTE update activity lower and performance is roughly comparable against 3.19. This patch (of 3): Threads that share writable data within pages are grouped together as related tasks. This decision is based on whether the PTE is marked dirty which is subject to timing races between the PTE scanner update and when the application writes the page. If the page is file-backed, then background flushes and sync also affect placement. This is unpredictable behaviour which is impossible to reason about so this patch makes grouping decisions based on the VMA flags. Signed-off-by: Mel Gorman Reported-by: Dave Chinner Tested-by: Dave Chinner Cc: Ingo Molnar Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 13 ++----------- mm/memory.c | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 626e93db28ba..2f12e9fcf1a2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1291,17 +1291,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, flags |= TNF_FAULT_LOCAL; } - /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. - * - * FIXME! This checks "pmd_dirty()" as an approximation of - * "is this a read-only page", since checking "pmd_write()" - * is even more broken. We haven't actually turned this into - * a writable page, so pmd_write() will always be false. - */ - if (!pmd_dirty(pmd)) + /* See similar comment in do_numa_page for explanation */ + if (!(vma->vm_flags & VM_WRITE)) flags |= TNF_NO_GROUP; /* diff --git a/mm/memory.c b/mm/memory.c index 411144f977b1..20beb6647dba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3069,16 +3069,19 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as + * much anyway since they can be in shared cache state. This misses + * the case where a mapping is writable but the process never writes + * to it but pte_write gets cleared during protection updates and + * pte_dirty has unpredictable behaviour between PTE scan updates, + * background writeback, dirty balancing and application behaviour. * - * FIXME! This checks "pmd_dirty()" as an approximation of - * "is this a read-only page", since checking "pmd_write()" - * is even more broken. We haven't actually turned this into - * a writable page, so pmd_write() will always be false. + * TODO: Note that the ideal here would be to avoid a situation where a + * NUMA fault is taken immediately followed by a write fault in + * some cases which would have lower overhead overall but would be + * invasive as the fault paths would need to be unified. */ - if (!pte_dirty(pte)) + if (!(vma->vm_flags & VM_WRITE)) flags |= TNF_NO_GROUP; /* -- cgit v1.2.3 From b191f9b106ea1a24a711dbebb2925d3313da5852 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Mar 2015 15:55:40 -0700 Subject: mm: numa: preserve PTE write permissions across a NUMA hinting fault Protecting a PTE to trap a NUMA hinting fault clears the writable bit and further faults are needed after trapping a NUMA hinting fault to set the writable bit again. This patch preserves the writable bit when trapping NUMA hinting faults. The impact is obvious from the number of minor faults trapped during the basis balancing benchmark and the system CPU usage; autonumabench 4.0.0-rc4 4.0.0-rc4 baseline preserve Time System-NUMA01 107.13 ( 0.00%) 103.13 ( 3.73%) Time System-NUMA01_THEADLOCAL 131.87 ( 0.00%) 83.30 ( 36.83%) Time System-NUMA02 8.95 ( 0.00%) 10.72 (-19.78%) Time System-NUMA02_SMT 4.57 ( 0.00%) 3.99 ( 12.69%) Time Elapsed-NUMA01 515.78 ( 0.00%) 517.26 ( -0.29%) Time Elapsed-NUMA01_THEADLOCAL 384.10 ( 0.00%) 384.31 ( -0.05%) Time Elapsed-NUMA02 48.86 ( 0.00%) 48.78 ( 0.16%) Time Elapsed-NUMA02_SMT 47.98 ( 0.00%) 48.12 ( -0.29%) 4.0.0-rc4 4.0.0-rc4 baseline preserve User 44383.95 43971.89 System 252.61 201.24 Elapsed 998.68 1000.94 Minor Faults 2597249 1981230 Major Faults 365 364 There is a similar drop in system CPU usage using Dave Chinner's xfsrepair workload 4.0.0-rc4 4.0.0-rc4 baseline preserve Amean real-xfsrepair 454.14 ( 0.00%) 442.36 ( 2.60%) Amean syst-xfsrepair 277.20 ( 0.00%) 204.68 ( 26.16%) The patch looks hacky but the alternatives looked worse. The tidest was to rewalk the page tables after a hinting fault but it was more complex than this approach and the performance was worse. It's not generally safe to just mark the page writable during the fault if it's a write fault as it may have been read-only for COW so that approach was discarded. Signed-off-by: Mel Gorman Reported-by: Dave Chinner Tested-by: Dave Chinner Cc: Ingo Molnar Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 9 ++++++++- mm/memory.c | 8 +++----- mm/mprotect.c | 3 +++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f12e9fcf1a2..0a42d1521aa4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1260,6 +1260,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; + bool was_writable; int flags = 0; /* A PROT_NONE fault should not end up here */ @@ -1354,7 +1355,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); + was_writable = pmd_write(pmd); pmd = pmd_modify(pmd, vma->vm_page_prot); + if (was_writable) + pmd = pmd_mkwrite(pmd); set_pmd_at(mm, haddr, pmdp, pmd); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); @@ -1478,6 +1482,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; + bool preserve_write = prot_numa && pmd_write(*pmd); ret = 1; /* @@ -1493,9 +1498,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (!prot_numa || !pmd_protnone(*pmd)) { entry = pmdp_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mkwrite(entry); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - BUG_ON(pmd_write(entry)); + BUG_ON(!preserve_write && pmd_write(entry)); } spin_unlock(ptl); } diff --git a/mm/memory.c b/mm/memory.c index 20beb6647dba..d20e12da3a3c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3035,6 +3035,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int last_cpupid; int target_nid; bool migrated = false; + bool was_writable = pte_write(pte); int flags = 0; /* A PROT_NONE fault should not end up here */ @@ -3059,6 +3060,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Make it present again */ pte = pte_modify(pte, vma->vm_page_prot); pte = pte_mkyoung(pte); + if (was_writable) + pte = pte_mkwrite(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3075,11 +3078,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * to it but pte_write gets cleared during protection updates and * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. - * - * TODO: Note that the ideal here would be to avoid a situation where a - * NUMA fault is taken immediately followed by a write fault in - * some cases which would have lower overhead overall but would be - * invasive as the fault paths would need to be unified. */ if (!(vma->vm_flags & VM_WRITE)) flags |= TNF_NO_GROUP; diff --git a/mm/mprotect.c b/mm/mprotect.c index 44727811bf4c..88584838e704 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM @@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); + if (preserve_write) + ptent = pte_mkwrite(ptent); /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && -- cgit v1.2.3 From 074c238177a75f5e79af3b2cb6a84e54823ef950 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Mar 2015 15:55:42 -0700 Subject: mm: numa: slow PTE scan rate if migration failures occur Dave Chinner reported the following on https://lkml.org/lkml/2015/3/1/226 Across the board the 4.0-rc1 numbers are much slower, and the degradation is far worse when using the large memory footprint configs. Perf points straight at the cause - this is from 4.0-rc1 on the "-o bhash=101073" config: - 56.07% 56.07% [kernel] [k] default_send_IPI_mask_sequence_phys - default_send_IPI_mask_sequence_phys - 99.99% physflat_send_IPI_mask - 99.37% native_send_call_func_ipi smp_call_function_many - native_flush_tlb_others - 99.85% flush_tlb_page ptep_clear_flush try_to_unmap_one rmap_walk try_to_unmap migrate_pages migrate_misplaced_page - handle_mm_fault - 99.73% __do_page_fault trace_do_page_fault do_async_page_fault + async_page_fault 0.63% native_send_call_func_single_ipi generic_exec_single smp_call_function_single This is showing excessive migration activity even though excessive migrations are meant to get throttled. Normally, the scan rate is tuned on a per-task basis depending on the locality of faults. However, if migrations fail for any reason then the PTE scanner may scan faster if the faults continue to be remote. This means there is higher system CPU overhead and fault trapping at exactly the time we know that migrations cannot happen. This patch tracks when migration failures occur and slows the PTE scanner. Signed-off-by: Mel Gorman Reported-by: Dave Chinner Tested-by: Dave Chinner Cc: Ingo Molnar Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 +++++---- kernel/sched/fair.c | 8 ++++++-- mm/huge_memory.c | 3 ++- mm/memory.c | 3 ++- 4 files changed, 15 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..a419b65770d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1625,11 +1625,11 @@ struct task_struct { /* * numa_faults_locality tracks if faults recorded during the last - * scan window were remote/local. The task scan period is adapted - * based on the locality of the faults with different weights - * depending on whether they were shared or private faults + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults */ - unsigned long numa_faults_locality[2]; + unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1719,6 +1719,7 @@ struct task_struct { #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 #define TNF_FAULT_LOCAL 0x08 +#define TNF_MIGRATE_FAIL 0x10 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..bcfe32088b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0a42d1521aa4..51b3e7c64622 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } + } else + flags |= TNF_MIGRATE_FAIL; goto out; clear_pmdnuma: diff --git a/mm/memory.c b/mm/memory.c index d20e12da3a3c..97839f5c8c30 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { page_nid = target_nid; flags |= TNF_MIGRATED; - } + } else + flags |= TNF_MIGRATE_FAIL; out: if (page_nid != -1) -- cgit v1.2.3 From b7b04004ecd9e58cdc6c6ff92f251d5ac5c0adb2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Mar 2015 15:55:45 -0700 Subject: mm: numa: mark huge PTEs young when clearing NUMA hinting faults Base PTEs are marked young when the NUMA hinting information is cleared but the same does not happen for huge pages which this patch addresses. Note that migrated pages are not marked young as the base page migration code does not assume that migrated pages have been referenced. This could be addressed but beyond the scope of this series which is aimed at Dave Chinners shrink workload that is unlikely to be affected by this issue. Signed-off-by: Mel Gorman Cc: Dave Chinner Cc: Ingo Molnar Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 51b3e7c64622..6817b0350c71 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1358,6 +1358,7 @@ clear_pmdnuma: BUG_ON(!PageLocked(page)); was_writable = pmd_write(pmd); pmd = pmd_modify(pmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); set_pmd_at(mm, haddr, pmdp, pmd); -- cgit v1.2.3