diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-16 11:31:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-10-16 11:31:55 -0700 |
commit | c4cf498dc0241fa2d758dba177634268446afb06 (patch) | |
tree | 77e86dd4f211c6681ff9fbab481295732dc23422 /fs | |
parent | 9ff9b0d392ea08090cd1780fb196f36dbb586529 (diff) | |
parent | 4d0e9df5e43dba52d38b251e3b909df8fa1110be (diff) | |
download | lwn-c4cf498dc0241fa2d758dba177634268446afb06.tar.gz lwn-c4cf498dc0241fa2d758dba177634268446afb06.zip |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"155 patches.
Subsystems affected by this patch series: mm (dax, debug, thp,
readahead, page-poison, util, memory-hotplug, zram, cleanups), misc,
core-kernel, get_maintainer, MAINTAINERS, lib, bitops, checkpatch,
binfmt, ramfs, autofs, nilfs, rapidio, panic, relay, kgdb, ubsan,
romfs, and fault-injection"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (155 commits)
lib, uaccess: add failure injection to usercopy functions
lib, include/linux: add usercopy failure capability
ROMFS: support inode blocks calculation
ubsan: introduce CONFIG_UBSAN_LOCAL_BOUNDS for Clang
sched.h: drop in_ubsan field when UBSAN is in trap mode
scripts/gdb/tasks: add headers and improve spacing format
scripts/gdb/proc: add struct mount & struct super_block addr in lx-mounts command
kernel/relay.c: drop unneeded initialization
panic: dump registers on panic_on_warn
rapidio: fix the missed put_device() for rio_mport_add_riodev
rapidio: fix error handling path
nilfs2: fix some kernel-doc warnings for nilfs2
autofs: harden ioctl table
ramfs: fix nommu mmap with gaps in the page cache
mm: remove the now-unnecessary mmget_still_valid() hack
mm/gup: take mmap_lock in get_dump_page()
binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot
coredump: rework elf/elf_fdpic vma_dump_size() into common helper
coredump: refactor page range dumping into common helper
coredump: let dump_emit() bail out on short writes
...
Diffstat (limited to 'fs')
-rw-r--r-- | fs/autofs/dev-ioctl.c | 8 | ||||
-rw-r--r-- | fs/binfmt_elf.c | 263 | ||||
-rw-r--r-- | fs/binfmt_elf_fdpic.c | 162 | ||||
-rw-r--r-- | fs/configfs/dir.c | 2 | ||||
-rw-r--r-- | fs/configfs/file.c | 2 | ||||
-rw-r--r-- | fs/coredump.c | 236 | ||||
-rw-r--r-- | fs/ext4/verity.c | 4 | ||||
-rw-r--r-- | fs/f2fs/verity.c | 4 | ||||
-rw-r--r-- | fs/inode.c | 2 | ||||
-rw-r--r-- | fs/nilfs2/bmap.c | 2 | ||||
-rw-r--r-- | fs/nilfs2/cpfile.c | 6 | ||||
-rw-r--r-- | fs/nilfs2/page.c | 1 | ||||
-rw-r--r-- | fs/nilfs2/sufile.c | 4 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 18 | ||||
-rw-r--r-- | fs/ramfs/file-nommu.c | 2 | ||||
-rw-r--r-- | fs/romfs/super.c | 1 | ||||
-rw-r--r-- | fs/userfaultfd.c | 28 |
17 files changed, 330 insertions, 415 deletions
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 75105f45c51a..322b7dfb4ea0 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -8,6 +8,7 @@ #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> +#include <linux/nospec.h> #include "autofs_i.h" @@ -563,7 +564,7 @@ out: static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { - static ioctl_fn _ioctls[] = { + static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, @@ -581,7 +582,10 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd) }; unsigned int idx = cmd_idx(cmd); - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; + if (idx >= ARRAY_SIZE(_ioctls)) + return NULL; + idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); + return _ioctls[idx]; } /* ioctl dispatcher */ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13d053982dd7..e7e9d0cde51a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> @@ -421,6 +422,26 @@ static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) return 0; } +static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) +{ + unsigned long alignment = 0; + int i; + + for (i = 0; i < nr; i++) { + if (cmds[i].p_type == PT_LOAD) { + unsigned long p_align = cmds[i].p_align; + + /* skip non-power of two alignments as invalid */ + if (!is_power_of_2(p_align)) + continue; + alignment = max(alignment, p_align); + } + } + + /* ensure we align to at least one page */ + return ELF_PAGEALIGN(alignment); +} + /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded @@ -1008,6 +1029,7 @@ out_free_interp: int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; + unsigned long alignment; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -1086,6 +1108,9 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (alignment) + load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED; } else load_bias = 0; @@ -1389,126 +1414,6 @@ out: * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ -/* - * The purpose of always_dump_vma() is to make sure that special kernel mappings - * that are useful for post-mortem analysis are included in every core dump. - * In that way we ensure that the core dump is fully interpretable later - * without matching up the same kernel and hardware config to see what PC values - * meant. These special mappings include - vDSO, vsyscall, and other - * architecture specific mappings - */ -static bool always_dump_vma(struct vm_area_struct *vma) -{ - /* Any vsyscall mappings? */ - if (vma == get_gate_vma(vma->vm_mm)) - return true; - - /* - * Assume that all vmas with a .name op should always be dumped. - * If this changes, a new vm_ops field can easily be added. - */ - if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) - return true; - - /* - * arch_vma_name() returns non-NULL for special architecture mappings, - * such as vDSO sections. - */ - if (arch_vma_name(vma)) - return true; - - return false; -} - -/* - * Decide what to dump of a segment, part, all or none. - */ -static unsigned long vma_dump_size(struct vm_area_struct *vma, - unsigned long mm_flags) -{ -#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) - - /* always dump the vdso and vsyscall sections */ - if (always_dump_vma(vma)) - goto whole; - - if (vma->vm_flags & VM_DONTDUMP) - return 0; - - /* support for DAX */ - if (vma_is_dax(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) - goto whole; - return 0; - } - - /* Hugetlb memory check */ - if (is_vm_hugetlb_page(vma)) { - if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) - goto whole; - if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) - goto whole; - return 0; - } - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) - return 0; - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0 ? - FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) - goto whole; - return 0; - } - - /* Dump segments that have been written to. */ - if (vma->anon_vma && FILTER(ANON_PRIVATE)) - goto whole; - if (vma->vm_file == NULL) - return 0; - - if (FILTER(MAPPED_PRIVATE)) - goto whole; - - /* - * If this looks like the beginning of a DSO or executable mapping, - * check for an ELF header. If we find one, dump the first page to - * aid in determining what was mapped here. - */ - if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { - u32 __user *header = (u32 __user *) vma->vm_start; - u32 word; - /* - * Doing it this way gets the constant folded by GCC. - */ - union { - u32 cmp; - char elfmag[SELFMAG]; - } magic; - BUILD_BUG_ON(SELFMAG != sizeof word); - magic.elfmag[EI_MAG0] = ELFMAG0; - magic.elfmag[EI_MAG1] = ELFMAG1; - magic.elfmag[EI_MAG2] = ELFMAG2; - magic.elfmag[EI_MAG3] = ELFMAG3; - if (unlikely(get_user(word, header))) - word = 0; - if (word == magic.cmp) - return PAGE_SIZE; - } - -#undef FILTER - - return 0; - -whole: - return vma->vm_end - vma->vm_start; -} - /* An ELF note in memory */ struct memelfnote { @@ -2220,32 +2125,6 @@ static void free_note_info(struct elf_note_info *info) #endif -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} -/* - * Helper function for iterating across a vma list. It ensures that the caller - * will visit `gate_vma' prior to terminating the search. - */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) - return NULL; - return gate_vma; -} - static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, elf_addr_t e_shoff, int segs) { @@ -2272,9 +2151,8 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs, i; - size_t vma_data_size = 0; - struct vm_area_struct *vma, *gate_vma; + int vma_count, segs, i; + size_t vma_data_size; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2282,30 +2160,16 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - elf_addr_t *vma_filesz = NULL; + struct core_vma_metadata *vma_meta; + + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + return 0; - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ - /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); - - gate_vma = get_gate_vma(current->mm); - if (gate_vma != NULL) - segs++; + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2343,24 +2207,6 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - /* - * Zero vma process will get ZERO_SIZE_PTR here. - * Let coredump continue for register state at least. - */ - vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), - GFP_KERNEL); - if (!vma_filesz) - goto end_coredump; - - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long dump_size; - - dump_size = vma_dump_size(vma, cprm->mm_flags); - vma_filesz[i++] = dump_size; - vma_data_size += dump_size; - } - offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2381,21 +2227,23 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = vma_filesz[i++]; - phdr.p_memsz = vma->vm_end - vma->vm_start; + phdr.p_filesz = meta->dump_size; + phdr.p_memsz = meta->end - meta->start; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -2417,28 +2265,11 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma)) { - unsigned long addr; - unsigned long end; + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - end = vma->vm_start + vma_filesz[i++]; - - for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { - struct page *page; - int stop; - - page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else - stop = !dump_skip(cprm, PAGE_SIZE); - if (stop) - goto end_coredump; - } + if (!dump_user_range(cprm, meta->start, meta->dump_size)) + goto end_coredump; } dump_truncate(cprm); @@ -2453,7 +2284,7 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_filesz); + kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 50f845702b92..be4062b8ba75 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1215,76 +1215,6 @@ struct elf_prstatus_fdpic int pr_fpvalid; /* True if math co-processor being used. */ }; -/* - * Decide whether a segment is worth dumping; default is yes to be - * sure (missing info is worse than too much; etc). - * Personally I'd include everything, and use the coredump limit... - * - * I think we should skip something. But I am not sure how. H.J. - */ -static int maydump(struct vm_area_struct *vma, unsigned long mm_flags) -{ - int dump_ok; - - /* Do not dump I/O mapped devices or special mappings */ - if (vma->vm_flags & VM_IO) { - kdcore("%08lx: %08lx: no (IO)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* If we may not read the contents, don't allow us to dump - * them either. "dump_write()" can't handle it anyway. - */ - if (!(vma->vm_flags & VM_READ)) { - kdcore("%08lx: %08lx: no (!read)", vma->vm_start, vma->vm_flags); - return 0; - } - - /* support for DAX */ - if (vma_is_dax(vma)) { - if (vma->vm_flags & VM_SHARED) { - dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } else { - dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - } - return dump_ok; - } - - /* By default, dump shared memory if mapped from an anonymous file. */ - if (vma->vm_flags & VM_SHARED) { - if (file_inode(vma->vm_file)->i_nlink == 0) { - dump_ok = test_bit(MMF_DUMP_ANON_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - - dump_ok = test_bit(MMF_DUMP_MAPPED_SHARED, &mm_flags); - kdcore("%08lx: %08lx: %s (share)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } - -#ifdef CONFIG_MMU - /* By default, if it hasn't been written to, don't write it out */ - if (!vma->anon_vma) { - dump_ok = test_bit(MMF_DUMP_MAPPED_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s (!anon)", vma->vm_start, - vma->vm_flags, dump_ok ? "yes" : "no"); - return dump_ok; - } -#endif - - dump_ok = test_bit(MMF_DUMP_ANON_PRIVATE, &mm_flags); - kdcore("%08lx: %08lx: %s", vma->vm_start, vma->vm_flags, - dump_ok ? "yes" : "no"); - return dump_ok; -} - /* An ELF note in memory */ struct memelfnote { @@ -1524,54 +1454,21 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, /* * dump the segments for an MMU process */ -static bool elf_fdpic_dump_segments(struct coredump_params *cprm) +static bool elf_fdpic_dump_segments(struct coredump_params *cprm, + struct core_vma_metadata *vma_meta, + int vma_count) { - struct vm_area_struct *vma; + int i; - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { -#ifdef CONFIG_MMU - unsigned long addr; -#endif + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; - if (!maydump(vma, cprm->mm_flags)) - continue; - -#ifdef CONFIG_MMU - for (addr = vma->vm_start; addr < vma->vm_end; - addr += PAGE_SIZE) { - bool res; - struct page *page = get_dump_page(addr); - if (page) { - void *kaddr = kmap(page); - res = dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); - put_page(page); - } else { - res = dump_skip(cprm, PAGE_SIZE); - } - if (!res) - return false; - } -#else - if (!dump_emit(cprm, (void *) vma->vm_start, - vma->vm_end - vma->vm_start)) + if (!dump_user_range(cprm, meta->start, meta->dump_size)) return false; -#endif } return true; } -static size_t elf_core_vma_data_size(unsigned long mm_flags) -{ - struct vm_area_struct *vma; - size_t size = 0; - - for (vma = current->mm->mmap; vma; vma = vma->vm_next) - if (maydump(vma, mm_flags)) - size += vma->vm_end - vma->vm_start; - return size; -} - /* * Actual dumper * @@ -1582,9 +1479,8 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags) static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int segs; + int vma_count, segs; int i; - struct vm_area_struct *vma; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; struct memelfnote psinfo_note, auxv_note; @@ -1598,18 +1494,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - - /* - * We no longer stop all VM operations. - * - * This is because those proceses that could possibly change map_count - * or the mmap / vma pages are now blocked in do_exit on current - * finishing this core dump. - * - * Only ptrace can touch these memory addresses, but it doesn't change - * the map_count or the pages allocated. So no possibility of crashing - * exists while dumping the mm->vm_next areas to the core file. - */ + struct core_vma_metadata *vma_meta = NULL; + size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1619,6 +1505,9 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) + goto end_coredump; + for (ct = current->mm->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1638,8 +1527,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = current->mm->map_count; - segs += elf_core_extra_phdrs(); + segs = vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1684,7 +1572,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += elf_core_vma_data_size(cprm->mm_flags); + offset += vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1704,23 +1592,26 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { + for (i = 0; i < vma_count; i++) { + struct core_vma_metadata *meta = vma_meta + i; struct elf_phdr phdr; size_t sz; - sz = vma->vm_end - vma->vm_start; + sz = meta->end - meta->start; phdr.p_type = PT_LOAD; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = meta->start; phdr.p_paddr = 0; - phdr.p_filesz = maydump(vma, cprm->mm_flags) ? sz : 0; + phdr.p_filesz = meta->dump_size; phdr.p_memsz = sz; offset += phdr.p_filesz; - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; - if (vma->vm_flags & VM_WRITE) + phdr.p_flags = 0; + if (meta->flags & VM_READ) + phdr.p_flags |= PF_R; + if (meta->flags & VM_WRITE) phdr.p_flags |= PF_W; - if (vma->vm_flags & VM_EXEC) + if (meta->flags & VM_EXEC) phdr.p_flags |= PF_X; phdr.p_align = ELF_EXEC_PAGESIZE; @@ -1752,7 +1643,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!dump_skip(cprm, dataoff - cprm->pos)) goto end_coredump; - if (!elf_fdpic_dump_segments(cprm)) + if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1776,6 +1667,7 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } + kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ca2273727225..b0983e2a4e2c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1168,7 +1168,7 @@ EXPORT_SYMBOL(configfs_depend_item); /* * Release the dependent linkage. This is much simpler than - * configfs_depend_item() because we know that that the client driver is + * configfs_depend_item() because we know that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ void configfs_undepend_item(struct config_item *target) diff --git a/fs/configfs/file.c b/fs/configfs/file.c index fb65b706cc0d..1f0270229d7b 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -267,7 +267,7 @@ flush_write_buffer(struct file *file, struct configfs_buffer *buffer, size_t cou * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come * on the first write. - * Hint: if you're writing a value, first read the file, modify only the + * Hint: if you're writing a value, first read the file, modify only * the value you're changing, then write entire buffer back. */ diff --git a/fs/coredump.c b/fs/coredump.c index 76e7c10edfc0..0cd9056d79cc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -840,17 +840,17 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) ssize_t n; if (cprm->written + nr > cprm->limit) return 0; - while (nr) { - if (dump_interrupted()) - return 0; - n = __kernel_write(file, addr, nr, &pos); - if (n <= 0) - return 0; - file->f_pos = pos; - cprm->written += n; - cprm->pos += n; - nr -= n; - } + + + if (dump_interrupted()) + return 0; + n = __kernel_write(file, addr, nr, &pos); + if (n != nr) + return 0; + file->f_pos = pos; + cprm->written += n; + cprm->pos += n; + return 1; } EXPORT_SYMBOL(dump_emit); @@ -876,6 +876,40 @@ int dump_skip(struct coredump_params *cprm, size_t nr) } EXPORT_SYMBOL(dump_skip); +#ifdef CONFIG_ELF_CORE +int dump_user_range(struct coredump_params *cprm, unsigned long start, + unsigned long len) +{ + unsigned long addr; + + for (addr = start; addr < start + len; addr += PAGE_SIZE) { + struct page *page; + int stop; + + /* + * To avoid having to allocate page tables for virtual address + * ranges that have never been used yet, and also to make it + * easy to generate sparse core files, use a helper that returns + * NULL when encountering an empty page table entry that would + * otherwise have been filled with the zero page. + */ + page = get_dump_page(addr); + if (page) { + void *kaddr = kmap(page); + + stop = !dump_emit(cprm, kaddr, PAGE_SIZE); + kunmap(page); + put_page(page); + } else { + stop = !dump_skip(cprm, PAGE_SIZE); + } + if (stop) + return 0; + } + return 1; +} +#endif + int dump_align(struct coredump_params *cprm, int align) { unsigned mod = cprm->pos & (align - 1); @@ -902,3 +936,183 @@ void dump_truncate(struct coredump_params *cprm) } } EXPORT_SYMBOL(dump_truncate); + +/* + * The purpose of always_dump_vma() is to make sure that special kernel mappings + * that are useful for post-mortem analysis are included in every core dump. + * In that way we ensure that the core dump is fully interpretable later + * without matching up the same kernel and hardware config to see what PC values + * meant. These special mappings include - vDSO, vsyscall, and other + * architecture specific mappings + */ +static bool always_dump_vma(struct vm_area_struct *vma) +{ + /* Any vsyscall mappings? */ + if (vma == get_gate_vma(vma->vm_mm)) + return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + + /* + * arch_vma_name() returns non-NULL for special architecture mappings, + * such as vDSO sections. + */ + if (arch_vma_name(vma)) + return true; + + return false; +} + +/* + * Decide how much of @vma's contents should be included in a core dump. + */ +static unsigned long vma_dump_size(struct vm_area_struct *vma, + unsigned long mm_flags) +{ +#define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) + + /* always dump the vdso and vsyscall sections */ + if (always_dump_vma(vma)) + goto whole; + + if (vma->vm_flags & VM_DONTDUMP) + return 0; + + /* support for DAX */ + if (vma_is_dax(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE)) + goto whole; + return 0; + } + + /* Hugetlb memory check */ + if (is_vm_hugetlb_page(vma)) { + if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED)) + goto whole; + if (!(vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_PRIVATE)) + goto whole; + return 0; + } + + /* Do not dump I/O mapped devices or special mappings */ + if (vma->vm_flags & VM_IO) + return 0; + + /* By default, dump shared memory if mapped from an anonymous file. */ + if (vma->vm_flags & VM_SHARED) { + if (file_inode(vma->vm_file)->i_nlink == 0 ? + FILTER(ANON_SHARED) : FILTER(MAPPED_SHARED)) + goto whole; + return 0; + } + + /* Dump segments that have been written to. */ + if ((!IS_ENABLED(CONFIG_MMU) || vma->anon_vma) && FILTER(ANON_PRIVATE)) + goto whole; + if (vma->vm_file == NULL) + return 0; + + if (FILTER(MAPPED_PRIVATE)) + goto whole; + + /* + * If this is the beginning of an executable file mapping, + * dump the first page to aid in determining what was mapped here. + */ + if (FILTER(ELF_HEADERS) && + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && + (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + +#undef FILTER + + return 0; + +whole: + return vma->vm_end - vma->vm_start; +} + +static struct vm_area_struct *first_vma(struct task_struct *tsk, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret = tsk->mm->mmap; + + if (ret) + return ret; + return gate_vma; +} + +/* + * Helper function for iterating across a vma list. It ensures that the caller + * will visit `gate_vma' prior to terminating the search. + */ +static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, + struct vm_area_struct *gate_vma) +{ + struct vm_area_struct *ret; + + ret = this_vma->vm_next; + if (ret) + return ret; + if (this_vma == gate_vma) + return NULL; + return gate_vma; +} + +/* + * Under the mmap_lock, take a snapshot of relevant information about the task's + * VMAs. + */ +int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, + struct core_vma_metadata **vma_meta, + size_t *vma_data_size_ptr) +{ + struct vm_area_struct *vma, *gate_vma; + struct mm_struct *mm = current->mm; + int i; + size_t vma_data_size = 0; + + /* + * Once the stack expansion code is fixed to not change VMA bounds + * under mmap_lock in read mode, this can be changed to take the + * mmap_lock in read mode. + */ + if (mmap_write_lock_killable(mm)) + return -EINTR; + + gate_vma = get_gate_vma(mm); + *vma_count = mm->map_count + (gate_vma ? 1 : 0); + + *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); + if (!*vma_meta) { + mmap_write_unlock(mm); + return -ENOMEM; + } + + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; + vma = next_vma(vma, gate_vma), i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + m->start = vma->vm_start; + m->end = vma->vm_end; + m->flags = vma->vm_flags; + m->dump_size = vma_dump_size(vma, cprm->mm_flags); + + vma_data_size += m->dump_size; + } + + mmap_write_unlock(mm); + + if (WARN_ON(i != *vma_count)) + return -EFAULT; + + *vma_data_size_ptr = vma_data_size; + return 0; +} diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index bbd5e7e0632b..5b7ba8f71153 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -349,6 +349,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -358,8 +359,7 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 9eb0dba851e8..054ec852b5ea 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -228,6 +228,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { + DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index); struct page *page; index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; @@ -237,8 +238,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, if (page) put_page(page); else if (num_ra_pages > 1) - page_cache_readahead_unbounded(inode->i_mapping, NULL, - index, num_ra_pages, 0); + page_cache_ra_unbounded(&ractl, num_ra_pages, 0); page = read_mapping_page(inode->i_mapping, index, NULL); } return page; diff --git a/fs/inode.c b/fs/inode.c index 72c4c347afb7..9d78c37b00b8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; + if (sb->s_type->fs_flags & FS_THP_SUPPORT) + __set_bit(AS_THP_SUPPORT, &mapping->flags); mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); #ifdef CONFIG_READ_ONLY_THP_FOR_FS diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index e516ae389ca5..5900879d5693 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -355,7 +355,7 @@ void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap, /** * nilfs_bmap_assign - assign a new block number to a block * @bmap: bmap - * @bhp: pointer to buffer head + * @bh: pointer to buffer head * @blocknr: block number * @binfo: block information * diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 86d4d850d130..025fb082575a 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -889,7 +889,7 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) * nilfs_cpfile_change_cpmode - change checkpoint mode * @cpfile: inode of checkpoint file * @cno: checkpoint number - * @status: mode of checkpoint + * @mode: mode of checkpoint * * Description: nilfs_change_cpmode() changes the mode of the checkpoint * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT. @@ -930,12 +930,12 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) /** * nilfs_cpfile_get_stat - get checkpoint statistics * @cpfile: inode of checkpoint file - * @stat: pointer to a structure of checkpoint statistics + * @cpstat: pointer to a structure of checkpoint statistics * * Description: nilfs_cpfile_get_stat() returns information about checkpoints. * * Return Value: On success, 0 is returned, and checkpoints information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @cpstat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b175f1330408..171fb5cd427f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -69,7 +69,6 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, /** * nilfs_forget_buffer - discard dirty state - * @inode: owner inode of the buffer * @bh: buffer head of the buffer to be discarded */ void nilfs_forget_buffer(struct buffer_head *bh) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 42ff67c0c14f..63722475e17e 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -546,13 +546,13 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, /** * nilfs_sufile_get_stat - get segment usage statistics * @sufile: inode of segment usage file - * @stat: pointer to a structure of segment usage statistics + * @sustat: pointer to a structure of segment usage statistics * * Description: nilfs_sufile_get_stat() returns information about segment * usage. * * Return Value: On success, 0 is returned, and segment usage information is - * stored in the place pointed by @stat. On error, one of the following + * stored in the place pointed by @sustat. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 846d43df3fdf..217aa2705d5d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1244,24 +1244,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } - /* - * Avoid to modify vma->vm_flags - * without locked ops while the - * coredump reads the vm_flags. - */ - if (!mmget_still_valid(mm)) { - /* - * Silently return "count" - * like if get_task_mm() - * failed. FIXME: should this - * function have returned - * -ESRCH if get_task_mm() - * failed like if - * get_proc_task() fails? - */ - mmap_write_unlock(mm); - goto out_mm; - } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 414695454956..355523f4a4bf 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -224,7 +224,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); + nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/fs/romfs/super.c b/fs/romfs/super.c index e582d001f792..b1b7d3f5752f 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -356,6 +356,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) } i->i_mode = mode; + i->i_blocks = (i->i_size + 511) >> 9; unlock_new_inode(i); return i; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0e4a3837da52..000b457ad087 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -601,8 +601,6 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - /* no task can run (and in turn coredump) yet */ - VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -842,7 +840,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; - bool still_valid; WRITE_ONCE(ctx->released, true); @@ -858,7 +855,6 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_lock for writing. */ mmap_write_lock(mm); - still_valid = mmget_still_valid(mm); prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -869,17 +865,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); - if (still_valid) { - prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX); - if (prev) - vma = prev; - else - prev = vma; - } + prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, + new_flags, vma->anon_vma, + vma->vm_file, vma->vm_pgoff, + vma_policy(vma), + NULL_VM_UFFD_CTX); + if (prev) + vma = prev; + else + prev = vma; vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1309,8 +1303,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1511,8 +1503,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - if (!mmget_still_valid(mm)) - goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; |