diff options
290 files changed, 6176 insertions, 3850 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9b0123388f18..94a24aedcdb2 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -51,6 +51,14 @@ Description: Controls the dirty page count condition for the in-place-update policies. +What: /sys/fs/f2fs/<disk>/min_seq_blocks +Date: August 2018 +Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> +Description: + Controls the dirty page count condition for batched sequential + writes in ->writepages. + + What: /sys/fs/f2fs/<disk>/min_hot_blocks Date: March 2017 Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 1746131bc9cb..184193bcb262 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1072,6 +1072,24 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.oom.group + A read-write single value file which exists on non-root + cgroups. The default value is "0". + + Determines whether the cgroup should be treated as + an indivisible workload by the OOM killer. If set, + all tasks belonging to the cgroup or to its descendants + (if the memory cgroup is not a leaf cgroup) are killed + together or not at all. This can be used to avoid + partial kills to guarantee workload integrity. + + Tasks with the OOM protection (oom_score_adj set to -1000) + are treated as an exception and are never killed. + + If the OOM killer is invoked in a cgroup, it's not going + to kill any tasks outside of this cgroup, regardless + memory.oom.group values of ancestor cgroups. + memory.events A read-only flat-keyed file which exists on non-root cgroups. The following entries are defined. Unless specified diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bffb0caa3693..970d837bd57f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3041,8 +3041,9 @@ on: enable the feature page_poison= [KNL] Boot-time parameter changing the state of - poisoning on the buddy allocator. - off: turn off poisoning + poisoning on the buddy allocator, available with + CONFIG_PAGE_POISONING=y. + off: turn off poisoning (default) on: turn on poisoning panic= [KNL] Kernel behaviour on panic: delay <timeout> diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 69f8de995739..e5edd29687b5 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,24 @@ data_flush Enable data flushing before checkpoint in order to persist data of regular and symlink. fault_injection=%d Enable fault injection in all supported types with specified injection rate. +fault_type=%d Support configuring fault injection type, should be + enabled with fault_injection option, fault type value + is shown below, it supports single or combined type. + Type_Name Type_Value + FAULT_KMALLOC 0x000000001 + FAULT_KVMALLOC 0x000000002 + FAULT_PAGE_ALLOC 0x000000004 + FAULT_PAGE_GET 0x000000008 + FAULT_ALLOC_BIO 0x000000010 + FAULT_ALLOC_NID 0x000000020 + FAULT_ORPHAN 0x000000040 + FAULT_BLOCK 0x000000080 + FAULT_DIR_DEPTH 0x000000100 + FAULT_EVICT_INODE 0x000000200 + FAULT_TRUNCATE 0x000000400 + FAULT_IO 0x000000800 + FAULT_CHECKPOINT 0x000001000 + FAULT_DISCARD 0x000002000 mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 1605acbb23b6..22b4b00dee31 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -870,6 +870,7 @@ Committed_AS: 100056 kB VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB +Percpu: 62080 kB HardwareCorrupted: 0 kB AnonHugePages: 49152 kB ShmemHugePages: 0 kB @@ -962,6 +963,8 @@ Committed_AS: The amount of memory presently allocated on the system. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contiguous block of vmalloc area which is free + Percpu: Memory allocated to the percpu allocator used to back percpu + allocations. This stat excludes the cost of metadata. .............................................................................. diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 59585030cbaf..37a679501ddc 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -38,6 +38,7 @@ show up in /proc/sys/kernel: - hung_task_panic - hung_task_check_count - hung_task_timeout_secs +- hung_task_check_interval_secs - hung_task_warnings - hyperv_record_panic_msg - kexec_load_disabled @@ -355,7 +356,7 @@ This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. hung_task_timeout_secs: -Check interval. When a task in D state did not get scheduled +When a task in D state did not get scheduled for more than this value report a warning. This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. @@ -364,6 +365,18 @@ Possible values to set are in range {0..LONG_MAX/HZ}. ============================================================== +hung_task_check_interval_secs: + +Hung task check interval. If hung task checking is enabled +(see hung_task_timeout_secs), the check is done every +hung_task_check_interval_secs seconds. +This file shows up if CONFIG_DETECT_HUNG_TASK is enabled. + +0 (default): means use hung_task_timeout_secs as checking interval. +Possible values to set are in range {0..LONG_MAX/HZ}. + +============================================================== + hung_task_warnings: The maximum number of warnings to report. During a check interval @@ -451,7 +464,8 @@ Notes: 1) kernel doesn't guarantee, that new object will have desired id. So, it's up to userspace, how to handle an object with "wrong" id. 2) Toggle with non-default value will be set back to -1 by kernel after -successful IPC object allocation. +successful IPC object allocation. If an IPC object allocation syscall +fails, it is undefined if the value remains unmodified or is reset to -1. ============================================================== diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index e72853b6d725..7d73882e2c27 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -691,7 +691,7 @@ and don't use much of it. The default value is 0. See Documentation/vm/overcommit-accounting.rst and -mm/mmap.c::__vm_enough_memory() for more information. +mm/util.c::__vm_enough_memory() for more information. ============================================================== diff --git a/arch/Kconfig b/arch/Kconfig index c6148166a7b4..4426e9687d89 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -841,6 +841,16 @@ config REFCOUNT_FULL against various use-after-free conditions that can be used in security flaw exploits. +config HAVE_ARCH_PREL32_RELOCATIONS + bool + help + May be selected by an architecture if it supports place-relative + 32-bit relocations, both in the toolchain and in the module loader, + in which case relative references can be used in special sections + for PCI fixup, initcalls etc which are only half the size on 64 bit + architectures, and don't require runtime relocation on relocatable + kernels. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d9c299133111..82ab015bf42b 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * atomic helpers. Insert it into the gate_vma so that it is visible * through ptrace and /proc/<pid>/mem. */ -static struct vm_area_struct gate_vma = { - .vm_start = 0xffff0000, - .vm_end = 0xffff0000 + PAGE_SIZE, - .vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, -}; +static struct vm_area_struct gate_vma; static int __init gate_vma_init(void) { vma_init(&gate_vma, NULL); gate_vma.vm_page_prot = PAGE_READONLY_EXEC; + gate_vma.vm_start = 0xffff0000; + gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; + gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; return 0; } arch_initcall(gate_vma_init); diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d0a53cc6293a..29e75b47becd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -108,6 +108,7 @@ config ARM64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT + select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_STACKLEAK select HAVE_ARCH_THREAD_STRUCT_WHITELIST diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 5e89d40be8cd..0b334b671e90 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -16,6 +16,7 @@ config H8300 select OF_IRQ select OF_EARLY_FLATTREE select HAVE_MEMBLOCK + select NO_BOOTMEM select TIMER_OF select H8300_TMR8 select HAVE_KERNEL_GZIP diff --git a/arch/h8300/Makefile b/arch/h8300/Makefile index e1c02ca230cb..cc12b162c222 100644 --- a/arch/h8300/Makefile +++ b/arch/h8300/Makefile @@ -8,6 +8,8 @@ # (C) Copyright 2002-2015 Yoshinori Sato <ysato@users.sourceforge.jp> # +KBUILD_DEFCONFIG := edosk2674_defconfig + cflags-$(CONFIG_CPU_H8300H) := -mh aflags-$(CONFIG_CPU_H8300H) := -mh -Wa,--mach=h8300h ldflags-$(CONFIG_CPU_H8300H) := -mh8300helf_linux @@ -22,6 +24,8 @@ KBUILD_CFLAGS += -DUTS_SYSNAME=\"uClinux\" KBUILD_AFLAGS += $(aflags-y) LDFLAGS += $(ldflags-y) +CHECKFLAGS += -msize-long + ifeq ($(CROSS_COMPILE),) CROSS_COMPILE := h8300-unknown-linux- endif diff --git a/arch/h8300/boot/dts/h8300h_sim.dts b/arch/h8300/boot/dts/h8300h_sim.dts index f1c31cecdec8..595398b9d018 100644 --- a/arch/h8300/boot/dts/h8300h_sim.dts +++ b/arch/h8300/boot/dts/h8300h_sim.dts @@ -73,7 +73,7 @@ timer16: timer@ffff68 { compatible = "renesas,16bit-timer"; reg = <0xffff68 8>, <0xffff60 8>; - interrupts = <24 0>; + interrupts = <26 0>; renesas,channel = <0>; clocks = <&fclk>; clock-names = "fck"; diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h index ea0cb0cf6a8b..647a83bd40b7 100644 --- a/arch/h8300/include/asm/bitops.h +++ b/arch/h8300/include/asm/bitops.h @@ -29,11 +29,11 @@ static inline unsigned long ffz(unsigned long word) result = -1; __asm__("1:\n\t" - "shlr.l %2\n\t" + "shlr.l %1\n\t" "adds #1,%0\n\t" "bcs 1b" - : "=r"(result) - : "0"(result), "r"(word)); + : "=r"(result),"=r"(word) + : "0"(result), "1"(word)); return result; } @@ -66,7 +66,7 @@ H8300_GEN_BITOP(change_bit, "bnot") #undef H8300_GEN_BITOP -static inline int test_bit(int nr, const unsigned long *addr) +static inline int test_bit(int nr, const volatile unsigned long *addr) { int ret = 0; unsigned char *b_addr; @@ -162,11 +162,11 @@ static inline unsigned long __ffs(unsigned long word) result = -1; __asm__("1:\n\t" - "shlr.l %2\n\t" + "shlr.l %1\n\t" "adds #1,%0\n\t" "bcc 1b" - : "=r" (result) - : "0"(result), "r"(word)); + : "=r" (result),"=r"(word) + : "0"(result), "1"(word)); return result; } diff --git a/arch/h8300/include/asm/ptrace.h b/arch/h8300/include/asm/ptrace.h index 313cafa85380..66d383848ff1 100644 --- a/arch/h8300/include/asm/ptrace.h +++ b/arch/h8300/include/asm/ptrace.h @@ -4,6 +4,8 @@ #include <uapi/asm/ptrace.h> +struct task_struct; + #ifndef __ASSEMBLY__ #ifndef PS_S #define PS_S (0x10) diff --git a/arch/h8300/kernel/kgdb.c b/arch/h8300/kernel/kgdb.c index 602e478afbd5..1a1d30cb0609 100644 --- a/arch/h8300/kernel/kgdb.c +++ b/arch/h8300/kernel/kgdb.c @@ -129,7 +129,7 @@ void kgdb_arch_exit(void) /* Nothing to do */ } -const struct kgdb_arch arch_kgdb_ops = { +struct kgdb_arch arch_kgdb_ops = { /* Breakpoint instruction: trapa #2 */ .gdb_bpt_instr = { 0x57, 0x20 }, }; diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c index a4d0470c10a9..34e2df5c0d6d 100644 --- a/arch/h8300/kernel/setup.c +++ b/arch/h8300/kernel/setup.c @@ -23,7 +23,6 @@ #include <linux/init.h> #include <linux/of.h> #include <linux/of_fdt.h> -#include <linux/of_platform.h> #include <linux/of_address.h> #include <linux/clk-provider.h> #include <linux/memblock.h> @@ -71,10 +70,6 @@ void __init h8300_fdt_init(void *fdt, char *bootargs) static void __init bootmem_init(void) { - int bootmap_size; - unsigned long ram_start_pfn; - unsigned long free_ram_start_pfn; - unsigned long ram_end_pfn; struct memblock_region *region; memory_end = memory_start = 0; @@ -88,33 +83,17 @@ static void __init bootmem_init(void) if (!memory_end) panic("No memory!"); - ram_start_pfn = PFN_UP(memory_start); - /* free_ram_start_pfn is first page after kernel */ - free_ram_start_pfn = PFN_UP(__pa(_end)); - ram_end_pfn = PFN_DOWN(memblock_end_of_DRAM()); + /* setup bootmem globals (we use no_bootmem, but mm still depends on this) */ + min_low_pfn = PFN_UP(memory_start); + max_low_pfn = PFN_DOWN(memblock_end_of_DRAM()); + max_pfn = max_low_pfn; - max_pfn = ram_end_pfn; + memblock_reserve(__pa(_stext), _end - _stext); - /* - * give all the memory to the bootmap allocator, tell it to put the - * boot mem_map at the start of memory - */ - bootmap_size = init_bootmem_node(NODE_DATA(0), - free_ram_start_pfn, - 0, - ram_end_pfn); - /* - * free the usable memory, we have to make sure we do not free - * the bootmem bitmap so we then reserve it after freeing it :-) - */ - free_bootmem(PFN_PHYS(free_ram_start_pfn), - (ram_end_pfn - free_ram_start_pfn) << PAGE_SHIFT); - reserve_bootmem(PFN_PHYS(free_ram_start_pfn), bootmap_size, - BOOTMEM_DEFAULT); + early_init_fdt_reserve_self(); + early_init_fdt_scan_reserved_mem(); - for_each_memblock(reserved, region) { - reserve_bootmem(region->base, region->size, BOOTMEM_DEFAULT); - } + memblock_dump_all(); } void __init setup_arch(char **cmdline_p) @@ -188,15 +167,6 @@ const struct seq_operations cpuinfo_op = { .show = show_cpuinfo, }; -static int __init device_probe(void) -{ - of_platform_populate(NULL, NULL, NULL, NULL); - - return 0; -} - -device_initcall(device_probe); - #if defined(CONFIG_CPU_H8300H) #define get_wait(base, addr) ({ \ int baddr; \ diff --git a/arch/h8300/kernel/sim-console.c b/arch/h8300/kernel/sim-console.c index 46138f55a9ea..03aa35b1a08c 100644 --- a/arch/h8300/kernel/sim-console.c +++ b/arch/h8300/kernel/sim-console.c @@ -13,12 +13,13 @@ static void sim_write(struct console *con, const char *s, unsigned n) { - register const int fd __asm__("er0") = 1; /* stdout */ register const char *_ptr __asm__("er1") = s; register const unsigned _len __asm__("er2") = n; - __asm__(".byte 0x5e,0x00,0x00,0xc7\n\t" /* jsr @0xc7 (sys_write) */ - : : "g"(fd), "g"(_ptr), "g"(_len)); + __asm__("sub.l er0,er0\n\t" /* er0 = 1 (stdout) */ + "inc.l #1,er0\n\t" + ".byte 0x5e,0x00,0x00,0xc7\n\t" /* jsr @0xc7 (sys_write) */ + : : "g"(_ptr), "g"(_len):"er0"); } static int __init sim_setup(struct earlycon_device *device, const char *opt) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a80669209155..db0b6eebbfa5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -177,6 +177,7 @@ config PPC select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT + select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JIT if !PPC64 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 2d58c26bff9a..e6f2a38d2e61 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -45,9 +45,13 @@ config SPARC select LOCKDEP_SMALL if LOCKDEP select NEED_DMA_MAP_STATE select NEED_SG_DMA_LENGTH + select HAVE_MEMBLOCK + select NO_BOOTMEM config SPARC32 def_bool !64BIT + select ARCH_HAS_SYNC_DMA_FOR_CPU + select DMA_NONCOHERENT_OPS select GENERIC_ATOMIC64 select CLZ_TAB select HAVE_UID16 @@ -60,7 +64,6 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP - select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE @@ -79,7 +82,6 @@ config SPARC64 select IRQ_PREFLOW_FASTEOI select ARCH_HAVE_NMI_SAFE_CMPXCHG select HAVE_C_RECORDMCOUNT - select NO_BOOTMEM select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW select HAVE_NMI diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile index 966a13d2b127..e32ef20de567 100644 --- a/arch/sparc/Makefile +++ b/arch/sparc/Makefile @@ -9,10 +9,10 @@ # Copyright (C) 1998 Jakub Jelinek (jj@ultra.linux.cz) # We are not yet configured - so test on arch -ifeq ($(ARCH),sparc) - KBUILD_DEFCONFIG := sparc32_defconfig -else +ifeq ($(ARCH),sparc64) KBUILD_DEFCONFIG := sparc64_defconfig +else + KBUILD_DEFCONFIG := sparc32_defconfig endif ifeq ($(CONFIG_SPARC32),y) diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h index 12ae33daf52f..e17566376934 100644 --- a/arch/sparc/include/asm/dma-mapping.h +++ b/arch/sparc/include/asm/dma-mapping.h @@ -7,7 +7,6 @@ #include <linux/dma-debug.h> extern const struct dma_map_ops *dma_ops; -extern const struct dma_map_ops pci32_dma_ops; extern struct bus_type pci_bus_type; @@ -15,11 +14,11 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { #ifdef CONFIG_SPARC_LEON if (sparc_cpu_model == sparc_leon) - return &pci32_dma_ops; + return &dma_noncoherent_ops; #endif #if defined(CONFIG_SPARC32) && defined(CONFIG_PCI) if (bus == &pci_bus_type) - return &pci32_dma_ops; + return &dma_noncoherent_ops; #endif return dma_ops; } diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index cca9134cfa7d..6799c93c9f27 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c @@ -38,6 +38,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/scatterlist.h> +#include <linux/dma-noncoherent.h> #include <linux/of_device.h> #include <asm/io.h> @@ -434,42 +435,41 @@ arch_initcall(sparc_register_ioport); /* Allocate and map kernel buffer using consistent mode DMA for a device. * hwdev should be valid struct pci_dev pointer for PCI devices. */ -static void *pci32_alloc_coherent(struct device *dev, size_t len, - dma_addr_t *pba, gfp_t gfp, - unsigned long attrs) +void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t gfp, unsigned long attrs) { - unsigned long len_total = PAGE_ALIGN(len); + unsigned long len_total = PAGE_ALIGN(size); void *va; struct resource *res; int order; - if (len == 0) { + if (size == 0) { return NULL; } - if (len > 256*1024) { /* __get_free_pages() limit */ + if (size > 256*1024) { /* __get_free_pages() limit */ return NULL; } order = get_order(len_total); va = (void *) __get_free_pages(gfp, order); if (va == NULL) { - printk("pci_alloc_consistent: no %ld pages\n", len_total>>PAGE_SHIFT); + printk("%s: no %ld pages\n", __func__, len_total>>PAGE_SHIFT); goto err_nopages; } if ((res = kzalloc(sizeof(struct resource), GFP_KERNEL)) == NULL) { - printk("pci_alloc_consistent: no core\n"); + printk("%s: no core\n", __func__); goto err_nomem; } if (allocate_resource(&_sparc_dvma, res, len_total, _sparc_dvma.start, _sparc_dvma.end, PAGE_SIZE, NULL, NULL) != 0) { - printk("pci_alloc_consistent: cannot occupy 0x%lx", len_total); + printk("%s: cannot occupy 0x%lx", __func__, len_total); goto err_nova; } srmmu_mapiorange(0, virt_to_phys(va), res->start, len_total); - *pba = virt_to_phys(va); /* equals virt_to_bus (R.I.P.) for us. */ + *dma_handle = virt_to_phys(va); return (void *) res->start; err_nova: @@ -481,184 +481,53 @@ err_nopages: } /* Free and unmap a consistent DMA buffer. - * cpu_addr is what was returned from pci_alloc_consistent, - * size must be the same as what as passed into pci_alloc_consistent, - * and likewise dma_addr must be the same as what *dma_addrp was set to. + * cpu_addr is what was returned arch_dma_alloc, size must be the same as what + * was passed into arch_dma_alloc, and likewise dma_addr must be the same as + * what *dma_ndler was set to. * * References to the memory and mappings associated with cpu_addr/dma_addr * past this call are illegal. */ -static void pci32_free_coherent(struct device *dev, size_t n, void *p, - dma_addr_t ba, unsigned long attrs) +void arch_dma_free(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_addr, unsigned long attrs) { struct resource *res; if ((res = lookup_resource(&_sparc_dvma, - (unsigned long)p)) == NULL) { - printk("pci_free_consistent: cannot free %p\n", p); + (unsigned long)cpu_addr)) == NULL) { + printk("%s: cannot free %p\n", __func__, cpu_addr); return; } - if (((unsigned long)p & (PAGE_SIZE-1)) != 0) { - printk("pci_free_consistent: unaligned va %p\n", p); + if (((unsigned long)cpu_addr & (PAGE_SIZE-1)) != 0) { + printk("%s: unaligned va %p\n", __func__, cpu_addr); return; } - n = PAGE_ALIGN(n); - if (resource_size(res) != n) { - printk("pci_free_consistent: region 0x%lx asked 0x%lx\n", - (long)resource_size(res), (long)n); + size = PAGE_ALIGN(size); + if (resource_size(res) != size) { + printk("%s: region 0x%lx asked 0x%zx\n", __func__, + (long)resource_size(res), size); return; } - dma_make_coherent(ba, n); - srmmu_unmapiorange((unsigned long)p, n); + dma_make_coherent(dma_addr, size); + srmmu_unmapiorange((unsigned long)cpu_addr, size); release_resource(res); kfree(res); - free_pages((unsigned long)phys_to_virt(ba), get_order(n)); -} - -/* - * Same as pci_map_single, but with pages. - */ -static dma_addr_t pci32_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - /* IIep is write-through, not flushing. */ - return page_to_phys(page) + offset; -} - -static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - dma_make_coherent(ba, PAGE_ALIGN(size)); -} - -/* Map a set of buffers described by scatterlist in streaming - * mode for DMA. This is the scatter-gather version of the - * above pci_map_single interface. Here the scatter gather list - * elements are each tagged with the appropriate dma address - * and length. They are obtained via sg_dma_{address,length}(SG). - * - * NOTE: An implementation may be able to use a smaller number of - * DMA address/length pairs than there are SG table elements. - * (for example via virtual mapping capabilities) - * The routine returns the number of addr/length pairs actually - * used, at most nents. - * - * Device ownership issues as mentioned above for pci_map_single are - * the same here. - */ -static int pci32_map_sg(struct device *device, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *sg; - int n; - - /* IIep is write-through, not flushing. */ - for_each_sg(sgl, sg, nents, n) { - sg->dma_address = sg_phys(sg); - sg->dma_length = sg->length; - } - return nents; -} - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - struct scatterlist *sg; - int n; - - if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { - for_each_sg(sgl, sg, nents, n) { - dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); - } - } -} - -/* Make physical memory consistent for a single - * streaming mode DMA translation before or after a transfer. - * - * If you perform a pci_map_single() but wish to interrogate the - * buffer using the cpu, yet do not wish to teardown the PCI dma - * mapping, you must call this function before doing so. At the - * next point you give the PCI dma address back to the card, you - * must first perform a pci_dma_sync_for_device, and then the - * device again owns the buffer. - */ -static void pci32_sync_single_for_cpu(struct device *dev, dma_addr_t ba, - size_t size, enum dma_data_direction dir) -{ - if (dir != PCI_DMA_TODEVICE) { - dma_make_coherent(ba, PAGE_ALIGN(size)); - } -} - -static void pci32_sync_single_for_device(struct device *dev, dma_addr_t ba, - size_t size, enum dma_data_direction dir) -{ - if (dir != PCI_DMA_TODEVICE) { - dma_make_coherent(ba, PAGE_ALIGN(size)); - } + free_pages((unsigned long)phys_to_virt(dma_addr), get_order(size)); } -/* Make physical memory consistent for a set of streaming - * mode DMA translations after a transfer. - * - * The same as pci_dma_sync_single_* but for a scatter-gather list, - * same rules and usage. - */ -static void pci32_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir) -{ - struct scatterlist *sg; - int n; - - if (dir != PCI_DMA_TODEVICE) { - for_each_sg(sgl, sg, nents, n) { - dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); - } - } -} +/* IIep is write-through, not flushing on cpu to device transfer. */ -static void pci32_sync_sg_for_device(struct device *device, struct scatterlist *sgl, - int nents, enum dma_data_direction dir) +void arch_sync_dma_for_cpu(struct device *dev, phys_addr_t paddr, + size_t size, enum dma_data_direction dir) { - struct scatterlist *sg; - int n; - - if (dir != PCI_DMA_TODEVICE) { - for_each_sg(sgl, sg, nents, n) { - dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); - } - } + if (dir != PCI_DMA_TODEVICE) + dma_make_coherent(paddr, PAGE_ALIGN(size)); } -/* note: leon re-uses pci32_dma_ops */ -const struct dma_map_ops pci32_dma_ops = { - .alloc = pci32_alloc_coherent, - .free = pci32_free_coherent, - .map_page = pci32_map_page, - .unmap_page = pci32_unmap_page, - .map_sg = pci32_map_sg, - .unmap_sg = pci32_unmap_sg, - .sync_single_for_cpu = pci32_sync_single_for_cpu, - .sync_single_for_device = pci32_sync_single_for_device, - .sync_sg_for_cpu = pci32_sync_sg_for_cpu, - .sync_sg_for_device = pci32_sync_sg_for_device, -}; -EXPORT_SYMBOL(pci32_dma_ops); - const struct dma_map_ops *dma_ops = &sbus_dma_ops; EXPORT_SYMBOL(dma_ops); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 95fe4f081ba3..92634d4e440c 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -23,6 +23,7 @@ #include <linux/init.h> #include <linux/highmem.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/pagemap.h> #include <linux/poison.h> #include <linux/gfp.h> @@ -101,13 +102,46 @@ static unsigned long calc_max_low_pfn(void) return tmp; } +static void __init find_ramdisk(unsigned long end_of_phys_memory) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long size; + + /* Now have to check initial ramdisk, so that it won't pass + * the end of memory + */ + if (sparc_ramdisk_image) { + if (sparc_ramdisk_image >= (unsigned long)&_end - 2 * PAGE_SIZE) + sparc_ramdisk_image -= KERNBASE; + initrd_start = sparc_ramdisk_image + phys_base; + initrd_end = initrd_start + sparc_ramdisk_size; + if (initrd_end > end_of_phys_memory) { + printk(KERN_CRIT "initrd extends beyond end of memory " + "(0x%016lx > 0x%016lx)\ndisabling initrd\n", + initrd_end, end_of_phys_memory); + initrd_start = 0; + } else { + /* Reserve the initrd image area. */ + size = initrd_end - initrd_start; + memblock_reserve(initrd_start, size); + + initrd_start = (initrd_start - phys_base) + PAGE_OFFSET; + initrd_end = (initrd_end - phys_base) + PAGE_OFFSET; + } + } +#endif +} + unsigned long __init bootmem_init(unsigned long *pages_avail) { - unsigned long bootmap_size, start_pfn; - unsigned long end_of_phys_memory = 0UL; - unsigned long bootmap_pfn, bytes_avail, size; + unsigned long start_pfn, bytes_avail, size; + unsigned long end_of_phys_memory = 0; + unsigned long high_pages = 0; int i; + memblock_set_bottom_up(true); + memblock_allow_resize(); + bytes_avail = 0UL; for (i = 0; sp_banks[i].num_bytes != 0; i++) { end_of_phys_memory = sp_banks[i].base_addr + @@ -124,24 +158,25 @@ unsigned long __init bootmem_init(unsigned long *pages_avail) if (sp_banks[i].num_bytes == 0) { sp_banks[i].base_addr = 0xdeadbeef; } else { + memblock_add(sp_banks[i].base_addr, + sp_banks[i].num_bytes); sp_banks[i+1].num_bytes = 0; sp_banks[i+1].base_addr = 0xdeadbeef; } break; } } + memblock_add(sp_banks[i].base_addr, sp_banks[i].num_bytes); } /* Start with page aligned address of last symbol in kernel - * image. + * image. */ start_pfn = (unsigned long)__pa(PAGE_ALIGN((unsigned long) &_end)); /* Now shift down to get the real physical page frame number. */ start_pfn >>= PAGE_SHIFT; - bootmap_pfn = start_pfn; - max_pfn = end_of_phys_memory >> PAGE_SHIFT; max_low_pfn = max_pfn; @@ -150,85 +185,19 @@ unsigned long __init bootmem_init(unsigned long *pages_avail) if (max_low_pfn > pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT)) { highstart_pfn = pfn_base + (SRMMU_MAXMEM >> PAGE_SHIFT); max_low_pfn = calc_max_low_pfn(); + high_pages = calc_highpages(); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - calc_highpages() >> (20 - PAGE_SHIFT)); + high_pages >> (20 - PAGE_SHIFT)); } -#ifdef CONFIG_BLK_DEV_INITRD - /* Now have to check initial ramdisk, so that bootmap does not overwrite it */ - if (sparc_ramdisk_image) { - if (sparc_ramdisk_image >= (unsigned long)&_end - 2 * PAGE_SIZE) - sparc_ramdisk_image -= KERNBASE; - initrd_start = sparc_ramdisk_image + phys_base; - initrd_end = initrd_start + sparc_ramdisk_size; - if (initrd_end > end_of_phys_memory) { - printk(KERN_CRIT "initrd extends beyond end of memory " - "(0x%016lx > 0x%016lx)\ndisabling initrd\n", - initrd_end, end_of_phys_memory); - initrd_start = 0; - } - if (initrd_start) { - if (initrd_start >= (start_pfn << PAGE_SHIFT) && - initrd_start < (start_pfn << PAGE_SHIFT) + 2 * PAGE_SIZE) - bootmap_pfn = PAGE_ALIGN (initrd_end) >> PAGE_SHIFT; - } - } -#endif - /* Initialize the boot-time allocator. */ - bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, pfn_base, - max_low_pfn); - - /* Now register the available physical memory with the - * allocator. - */ - *pages_avail = 0; - for (i = 0; sp_banks[i].num_bytes != 0; i++) { - unsigned long curr_pfn, last_pfn; + find_ramdisk(end_of_phys_memory); - curr_pfn = sp_banks[i].base_addr >> PAGE_SHIFT; - if (curr_pfn >= max_low_pfn) - break; - - last_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT; - if (last_pfn > max_low_pfn) - last_pfn = max_low_pfn; - - /* - * .. finally, did all the rounding and playing - * around just make the area go away? - */ - if (last_pfn <= curr_pfn) - continue; - - size = (last_pfn - curr_pfn) << PAGE_SHIFT; - *pages_avail += last_pfn - curr_pfn; - - free_bootmem(sp_banks[i].base_addr, size); - } - -#ifdef CONFIG_BLK_DEV_INITRD - if (initrd_start) { - /* Reserve the initrd image area. */ - size = initrd_end - initrd_start; - reserve_bootmem(initrd_start, size, BOOTMEM_DEFAULT); - *pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT; - - initrd_start = (initrd_start - phys_base) + PAGE_OFFSET; - initrd_end = (initrd_end - phys_base) + PAGE_OFFSET; - } -#endif /* Reserve the kernel text/data/bss. */ size = (start_pfn << PAGE_SHIFT) - phys_base; - reserve_bootmem(phys_base, size, BOOTMEM_DEFAULT); - *pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT; + memblock_reserve(phys_base, size); - /* Reserve the bootmem map. We do not account for it - * in pages_avail because we will release that memory - * in free_all_bootmem. - */ - size = bootmap_size; - reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size, BOOTMEM_DEFAULT); - *pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT; + size = memblock_phys_mem_size() - memblock_reserved_size(); + *pages_avail = (size >> PAGE_SHIFT) - high_pages; return max_pfn; } @@ -322,7 +291,7 @@ void __init mem_init(void) map_high_region(start_pfn, end_pfn); } - + mem_init_print_info(NULL); } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0312f8947ce..512003f16889 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -124,6 +124,7 @@ config X86 select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT + select HAVE_ARCH_PREL32_RELOCATIONS select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 302517929932..d1e19f358b6e 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -23,11 +23,8 @@ * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h. * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL * which is meaningless and will cause compiling error in some cases. - * So do not include linux/export.h and define EXPORT_SYMBOL(sym) - * as empty. */ -#define _LINUX_EXPORT_H -#define EXPORT_SYMBOL(sym) +#define __DISABLE_EXPORTS #include "misc.h" #include "error.h" diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index de690c2d2e33..a0ab9ab61c75 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -8,5 +8,6 @@ generated-y += xen-hypercalls.h generic-y += dma-contiguous.h generic-y += early_ioremap.h +generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/x86/include/asm/export.h b/arch/x86/include/asm/export.h deleted file mode 100644 index 2a51d66689c5..000000000000 --- a/arch/x86/include/asm/export.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_64BIT -#define KSYM_ALIGN 16 -#endif -#include <asm-generic/export.h> diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index dde437f5d14f..158ad1483c43 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -108,7 +108,7 @@ static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) cx->type); } snprintf(cx->desc, - ACPI_CX_DESC_LEN, "ACPI FFH INTEL MWAIT 0x%x", + ACPI_CX_DESC_LEN, "ACPI FFH MWAIT 0x%x", cx->address); out: return retval; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 14ee9a814888..506bd2b4b8bb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7303,8 +7303,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, + bool blockable) { unsigned long apic_address; @@ -7315,6 +7316,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + + return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index a9e8633388f4..58c6efa9f9a9 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -913,7 +913,8 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, if (ret) return ret; - return bfq_io_set_weight_legacy(of_css(of), NULL, weight); + ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); + return ret ?: nbytes; } #ifdef CONFIG_DEBUG_BLK_CGROUP diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 41d9036b1822..653100fb719e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -187,11 +187,25 @@ static const int bfq_stats_min_budgets = 194; static const int bfq_default_max_budget = 16 * 1024; /* - * Async to sync throughput distribution is controlled as follows: - * when an async request is served, the entity is charged the number - * of sectors of the request, multiplied by the factor below + * When a sync request is dispatched, the queue that contains that + * request, and all the ancestor entities of that queue, are charged + * with the number of sectors of the request. In constrast, if the + * request is async, then the queue and its ancestor entities are + * charged with the number of sectors of the request, multiplied by + * the factor below. This throttles the bandwidth for async I/O, + * w.r.t. to sync I/O, and it is done to counter the tendency of async + * writes to steal I/O throughput to reads. + * + * The current value of this parameter is the result of a tuning with + * several hardware and software configurations. We tried to find the + * lowest value for which writes do not cause noticeable problems to + * reads. In fact, the lower this parameter, the stabler I/O control, + * in the following respect. The lower this parameter is, the less + * the bandwidth enjoyed by a group decreases + * - when the group does writes, w.r.t. to when it does reads; + * - when other groups do reads, w.r.t. to when they do writes. */ -static const int bfq_async_charge_factor = 10; +static const int bfq_async_charge_factor = 3; /* Default timeout values, in jiffies, approximating CFQ defaults. */ const int bfq_timeout = HZ / 8; @@ -853,16 +867,7 @@ static unsigned long bfq_serv_to_charge(struct request *rq, if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) return blk_rq_sectors(rq); - /* - * If there are no weight-raised queues, then amplify service - * by just the async charge factor; otherwise amplify service - * by twice the async charge factor, to further reduce latency - * for weight-raised queues. - */ - if (bfqq->bfqd->wr_busy_queues == 0) - return blk_rq_sectors(rq) * bfq_async_charge_factor; - - return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; + return blk_rq_sectors(rq) * bfq_async_charge_factor; } /** @@ -3298,6 +3303,27 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, */ } else entity->service = 0; + + /* + * Reset the received-service counter for every parent entity. + * Differently from what happens with bfqq->entity.service, + * the resetting of this counter never needs to be postponed + * for parent entities. In fact, in case bfqq may have a + * chance to go on being served using the last, partially + * consumed budget, bfqq->entity.service needs to be kept, + * because if bfqq then actually goes on being served using + * the same budget, the last value of bfqq->entity.service is + * needed to properly decrement bfqq->entity.budget by the + * portion already consumed. In contrast, it is not necessary + * to keep entity->service for parent entities too, because + * the bubble up of the new value of bfqq->entity.budget will + * make sure that the budgets of parent entities are correct, + * even in case bfqq and thus parent entities go on receiving + * service with the same budget. + */ + entity = entity->parent; + for_each_entity(entity) + entity->service = 0; } /* diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index dbc07b456059..ae52bff43ce4 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -130,10 +130,14 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, if (!change_without_lookup) /* lookup needed */ next_in_service = bfq_lookup_next_entity(sd, expiration); - if (next_in_service) - parent_sched_may_change = !sd->next_in_service || + if (next_in_service) { + bool new_budget_triggers_change = bfq_update_parent_budget(next_in_service); + parent_sched_may_change = !sd->next_in_service || + new_budget_triggers_change; + } + sd->next_in_service = next_in_service; if (!next_in_service) @@ -877,15 +881,11 @@ void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, unsigned long time_ms) { struct bfq_entity *entity = &bfqq->entity; - int tot_serv_to_charge = entity->service; - unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); - - if (time_ms > 0 && time_ms < timeout_ms) - tot_serv_to_charge = - (bfqd->bfq_max_budget * time_ms) / timeout_ms; - - if (tot_serv_to_charge < entity->service) - tot_serv_to_charge = entity->service; + unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); + unsigned long bounded_time_ms = min(time_ms, timeout_ms); + int serv_to_charge_for_time = + (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; + int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service); /* Increase budget to avoid inconsistencies */ if (tot_serv_to_charge > entity->budget) diff --git a/block/blk-core.c b/block/blk-core.c index 12550340418d..dee56c282efb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1036,7 +1036,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, NULL); - INIT_LIST_HEAD(&q->queue_head); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP @@ -2162,7 +2161,9 @@ static inline bool should_fail_request(struct hd_struct *part, static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { - if (part->policy && op_is_write(bio_op(bio))) { + const int op = bio_op(bio); + + if (part->policy && (op_is_write(op) && !op_is_flush(op))) { char b[BDEVNAME_SIZE]; WARN_ONCE(1, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index cf9c66c6d35a..29bfe8017a2d 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -462,50 +462,6 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) blk_mq_sched_free_tags(set, hctx, i); } -int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - struct elevator_queue *e = q->elevator; - int ret; - - if (!e) - return 0; - - ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); - if (ret) - return ret; - - if (e->type->ops.mq.init_hctx) { - ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); - if (ret) { - blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); - return ret; - } - } - - blk_mq_debugfs_register_sched_hctx(q, hctx); - - return 0; -} - -void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - struct elevator_queue *e = q->elevator; - - if (!e) - return; - - blk_mq_debugfs_unregister_sched_hctx(hctx); - - if (e->type->ops.mq.exit_hctx && hctx->sched_data) { - e->type->ops.mq.exit_hctx(hctx, hctx_idx); - hctx->sched_data = NULL; - } - - blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); -} - int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 0cb8f938dff9..4e028ee42430 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -28,11 +28,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); -int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx); -void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx); - static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 816923bf874d..94e1ed667b6e 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -320,6 +320,18 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, struct blk_mq_hw_ctx *hctx; int i; + /* + * __blk_mq_update_nr_hw_queues will update the nr_hw_queues and + * queue_hw_ctx after freeze the queue. So we could use q_usage_counter + * to avoid race with it. __blk_mq_update_nr_hw_queues will users + * synchronize_rcu to ensure all of the users go out of the critical + * section below and see zeroed q_usage_counter. + */ + rcu_read_lock(); + if (percpu_ref_is_zero(&q->q_usage_counter)) { + rcu_read_unlock(); + return; + } queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; @@ -335,7 +347,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } - + rcu_read_unlock(); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, diff --git a/block/blk-mq.c b/block/blk-mq.c index 72a0033ccee9..85a1c1a59c72 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2145,8 +2145,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); - blk_mq_sched_exit_hctx(q, hctx, hctx_idx); - if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -2214,12 +2212,9 @@ static int blk_mq_init_hctx(struct request_queue *q, set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; - if (blk_mq_sched_init_hctx(q, hctx, hctx_idx)) - goto exit_hctx; - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); if (!hctx->fq) - goto sched_exit_hctx; + goto exit_hctx; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node)) goto free_fq; @@ -2233,8 +2228,6 @@ static int blk_mq_init_hctx(struct request_queue *q, free_fq: kfree(hctx->fq); - sched_exit_hctx: - blk_mq_sched_exit_hctx(q, hctx, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -2896,10 +2889,81 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) return ret; } +/* + * request_queue and elevator_type pair. + * It is just used by __blk_mq_update_nr_hw_queues to cache + * the elevator_type associated with a request_queue. + */ +struct blk_mq_qe_pair { + struct list_head node; + struct request_queue *q; + struct elevator_type *type; +}; + +/* + * Cache the elevator_type in qe pair list and switch the + * io scheduler to 'none' + */ +static bool blk_mq_elv_switch_none(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + + if (!q->elevator) + return true; + + qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); + if (!qe) + return false; + + INIT_LIST_HEAD(&qe->node); + qe->q = q; + qe->type = q->elevator->type; + list_add(&qe->node, head); + + mutex_lock(&q->sysfs_lock); + /* + * After elevator_switch_mq, the previous elevator_queue will be + * released by elevator_release. The reference of the io scheduler + * module get by elevator_get will also be put. So we need to get + * a reference of the io scheduler module here to prevent it to be + * removed. + */ + __module_get(qe->type->elevator_owner); + elevator_switch_mq(q, NULL); + mutex_unlock(&q->sysfs_lock); + + return true; +} + +static void blk_mq_elv_switch_back(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + struct elevator_type *t = NULL; + + list_for_each_entry(qe, head, node) + if (qe->q == q) { + t = qe->type; + break; + } + + if (!t) + return; + + list_del(&qe->node); + kfree(qe); + + mutex_lock(&q->sysfs_lock); + elevator_switch_mq(q, t); + mutex_unlock(&q->sysfs_lock); +} + static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; + LIST_HEAD(head); lockdep_assert_held(&set->tag_list_lock); @@ -2910,6 +2974,18 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); + /* + * Sync with blk_mq_queue_tag_busy_iter. + */ + synchronize_rcu(); + /* + * Switch IO scheduler to 'none', cleaning up the data associated + * with the previous scheduler. We will switch back once we are done + * updating the new sw to hw queue mappings. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + if (!blk_mq_elv_switch_none(&head, q)) + goto switch_back; set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); @@ -2918,6 +2994,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_queue_reinit(q); } +switch_back: + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_elv_switch_back(&head, q); + list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); } diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 1d94a20374fc..bb93c7c2b182 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -576,12 +576,8 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags; - if (!rwb_enabled(rwb)) - return; - flags = bio_to_wbt_flags(rwb, bio); - - if (!wbt_should_throttle(rwb, bio)) { + if (!(flags & WBT_TRACKED)) { if (flags & WBT_READ) wb_timestamp(rwb, &rwb->last_issue); return; diff --git a/block/blk.h b/block/blk.h index d4d67e948920..9db4e389582c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -234,6 +234,8 @@ static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq int elevator_init(struct request_queue *); int elevator_init_mq(struct request_queue *q); +int elevator_switch_mq(struct request_queue *q, + struct elevator_type *new_e); void elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q); @@ -297,7 +299,7 @@ extern int blk_update_nr_requests(struct request_queue *, unsigned int); * b) the queue had IO stats enabled when this request was started, and * c) it's a file system request */ -static inline int blk_do_io_stat(struct request *rq) +static inline bool blk_do_io_stat(struct request *rq) { return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT) && diff --git a/block/elevator.c b/block/elevator.c index fa828b5bfd4b..5ea6e7d600e4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -933,16 +933,13 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static int elevator_switch_mq(struct request_queue *q, +int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e) { int ret; lockdep_assert_held(&q->sysfs_lock); - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - if (q->elevator) { if (q->elevator->registered) elv_unregister_queue(q); @@ -968,8 +965,6 @@ static int elevator_switch_mq(struct request_queue *q, blk_add_trace_msg(q, "elv switch: none"); out: - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); return ret; } @@ -1021,8 +1016,17 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) lockdep_assert_held(&q->sysfs_lock); - if (q->mq_ops) - return elevator_switch_mq(q, new_e); + if (q->mq_ops) { + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + err = elevator_switch_mq(q, new_e); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return err; + } /* * Turn on BYPASS and drain all requests w/ elevator private data. diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index c5367bf5487f..0f28a38a43ea 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -164,6 +164,7 @@ struct acpi_namespace_node { #define ANOBJ_SUBTREE_HAS_INI 0x10 /* Used to optimize device initialization */ #define ANOBJ_EVALUATED 0x20 /* Set on first evaluation of node */ #define ANOBJ_ALLOCATED_BUFFER 0x40 /* Method AML buffer is dynamic (install_method) */ +#define ANOBJ_NODE_EARLY_INIT 0x80 /* acpi_exec only: Node was create via init file (-fi) */ #define ANOBJ_IS_EXTERNAL 0x08 /* iASL only: This object created via External() */ #define ANOBJ_METHOD_NO_RETVAL 0x10 /* iASL only: Method has no return value */ diff --git a/drivers/acpi/acpica/acnamesp.h b/drivers/acpi/acpica/acnamesp.h index 3825df923480..bbb3b4d1e796 100644 --- a/drivers/acpi/acpica/acnamesp.h +++ b/drivers/acpi/acpica/acnamesp.h @@ -25,14 +25,15 @@ /* Flags for acpi_ns_lookup, acpi_ns_search_and_enter */ #define ACPI_NS_NO_UPSEARCH 0 -#define ACPI_NS_SEARCH_PARENT 0x01 -#define ACPI_NS_DONT_OPEN_SCOPE 0x02 -#define ACPI_NS_NO_PEER_SEARCH 0x04 -#define ACPI_NS_ERROR_IF_FOUND 0x08 -#define ACPI_NS_PREFIX_IS_SCOPE 0x10 -#define ACPI_NS_EXTERNAL 0x20 -#define ACPI_NS_TEMPORARY 0x40 -#define ACPI_NS_OVERRIDE_IF_FOUND 0x80 +#define ACPI_NS_SEARCH_PARENT 0x0001 +#define ACPI_NS_DONT_OPEN_SCOPE 0x0002 +#define ACPI_NS_NO_PEER_SEARCH 0x0004 +#define ACPI_NS_ERROR_IF_FOUND 0x0008 +#define ACPI_NS_PREFIX_IS_SCOPE 0x0010 +#define ACPI_NS_EXTERNAL 0x0020 +#define ACPI_NS_TEMPORARY 0x0040 +#define ACPI_NS_OVERRIDE_IF_FOUND 0x0080 +#define ACPI_NS_EARLY_INIT 0x0100 /* Flags for acpi_ns_walk_namespace */ diff --git a/drivers/acpi/acpica/acutils.h b/drivers/acpi/acpica/acutils.h index 2733cd4e418c..3374d41582b5 100644 --- a/drivers/acpi/acpica/acutils.h +++ b/drivers/acpi/acpica/acutils.h @@ -180,6 +180,8 @@ char acpi_ut_remove_leading_zeros(char **string); u8 acpi_ut_detect_hex_prefix(char **string); +void acpi_ut_remove_hex_prefix(char **string); + u8 acpi_ut_detect_octal_prefix(char **string); /* diff --git a/drivers/acpi/acpica/dbinput.c b/drivers/acpi/acpica/dbinput.c index 556ff59bbbfc..3e5f95390f0d 100644 --- a/drivers/acpi/acpica/dbinput.c +++ b/drivers/acpi/acpica/dbinput.c @@ -763,7 +763,12 @@ acpi_db_command_dispatch(char *input_buffer, case CMD_DISASSEMBLE: case CMD_DISASM: +#ifdef ACPI_DISASSEMBLER (void)acpi_db_disassemble_method(acpi_gbl_db_args[1]); +#else + acpi_os_printf + ("The AML Disassembler is not configured/present\n"); +#endif break; case CMD_DUMP: @@ -872,7 +877,12 @@ acpi_db_command_dispatch(char *input_buffer, case CMD_LIST: +#ifdef ACPI_DISASSEMBLER acpi_db_disassemble_aml(acpi_gbl_db_args[1], op); +#else + acpi_os_printf + ("The AML Disassembler is not configured/present\n"); +#endif break; case CMD_LOCKS: diff --git a/drivers/acpi/acpica/dbmethod.c b/drivers/acpi/acpica/dbmethod.c index 9fcecf104ba0..d8b7a0fe92ec 100644 --- a/drivers/acpi/acpica/dbmethod.c +++ b/drivers/acpi/acpica/dbmethod.c @@ -216,6 +216,7 @@ cleanup: acpi_ut_remove_reference(obj_desc); } +#ifdef ACPI_DISASSEMBLER /******************************************************************************* * * FUNCTION: acpi_db_disassemble_aml @@ -242,9 +243,8 @@ void acpi_db_disassemble_aml(char *statements, union acpi_parse_object *op) if (statements) { num_statements = strtoul(statements, NULL, 0); } -#ifdef ACPI_DISASSEMBLER + acpi_dm_disassemble(NULL, op, num_statements); -#endif } /******************************************************************************* @@ -317,8 +317,6 @@ acpi_status acpi_db_disassemble_method(char *name) walk_state->parse_flags |= ACPI_PARSE_DISASSEMBLE; status = acpi_ps_parse_aml(walk_state); - -#ifdef ACPI_DISASSEMBLER (void)acpi_dm_parse_deferred_ops(op); /* Now we can disassemble the method */ @@ -326,7 +324,6 @@ acpi_status acpi_db_disassemble_method(char *name) acpi_gbl_dm_opt_verbose = FALSE; acpi_dm_disassemble(NULL, op, 0); acpi_gbl_dm_opt_verbose = TRUE; -#endif acpi_ps_delete_parse_tree(op); @@ -337,6 +334,7 @@ acpi_status acpi_db_disassemble_method(char *name) acpi_ut_release_owner_id(&obj_desc->method.owner_id); return (AE_OK); } +#endif /******************************************************************************* * diff --git a/drivers/acpi/acpica/dbxface.c b/drivers/acpi/acpica/dbxface.c index 4647aa8efecb..f2526726daf6 100644 --- a/drivers/acpi/acpica/dbxface.c +++ b/drivers/acpi/acpica/dbxface.c @@ -10,6 +10,7 @@ #include "amlcode.h" #include "acdebug.h" #include "acinterp.h" +#include "acparser.h" #define _COMPONENT ACPI_CA_DEBUGGER ACPI_MODULE_NAME("dbxface") @@ -262,10 +263,17 @@ acpi_db_single_step(struct acpi_walk_state *walk_state, } } - /* Now we can display it */ + /* Now we can disassemble and display it */ #ifdef ACPI_DISASSEMBLER acpi_dm_disassemble(walk_state, display_op, ACPI_UINT32_MAX); +#else + /* + * The AML Disassembler is not configured - at least we can + * display the opcode value and name + */ + acpi_os_printf("AML Opcode: %4.4X %s\n", op->common.aml_opcode, + acpi_ps_get_opcode_name(op->common.aml_opcode)); #endif if ((op->common.aml_opcode == AML_IF_OP) || diff --git a/drivers/acpi/acpica/dsfield.c b/drivers/acpi/acpica/dsfield.c index 7c937595dfcb..30fe89545d6a 100644 --- a/drivers/acpi/acpica/dsfield.c +++ b/drivers/acpi/acpica/dsfield.c @@ -15,6 +15,10 @@ #include "acnamesp.h" #include "acparser.h" +#ifdef ACPI_EXEC_APP +#include "aecommon.h" +#endif + #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsfield") @@ -259,6 +263,13 @@ acpi_ds_get_field_names(struct acpi_create_field_info *info, u64 position; union acpi_parse_object *child; +#ifdef ACPI_EXEC_APP + u64 value = 0; + union acpi_operand_object *result_desc; + union acpi_operand_object *obj_desc; + char *name_path; +#endif + ACPI_FUNCTION_TRACE_PTR(ds_get_field_names, info); /* First field starts at bit zero */ @@ -391,6 +402,25 @@ acpi_ds_get_field_names(struct acpi_create_field_info *info, if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } +#ifdef ACPI_EXEC_APP + name_path = + acpi_ns_get_external_pathname(info-> + field_node); + obj_desc = + acpi_ut_create_integer_object + (value); + if (ACPI_SUCCESS + (ae_lookup_init_file_entry + (name_path, &value))) { + acpi_ex_write_data_to_field + (obj_desc, + acpi_ns_get_attached_object + (info->field_node), + &result_desc); + } + acpi_ut_remove_reference(obj_desc); + ACPI_FREE(name_path); +#endif } } @@ -573,7 +603,9 @@ acpi_ds_init_field_objects(union acpi_parse_object *op, !(walk_state->parse_flags & ACPI_PARSE_MODULE_LEVEL)) { flags |= ACPI_NS_TEMPORARY; } - +#ifdef ACPI_EXEC_APP + flags |= ACPI_NS_OVERRIDE_IF_FOUND; +#endif /* * Walk the list of entries in the field_list * Note: field_list can be of zero length. In this case, Arg will be NULL. diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c index 3de794bcf8fa..69603ba52a3a 100644 --- a/drivers/acpi/acpica/hwregs.c +++ b/drivers/acpi/acpica/hwregs.c @@ -528,13 +528,18 @@ acpi_status acpi_hw_register_read(u32 register_id, u32 *return_value) status = acpi_hw_read(&value64, &acpi_gbl_FADT.xpm2_control_block); - value = (u32)value64; + if (ACPI_SUCCESS(status)) { + value = (u32)value64; + } break; case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ status = acpi_hw_read(&value64, &acpi_gbl_FADT.xpm_timer_block); - value = (u32)value64; + if (ACPI_SUCCESS(status)) { + value = (u32)value64; + } + break; case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index fe9d46d81750..d8b8fc2ff563 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -56,14 +56,9 @@ acpi_status acpi_hw_legacy_sleep(u8 sleep_state) if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } - /* - * If the target sleep state is S5, clear all GPEs and fixed events too - */ - if (sleep_state == ACPI_STATE_S5) { - status = acpi_hw_clear_acpi_status(); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } + status = acpi_hw_clear_acpi_status(); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); } acpi_gbl_system_awake_and_running = FALSE; diff --git a/drivers/acpi/acpica/nsaccess.c b/drivers/acpi/acpica/nsaccess.c index 83a593e2155d..e3f10afde5ff 100644 --- a/drivers/acpi/acpica/nsaccess.c +++ b/drivers/acpi/acpica/nsaccess.c @@ -558,6 +558,14 @@ acpi_ns_lookup(union acpi_generic_state *scope_info, (char *)¤t_node->name, current_node)); } +#ifdef ACPI_EXEC_APP + if ((status == AE_ALREADY_EXISTS) && + (this_node->flags & ANOBJ_NODE_EARLY_INIT)) { + this_node->flags &= ~ANOBJ_NODE_EARLY_INIT; + status = AE_OK; + } +#endif + #ifdef ACPI_ASL_COMPILER /* * If this ACPI name already exists within the namespace as an @@ -676,6 +684,11 @@ acpi_ns_lookup(union acpi_generic_state *scope_info, } } } +#ifdef ACPI_EXEC_APP + if (flags & ACPI_NS_EARLY_INIT) { + this_node->flags |= ANOBJ_NODE_EARLY_INIT; + } +#endif *return_node = this_node; return_ACPI_STATUS(AE_OK); diff --git a/drivers/acpi/acpica/psloop.c b/drivers/acpi/acpica/psloop.c index 44f35ab3347d..34fc2f7476ed 100644 --- a/drivers/acpi/acpica/psloop.c +++ b/drivers/acpi/acpica/psloop.c @@ -22,6 +22,7 @@ #include "acdispat.h" #include "amlcode.h" #include "acconvert.h" +#include "acnamesp.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psloop") @@ -527,12 +528,18 @@ acpi_status acpi_ps_parse_loop(struct acpi_walk_state *walk_state) if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); } - if (walk_state->opcode == AML_SCOPE_OP) { + if (acpi_ns_opens_scope + (acpi_ps_get_opcode_info + (walk_state->opcode)->object_type)) { /* - * If the scope op fails to parse, skip the body of the - * scope op because the parse failure indicates that the - * device may not exist. + * If the scope/device op fails to parse, skip the body of + * the scope op because the parse failure indicates that + * the device may not exist. */ + ACPI_ERROR((AE_INFO, + "Skip parsing opcode %s", + acpi_ps_get_opcode_name + (walk_state->opcode))); walk_state->parser_state.aml = walk_state->aml + 1; walk_state->parser_state.aml = @@ -540,8 +547,6 @@ acpi_status acpi_ps_parse_loop(struct acpi_walk_state *walk_state) (&walk_state->parser_state); walk_state->aml = walk_state->parser_state.aml; - ACPI_ERROR((AE_INFO, - "Skipping Scope block")); } continue; @@ -709,20 +714,20 @@ acpi_status acpi_ps_parse_loop(struct acpi_walk_state *walk_state) } else if ((walk_state-> parse_flags & ACPI_PARSE_MODULE_LEVEL) - && status != AE_CTRL_TRANSFER - && ACPI_FAILURE(status)) { + && (ACPI_AML_EXCEPTION(status) + || status == AE_ALREADY_EXISTS + || status == AE_NOT_FOUND)) { /* - * ACPI_PARSE_MODULE_LEVEL flag means that we are currently - * loading a table by executing it as a control method. - * However, if we encounter an error while loading the table, - * we need to keep trying to load the table rather than - * aborting the table load (setting the status to AE_OK - * continues the table load). If we get a failure at this - * point, it means that the dispatcher got an error while - * processing Op (most likely an AML operand error) or a - * control method was called from module level and the - * dispatcher returned AE_CTRL_TRANSFER. In the latter case, - * leave the status alone, there's nothing wrong with it. + * ACPI_PARSE_MODULE_LEVEL flag means that we + * are currently loading a table by executing + * it as a control method. However, if we + * encounter an error while loading the table, + * we need to keep trying to load the table + * rather than aborting the table load (setting + * the status to AE_OK continues the table + * load). If we get a failure at this point, it + * means that the dispatcher got an error while + * trying to execute the Op. */ status = AE_OK; } diff --git a/drivers/acpi/acpica/tbdata.c b/drivers/acpi/acpica/tbdata.c index 51891f9fb057..862149c8a208 100644 --- a/drivers/acpi/acpica/tbdata.c +++ b/drivers/acpi/acpica/tbdata.c @@ -516,9 +516,9 @@ acpi_tb_verify_temp_table(struct acpi_table_desc *table_desc, acpi_tb_check_duplication(table_desc, table_index); if (ACPI_FAILURE(status)) { if (status != AE_CTRL_TERMINATE) { - ACPI_EXCEPTION((AE_INFO, AE_NO_MEMORY, + ACPI_EXCEPTION((AE_INFO, status, "%4.4s 0x%8.8X%8.8X" - " Table is duplicated", + " Table is already loaded", acpi_ut_valid_nameseg (table_desc->signature. ascii) ? table_desc-> diff --git a/drivers/acpi/acpica/utdelete.c b/drivers/acpi/acpica/utdelete.c index 118f3ff1fbb5..8cc4392c61f3 100644 --- a/drivers/acpi/acpica/utdelete.c +++ b/drivers/acpi/acpica/utdelete.c @@ -355,6 +355,7 @@ acpi_ut_update_ref_count(union acpi_operand_object *object, u32 action) u16 original_count; u16 new_count = 0; acpi_cpu_flags lock_flags; + char *message; ACPI_FUNCTION_NAME(ut_update_ref_count); @@ -391,6 +392,7 @@ acpi_ut_update_ref_count(union acpi_operand_object *object, u32 action) object, object->common.type, acpi_ut_get_object_type_name(object), new_count)); + message = "Incremement"; break; case REF_DECREMENT: @@ -420,6 +422,7 @@ acpi_ut_update_ref_count(union acpi_operand_object *object, u32 action) if (new_count == 0) { acpi_ut_delete_internal_obj(object); } + message = "Decrement"; break; default: @@ -436,8 +439,8 @@ acpi_ut_update_ref_count(union acpi_operand_object *object, u32 action) */ if (new_count > ACPI_MAX_REFERENCE_COUNT) { ACPI_WARNING((AE_INFO, - "Large Reference Count (0x%X) in object %p, Type=0x%.2X", - new_count, object, object->common.type)); + "Large Reference Count (0x%X) in object %p, Type=0x%.2X Operation=%s", + new_count, object, object->common.type, message)); } } diff --git a/drivers/acpi/acpica/utstrsuppt.c b/drivers/acpi/acpica/utstrsuppt.c index 954f8e3e35cd..05ff20049b87 100644 --- a/drivers/acpi/acpica/utstrsuppt.c +++ b/drivers/acpi/acpica/utstrsuppt.c @@ -231,14 +231,34 @@ char acpi_ut_remove_whitespace(char **string) u8 acpi_ut_detect_hex_prefix(char **string) { + char *initial_position = *string; + acpi_ut_remove_hex_prefix(string); + if (*string != initial_position) { + return (TRUE); /* String is past leading 0x */ + } + + return (FALSE); /* Not a hex string */ +} + +/******************************************************************************* + * + * FUNCTION: acpi_ut_remove_hex_prefix + * + * PARAMETERS: string - Pointer to input ASCII string + * + * RETURN: none + * + * DESCRIPTION: Remove a hex "0x" prefix + * + ******************************************************************************/ + +void acpi_ut_remove_hex_prefix(char **string) +{ if ((**string == ACPI_ASCII_ZERO) && (tolower((int)*(*string + 1)) == 'x')) { *string += 2; /* Go past the leading 0x */ - return (TRUE); } - - return (FALSE); /* Not a hex string */ } /******************************************************************************* diff --git a/drivers/acpi/acpica/utstrtoul64.c b/drivers/acpi/acpica/utstrtoul64.c index 8fadad242db6..5fde619a8bbd 100644 --- a/drivers/acpi/acpica/utstrtoul64.c +++ b/drivers/acpi/acpica/utstrtoul64.c @@ -218,7 +218,7 @@ u64 acpi_ut_implicit_strtoul64(char *string) * implicit conversions, and the "0x" prefix is "not allowed". * However, allow a "0x" prefix as an ACPI extension. */ - acpi_ut_detect_hex_prefix(&string); + acpi_ut_remove_hex_prefix(&string); if (!acpi_ut_remove_leading_zeros(&string)) { return_VALUE(0); diff --git a/drivers/acpi/pmic/intel_pmic_crc.c b/drivers/acpi/pmic/intel_pmic_crc.c index 7ffa74048107..22c9e374c923 100644 --- a/drivers/acpi/pmic/intel_pmic_crc.c +++ b/drivers/acpi/pmic/intel_pmic_crc.c @@ -25,16 +25,121 @@ #define PMIC_A0LOCK_REG 0xc5 static struct pmic_table power_table[] = { +/* { + .address = 0x00, + .reg = ??, + .bit = ??, + }, ** VSYS */ + { + .address = 0x04, + .reg = 0x63, + .bit = 0x00, + }, /* SYSX -> VSYS_SX */ + { + .address = 0x08, + .reg = 0x62, + .bit = 0x00, + }, /* SYSU -> VSYS_U */ + { + .address = 0x0c, + .reg = 0x64, + .bit = 0x00, + }, /* SYSS -> VSYS_S */ + { + .address = 0x10, + .reg = 0x6a, + .bit = 0x00, + }, /* V50S -> V5P0S */ + { + .address = 0x14, + .reg = 0x6b, + .bit = 0x00, + }, /* HOST -> VHOST, USB2/3 host */ + { + .address = 0x18, + .reg = 0x6c, + .bit = 0x00, + }, /* VBUS -> VBUS, USB2/3 OTG */ + { + .address = 0x1c, + .reg = 0x6d, + .bit = 0x00, + }, /* HDMI -> VHDMI */ +/* { + .address = 0x20, + .reg = ??, + .bit = ??, + }, ** S285 */ { .address = 0x24, .reg = 0x66, .bit = 0x00, - }, + }, /* X285 -> V2P85SX, camera */ +/* { + .address = 0x28, + .reg = ??, + .bit = ??, + }, ** V33A */ + { + .address = 0x2c, + .reg = 0x69, + .bit = 0x00, + }, /* V33S -> V3P3S, display/ssd/audio */ + { + .address = 0x30, + .reg = 0x68, + .bit = 0x00, + }, /* V33U -> V3P3U, SDIO wifi&bt */ +/* { + .address = 0x34 .. 0x40, + .reg = ??, + .bit = ??, + }, ** V33I, V18A, REFQ, V12A */ + { + .address = 0x44, + .reg = 0x5c, + .bit = 0x00, + }, /* V18S -> V1P8S, SOC/USB PHY/SIM */ { .address = 0x48, .reg = 0x5d, .bit = 0x00, - }, + }, /* V18X -> V1P8SX, eMMC/camara/audio */ + { + .address = 0x4c, + .reg = 0x5b, + .bit = 0x00, + }, /* V18U -> V1P8U, LPDDR */ + { + .address = 0x50, + .reg = 0x61, + .bit = 0x00, + }, /* V12X -> V1P2SX, SOC SFR */ + { + .address = 0x54, + .reg = 0x60, + .bit = 0x00, + }, /* V12S -> V1P2S, MIPI */ +/* { + .address = 0x58, + .reg = ??, + .bit = ??, + }, ** V10A */ + { + .address = 0x5c, + .reg = 0x56, + .bit = 0x00, + }, /* V10S -> V1P0S, SOC GFX */ + { + .address = 0x60, + .reg = 0x57, + .bit = 0x00, + }, /* V10X -> V1P0SX, SOC display/DDR IO/PCIe */ + { + .address = 0x64, + .reg = 0x59, + .bit = 0x00, + }, /* V105 -> V1P05S, L2 SRAM */ }; static struct pmic_table thermal_table[] = { diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index f99e5c883368..581312ac375f 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -2428,16 +2428,20 @@ static bool DAC960_V2_ReportDeviceConfiguration(DAC960_Controller_T { DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo = Controller->V2.LogicalDeviceInformation[LogicalDriveNumber]; - unsigned char *ReadCacheStatus[] = { "Read Cache Disabled", - "Read Cache Enabled", - "Read Ahead Enabled", - "Intelligent Read Ahead Enabled", - "-", "-", "-", "-" }; - unsigned char *WriteCacheStatus[] = { "Write Cache Disabled", - "Logical Device Read Only", - "Write Cache Enabled", - "Intelligent Write Cache Enabled", - "-", "-", "-", "-" }; + static const unsigned char *ReadCacheStatus[] = { + "Read Cache Disabled", + "Read Cache Enabled", + "Read Ahead Enabled", + "Intelligent Read Ahead Enabled", + "-", "-", "-", "-" + }; + static const unsigned char *WriteCacheStatus[] = { + "Write Cache Disabled", + "Logical Device Read Only", + "Write Cache Enabled", + "Intelligent Write Cache Enabled", + "-", "-", "-", "-" + }; unsigned char *GeometryTranslation; if (LogicalDeviceInfo == NULL) continue; switch (LogicalDeviceInfo->DriveGeometry) @@ -4339,14 +4343,16 @@ static void DAC960_V1_ProcessCompletedCommand(DAC960_Command_T *Command) static void DAC960_V2_ReadWriteError(DAC960_Command_T *Command) { DAC960_Controller_T *Controller = Command->Controller; - unsigned char *SenseErrors[] = { "NO SENSE", "RECOVERED ERROR", - "NOT READY", "MEDIUM ERROR", - "HARDWARE ERROR", "ILLEGAL REQUEST", - "UNIT ATTENTION", "DATA PROTECT", - "BLANK CHECK", "VENDOR-SPECIFIC", - "COPY ABORTED", "ABORTED COMMAND", - "EQUAL", "VOLUME OVERFLOW", - "MISCOMPARE", "RESERVED" }; + static const unsigned char *SenseErrors[] = { + "NO SENSE", "RECOVERED ERROR", + "NOT READY", "MEDIUM ERROR", + "HARDWARE ERROR", "ILLEGAL REQUEST", + "UNIT ATTENTION", "DATA PROTECT", + "BLANK CHECK", "VENDOR-SPECIFIC", + "COPY ABORTED", "ABORTED COMMAND", + "EQUAL", "VOLUME OVERFLOW", + "MISCOMPARE", "RESERVED" + }; unsigned char *CommandName = "UNKNOWN"; switch (Command->CommandType) { diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index e285413d4a75..6f1d25c1eb64 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2740,6 +2740,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) pd->write_congestion_on = write_congestion_on; pd->write_congestion_off = write_congestion_off; + ret = -ENOMEM; disk = alloc_disk(1); if (!disk) goto out_mem; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c7acf74253a1..a1d6b5597c17 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -337,6 +337,7 @@ static ssize_t backing_dev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { char *file_name; + size_t sz; struct file *backing_dev = NULL; struct inode *inode; struct address_space *mapping; @@ -357,7 +358,11 @@ static ssize_t backing_dev_store(struct device *dev, goto out; } - strlcpy(file_name, buf, len); + strlcpy(file_name, buf, PATH_MAX); + /* ignore trailing newline */ + sz = strlen(file_name); + if (sz > 0 && file_name[sz - 1] == '\n') + file_name[sz - 1] = 0x00; backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0); if (IS_ERR(backing_dev)) { diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 1d50e97d49f1..6d53f7d9fc7a 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -555,12 +555,20 @@ EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_stop); void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy) { - struct policy_dbs_info *policy_dbs = policy->governor_data; + struct policy_dbs_info *policy_dbs; + + /* Protect gov->gdbs_data against cpufreq_dbs_governor_exit() */ + mutex_lock(&gov_dbs_data_mutex); + policy_dbs = policy->governor_data; + if (!policy_dbs) + goto out; mutex_lock(&policy_dbs->update_mutex); cpufreq_policy_apply_limits(policy); gov_update_sample_delay(policy_dbs, 0); - mutex_unlock(&policy_dbs->update_mutex); + +out: + mutex_unlock(&gov_dbs_data_mutex); } EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits); diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 1aef60d160eb..110483f0e3fb 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -328,9 +328,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, unsigned int polling_threshold; /* - * We want to default to C1 (hlt), not to busy polling - * unless the timer is happening really really soon, or - * C1's exit latency exceeds the user configured limit. + * Default to a physical idle state, not to busy polling, unless + * a timer is going to trigger really really soon. */ polling_threshold = max_t(unsigned int, 20, s->target_residency); if (data->next_timer_us > polling_threshold && @@ -349,14 +348,12 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * If the tick is already stopped, the cost of possible short * idle duration misprediction is much higher, because the CPU * may be stuck in a shallow idle state for a long time as a - * result of it. In that case say we might mispredict and try - * to force the CPU into a state for which we would have stopped - * the tick, unless a timer is going to expire really soon - * anyway. + * result of it. In that case say we might mispredict and use + * the known time till the closest timer event for the idle + * state selection. */ if (data->predicted_us < TICK_USEC) - data->predicted_us = min_t(unsigned int, TICK_USEC, - ktime_to_us(delta_next)); + data->predicted_us = ktime_to_us(delta_next); } else { /* * Use the performance multiplier and the user-configurable @@ -381,8 +378,22 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; if (idx == -1) idx = i; /* first enabled state */ - if (s->target_residency > data->predicted_us) - break; + if (s->target_residency > data->predicted_us) { + if (!tick_nohz_tick_stopped()) + break; + + /* + * If the state selected so far is shallow and this + * state's target residency matches the time till the + * closest timer event, select this one to avoid getting + * stuck in the shallow one for too long. + */ + if (drv->states[idx].target_residency < TICK_USEC && + s->target_residency <= ktime_to_us(delta_next)) + idx = i; + + goto out; + } if (s->exit_latency > latency_req) { /* * If we break out of the loop for latency reasons, use @@ -403,14 +414,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * Don't stop the tick if the selected state is a polling one or if the * expected idle duration is shorter than the tick period length. */ - if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || - expected_interval < TICK_USEC) { + if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || + expected_interval < TICK_USEC) && !tick_nohz_tick_stopped()) { unsigned int delta_next_us = ktime_to_us(delta_next); *stop_tick = false; - if (!tick_nohz_tick_stopped() && idx > 0 && - drv->states[idx].target_residency > delta_next_us) { + if (idx > 0 && drv->states[idx].target_residency > delta_next_us) { /* * The tick is not going to be stopped and the target * residency of the state to be returned is not within @@ -418,8 +428,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * tick, so try to correct that. */ for (i = idx - 1; i >= 0; i--) { - if (drv->states[i].disabled || - dev->states_usage[i].disable) + if (drv->states[i].disabled || + dev->states_usage[i].disable) continue; idx = i; @@ -429,6 +439,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } } +out: data->last_state_idx = idx; return data->last_state_idx; diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index 88c322d7c71e..14c40a7750d1 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -24,6 +24,7 @@ KBUILD_CFLAGS := $(cflags-y) -DDISABLE_BRANCH_PROFILING \ -D__NO_FORTIFY \ $(call cc-option,-ffreestanding) \ $(call cc-option,-fno-stack-protector) \ + -D__DISABLE_EXPORTS GCOV_PROFILE := n KASAN_SANITIZE := n diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index a365ea2383d1..e55508b39496 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn) * * @amn: our notifier */ -static void amdgpu_mn_read_lock(struct amdgpu_mn *amn) +static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable) { - mutex_lock(&amn->read_lock); + if (blockable) + mutex_lock(&amn->read_lock); + else if (!mutex_trylock(&amn->read_lock)) + return -EAGAIN; + if (atomic_inc_return(&amn->recursion) == 1) down_read_non_owner(&amn->lock); mutex_unlock(&amn->read_lock); + + return 0; } /** @@ -239,10 +245,11 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ -static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, +static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; @@ -250,17 +257,28 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end -= 1; - amdgpu_mn_read_lock(amn); + /* TODO we should be able to split locking for interval tree and + * amdgpu_mn_invalidate_node + */ + if (amdgpu_mn_read_lock(amn, blockable)) + return -EAGAIN; it = interval_tree_iter_first(&amn->objects, start, end); while (it) { struct amdgpu_mn_node *node; + if (!blockable) { + amdgpu_mn_read_unlock(amn); + return -EAGAIN; + } + node = container_of(it, struct amdgpu_mn_node, it); it = interval_tree_iter_next(it, start, end); amdgpu_mn_invalidate_node(node, start, end); } + + return 0; } /** @@ -275,10 +293,11 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, * necessitates evicting all user-mode queues of the process. The BOs * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ -static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, +static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; @@ -286,13 +305,19 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end -= 1; - amdgpu_mn_read_lock(amn); + if (amdgpu_mn_read_lock(amn, blockable)) + return -EAGAIN; it = interval_tree_iter_first(&amn->objects, start, end); while (it) { struct amdgpu_mn_node *node; struct amdgpu_bo *bo; + if (!blockable) { + amdgpu_mn_read_unlock(amn); + return -EAGAIN; + } + node = container_of(it, struct amdgpu_mn_node, it); it = interval_tree_iter_next(it, start, end); @@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, amdgpu_amdkfd_evict_userptr(mem, mm); } } + + return 0; } /** diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index dcd6e230d16a..2c9b284036d1 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo) mo->attached = false; } -static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, +static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct i915_mmu_notifier *mn = container_of(_mn, struct i915_mmu_notifier, mn); @@ -124,7 +125,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, LIST_HEAD(cancelled); if (RB_EMPTY_ROOT(&mn->objects.rb_root)) - return; + return 0; /* interval ranges are inclusive, but invalidate range is exclusive */ end--; @@ -132,6 +133,10 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, spin_lock(&mn->lock); it = interval_tree_iter_first(&mn->objects, start, end); while (it) { + if (!blockable) { + spin_unlock(&mn->lock); + return -EAGAIN; + } /* The mmu_object is released late when destroying the * GEM object so it is entirely possible to gain a * reference on an object in the process of being freed @@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, if (!list_empty(&cancelled)) flush_workqueue(mn->wq); + + return 0; } static const struct mmu_notifier_ops i915_gem_userptr_notifier = { diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index abd24975c9b1..f8b35df44c60 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -118,19 +118,27 @@ static void radeon_mn_release(struct mmu_notifier *mn, * We block for all BOs between start and end to be idle and * unmap them by move them into system domain again. */ -static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, +static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); struct ttm_operation_ctx ctx = { false, false }; struct interval_tree_node *it; + int ret = 0; /* notification is exclusive, but interval is inclusive */ end -= 1; - mutex_lock(&rmn->lock); + /* TODO we should be able to split locking for interval tree and + * the tear down. + */ + if (blockable) + mutex_lock(&rmn->lock); + else if (!mutex_trylock(&rmn->lock)) + return -EAGAIN; it = interval_tree_iter_first(&rmn->objects, start, end); while (it) { @@ -138,6 +146,11 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; + if (!blockable) { + ret = -EAGAIN; + goto out_unlock; + } + node = container_of(it, struct radeon_mn_node, it); it = interval_tree_iter_next(it, start, end); @@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, } } +out_unlock: mutex_unlock(&rmn->lock); + + return ret; } static const struct mmu_notifier_ops radeon_mn_ops = { diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c index e52c58c29d9a..4d565b0c5a6e 100644 --- a/drivers/ide/hpt366.c +++ b/drivers/ide/hpt366.c @@ -574,13 +574,14 @@ static u8 hpt3xx_udma_filter(ide_drive_t *drive) if (!HPT370_ALLOW_ATA100_5 || check_in_drive_list(drive, bad_ata100_5)) return ATA_UDMA4; + /* else: fall through */ case HPT372 : case HPT372A: case HPT372N: case HPT374 : if (ata_id_is_sata(drive->id)) mask &= ~0x0e; - /* Fall thru */ + /* fall through */ default: return mask; } @@ -600,7 +601,7 @@ static u8 hpt3xx_mdma_filter(ide_drive_t *drive) case HPT374 : if (ata_id_is_sata(drive->id)) return 0x00; - /* Fall thru */ + /* else: fall through */ default: return 0x07; } diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 5bd2aafc3753..a8df300f949c 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -427,6 +427,7 @@ static int ide_floppy_get_capacity(ide_drive_t *drive) * (maintains previous driver behaviour) */ break; + /* else: fall through */ case CAPACITY_CURRENT: /* Normal Zip/LS-120 disks */ if (memcmp(cap_desc, &floppy->cap_desc, 8)) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index a444bad7a2aa..0d93e0cfbeaf 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -460,7 +460,6 @@ void do_ide_request(struct request_queue *q) struct ide_host *host = hwif->host; struct request *rq = NULL; ide_startstop_t startstop; - unsigned long queue_run_ms = 3; /* old plug delay */ spin_unlock_irq(q->queue_lock); @@ -480,9 +479,6 @@ repeat: prev_port = hwif->host->cur_port; if (drive->dev_flags & IDE_DFLAG_SLEEPING && time_after(drive->sleep, jiffies)) { - unsigned long left = jiffies - drive->sleep; - - queue_run_ms = jiffies_to_msecs(left + 1); ide_unlock_port(hwif); goto plug_device; } diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 416a2f353071..3b75a7b7a284 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -142,6 +142,7 @@ static void ide_classify_atapi_dev(ide_drive_t *drive) } /* Early cdrom models used zero */ type = ide_cdrom; + /* fall through */ case ide_cdrom: drive->dev_flags |= IDE_DFLAG_REMOVABLE; #ifdef CONFIG_PPC diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index aee7b46d2330..34c1165226a4 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1746,7 +1746,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor) { unsigned long t; int speed; - int buffer_size; u16 *ctl = (u16 *)&tape->caps[12]; ide_debug_log(IDE_DBG_FUNC, "minor: %d", minor); @@ -1781,7 +1780,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor) *ctl /= 2; tape->buffer_size = *ctl * tape->blk_size; } - buffer_size = tape->buffer_size; /* select the "best" DSC read/write polling freq */ speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]); diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 89b29028d315..c21d5c50ae3a 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -128,7 +128,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd) return pre_task_out_intr(drive, cmd); } handler = task_pio_intr; - /* fall-through */ + /* fall through */ case ATA_PROT_NODATA: if (handler == NULL) handler = task_no_data_intr; @@ -140,6 +140,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd) hwif->expiry = dma_ops->dma_timer_expiry; ide_execute_command(drive, cmd, ide_dma_intr, 2 * WAIT_CMD); dma_ops->dma_start(drive); + /* fall through */ default: return ide_started; } diff --git a/drivers/ide/sis5513.c b/drivers/ide/sis5513.c index c3062b53056f..024bc7ba49ee 100644 --- a/drivers/ide/sis5513.c +++ b/drivers/ide/sis5513.c @@ -494,6 +494,7 @@ static int init_chipset_sis5513(struct pci_dev *dev) pci_read_config_byte(dev, 0x09, ®); if ((reg & 0x0f) != 0x00) pci_write_config_byte(dev, 0x09, reg&0xf0); + /* fall through */ case ATA_16: /* force per drive recovery and active timings needed on ATA_33 and below chips */ diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 182436b92ba9..6ec748eccff7 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, + true, NULL); up_read(&context->umem_rwsem); } @@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, return 0; } -static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + int ret; if (!context->invalidate_range) - return; + return 0; + + if (blockable) + down_read(&context->umem_rwsem); + else if (!down_read_trylock(&context->umem_rwsem)) + return -EAGAIN; ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, end, - invalidate_range_start_trampoline, NULL); + invalidate_range_start_trampoline, + blockable, NULL); up_read(&context->umem_rwsem); + + return ret; } static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, @@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, if (!context->invalidate_range) return; + /* + * TODO: we currently bail out if there is any sleepable work to be done + * in ib_umem_notifier_invalidate_range_start so we shouldn't really block + * here. But this is ugly and fragile. + */ down_read(&context->umem_rwsem); rbt_ib_umem_for_each_in_range(&context->umem_tree, start, end, - invalidate_range_end_trampoline, NULL); + invalidate_range_end_trampoline, true, NULL); up_read(&context->umem_rwsem); ib_ucontext_notifier_end_account(context); } @@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 last, umem_call_back cb, + bool blockable, void *cookie) { int ret_val = 0; @@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; node = next) { + /* TODO move the blockable decision up to the callback */ + if (!blockable) + return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); ret_val = cb(umem->umem, start, last, cookie) || ret_val; diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 70aceefe14d5..e1c7996c018e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -67,9 +67,9 @@ struct mmu_rb_handler { static unsigned long mmu_node_start(struct mmu_rb_node *); static unsigned long mmu_node_last(struct mmu_rb_node *); -static void mmu_notifier_range_start(struct mmu_notifier *, +static int mmu_notifier_range_start(struct mmu_notifier *, struct mm_struct *, - unsigned long, unsigned long); + unsigned long, unsigned long, bool); static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, unsigned long, unsigned long); static void do_remove(struct mmu_rb_handler *handler, @@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, handler->ops->remove(handler->ops_arg, node); } -static void mmu_notifier_range_start(struct mmu_notifier *mn, +static int mmu_notifier_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct mmu_rb_handler *handler = container_of(mn, struct mmu_rb_handler, mn); @@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn, if (added) queue_work(handler->wq, &handler->del_work); + + return 0; } /* diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f1a87a690a4c..d216e0d2921d 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) down_read(&ctx->umem_rwsem); rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, - mr_leaf_free, imr); + mr_leaf_free, true, imr); up_read(&ctx->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); diff --git a/drivers/irqchip/irq-renesas-h8s.c b/drivers/irqchip/irq-renesas-h8s.c index aed31afb0216..85234d456638 100644 --- a/drivers/irqchip/irq-renesas-h8s.c +++ b/drivers/irqchip/irq-renesas-h8s.c @@ -12,7 +12,7 @@ #include <asm/io.h> static void *intc_baseaddr; -#define IPRA ((unsigned long)intc_baseaddr) +#define IPRA (intc_baseaddr) static const unsigned char ipr_table[] = { 0x03, 0x02, 0x01, 0x00, 0x13, 0x12, 0x11, 0x10, /* 16 - 23 */ @@ -34,7 +34,7 @@ static const unsigned char ipr_table[] = { static void h8s_disable_irq(struct irq_data *data) { int pos; - unsigned int addr; + void __iomem *addr; unsigned short pri; int irq = data->irq; @@ -48,7 +48,7 @@ static void h8s_disable_irq(struct irq_data *data) static void h8s_enable_irq(struct irq_data *data) { int pos; - unsigned int addr; + void __iomem *addr; unsigned short pri; int irq = data->irq; diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 17bf109c58e9..f6e0a8b3a61e 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -1,7 +1,8 @@ config BCACHE tristate "Block device as cache" - ---help--- + select CRC64 + help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. @@ -10,7 +11,7 @@ config BCACHE config BCACHE_DEBUG bool "Bcache debugging" depends on BCACHE - ---help--- + help Don't select this option unless you're a developer Enables extra debugging tools, allows expensive runtime checks to be @@ -20,7 +21,7 @@ config BCACHE_CLOSURES_DEBUG bool "Debug closures" depends on BCACHE select DEBUG_FS - ---help--- + help Keeps all active closures in a linked list and provides a debugfs interface to list them, which makes it possible to see asynchronous operations that get stuck. diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 7fa2631b422c..7a28232d868b 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -87,8 +87,8 @@ void bch_rescale_priorities(struct cache_set *c, int sectors) { struct cache *ca; struct bucket *b; - unsigned next = c->nbuckets * c->sb.bucket_size / 1024; - unsigned i; + unsigned int next = c->nbuckets * c->sb.bucket_size / 1024; + unsigned int i; int r; atomic_sub(sectors, &c->rescale); @@ -169,7 +169,7 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) #define bucket_prio(b) \ ({ \ - unsigned min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ \ (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ }) @@ -244,6 +244,7 @@ static void invalidate_buckets_random(struct cache *ca) while (!fifo_full(&ca->free_inc)) { size_t n; + get_random_bytes(&n, sizeof(n)); n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket); @@ -301,7 +302,7 @@ do { \ static int bch_allocator_push(struct cache *ca, long bucket) { - unsigned i; + unsigned int i; /* Prios/gens are actually the most important reserve */ if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) @@ -385,7 +386,7 @@ out: /* Allocation */ -long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) +long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) { DEFINE_WAIT(w); struct bucket *b; @@ -421,7 +422,7 @@ out: if (expensive_debug_checks(ca->set)) { size_t iter; long i; - unsigned j; + unsigned int j; for (iter = 0; iter < prio_buckets(ca) * 2; iter++) BUG_ON(ca->prio_buckets[iter] == (uint64_t) r); @@ -470,14 +471,14 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) void bch_bucket_free(struct cache_set *c, struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) __bch_bucket_free(PTR_CACHE(c, k, i), PTR_BUCKET(c, k, i)); } -int __bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, +int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, int n, bool wait) { int i; @@ -510,10 +511,11 @@ err: return -1; } -int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, +int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, struct bkey *k, int n, bool wait) { int ret; + mutex_lock(&c->bucket_lock); ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); mutex_unlock(&c->bucket_lock); @@ -524,8 +526,8 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned reserve, struct open_bucket { struct list_head list; - unsigned last_write_point; - unsigned sectors_free; + unsigned int last_write_point; + unsigned int sectors_free; BKEY_PADDED(key); }; @@ -556,7 +558,7 @@ struct open_bucket { */ static struct open_bucket *pick_data_bucket(struct cache_set *c, const struct bkey *search, - unsigned write_point, + unsigned int write_point, struct bkey *alloc) { struct open_bucket *ret, *ret_task = NULL; @@ -595,12 +597,16 @@ found: * * If s->writeback is true, will not fail. */ -bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, - unsigned write_point, unsigned write_prio, bool wait) +bool bch_alloc_sectors(struct cache_set *c, + struct bkey *k, + unsigned int sectors, + unsigned int write_point, + unsigned int write_prio, + bool wait) { struct open_bucket *b; BKEY_PADDED(key) alloc; - unsigned i; + unsigned int i; /* * We might have to allocate a new bucket, which we can't do with a @@ -613,7 +619,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, spin_lock(&c->data_bucket_lock); while (!(b = pick_data_bucket(c, k, write_point, &alloc.key))) { - unsigned watermark = write_prio + unsigned int watermark = write_prio ? RESERVE_MOVINGGC : RESERVE_NONE; @@ -702,6 +708,7 @@ int bch_open_buckets_alloc(struct cache_set *c) for (i = 0; i < MAX_OPEN_BUCKETS; i++) { struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL); + if (!b) return -ENOMEM; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 05f82ff6f016..83504dd8100a 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -252,7 +252,7 @@ struct bcache_device { struct kobject kobj; struct cache_set *c; - unsigned id; + unsigned int id; #define BCACHEDEVNAME_SIZE 12 char name[BCACHEDEVNAME_SIZE]; @@ -264,18 +264,19 @@ struct bcache_device { #define BCACHE_DEV_UNLINK_DONE 2 #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned nr_stripes; - unsigned stripe_size; + unsigned int nr_stripes; + unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; struct bio_set bio_split; - unsigned data_csum:1; + unsigned int data_csum:1; - int (*cache_miss)(struct btree *, struct search *, - struct bio *, unsigned); - int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long); + int (*cache_miss)(struct btree *b, struct search *s, + struct bio *bio, unsigned int sectors); + int (*ioctl)(struct bcache_device *d, fmode_t mode, + unsigned int cmd, unsigned long arg); }; struct io { @@ -284,7 +285,7 @@ struct io { struct list_head lru; unsigned long jiffies; - unsigned sequential; + unsigned int sequential; sector_t last; }; @@ -358,18 +359,18 @@ struct cached_dev { struct cache_accounting accounting; /* The rest of this all shows up in sysfs */ - unsigned sequential_cutoff; - unsigned readahead; + unsigned int sequential_cutoff; + unsigned int readahead; - unsigned io_disable:1; - unsigned verify:1; - unsigned bypass_torture_test:1; + unsigned int io_disable:1; + unsigned int verify:1; + unsigned int bypass_torture_test:1; - unsigned partial_stripes_expensive:1; - unsigned writeback_metadata:1; - unsigned writeback_running:1; + unsigned int partial_stripes_expensive:1; + unsigned int writeback_metadata:1; + unsigned int writeback_running:1; unsigned char writeback_percent; - unsigned writeback_delay; + unsigned int writeback_delay; uint64_t writeback_rate_target; int64_t writeback_rate_proportional; @@ -377,16 +378,16 @@ struct cached_dev { int64_t writeback_rate_integral_scaled; int32_t writeback_rate_change; - unsigned writeback_rate_update_seconds; - unsigned writeback_rate_i_term_inverse; - unsigned writeback_rate_p_term_inverse; - unsigned writeback_rate_minimum; + unsigned int writeback_rate_update_seconds; + unsigned int writeback_rate_i_term_inverse; + unsigned int writeback_rate_p_term_inverse; + unsigned int writeback_rate_minimum; enum stop_on_failure stop_when_cache_set_failed; #define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 atomic_t io_errors; - unsigned error_limit; - unsigned offline_seconds; + unsigned int error_limit; + unsigned int offline_seconds; char backing_dev_name[BDEVNAME_SIZE]; }; @@ -447,7 +448,7 @@ struct cache { * until a gc finishes - otherwise we could pointlessly burn a ton of * cpu */ - unsigned invalidate_needs_gc; + unsigned int invalidate_needs_gc; bool discard; /* Get rid of? */ @@ -472,7 +473,7 @@ struct gc_stat { size_t nkeys; uint64_t data; /* sectors */ - unsigned in_use; /* percent */ + unsigned int in_use; /* percent */ }; /* @@ -518,7 +519,7 @@ struct cache_set { int caches_loaded; struct bcache_device **devices; - unsigned devices_max_used; + unsigned int devices_max_used; atomic_t attached_dev_nr; struct list_head cached_devs; uint64_t cached_dev_sectors; @@ -548,7 +549,7 @@ struct cache_set { * Default number of pages for a new btree node - may be less than a * full bucket */ - unsigned btree_pages; + unsigned int btree_pages; /* * Lists of struct btrees; lru is the list for structs that have memory @@ -571,7 +572,7 @@ struct cache_set { struct list_head btree_cache_freed; /* Number of elements in btree_cache + btree_cache_freeable lists */ - unsigned btree_cache_used; + unsigned int btree_cache_used; /* * If we need to allocate memory for a new btree node and that @@ -613,8 +614,8 @@ struct cache_set { uint16_t min_prio; /* - * max(gen - last_gc) for all buckets. When it gets too big we have to gc - * to keep gens from wrapping around. + * max(gen - last_gc) for all buckets. When it gets too big we have to + * gc to keep gens from wrapping around. */ uint8_t need_gc; struct gc_stat gc_stats; @@ -649,7 +650,7 @@ struct cache_set { struct mutex verify_lock; #endif - unsigned nr_uuids; + unsigned int nr_uuids; struct uuid_entry *uuids; BKEY_PADDED(uuid_bucket); struct closure uuid_write; @@ -670,12 +671,12 @@ struct cache_set { struct journal journal; #define CONGESTED_MAX 1024 - unsigned congested_last_us; + unsigned int congested_last_us; atomic_t congested; /* The rest of this all shows up in sysfs */ - unsigned congested_read_threshold_us; - unsigned congested_write_threshold_us; + unsigned int congested_read_threshold_us; + unsigned int congested_write_threshold_us; struct time_stats btree_gc_time; struct time_stats btree_split_time; @@ -694,16 +695,16 @@ struct cache_set { ON_ERROR_PANIC, } on_error; #define DEFAULT_IO_ERROR_LIMIT 8 - unsigned error_limit; - unsigned error_decay; + unsigned int error_limit; + unsigned int error_decay; unsigned short journal_delay_ms; bool expensive_debug_checks; - unsigned verify:1; - unsigned key_merging_disabled:1; - unsigned gc_always_rewrite:1; - unsigned shrinker_disabled:1; - unsigned copy_gc_enabled:1; + unsigned int verify:1; + unsigned int key_merging_disabled:1; + unsigned int gc_always_rewrite:1; + unsigned int shrinker_disabled:1; + unsigned int copy_gc_enabled:1; #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; @@ -712,7 +713,7 @@ struct cache_set { }; struct bbio { - unsigned submit_time_us; + unsigned int submit_time_us; union { struct bkey key; uint64_t _pad[3]; @@ -729,10 +730,10 @@ struct bbio { #define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE) #define btree_blocks(b) \ - ((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) + ((unsigned int) (KEY_SIZE(&b->key) >> (b)->c->block_bits)) #define btree_default_blocks(c) \ - ((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) + ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits)) #define bucket_pages(c) ((c)->sb.bucket_size / PAGE_SECTORS) #define bucket_bytes(c) ((c)->sb.bucket_size << 9) @@ -761,21 +762,21 @@ static inline sector_t bucket_remainder(struct cache_set *c, sector_t s) static inline struct cache *PTR_CACHE(struct cache_set *c, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { return c->cache[PTR_DEV(k, ptr)]; } static inline size_t PTR_BUCKET_NR(struct cache_set *c, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { return sector_to_bucket(c, PTR_OFFSET(k, ptr)); } static inline struct bucket *PTR_BUCKET(struct cache_set *c, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr); } @@ -783,17 +784,18 @@ static inline struct bucket *PTR_BUCKET(struct cache_set *c, static inline uint8_t gen_after(uint8_t a, uint8_t b) { uint8_t r = a - b; + return r > 128U ? 0 : r; } static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k, - unsigned i) + unsigned int i) { return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i)); } static inline bool ptr_available(struct cache_set *c, const struct bkey *k, - unsigned i) + unsigned int i) { return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i); } @@ -879,16 +881,16 @@ static inline uint8_t bucket_gc_gen(struct bucket *b) #define BUCKET_GC_GEN_MAX 96U #define kobj_attribute_write(n, fn) \ - static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn) + static struct kobj_attribute ksysfs_##n = __ATTR(n, 0200, NULL, fn) #define kobj_attribute_rw(n, show, store) \ static struct kobj_attribute ksysfs_##n = \ - __ATTR(n, S_IWUSR|S_IRUSR, show, store) + __ATTR(n, 0600, show, store) static inline void wake_up_allocators(struct cache_set *c) { struct cache *ca; - unsigned i; + unsigned int i; for_each_cache(ca, c, i) wake_up_process(ca->alloc_thread); @@ -924,40 +926,43 @@ static inline void wait_for_kthread_stop(void) /* Forward declarations */ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); -void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); -void bch_bbio_count_io_errors(struct cache_set *, struct bio *, - blk_status_t, const char *); -void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t, - const char *); -void bch_bbio_free(struct bio *, struct cache_set *); -struct bio *bch_bbio_alloc(struct cache_set *); - -void __bch_submit_bbio(struct bio *, struct cache_set *); -void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); - -uint8_t bch_inc_gen(struct cache *, struct bucket *); -void bch_rescale_priorities(struct cache_set *, int); - -bool bch_can_invalidate_bucket(struct cache *, struct bucket *); -void __bch_invalidate_one_bucket(struct cache *, struct bucket *); - -void __bch_bucket_free(struct cache *, struct bucket *); -void bch_bucket_free(struct cache_set *, struct bkey *); - -long bch_bucket_alloc(struct cache *, unsigned, bool); -int __bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); -int bch_bucket_alloc_set(struct cache_set *, unsigned, - struct bkey *, int, bool); -bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, - unsigned, unsigned, bool); +void bch_count_io_errors(struct cache *ca, blk_status_t error, + int is_read, const char *m); +void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_endio(struct cache_set *c, struct bio *bio, + blk_status_t error, const char *m); +void bch_bbio_free(struct bio *bio, struct cache_set *c); +struct bio *bch_bbio_alloc(struct cache_set *c); + +void __bch_submit_bbio(struct bio *bio, struct cache_set *c); +void bch_submit_bbio(struct bio *bio, struct cache_set *c, + struct bkey *k, unsigned int ptr); + +uint8_t bch_inc_gen(struct cache *ca, struct bucket *b); +void bch_rescale_priorities(struct cache_set *c, int sectors); + +bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b); +void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b); + +void __bch_bucket_free(struct cache *ca, struct bucket *b); +void bch_bucket_free(struct cache_set *c, struct bkey *k); + +long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); +int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, int n, bool wait); +int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, + struct bkey *k, int n, bool wait); +bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, + unsigned int sectors, unsigned int write_point, + unsigned int write_prio, bool wait); bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) -bool bch_cache_set_error(struct cache_set *, const char *, ...); +bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *); -void bch_write_bdev_super(struct cached_dev *, struct closure *); +void bch_prio_write(struct cache *ca); +void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; extern struct mutex bch_register_lock; @@ -969,30 +974,31 @@ extern struct kobj_type bch_cache_set_ktype; extern struct kobj_type bch_cache_set_internal_ktype; extern struct kobj_type bch_cache_ktype; -void bch_cached_dev_release(struct kobject *); -void bch_flash_dev_release(struct kobject *); -void bch_cache_set_release(struct kobject *); -void bch_cache_release(struct kobject *); +void bch_cached_dev_release(struct kobject *kobj); +void bch_flash_dev_release(struct kobject *kobj); +void bch_cache_set_release(struct kobject *kobj); +void bch_cache_release(struct kobject *kobj); -int bch_uuid_write(struct cache_set *); -void bcache_write_super(struct cache_set *); +int bch_uuid_write(struct cache_set *c); +void bcache_write_super(struct cache_set *c); int bch_flash_dev_create(struct cache_set *c, uint64_t size); -int bch_cached_dev_attach(struct cached_dev *, struct cache_set *, uint8_t *); -void bch_cached_dev_detach(struct cached_dev *); -void bch_cached_dev_run(struct cached_dev *); -void bcache_device_stop(struct bcache_device *); - -void bch_cache_set_unregister(struct cache_set *); -void bch_cache_set_stop(struct cache_set *); - -struct cache_set *bch_cache_set_alloc(struct cache_sb *); -void bch_btree_cache_free(struct cache_set *); -int bch_btree_cache_alloc(struct cache_set *); -void bch_moving_init_cache_set(struct cache_set *); -int bch_open_buckets_alloc(struct cache_set *); -void bch_open_buckets_free(struct cache_set *); +int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, + uint8_t *set_uuid); +void bch_cached_dev_detach(struct cached_dev *dc); +void bch_cached_dev_run(struct cached_dev *dc); +void bcache_device_stop(struct bcache_device *d); + +void bch_cache_set_unregister(struct cache_set *c); +void bch_cache_set_stop(struct cache_set *c); + +struct cache_set *bch_cache_set_alloc(struct cache_sb *sb); +void bch_btree_cache_free(struct cache_set *c); +int bch_btree_cache_alloc(struct cache_set *c); +void bch_moving_init_cache_set(struct cache_set *c); +int bch_open_buckets_alloc(struct cache_set *c); +void bch_open_buckets_free(struct cache_set *c); int bch_cache_allocator_start(struct cache *ca); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 596c93b44e9b..8f07fa6e1739 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -18,31 +18,31 @@ #ifdef CONFIG_BCACHE_DEBUG -void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned set) +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set) { struct bkey *k, *next; for (k = i->start; k < bset_bkey_last(i); k = next) { next = bkey_next(k); - printk(KERN_ERR "block %u key %u/%u: ", set, - (unsigned) ((u64 *) k - i->d), i->keys); + pr_err("block %u key %u/%u: ", set, + (unsigned int) ((u64 *) k - i->d), i->keys); if (b->ops->key_dump) b->ops->key_dump(b, k); else - printk("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); + pr_err("%llu:%llu\n", KEY_INODE(k), KEY_OFFSET(k)); if (next < bset_bkey_last(i) && bkey_cmp(k, b->ops->is_extents ? &START_KEY(next) : next) > 0) - printk(KERN_ERR "Key skipped backwards\n"); + pr_err("Key skipped backwards\n"); } } void bch_dump_bucket(struct btree_keys *b) { - unsigned i; + unsigned int i; console_lock(); for (i = 0; i <= b->nsets; i++) @@ -53,7 +53,7 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { - unsigned ret = 0; + unsigned int ret = 0; struct btree_iter iter; struct bkey *k; @@ -128,7 +128,7 @@ static inline void bch_btree_iter_next_check(struct btree_iter *iter) {} /* Keylists */ -int __bch_keylist_realloc(struct keylist *l, unsigned u64s) +int __bch_keylist_realloc(struct keylist *l, unsigned int u64s) { size_t oldsize = bch_keylist_nkeys(l); size_t newsize = oldsize + u64s; @@ -180,7 +180,7 @@ void bch_keylist_pop_front(struct keylist *l) /* Key/pointer manipulation */ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, - unsigned i) + unsigned int i) { BUG_ON(i > KEY_PTRS(src)); @@ -194,7 +194,7 @@ void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, bool __bch_cut_front(const struct bkey *where, struct bkey *k) { - unsigned i, len = 0; + unsigned int i, len = 0; if (bkey_cmp(where, &START_KEY(k)) <= 0) return false; @@ -214,7 +214,7 @@ bool __bch_cut_front(const struct bkey *where, struct bkey *k) bool __bch_cut_back(const struct bkey *where, struct bkey *k) { - unsigned len = 0; + unsigned int len = 0; if (bkey_cmp(where, k) >= 0) return false; @@ -240,9 +240,9 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) #define BKEY_MANTISSA_MASK ((1 << BKEY_MANTISSA_BITS) - 1) struct bkey_float { - unsigned exponent:BKEY_EXPONENT_BITS; - unsigned m:BKEY_MID_BITS; - unsigned mantissa:BKEY_MANTISSA_BITS; + unsigned int exponent:BKEY_EXPONENT_BITS; + unsigned int m:BKEY_MID_BITS; + unsigned int mantissa:BKEY_MANTISSA_BITS; } __packed; /* @@ -311,7 +311,9 @@ void bch_btree_keys_free(struct btree_keys *b) } EXPORT_SYMBOL(bch_btree_keys_free); -int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) +int bch_btree_keys_alloc(struct btree_keys *b, + unsigned int page_order, + gfp_t gfp) { struct bset_tree *t = b->set; @@ -345,7 +347,7 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) { - unsigned i; + unsigned int i; b->ops = ops; b->expensive_debug_checks = expensive_debug_checks; @@ -370,7 +372,7 @@ EXPORT_SYMBOL(bch_btree_keys_init); * return array index next to j when does in-order traverse * of a binary tree which is stored in a linear array */ -static unsigned inorder_next(unsigned j, unsigned size) +static unsigned int inorder_next(unsigned int j, unsigned int size) { if (j * 2 + 1 < size) { j = j * 2 + 1; @@ -387,7 +389,7 @@ static unsigned inorder_next(unsigned j, unsigned size) * return array index previous to j when does in-order traverse * of a binary tree which is stored in a linear array */ -static unsigned inorder_prev(unsigned j, unsigned size) +static unsigned int inorder_prev(unsigned int j, unsigned int size) { if (j * 2 < size) { j = j * 2; @@ -400,7 +402,8 @@ static unsigned inorder_prev(unsigned j, unsigned size) return j; } -/* I have no idea why this code works... and I'm the one who wrote it +/* + * I have no idea why this code works... and I'm the one who wrote it * * However, I do know what it does: * Given a binary tree constructed in an array (i.e. how you normally implement @@ -413,10 +416,12 @@ static unsigned inorder_prev(unsigned j, unsigned size) * extra is a function of size: * extra = (size - rounddown_pow_of_two(size - 1)) << 1; */ -static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) +static unsigned int __to_inorder(unsigned int j, + unsigned int size, + unsigned int extra) { - unsigned b = fls(j); - unsigned shift = fls(size - 1) - b; + unsigned int b = fls(j); + unsigned int shift = fls(size - 1) - b; j ^= 1U << (b - 1); j <<= 1; @@ -433,14 +438,16 @@ static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra) * Return the cacheline index in bset_tree->data, where j is index * from a linear array which stores the auxiliar binary tree */ -static unsigned to_inorder(unsigned j, struct bset_tree *t) +static unsigned int to_inorder(unsigned int j, struct bset_tree *t) { return __to_inorder(j, t->size, t->extra); } -static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) +static unsigned int __inorder_to_tree(unsigned int j, + unsigned int size, + unsigned int extra) { - unsigned shift; + unsigned int shift; if (j > extra) j += j - extra; @@ -457,7 +464,7 @@ static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra) * Return an index from a linear array which stores the auxiliar binary * tree, j is the cacheline index of t->data. */ -static unsigned inorder_to_tree(unsigned j, struct bset_tree *t) +static unsigned int inorder_to_tree(unsigned int j, struct bset_tree *t) { return __inorder_to_tree(j, t->size, t->extra); } @@ -468,14 +475,15 @@ void inorder_test(void) unsigned long done = 0; ktime_t start = ktime_get(); - for (unsigned size = 2; + for (unsigned int size = 2; size < 65536000; size++) { - unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1; - unsigned i = 1, j = rounddown_pow_of_two(size - 1); + unsigned int extra = + (size - rounddown_pow_of_two(size - 1)) << 1; + unsigned int i = 1, j = rounddown_pow_of_two(size - 1); if (!(size % 4096)) - printk(KERN_NOTICE "loop %u, %llu per us\n", size, + pr_notice("loop %u, %llu per us\n", size, done / ktime_us_delta(ktime_get(), start)); while (1) { @@ -518,30 +526,31 @@ void inorder_test(void) * of the previous key so we can walk backwards to it from t->tree[j]'s key. */ -static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline, - unsigned offset) +static struct bkey *cacheline_to_bkey(struct bset_tree *t, + unsigned int cacheline, + unsigned int offset) { return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8; } -static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k) +static unsigned int bkey_to_cacheline(struct bset_tree *t, struct bkey *k) { return ((void *) k - (void *) t->data) / BSET_CACHELINE; } -static unsigned bkey_to_cacheline_offset(struct bset_tree *t, - unsigned cacheline, +static unsigned int bkey_to_cacheline_offset(struct bset_tree *t, + unsigned int cacheline, struct bkey *k) { return (u64 *) k - (u64 *) cacheline_to_bkey(t, cacheline, 0); } -static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j) +static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned int j) { return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m); } -static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) +static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned int j) { return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]); } @@ -550,7 +559,7 @@ static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j) * For the write set - the one we're currently inserting keys into - we don't * maintain a full search tree, we just keep a simple lookup table in t->prev. */ -static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline) +static struct bkey *table_to_bkey(struct bset_tree *t, unsigned int cacheline) { return cacheline_to_bkey(t, cacheline, t->prev[cacheline]); } @@ -576,14 +585,15 @@ static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift) * See make_bfloat() to check when most significant bit of f->exponent * is set or not. */ -static inline unsigned bfloat_mantissa(const struct bkey *k, +static inline unsigned int bfloat_mantissa(const struct bkey *k, struct bkey_float *f) { const uint64_t *p = &k->low - (f->exponent >> 6); + return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK; } -static void make_bfloat(struct bset_tree *t, unsigned j) +static void make_bfloat(struct bset_tree *t, unsigned int j) { struct bkey_float *f = &t->tree[j]; struct bkey *m = tree_to_bkey(t, j); @@ -631,7 +641,7 @@ static void make_bfloat(struct bset_tree *t, unsigned j) static void bset_alloc_tree(struct btree_keys *b, struct bset_tree *t) { if (t != b->set) { - unsigned j = roundup(t[-1].size, + unsigned int j = roundup(t[-1].size, 64 / sizeof(struct bkey_float)); t->tree = t[-1].tree + j; @@ -686,13 +696,13 @@ void bch_bset_build_written_tree(struct btree_keys *b) { struct bset_tree *t = bset_tree_last(b); struct bkey *prev = NULL, *k = t->data->start; - unsigned j, cacheline = 1; + unsigned int j, cacheline = 1; b->last_set_unwritten = 0; bset_alloc_tree(b, t); - t->size = min_t(unsigned, + t->size = min_t(unsigned int, bkey_to_cacheline(t, bset_bkey_last(t->data)), b->set->tree + btree_keys_cachelines(b) - t->tree); @@ -732,7 +742,7 @@ EXPORT_SYMBOL(bch_bset_build_written_tree); void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k) { struct bset_tree *t; - unsigned inorder, j = 1; + unsigned int inorder, j = 1; for (t = b->set; t <= bset_tree_last(b); t++) if (k < bset_bkey_last(t->data)) @@ -779,14 +789,15 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, struct bset_tree *t, struct bkey *k) { - unsigned shift = bkey_u64s(k); - unsigned j = bkey_to_cacheline(t, k); + unsigned int shift = bkey_u64s(k); + unsigned int j = bkey_to_cacheline(t, k); /* We're getting called from btree_split() or btree_gc, just bail out */ if (!t->size) return; - /* k is the key we just inserted; we need to find the entry in the + /* + * k is the key we just inserted; we need to find the entry in the * lookup table for the first key that is strictly greater than k: * it's either k's cacheline or the next one */ @@ -794,7 +805,8 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, table_to_bkey(t, j) <= k) j++; - /* Adjust all the lookup table entries, and find a new key for any that + /* + * Adjust all the lookup table entries, and find a new key for any that * have gotten too big */ for (; j < t->size; j++) { @@ -819,7 +831,8 @@ static void bch_bset_fix_lookup_table(struct btree_keys *b, k != bset_bkey_last(t->data); k = bkey_next(k)) if (t->size == bkey_to_cacheline(t, k)) { - t->prev[t->size] = bkey_to_cacheline_offset(t, t->size, k); + t->prev[t->size] = + bkey_to_cacheline_offset(t, t->size, k); t->size++; } } @@ -867,10 +880,10 @@ void bch_bset_insert(struct btree_keys *b, struct bkey *where, } EXPORT_SYMBOL(bch_bset_insert); -unsigned bch_btree_insert_key(struct btree_keys *b, struct bkey *k, +unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, struct bkey *replace_key) { - unsigned status = BTREE_INSERT_STATUS_NO_INSERT; + unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; struct btree_iter iter; @@ -922,10 +935,10 @@ struct bset_search_iter { static struct bset_search_iter bset_search_write_set(struct bset_tree *t, const struct bkey *search) { - unsigned li = 0, ri = t->size; + unsigned int li = 0, ri = t->size; while (li + 1 != ri) { - unsigned m = (li + ri) >> 1; + unsigned int m = (li + ri) >> 1; if (bkey_cmp(table_to_bkey(t, m), search) > 0) ri = m; @@ -944,7 +957,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, { struct bkey *l, *r; struct bkey_float *f; - unsigned inorder, j, n = 1; + unsigned int inorder, j, n = 1; do { /* @@ -958,7 +971,8 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, * p = 0; * but a branch instruction is avoided. */ - unsigned p = n << 4; + unsigned int p = n << 4; + p &= ((int) (p - t->size)) >> 31; prefetch(&t->tree[p]); @@ -978,7 +992,7 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, * to work - that's done in make_bfloat() */ if (likely(f->exponent != 127)) - n = j * 2 + (((unsigned) + n = j * 2 + (((unsigned int) (f->mantissa - bfloat_mantissa(search, f))) >> 31); else @@ -1109,6 +1123,7 @@ static struct bkey *__bch_btree_iter_init(struct btree_keys *b, struct bset_tree *start) { struct bkey *ret = NULL; + iter->size = ARRAY_SIZE(iter->data); iter->used = 0; @@ -1184,7 +1199,8 @@ void bch_bset_sort_state_free(struct bset_sort_state *state) mempool_exit(&state->pool); } -int bch_bset_sort_state_init(struct bset_sort_state *state, unsigned page_order) +int bch_bset_sort_state_init(struct bset_sort_state *state, + unsigned int page_order) { spin_lock_init(&state->time.lock); @@ -1237,7 +1253,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, } static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, - unsigned start, unsigned order, bool fixup, + unsigned int start, unsigned int order, bool fixup, struct bset_sort_state *state) { uint64_t start_time; @@ -1288,7 +1304,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter, bch_time_stats_update(&state->time, start_time); } -void bch_btree_sort_partial(struct btree_keys *b, unsigned start, +void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; @@ -1298,7 +1314,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned start, __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); if (start) { - unsigned i; + unsigned int i; for (i = start; i <= b->nsets; i++) keys += b->set[i].data->keys; @@ -1323,8 +1339,8 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; + bch_btree_iter_init(b, &iter, NULL); btree_mergesort(b, new->set->data, &iter, false, true); @@ -1338,7 +1354,7 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state) { - unsigned crit = SORT_CRIT; + unsigned int crit = SORT_CRIT; int i; /* Don't sort if nothing to do */ @@ -1367,7 +1383,7 @@ EXPORT_SYMBOL(bch_btree_sort_lazy); void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *stats) { - unsigned i; + unsigned int i; for (i = 0; i <= b->nsets; i++) { struct bset_tree *t = &b->set[i]; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index b867f2200495..bac76aabca6d 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -163,10 +163,10 @@ struct bset_tree { */ /* size of the binary tree and prev array */ - unsigned size; + unsigned int size; /* function of size - precalculated for to_inorder() */ - unsigned extra; + unsigned int extra; /* copy of the last key in the set */ struct bkey end; @@ -187,18 +187,25 @@ struct bset_tree { }; struct btree_keys_ops { - bool (*sort_cmp)(struct btree_iter_set, - struct btree_iter_set); - struct bkey *(*sort_fixup)(struct btree_iter *, struct bkey *); - bool (*insert_fixup)(struct btree_keys *, struct bkey *, - struct btree_iter *, struct bkey *); - bool (*key_invalid)(struct btree_keys *, - const struct bkey *); - bool (*key_bad)(struct btree_keys *, const struct bkey *); - bool (*key_merge)(struct btree_keys *, - struct bkey *, struct bkey *); - void (*key_to_text)(char *, size_t, const struct bkey *); - void (*key_dump)(struct btree_keys *, const struct bkey *); + bool (*sort_cmp)(struct btree_iter_set l, + struct btree_iter_set r); + struct bkey *(*sort_fixup)(struct btree_iter *iter, + struct bkey *tmp); + bool (*insert_fixup)(struct btree_keys *b, + struct bkey *insert, + struct btree_iter *iter, + struct bkey *replace_key); + bool (*key_invalid)(struct btree_keys *bk, + const struct bkey *k); + bool (*key_bad)(struct btree_keys *bk, + const struct bkey *k); + bool (*key_merge)(struct btree_keys *bk, + struct bkey *l, struct bkey *r); + void (*key_to_text)(char *buf, + size_t size, + const struct bkey *k); + void (*key_dump)(struct btree_keys *keys, + const struct bkey *k); /* * Only used for deciding whether to use START_KEY(k) or just the key @@ -211,7 +218,7 @@ struct btree_keys { const struct btree_keys_ops *ops; uint8_t page_order; uint8_t nsets; - unsigned last_set_unwritten:1; + unsigned int last_set_unwritten:1; bool *expensive_debug_checks; /* @@ -239,12 +246,14 @@ static inline bool bkey_written(struct btree_keys *b, struct bkey *k) return !b->last_set_unwritten || k < b->set[b->nsets].data->start; } -static inline unsigned bset_byte_offset(struct btree_keys *b, struct bset *i) +static inline unsigned int bset_byte_offset(struct btree_keys *b, + struct bset *i) { return ((size_t) i) - ((size_t) b->set->data); } -static inline unsigned bset_sector_offset(struct btree_keys *b, struct bset *i) +static inline unsigned int bset_sector_offset(struct btree_keys *b, + struct bset *i) { return bset_byte_offset(b, i) >> 9; } @@ -273,25 +282,27 @@ static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b) } static inline struct bset *bset_next_set(struct btree_keys *b, - unsigned block_bytes) + unsigned int block_bytes) { struct bset *i = bset_tree_last(b)->data; return ((void *) i) + roundup(set_bytes(i), block_bytes); } -void bch_btree_keys_free(struct btree_keys *); -int bch_btree_keys_alloc(struct btree_keys *, unsigned, gfp_t); -void bch_btree_keys_init(struct btree_keys *, const struct btree_keys_ops *, - bool *); - -void bch_bset_init_next(struct btree_keys *, struct bset *, uint64_t); -void bch_bset_build_written_tree(struct btree_keys *); -void bch_bset_fix_invalidated_key(struct btree_keys *, struct bkey *); -bool bch_bkey_try_merge(struct btree_keys *, struct bkey *, struct bkey *); -void bch_bset_insert(struct btree_keys *, struct bkey *, struct bkey *); -unsigned bch_btree_insert_key(struct btree_keys *, struct bkey *, - struct bkey *); +void bch_btree_keys_free(struct btree_keys *b); +int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order, + gfp_t gfp); +void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, + bool *expensive_debug_checks); + +void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic); +void bch_bset_build_written_tree(struct btree_keys *b); +void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k); +bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r); +void bch_bset_insert(struct btree_keys *b, struct bkey *where, + struct bkey *insert); +unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, + struct bkey *replace_key); enum { BTREE_INSERT_STATUS_NO_INSERT = 0, @@ -313,18 +324,21 @@ struct btree_iter { } data[MAX_BSETS]; }; -typedef bool (*ptr_filter_fn)(struct btree_keys *, const struct bkey *); +typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); -struct bkey *bch_btree_iter_next(struct btree_iter *); -struct bkey *bch_btree_iter_next_filter(struct btree_iter *, - struct btree_keys *, ptr_filter_fn); +struct bkey *bch_btree_iter_next(struct btree_iter *iter); +struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, + struct btree_keys *b, + ptr_filter_fn fn); -void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *); -struct bkey *bch_btree_iter_init(struct btree_keys *, struct btree_iter *, - struct bkey *); +void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, + struct bkey *end); +struct bkey *bch_btree_iter_init(struct btree_keys *b, + struct btree_iter *iter, + struct bkey *search); -struct bkey *__bch_bset_search(struct btree_keys *, struct bset_tree *, - const struct bkey *); +struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, + const struct bkey *search); /* * Returns the first key that is strictly greater than search @@ -349,21 +363,23 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, struct bset_sort_state { mempool_t pool; - unsigned page_order; - unsigned crit_factor; + unsigned int page_order; + unsigned int crit_factor; struct time_stats time; }; -void bch_bset_sort_state_free(struct bset_sort_state *); -int bch_bset_sort_state_init(struct bset_sort_state *, unsigned); -void bch_btree_sort_lazy(struct btree_keys *, struct bset_sort_state *); -void bch_btree_sort_into(struct btree_keys *, struct btree_keys *, - struct bset_sort_state *); -void bch_btree_sort_and_fix_extents(struct btree_keys *, struct btree_iter *, - struct bset_sort_state *); -void bch_btree_sort_partial(struct btree_keys *, unsigned, - struct bset_sort_state *); +void bch_bset_sort_state_free(struct bset_sort_state *state); +int bch_bset_sort_state_init(struct bset_sort_state *state, + unsigned int page_order); +void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state); +void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, + struct bset_sort_state *state); +void bch_btree_sort_and_fix_extents(struct btree_keys *b, + struct btree_iter *iter, + struct bset_sort_state *state); +void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, + struct bset_sort_state *state); static inline void bch_btree_sort(struct btree_keys *b, struct bset_sort_state *state) @@ -377,13 +393,13 @@ struct bset_stats { size_t floats, failed; }; -void bch_btree_keys_stats(struct btree_keys *, struct bset_stats *); +void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state); /* Bkey utility code */ #define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) -static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned idx) +static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx) { return bkey_idx(i->start, idx); } @@ -401,10 +417,10 @@ static __always_inline int64_t bkey_cmp(const struct bkey *l, : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r); } -void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *, - unsigned); -bool __bch_cut_front(const struct bkey *, struct bkey *); -bool __bch_cut_back(const struct bkey *, struct bkey *); +void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src, + unsigned int i); +bool __bch_cut_front(const struct bkey *where, struct bkey *k); +bool __bch_cut_back(const struct bkey *where, struct bkey *k); static inline bool bch_cut_front(const struct bkey *where, struct bkey *k) { @@ -522,18 +538,20 @@ static inline size_t bch_keylist_bytes(struct keylist *l) return bch_keylist_nkeys(l) * sizeof(uint64_t); } -struct bkey *bch_keylist_pop(struct keylist *); -void bch_keylist_pop_front(struct keylist *); -int __bch_keylist_realloc(struct keylist *, unsigned); +struct bkey *bch_keylist_pop(struct keylist *l); +void bch_keylist_pop_front(struct keylist *l); +int __bch_keylist_realloc(struct keylist *l, unsigned int u64s); /* Debug stuff */ #ifdef CONFIG_BCACHE_DEBUG -int __bch_count_data(struct btree_keys *); -void __printf(2, 3) __bch_check_keys(struct btree_keys *, const char *, ...); -void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); -void bch_dump_bucket(struct btree_keys *); +int __bch_count_data(struct btree_keys *b); +void __printf(2, 3) __bch_check_keys(struct btree_keys *b, + const char *fmt, + ...); +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); +void bch_dump_bucket(struct btree_keys *b); #else @@ -541,7 +559,7 @@ static inline int __bch_count_data(struct btree_keys *b) { return -1; } static inline void __printf(2, 3) __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {} static inline void bch_dump_bucket(struct btree_keys *b) {} -void bch_dump_bset(struct btree_keys *, struct bset *, unsigned); +void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set); #endif diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index c19f7716df88..e7d4817681f2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -183,7 +183,7 @@ static void bch_btree_init_next(struct btree *b) void bkey_put(struct cache_set *c, struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) @@ -287,6 +287,7 @@ err: static void btree_node_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; + closure_put(cl); } @@ -435,7 +436,10 @@ static void do_btree_node_write(struct btree *b) continue_at(cl, btree_node_write_done, NULL); } else { - /* No problem for multipage bvec since the bio is just allocated */ + /* + * No problem for multipage bvec since the bio is + * just allocated + */ b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); @@ -479,7 +483,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent) void bch_btree_node_write(struct btree *b, struct closure *parent) { - unsigned nsets = b->keys.nsets; + unsigned int nsets = b->keys.nsets; lockdep_assert_held(&b->lock); @@ -581,7 +585,7 @@ static void mca_bucket_free(struct btree *b) list_move(&b->list, &b->c->btree_cache_freeable); } -static unsigned btree_order(struct bkey *k) +static unsigned int btree_order(struct bkey *k) { return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); } @@ -589,7 +593,7 @@ static unsigned btree_order(struct bkey *k) static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { if (!bch_btree_keys_alloc(&b->keys, - max_t(unsigned, + max_t(unsigned int, ilog2(b->c->btree_pages), btree_order(k)), gfp)) { @@ -604,6 +608,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { struct btree *b = kzalloc(sizeof(struct btree), gfp); + if (!b) return NULL; @@ -620,7 +625,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, return b; } -static int mca_reap(struct btree *b, unsigned min_order, bool flush) +static int mca_reap(struct btree *b, unsigned int min_order, bool flush) { struct closure cl; @@ -746,6 +751,7 @@ void bch_btree_cache_free(struct cache_set *c) { struct btree *b; struct closure cl; + closure_init_stack(&cl); if (c->shrink.list.next) @@ -786,7 +792,7 @@ void bch_btree_cache_free(struct cache_set *c) int bch_btree_cache_alloc(struct cache_set *c) { - unsigned i; + unsigned int i; for (i = 0; i < mca_reserve(c); i++) if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) @@ -1124,6 +1130,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, struct btree_op *op) { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); + if (!IS_ERR_OR_NULL(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); @@ -1136,7 +1143,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, static void make_btree_freeing_key(struct btree *b, struct bkey *k) { - unsigned i; + unsigned int i; mutex_lock(&b->c->bucket_lock); @@ -1157,7 +1164,7 @@ static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; struct cache *ca; - unsigned i, reserve = (c->root->level - b->level) * 2 + 1; + unsigned int i, reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); @@ -1181,7 +1188,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) { uint8_t stale = 0; - unsigned i; + unsigned int i; struct bucket *g; /* @@ -1219,7 +1226,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, SET_GC_MARK(g, GC_MARK_RECLAIMABLE); /* guard against overflow */ - SET_GC_SECTORS_USED(g, min_t(unsigned, + SET_GC_SECTORS_USED(g, min_t(unsigned int, GC_SECTORS_USED(g) + KEY_SIZE(k), MAX_GC_SECTORS_USED)); @@ -1233,7 +1240,7 @@ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i) && @@ -1259,7 +1266,7 @@ void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; - unsigned keys = 0, good_keys = 0; + unsigned int keys = 0, good_keys = 0; struct bkey *k; struct btree_iter iter; struct bset_tree *t; @@ -1302,16 +1309,18 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) struct gc_merge_info { struct btree *b; - unsigned keys; + unsigned int keys; }; -static int bch_btree_insert_node(struct btree *, struct btree_op *, - struct keylist *, atomic_t *, struct bkey *); +static int bch_btree_insert_node(struct btree *b, struct btree_op *op, + struct keylist *insert_keys, + atomic_t *journal_ref, + struct bkey *replace_key); static int btree_gc_coalesce(struct btree *b, struct btree_op *op, struct gc_stat *gc, struct gc_merge_info *r) { - unsigned i, nodes = 0, keys = 0, blocks; + unsigned int i, nodes = 0, keys = 0, blocks; struct btree *new_nodes[GC_MERGE_NODES]; struct keylist keylist; struct closure cl; @@ -1511,11 +1520,11 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return -EINTR; } -static unsigned btree_gc_count_keys(struct btree *b) +static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; struct btree_iter iter; - unsigned ret = 0; + unsigned int ret = 0; for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); @@ -1678,7 +1687,7 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned i; + unsigned int i; if (!c->gc_mark_valid) return; @@ -1704,7 +1713,7 @@ static void bch_btree_gc_finish(struct cache_set *c) { struct bucket *b; struct cache *ca; - unsigned i; + unsigned int i; mutex_lock(&c->bucket_lock); @@ -1722,7 +1731,7 @@ static void bch_btree_gc_finish(struct cache_set *c) struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; - unsigned j; + unsigned int j; if (!d || UUID_FLASH_ONLY(&c->uuids[i])) continue; @@ -1814,7 +1823,7 @@ static void bch_btree_gc(struct cache_set *c) static bool gc_should_run(struct cache_set *c) { struct cache *ca; - unsigned i; + unsigned int i; for_each_cache(ca, c, i) if (ca->invalidate_needs_gc) @@ -1905,7 +1914,7 @@ void bch_initial_gc_finish(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned i; + unsigned int i; bch_btree_gc_finish(c); @@ -1945,7 +1954,7 @@ void bch_initial_gc_finish(struct cache_set *c) static bool btree_insert_key(struct btree *b, struct bkey *k, struct bkey *replace_key) { - unsigned status; + unsigned int status; BUG_ON(bkey_cmp(k, &b->key) > 0); @@ -2044,7 +2053,7 @@ static int btree_split(struct btree *b, struct btree_op *op, block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; if (split) { - unsigned keys = 0; + unsigned int keys = 0; trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); @@ -2222,10 +2231,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, rw_lock(true, b, b->level); if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1) { + b->seq != seq + 1) { op->lock = b->level; goto out; - } + } } SET_KEY_PTRS(check_key, 1); @@ -2300,7 +2309,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys, void bch_btree_set_root(struct btree *b) { - unsigned i; + unsigned int i; struct closure cl; closure_init_stack(&cl); @@ -2412,7 +2421,7 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, struct refill { struct btree_op op; - unsigned nr_found; + unsigned int nr_found; struct keybuf *buf; struct bkey *end; keybuf_pred_fn *pred; @@ -2488,6 +2497,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, if (!RB_EMPTY_ROOT(&buf->keys)) { struct keybuf_key *w; + w = RB_FIRST(&buf->keys, struct keybuf_key, node); buf->start = START_KEY(&w->key); @@ -2519,6 +2529,7 @@ bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, { bool ret = false; struct keybuf_key *p, *w, s; + s.key = *start; if (bkey_cmp(end, &buf->start) <= 0 || @@ -2545,6 +2556,7 @@ bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, struct keybuf_key *bch_keybuf_next(struct keybuf *buf) { struct keybuf_key *w; + spin_lock(&buf->lock); w = RB_FIRST(&buf->keys, struct keybuf_key, node); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 68e9d926134d..a68d6c55783b 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -184,7 +184,7 @@ static inline struct bset *btree_bset_last(struct btree *b) return bset_tree_last(&b->keys)->data; } -static inline unsigned bset_block_offset(struct btree *b, struct bset *i) +static inline unsigned int bset_block_offset(struct btree *b, struct bset *i) { return bset_sector_offset(&b->keys, i) >> b->c->block_bits; } @@ -213,7 +213,7 @@ struct btree_op { /* Btree level at which we start taking write locks */ short lock; - unsigned insert_collision:1; + unsigned int insert_collision:1; }; static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) @@ -238,26 +238,28 @@ static inline void rw_unlock(bool w, struct btree *b) (w ? up_write : up_read)(&b->lock); } -void bch_btree_node_read_done(struct btree *); -void __bch_btree_node_write(struct btree *, struct closure *); -void bch_btree_node_write(struct btree *, struct closure *); - -void bch_btree_set_root(struct btree *); -struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *, - int, bool, struct btree *); -struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, - struct bkey *, int, bool, struct btree *); - -int bch_btree_insert_check_key(struct btree *, struct btree_op *, - struct bkey *); -int bch_btree_insert(struct cache_set *, struct keylist *, - atomic_t *, struct bkey *); - -int bch_gc_thread_start(struct cache_set *); -void bch_initial_gc_finish(struct cache_set *); -void bch_moving_gc(struct cache_set *); -int bch_btree_check(struct cache_set *); -void bch_initial_mark_key(struct cache_set *, int, struct bkey *); +void bch_btree_node_read_done(struct btree *b); +void __bch_btree_node_write(struct btree *b, struct closure *parent); +void bch_btree_node_write(struct btree *b, struct closure *parent); + +void bch_btree_set_root(struct btree *b); +struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, + int level, bool wait, + struct btree *parent); +struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, + struct bkey *k, int level, bool write, + struct btree *parent); + +int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, + struct bkey *check_key); +int bch_btree_insert(struct cache_set *c, struct keylist *keys, + atomic_t *journal_ref, struct bkey *replace_key); + +int bch_gc_thread_start(struct cache_set *c); +void bch_initial_gc_finish(struct cache_set *c); +void bch_moving_gc(struct cache_set *c); +int bch_btree_check(struct cache_set *c); +void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k); static inline void wake_up_gc(struct cache_set *c) { @@ -272,9 +274,9 @@ static inline void wake_up_gc(struct cache_set *c) #define MAP_END_KEY 1 -typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); -int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, - struct bkey *, btree_map_nodes_fn *, int); +typedef int (btree_map_nodes_fn)(struct btree_op *b_op, struct btree *b); +int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_nodes_fn *fn, int flags); static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn) @@ -290,21 +292,23 @@ static inline int bch_btree_map_leaf_nodes(struct btree_op *op, return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); } -typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, - struct bkey *); -int bch_btree_map_keys(struct btree_op *, struct cache_set *, - struct bkey *, btree_map_keys_fn *, int); - -typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); - -void bch_keybuf_init(struct keybuf *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, - struct bkey *, keybuf_pred_fn *); -bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, - struct bkey *); -void bch_keybuf_del(struct keybuf *, struct keybuf_key *); -struct keybuf_key *bch_keybuf_next(struct keybuf *); -struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, - struct bkey *, keybuf_pred_fn *); +typedef int (btree_map_keys_fn)(struct btree_op *op, struct btree *b, + struct bkey *k); +int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, + struct bkey *from, btree_map_keys_fn *fn, int flags); + +typedef bool (keybuf_pred_fn)(struct keybuf *buf, struct bkey *k); + +void bch_keybuf_init(struct keybuf *buf); +void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, + struct bkey *end, keybuf_pred_fn *pred); +bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, + struct bkey *end); +void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w); +struct keybuf_key *bch_keybuf_next(struct keybuf *buf); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, + struct keybuf *buf, + struct bkey *end, + keybuf_pred_fn *pred); void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); #endif diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 618253683d40..73f5319295bc 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Asynchronous refcounty things * @@ -162,12 +163,13 @@ static struct dentry *closure_debug; static int debug_seq_show(struct seq_file *f, void *data) { struct closure *cl; + spin_lock_irq(&closure_list_lock); list_for_each_entry(cl, &closure_list, all) { int r = atomic_read(&cl->remaining); - seq_printf(f, "%p: %pF -> %pf p %p r %i ", + seq_printf(f, "%p: %pS -> %pS p %p r %i ", cl, (void *) cl->ip, cl->fn, cl->parent, r & CLOSURE_REMAINING_MASK); @@ -177,7 +179,7 @@ static int debug_seq_show(struct seq_file *f, void *data) r & CLOSURE_RUNNING ? "R" : ""); if (r & CLOSURE_WAITING) - seq_printf(f, " W %pF\n", + seq_printf(f, " W %pS\n", (void *) cl->waiting_on); seq_printf(f, "\n"); diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 7c2c5bc7c88b..eca0d496b686 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -159,7 +159,7 @@ struct closure { #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e - unsigned magic; + unsigned int magic; struct list_head all; unsigned long ip; unsigned long waiting_on; @@ -289,10 +289,12 @@ static inline void closure_init_stack(struct closure *cl) } /** - * closure_wake_up - wake up all closures on a wait list. + * closure_wake_up - wake up all closures on a wait list, + * with memory barrier */ static inline void closure_wake_up(struct closure_waitlist *list) { + /* Memory barrier for the wait list */ smp_mb(); __closure_wake_up(list); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 12034c07257b..06da66b2488a 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -67,34 +67,35 @@ void bch_btree_verify(struct btree *b) if (inmemory->keys != sorted->keys || memcmp(inmemory->start, sorted->start, - (void *) bset_bkey_last(inmemory) - (void *) inmemory->start)) { + (void *) bset_bkey_last(inmemory) - + (void *) inmemory->start)) { struct bset *i; - unsigned j; + unsigned int j; console_lock(); - printk(KERN_ERR "*** in memory:\n"); + pr_err("*** in memory:\n"); bch_dump_bset(&b->keys, inmemory, 0); - printk(KERN_ERR "*** read back in:\n"); + pr_err("*** read back in:\n"); bch_dump_bset(&v->keys, sorted, 0); for_each_written_bset(b, ondisk, i) { - unsigned block = ((void *) i - (void *) ondisk) / + unsigned int block = ((void *) i - (void *) ondisk) / block_bytes(b->c); - printk(KERN_ERR "*** on disk block %u:\n", block); + pr_err("*** on disk block %u:\n", block); bch_dump_bset(&b->keys, i, block); } - printk(KERN_ERR "*** block %zu not written\n", + pr_err("*** block %zu not written\n", ((void *) i - (void *) ondisk) / block_bytes(b->c)); for (j = 0; j < inmemory->keys; j++) if (inmemory->d[j] != sorted->d[j]) break; - printk(KERN_ERR "b->written %u\n", b->written); + pr_err("b->written %u\n", b->written); console_unlock(); panic("verify failed at %u\n", j); @@ -176,9 +177,9 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, while (size) { struct keybuf_key *w; - unsigned bytes = min(i->bytes, size); - + unsigned int bytes = min(i->bytes, size); int err = copy_to_user(buf, i->buf, bytes); + if (err) return err; @@ -237,8 +238,8 @@ void bch_debug_init_cache_set(struct cache_set *c) { if (!IS_ERR_OR_NULL(bcache_debug)) { char name[50]; - snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); + snprintf(name, 50, "bcache-%pU", c->sb.set_uuid); c->debug = debugfs_create_file(name, 0400, bcache_debug, c, &cache_set_debug_ops); } diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index acc48d3fa274..fb3d4dff4b26 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -8,8 +8,8 @@ struct cache_set; #ifdef CONFIG_BCACHE_DEBUG -void bch_btree_verify(struct btree *); -void bch_data_verify(struct cached_dev *, struct bio *); +void bch_btree_verify(struct btree *b); +void bch_data_verify(struct cached_dev *dc, struct bio *bio); #define expensive_debug_checks(c) ((c)->expensive_debug_checks) #define key_merging_disabled(c) ((c)->key_merging_disabled) @@ -27,7 +27,7 @@ static inline void bch_data_verify(struct cached_dev *dc, struct bio *bio) {} #endif #ifdef CONFIG_DEBUG_FS -void bch_debug_init_cache_set(struct cache_set *); +void bch_debug_init_cache_set(struct cache_set *c); #else static inline void bch_debug_init_cache_set(struct cache_set *c) {} #endif diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 1d096742eb41..c809724e6571 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -46,7 +46,7 @@ static bool bch_key_sort_cmp(struct btree_iter_set l, static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { @@ -67,7 +67,7 @@ static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) { - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) { @@ -96,7 +96,7 @@ static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) { - unsigned i = 0; + unsigned int i = 0; char *out = buf, *end = buf + size; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) @@ -126,22 +126,22 @@ void bch_extent_to_text(char *buf, size_t size, const struct bkey *k) static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k) { struct btree *b = container_of(keys, struct btree, keys); - unsigned j; + unsigned int j; char buf[80]; bch_extent_to_text(buf, sizeof(buf), k); - printk(" %s", buf); + pr_err(" %s", buf); for (j = 0; j < KEY_PTRS(k); j++) { size_t n = PTR_BUCKET_NR(b->c, k, j); - printk(" bucket %zu", n); + pr_err(" bucket %zu", n); if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets) - printk(" prio %i", + pr_err(" prio %i", PTR_BUCKET(b->c, k, j)->prio); } - printk(" %s\n", bch_ptr_status(b->c, k)); + pr_err(" %s\n", bch_ptr_status(b->c, k)); } /* Btree ptrs */ @@ -166,12 +166,13 @@ bad: static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); + return __bch_btree_ptr_invalid(b->c, k); } static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) { - unsigned i; + unsigned int i; char buf[80]; struct bucket *g; @@ -204,7 +205,7 @@ err: static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); - unsigned i; + unsigned int i; if (!bkey_cmp(k, &ZERO_KEY) || !KEY_PTRS(k) || @@ -327,13 +328,14 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, struct cache_set *c = container_of(b, struct btree, keys)->c; uint64_t old_offset; - unsigned old_size, sectors_found = 0; + unsigned int old_size, sectors_found = 0; BUG_ON(!KEY_OFFSET(insert)); BUG_ON(!KEY_SIZE(insert)); while (1) { struct bkey *k = bch_btree_iter_next(iter); + if (!k) break; @@ -363,7 +365,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, * k might have been split since we inserted/found the * key we're replacing */ - unsigned i; + unsigned int i; uint64_t offset = KEY_START(k) - KEY_START(replace_key); @@ -498,11 +500,12 @@ bad: static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); + return __bch_extent_invalid(b->c, k); } static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, - unsigned ptr) + unsigned int ptr) { struct bucket *g = PTR_BUCKET(b->c, k, ptr); char buf[80]; @@ -534,7 +537,7 @@ err: static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) { struct btree *b = container_of(bk, struct btree, keys); - unsigned i, stale; + unsigned int i, stale; if (!KEY_PTRS(k) || bch_extent_invalid(bk, k)) @@ -574,10 +577,12 @@ static uint64_t merge_chksums(struct bkey *l, struct bkey *r) ~((uint64_t)1 << 63); } -static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) +static bool bch_extent_merge(struct btree_keys *bk, + struct bkey *l, + struct bkey *r) { struct btree *b = container_of(bk, struct btree, keys); - unsigned i; + unsigned int i; if (key_merging_disabled(b->c)) return false; diff --git a/drivers/md/bcache/extents.h b/drivers/md/bcache/extents.h index 0cd3575afa1d..4d667e05bb73 100644 --- a/drivers/md/bcache/extents.h +++ b/drivers/md/bcache/extents.h @@ -8,8 +8,8 @@ extern const struct btree_keys_ops bch_extent_keys_ops; struct bkey; struct cache_set; -void bch_extent_to_text(char *, size_t, const struct bkey *); -bool __bch_btree_ptr_invalid(struct cache_set *, const struct bkey *); -bool __bch_extent_invalid(struct cache_set *, const struct bkey *); +void bch_extent_to_text(char *buf, size_t size, const struct bkey *k); +bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k); +bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k); #endif /* _BCACHE_EXTENTS_H */ diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 9612873afee2..c25097968319 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -17,6 +17,7 @@ void bch_bbio_free(struct bio *bio, struct cache_set *c) { struct bbio *b = container_of(bio, struct bbio, bio); + mempool_free(b, &c->bio_meta); } @@ -42,9 +43,10 @@ void __bch_submit_bbio(struct bio *bio, struct cache_set *c) } void bch_submit_bbio(struct bio *bio, struct cache_set *c, - struct bkey *k, unsigned ptr) + struct bkey *k, unsigned int ptr) { struct bbio *b = container_of(bio, struct bbio, bio); + bch_bkey_copy_single_ptr(&b->key, k, ptr); __bch_submit_bbio(bio, c); } @@ -52,7 +54,7 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, /* IO errors */ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) { - unsigned errors; + unsigned int errors; WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); @@ -75,16 +77,16 @@ void bch_count_io_errors(struct cache *ca, */ if (ca->set->error_decay) { - unsigned count = atomic_inc_return(&ca->io_count); + unsigned int count = atomic_inc_return(&ca->io_count); while (count > ca->set->error_decay) { - unsigned errors; - unsigned old = count; - unsigned new = count - ca->set->error_decay; + unsigned int errors; + unsigned int old = count; + unsigned int new = count - ca->set->error_decay; /* * First we subtract refresh from count; each time we - * succesfully do so, we rescale the errors once: + * successfully do so, we rescale the errors once: */ count = atomic_cmpxchg(&ca->io_count, old, new); @@ -104,7 +106,7 @@ void bch_count_io_errors(struct cache *ca, } if (error) { - unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT, + unsigned int errors = atomic_add_return(1 << IO_ERROR_SHIFT, &ca->io_errors); errors >>= IO_ERROR_SHIFT; @@ -126,18 +128,18 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio, struct cache *ca = PTR_CACHE(c, &b->key, 0); int is_read = (bio_data_dir(bio) == READ ? 1 : 0); - unsigned threshold = op_is_write(bio_op(bio)) + unsigned int threshold = op_is_write(bio_op(bio)) ? c->congested_write_threshold_us : c->congested_read_threshold_us; if (threshold) { - unsigned t = local_clock_us(); - + unsigned int t = local_clock_us(); int us = t - b->submit_time_us; int congested = atomic_read(&c->congested); if (us > (int) threshold) { int ms = us / 1024; + c->congested_last_us = t; ms = min(ms, CONGESTED_MAX + congested); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 10748c626a1d..6116bbf870d8 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -28,11 +28,12 @@ static void journal_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; + closure_put(cl); } static int journal_read_bucket(struct cache *ca, struct list_head *list, - unsigned bucket_index) + unsigned int bucket_index) { struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio; @@ -40,7 +41,7 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, struct journal_replay *i; struct jset *j, *data = ca->set->journal.w[0].data; struct closure cl; - unsigned len, left, offset = 0; + unsigned int len, left, offset = 0; int ret = 0; sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); @@ -50,7 +51,7 @@ static int journal_read_bucket(struct cache *ca, struct list_head *list, while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); + len = min_t(unsigned int, left, PAGE_SECTORS << JSET_BITS); bio_reset(bio); bio->bi_iter.bi_sector = bucket + offset; @@ -154,12 +155,12 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) }) struct cache *ca; - unsigned iter; + unsigned int iter; for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); - unsigned i, l, r, m; + unsigned int i, l, r, m; uint64_t seq; bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); @@ -192,7 +193,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list) for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets); l < ca->sb.njournal_buckets; - l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1)) + l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, + l + 1)) if (read_bucket(l)) goto bsearch; @@ -304,7 +306,7 @@ void bch_journal_mark(struct cache_set *c, struct list_head *list) k < bset_bkey_last(&i->j); k = bkey_next(k)) if (!__bch_extent_invalid(c, k)) { - unsigned j; + unsigned int j; for (j = 0; j < KEY_PTRS(k); j++) if (ptr_available(c, k, j)) @@ -492,7 +494,7 @@ static void journal_reclaim(struct cache_set *c) struct bkey *k = &c->journal.key; struct cache *ca; uint64_t last_seq; - unsigned iter, n = 0; + unsigned int iter, n = 0; atomic_t p __maybe_unused; atomic_long_inc(&c->reclaim); @@ -526,7 +528,7 @@ static void journal_reclaim(struct cache_set *c) for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; - unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; + unsigned int next = (ja->cur_idx + 1) % ca->sb.njournal_buckets; /* No space available on this device */ if (next == ja->discard_idx) @@ -580,7 +582,7 @@ static void journal_write_endio(struct bio *bio) closure_put(&w->c->journal.io); } -static void journal_write(struct closure *); +static void journal_write(struct closure *cl); static void journal_write_done(struct closure *cl) { @@ -609,11 +611,12 @@ static void journal_write_unlocked(struct closure *cl) struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + unsigned int i, sectors = set_blocks(w->data, block_bytes(c)) * c->sb.block_size; struct bio *bio; struct bio_list list; + bio_list_init(&list); if (!w->need_write) { @@ -705,7 +708,7 @@ static void journal_try_write(struct cache_set *c) } static struct journal_write *journal_wait_for_write(struct cache_set *c, - unsigned nkeys) + unsigned int nkeys) __acquires(&c->journal.lock) { size_t sectors; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index b5788199188f..66f0facff84b 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -110,7 +110,7 @@ struct journal { struct delayed_work work; /* Number of blocks free in the bucket(s) we're currently writing to */ - unsigned blocks_free; + unsigned int blocks_free; uint64_t seq; DECLARE_FIFO(atomic_t, pin); @@ -131,13 +131,13 @@ struct journal_device { uint64_t seq[SB_JOURNAL_BUCKETS]; /* Journal bucket we're currently writing to */ - unsigned cur_idx; + unsigned int cur_idx; /* Last journal bucket that still contains an open journal entry */ - unsigned last_idx; + unsigned int last_idx; /* Next journal bucket to be discarded */ - unsigned discard_idx; + unsigned int discard_idx; #define DISCARD_READY 0 #define DISCARD_IN_FLIGHT 1 @@ -167,14 +167,16 @@ struct cache_set; struct btree_op; struct keylist; -atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); -void bch_journal_next(struct journal *); -void bch_journal_mark(struct cache_set *, struct list_head *); -void bch_journal_meta(struct cache_set *, struct closure *); -int bch_journal_read(struct cache_set *, struct list_head *); -int bch_journal_replay(struct cache_set *, struct list_head *); - -void bch_journal_free(struct cache_set *); -int bch_journal_alloc(struct cache_set *); +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent); +void bch_journal_next(struct journal *j); +void bch_journal_mark(struct cache_set *c, struct list_head *list); +void bch_journal_meta(struct cache_set *c, struct closure *cl); +int bch_journal_read(struct cache_set *c, struct list_head *list); +int bch_journal_replay(struct cache_set *c, struct list_head *list); + +void bch_journal_free(struct cache_set *c); +int bch_journal_alloc(struct cache_set *c); #endif /* _BCACHE_JOURNAL_H */ diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index a24c3a95b2c0..7891fb512736 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -23,7 +23,7 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) { struct cache_set *c = container_of(buf, struct cache_set, moving_gc_keys); - unsigned i; + unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i) && @@ -38,6 +38,7 @@ static bool moving_pred(struct keybuf *buf, struct bkey *k) static void moving_io_destructor(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); + kfree(io); } @@ -186,9 +187,10 @@ static bool bucket_cmp(struct bucket *l, struct bucket *r) return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); } -static unsigned bucket_heap_top(struct cache *ca) +static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } @@ -196,7 +198,7 @@ void bch_moving_gc(struct cache_set *c) { struct cache *ca; struct bucket *b; - unsigned i; + unsigned int i; if (!c->copy_gc_enabled) return; @@ -204,9 +206,9 @@ void bch_moving_gc(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) { - unsigned sectors_to_move = 0; - unsigned reserve_sectors = ca->sb.bucket_size * - fifo_used(&ca->free[RESERVE_MOVINGGC]); + unsigned int sectors_to_move = 0; + unsigned int reserve_sectors = ca->sb.bucket_size * + fifo_used(&ca->free[RESERVE_MOVINGGC]); ca->heap.used = 0; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7dbe8b6316a0..51be355a3309 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -25,9 +25,9 @@ struct kmem_cache *bch_search_cache; -static void bch_data_insert_start(struct closure *); +static void bch_data_insert_start(struct closure *cl); -static unsigned cache_mode(struct cached_dev *dc) +static unsigned int cache_mode(struct cached_dev *dc) { return BDEV_CACHE_MODE(&dc->sb); } @@ -45,6 +45,7 @@ static void bio_csum(struct bio *bio, struct bkey *k) bio_for_each_segment(bv, bio, iter) { void *d = kmap(bv.bv_page) + bv.bv_offset; + csum = bch_crc64_update(csum, d, bv.bv_len); kunmap(bv.bv_page); } @@ -98,7 +99,7 @@ static void bch_data_insert_keys(struct closure *cl) closure_return(cl); } -static int bch_keylist_realloc(struct keylist *l, unsigned u64s, +static int bch_keylist_realloc(struct keylist *l, unsigned int u64s, struct cache_set *c) { size_t oldsize = bch_keylist_nkeys(l); @@ -125,7 +126,7 @@ static void bch_data_invalidate(struct closure *cl) bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); while (bio_sectors(bio)) { - unsigned sectors = min(bio_sectors(bio), + unsigned int sectors = min(bio_sectors(bio), 1U << (KEY_SIZE_BITS - 1)); if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) @@ -135,7 +136,9 @@ static void bch_data_invalidate(struct closure *cl) bio->bi_iter.bi_size -= sectors << 9; bch_keylist_add(&op->insert_keys, - &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); + &KEY(op->inode, + bio->bi_iter.bi_sector, + sectors)); } op->insert_data_done = true; @@ -151,7 +154,7 @@ static void bch_data_insert_error(struct closure *cl) /* * Our data write just errored, which means we've got a bunch of keys to - * insert that point to data that wasn't succesfully written. + * insert that point to data that wasn't successfully written. * * We don't have to insert those keys but we still have to invalidate * that region of the cache - so, if we just strip off all the pointers @@ -211,7 +214,7 @@ static void bch_data_insert_start(struct closure *cl) bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA); do { - unsigned i; + unsigned int i; struct bkey *k; struct bio_set *split = &op->c->bio_split; @@ -328,7 +331,7 @@ void bch_data_insert(struct closure *cl) /* Congested? */ -unsigned bch_get_congested(struct cache_set *c) +unsigned int bch_get_congested(struct cache_set *c) { int i; long rand; @@ -372,8 +375,8 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; - unsigned mode = cache_mode(dc); - unsigned sectors, congested = bch_get_congested(c); + unsigned int mode = cache_mode(dc); + unsigned int sectors, congested = bch_get_congested(c); struct task_struct *task = current; struct io *i; @@ -469,11 +472,11 @@ struct search { struct bio *cache_miss; struct bcache_device *d; - unsigned insert_bio_sectors; - unsigned recoverable:1; - unsigned write:1; - unsigned read_dirty_data:1; - unsigned cache_missed:1; + unsigned int insert_bio_sectors; + unsigned int recoverable:1; + unsigned int write:1; + unsigned int read_dirty_data:1; + unsigned int cache_missed:1; unsigned long start_time; @@ -514,20 +517,20 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) struct search *s = container_of(op, struct search, op); struct bio *n, *bio = &s->bio.bio; struct bkey *bio_key; - unsigned ptr; + unsigned int ptr; if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) return MAP_CONTINUE; if (KEY_INODE(k) != s->iop.inode || KEY_START(k) > bio->bi_iter.bi_sector) { - unsigned bio_sectors = bio_sectors(bio); - unsigned sectors = KEY_INODE(k) == s->iop.inode + unsigned int bio_sectors = bio_sectors(bio); + unsigned int sectors = KEY_INODE(k) == s->iop.inode ? min_t(uint64_t, INT_MAX, KEY_START(k) - bio->bi_iter.bi_sector) : INT_MAX; - int ret = s->d->cache_miss(b, s, bio, sectors); + if (ret != MAP_CONTINUE) return ret; @@ -623,6 +626,7 @@ static void request_endio(struct bio *bio) if (bio->bi_status) { struct search *s = container_of(cl, struct search, cl); + s->iop.status = bio->bi_status; /* Only cache read errors are recoverable */ s->recoverable = false; @@ -813,7 +817,8 @@ static void cached_dev_read_done(struct closure *cl) if (s->iop.bio) { bio_reset(s->iop.bio); - s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; + s->iop.bio->bi_iter.bi_sector = + s->cache_miss->bi_iter.bi_sector; bio_copy_dev(s->iop.bio, s->cache_miss); s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; bch_bio_map(s->iop.bio, NULL); @@ -856,10 +861,10 @@ static void cached_dev_read_done_bh(struct closure *cl) } static int cached_dev_cache_miss(struct btree *b, struct search *s, - struct bio *bio, unsigned sectors) + struct bio *bio, unsigned int sectors) { int ret = MAP_CONTINUE; - unsigned reada = 0; + unsigned int reada = 0; struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; @@ -1212,6 +1217,7 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, unsigned int cmd, unsigned long arg) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); } @@ -1226,7 +1232,7 @@ static int cached_dev_congested(void *data, int bits) return 1; if (cached_dev_get(dc)) { - unsigned i; + unsigned int i; struct cache *ca; for_each_cache(ca, d->c, i) { @@ -1253,9 +1259,9 @@ void bch_cached_dev_request_init(struct cached_dev *dc) /* Flash backed devices */ static int flash_dev_cache_miss(struct btree *b, struct search *s, - struct bio *bio, unsigned sectors) + struct bio *bio, unsigned int sectors) { - unsigned bytes = min(sectors, bio_sectors(bio)) << 9; + unsigned int bytes = min(sectors, bio_sectors(bio)) << 9; swap(bio->bi_iter.bi_size, bytes); zero_fill_bio(bio); @@ -1338,7 +1344,7 @@ static int flash_dev_congested(void *data, int bits) struct bcache_device *d = data; struct request_queue *q; struct cache *ca; - unsigned i; + unsigned int i; int ret = 0; for_each_cache(ca, d->c, i) { @@ -1361,8 +1367,7 @@ void bch_flash_dev_request_init(struct bcache_device *d) void bch_request_exit(void) { - if (bch_search_cache) - kmem_cache_destroy(bch_search_cache); + kmem_cache_destroy(bch_search_cache); } int __init bch_request_init(void) diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index dea0886b81c1..aa055cfeb099 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -8,7 +8,7 @@ struct data_insert_op { struct bio *bio; struct workqueue_struct *wq; - unsigned inode; + unsigned int inode; uint16_t write_point; uint16_t write_prio; blk_status_t status; @@ -17,15 +17,15 @@ struct data_insert_op { uint16_t flags; struct { - unsigned bypass:1; - unsigned writeback:1; - unsigned flush_journal:1; - unsigned csum:1; + unsigned int bypass:1; + unsigned int writeback:1; + unsigned int flush_journal:1; + unsigned int csum:1; - unsigned replace:1; - unsigned replace_collision:1; + unsigned int replace:1; + unsigned int replace_collision:1; - unsigned insert_data_done:1; + unsigned int insert_data_done:1; }; }; @@ -33,7 +33,7 @@ struct data_insert_op { BKEY_PADDED(replace_key); }; -unsigned bch_get_congested(struct cache_set *); +unsigned int bch_get_congested(struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index be119326297b..894410f3f829 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -33,11 +33,11 @@ * stored left shifted by 16, and scaled back in the sysfs show() function. */ -static const unsigned DAY_RESCALE = 288; -static const unsigned HOUR_RESCALE = 12; -static const unsigned FIVE_MINUTE_RESCALE = 1; -static const unsigned accounting_delay = (HZ * 300) / 22; -static const unsigned accounting_weight = 32; +static const unsigned int DAY_RESCALE = 288; +static const unsigned int HOUR_RESCALE = 12; +static const unsigned int FIVE_MINUTE_RESCALE = 1; +static const unsigned int accounting_delay = (HZ * 300) / 22; +static const unsigned int accounting_weight = 32; /* sysfs reading/writing */ @@ -152,7 +152,7 @@ static void scale_accounting(struct timer_list *t) struct cache_accounting *acc = from_timer(acc, t, timer); #define move_stat(name) do { \ - unsigned t = atomic_xchg(&acc->collector.name, 0); \ + unsigned int t = atomic_xchg(&acc->collector.name, 0); \ t <<= 16; \ acc->five_minute.name += t; \ acc->hour.name += t; \ @@ -200,6 +200,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, bool hit, bool bypass) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + mark_cache_stats(&dc->accounting.collector, hit, bypass); mark_cache_stats(&c->accounting.collector, hit, bypass); } @@ -207,6 +208,7 @@ void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_readaheads); atomic_inc(&c->accounting.collector.cache_readaheads); } @@ -214,6 +216,7 @@ void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d) void bch_mark_cache_miss_collision(struct cache_set *c, struct bcache_device *d) { struct cached_dev *dc = container_of(d, struct cached_dev, disk); + atomic_inc(&dc->accounting.collector.cache_miss_collisions); atomic_inc(&c->accounting.collector.cache_miss_collisions); } diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index 0b70f9de0c03..abfaabf7e7fc 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -23,7 +23,7 @@ struct cache_stats { unsigned long cache_miss_collisions; unsigned long sectors_bypassed; - unsigned rescale; + unsigned int rescale; }; struct cache_accounting { @@ -53,10 +53,13 @@ void bch_cache_accounting_clear(struct cache_accounting *acc); void bch_cache_accounting_destroy(struct cache_accounting *acc); -void bch_mark_cache_accounting(struct cache_set *, struct bcache_device *, - bool, bool); -void bch_mark_cache_readahead(struct cache_set *, struct bcache_device *); -void bch_mark_cache_miss_collision(struct cache_set *, struct bcache_device *); -void bch_mark_sectors_bypassed(struct cache_set *, struct cached_dev *, int); +void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, + bool hit, bool bypass); +void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d); +void bch_mark_cache_miss_collision(struct cache_set *c, + struct bcache_device *d); +void bch_mark_sectors_bypassed(struct cache_set *c, + struct cached_dev *dc, + int sectors); #endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 55a37641aa95..94c756c66bd7 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * bcache setup/teardown code, and some metadata io - read a superblock and * figure out what to do with it. @@ -61,7 +62,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, const char *err; struct cache_sb *s; struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); - unsigned i; + unsigned int i; if (!bh) return "IO error"; @@ -149,7 +150,8 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev, goto err; err = "Invalid superblock: device too small"; - if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) + if (get_capacity(bdev->bd_disk) < + sb->bucket_size * sb->nbuckets) goto err; err = "Bad UUID"; @@ -202,7 +204,7 @@ static void write_bdev_super_endio(struct bio *bio) static void __write_super(struct cache_sb *sb, struct bio *bio) { struct cache_sb *out = page_address(bio_first_page_all(bio)); - unsigned i; + unsigned int i; bio->bi_iter.bi_sector = SB_SECTOR; bio->bi_iter.bi_size = SB_SIZE; @@ -282,7 +284,7 @@ void bcache_write_super(struct cache_set *c) { struct closure *cl = &c->sb_write; struct cache *ca; - unsigned i; + unsigned int i; down(&c->sb_write_mutex); closure_init(cl, &c->cl); @@ -334,7 +336,7 @@ static void uuid_io(struct cache_set *c, int op, unsigned long op_flags, { struct closure *cl = &c->uuid_write; struct uuid_entry *u; - unsigned i; + unsigned int i; char buf[80]; BUG_ON(!parent); @@ -415,8 +417,8 @@ static int __uuid_write(struct cache_set *c) { BKEY_PADDED(key) k; struct closure cl; - closure_init_stack(&cl); + closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) @@ -456,6 +458,7 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) static struct uuid_entry *uuid_find_empty(struct cache_set *c) { static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + return uuid_find(c, zero_uuid); } @@ -463,8 +466,8 @@ static struct uuid_entry *uuid_find_empty(struct cache_set *c) * Bucket priorities/gens: * * For each bucket, we store on disk its - * 8 bit gen - * 16 bit priority + * 8 bit gen + * 16 bit priority * * See alloc.c for an explanation of the gen. The priority is used to implement * lru (and in the future other) cache replacement policies; for most purposes @@ -587,7 +590,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) struct prio_set *p = ca->disk_buckets; struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; struct bucket *b; - unsigned bucket_nr = 0; + unsigned int bucket_nr = 0; for (b = ca->buckets; b < ca->buckets + ca->sb.nbuckets; @@ -599,7 +602,8 @@ static void prio_read(struct cache *ca, uint64_t bucket) prio_io(ca, bucket, REQ_OP_READ, 0); - if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) + if (p->csum != + bch_crc64(&p->magic, bucket_bytes(ca) - 8)) pr_warn("bad csum reading priorities"); if (p->magic != pset_magic(&ca->sb)) @@ -619,6 +623,7 @@ static void prio_read(struct cache *ca, uint64_t bucket) static int open_dev(struct block_device *b, fmode_t mode) { struct bcache_device *d = b->bd_disk->private_data; + if (test_bit(BCACHE_DEV_CLOSING, &d->flags)) return -ENXIO; @@ -629,6 +634,7 @@ static int open_dev(struct block_device *b, fmode_t mode) static void release_dev(struct gendisk *b, fmode_t mode) { struct bcache_device *d = b->private_data; + closure_put(&d->cl); } @@ -662,7 +668,7 @@ static void bcache_device_unlink(struct bcache_device *d) lockdep_assert_held(&bch_register_lock); if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) { - unsigned i; + unsigned int i; struct cache *ca; sysfs_remove_link(&d->c->kobj, d->name); @@ -676,7 +682,7 @@ static void bcache_device_unlink(struct bcache_device *d) static void bcache_device_link(struct bcache_device *d, struct cache_set *c, const char *name) { - unsigned i; + unsigned int i; struct cache *ca; for_each_cache(ca, d->c, i) @@ -715,7 +721,7 @@ static void bcache_device_detach(struct bcache_device *d) } static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, - unsigned id) + unsigned int id) { d->id = id; d->c = c; @@ -762,7 +768,7 @@ static void bcache_device_free(struct bcache_device *d) closure_debug_destroy(&d->cl); } -static int bcache_device_init(struct bcache_device *d, unsigned block_size, +static int bcache_device_init(struct bcache_device *d, unsigned int block_size, sector_t sectors) { struct request_queue *q; @@ -778,7 +784,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->nr_stripes || d->nr_stripes > max_stripes) { pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)", - (unsigned)d->nr_stripes); + (unsigned int)d->nr_stripes); return -ENOMEM; } @@ -919,6 +925,7 @@ void bch_cached_dev_run(struct cached_dev *dc) if (!d->c && BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { struct closure cl; + closure_init_stack(&cl); SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); @@ -928,8 +935,10 @@ void bch_cached_dev_run(struct cached_dev *dc) add_disk(d->disk); bd_link_disk_holder(dc->bdev, dc->disk.disk); - /* won't show up in the uevent file, use udevadm monitor -e instead - * only class / kset properties are persistent */ + /* + * won't show up in the uevent file, use udevadm monitor -e instead + * only class / kset properties are persistent + */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); kfree(env[2]); @@ -976,6 +985,7 @@ static void cached_dev_detach_finish(struct work_struct *w) { struct cached_dev *dc = container_of(w, struct cached_dev, detach); struct closure cl; + closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); @@ -1097,12 +1107,14 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, } } - /* Deadlocks since we're called via sysfs... - sysfs_remove_file(&dc->kobj, &sysfs_attach); + /* + * Deadlocks since we're called via sysfs... + * sysfs_remove_file(&dc->kobj, &sysfs_attach); */ if (bch_is_zero(u->uuid, 16)) { struct closure cl; + closure_init_stack(&cl); memcpy(u->uuid, dc->sb.uuid, 16); @@ -1124,11 +1136,11 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, list_move(&dc->list, &c->cached_devs); calc_cached_dev_sectors(c); - smp_wmb(); /* * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get() */ + smp_wmb(); refcount_set(&dc->count, 1); /* Block writeback thread, but spawn it */ @@ -1212,7 +1224,7 @@ static void cached_dev_flush(struct closure *cl) continue_at(cl, cached_dev_free, system_wq); } -static int cached_dev_init(struct cached_dev *dc, unsigned block_size) +static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) { int ret; struct io *io; @@ -1320,6 +1332,7 @@ void bch_flash_dev_release(struct kobject *kobj) static void flash_dev_free(struct closure *cl) { struct bcache_device *d = container_of(cl, struct bcache_device, cl); + mutex_lock(&bch_register_lock); atomic_long_sub(bcache_dev_sectors_dirty(d), &d->c->flash_dev_dirty_sectors); @@ -1459,17 +1472,18 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) pr_info("CACHE_SET_IO_DISABLE already set"); - /* XXX: we can be called from atomic context - acquire_console_sem(); - */ + /* + * XXX: we can be called from atomic context + * acquire_console_sem(); + */ - printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); + pr_err("bcache: error on %pU: ", c->sb.set_uuid); va_start(args, fmt); vprintk(fmt, args); va_end(args); - printk(", disabling caching\n"); + pr_err(", disabling caching\n"); if (c->on_error == ON_ERROR_PANIC) panic("panic forced after error\n"); @@ -1481,6 +1495,7 @@ bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) void bch_cache_set_release(struct kobject *kobj) { struct cache_set *c = container_of(kobj, struct cache_set, kobj); + kfree(c); module_put(THIS_MODULE); } @@ -1489,7 +1504,7 @@ static void cache_set_free(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, cl); struct cache *ca; - unsigned i; + unsigned int i; if (!IS_ERR_OR_NULL(c->debug)) debugfs_remove(c->debug); @@ -1532,7 +1547,7 @@ static void cache_set_flush(struct closure *cl) struct cache_set *c = container_of(cl, struct cache_set, caching); struct cache *ca; struct btree *b; - unsigned i; + unsigned int i; bch_cache_accounting_destroy(&c->accounting); @@ -1671,6 +1686,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { int iter_size; struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); + if (!c) return NULL; @@ -1731,8 +1747,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) || mempool_init_slab_pool(&c->search, 32, bch_search_cache) || mempool_init_kmalloc_pool(&c->bio_meta, 2, - sizeof(struct bbio) + sizeof(struct bio_vec) * - bucket_pages(c)) || + sizeof(struct bbio) + sizeof(struct bio_vec) * + bucket_pages(c)) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) || @@ -1762,7 +1778,7 @@ static void run_cache_set(struct cache_set *c) struct cached_dev *dc, *t; struct cache *ca; struct closure cl; - unsigned i; + unsigned int i; closure_init_stack(&cl); @@ -1804,7 +1820,9 @@ static void run_cache_set(struct cache_set *c) goto err; err = "error reading btree root"; - c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); + c->root = bch_btree_node_get(c, NULL, k, + j->btree_level, + true, NULL); if (IS_ERR_OR_NULL(c->root)) goto err; @@ -1853,7 +1871,7 @@ static void run_cache_set(struct cache_set *c) pr_notice("invalidating existing data"); for_each_cache(ca, c, i) { - unsigned j; + unsigned int j; ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, 2, SB_JOURNAL_BUCKETS); @@ -1998,7 +2016,7 @@ err: void bch_cache_release(struct kobject *kobj) { struct cache *ca = container_of(kobj, struct cache, kobj); - unsigned i; + unsigned int i; if (ca->set) { BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca); @@ -2098,7 +2116,9 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page, goto err; } - if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) { + if (kobject_add(&ca->kobj, + &part_to_dev(bdev->bd_part)->kobj, + "bcache")) { err = "error calling kobject_add"; ret = -ENOMEM; goto out; @@ -2127,13 +2147,14 @@ err: /* Global interfaces/init */ -static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, - const char *, size_t); +static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, + const char *buffer, size_t size); kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); -static bool bch_is_open_backing(struct block_device *bdev) { +static bool bch_is_open_backing(struct block_device *bdev) +{ struct cache_set *c, *tc; struct cached_dev *dc, *t; @@ -2147,10 +2168,11 @@ static bool bch_is_open_backing(struct block_device *bdev) { return false; } -static bool bch_is_open_cache(struct block_device *bdev) { +static bool bch_is_open_cache(struct block_device *bdev) +{ struct cache_set *c, *tc; struct cache *ca; - unsigned i; + unsigned int i; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) for_each_cache(ca, c, i) @@ -2159,7 +2181,8 @@ static bool bch_is_open_cache(struct block_device *bdev) { return false; } -static bool bch_is_open(struct block_device *bdev) { +static bool bch_is_open(struct block_device *bdev) +{ return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); } @@ -2216,6 +2239,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, err = "failed to register device"; if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); + if (!dc) goto err_close; @@ -2224,6 +2248,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, mutex_unlock(&bch_register_lock); } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) goto err_close; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 81d3520b0702..150cf4f4cf74 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -130,8 +130,10 @@ rw_attribute(btree_shrinker_disabled); rw_attribute(copy_gc_enabled); rw_attribute(size); -static ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], - size_t selected) +static ssize_t bch_snprint_string_list(char *buf, + size_t size, + const char * const list[], + size_t selected) { char *out = buf; size_t i; @@ -148,7 +150,7 @@ SHOW(__bch_cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - const char *states[] = { "no cache", "clean", "dirty", "inconsistent" }; + char const *states[] = { "no cache", "clean", "dirty", "inconsistent" }; int wb = dc->writeback_running; #define var(stat) (dc->stat) @@ -307,7 +309,7 @@ STORE(__cached_dev) if (v < 0) return v; - if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) { + if ((unsigned int) v != BDEV_CACHE_MODE(&dc->sb)) { SET_BDEV_CACHE_MODE(&dc->sb, v); bch_write_bdev_super(dc, NULL); } @@ -341,8 +343,9 @@ STORE(__cached_dev) add_uevent_var(env, "DRIVER=bcache"); add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), add_uevent_var(env, "CACHED_LABEL=%s", buf); - kobject_uevent_env( - &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); + kobject_uevent_env(&disk_to_dev(dc->disk.disk)->kobj, + KOBJ_CHANGE, + env->envp); kfree(env); } @@ -459,6 +462,7 @@ STORE(__bch_flash_dev) if (attr == &sysfs_size) { uint64_t v; + strtoi_h_or_return(buf, v); u->sectors = v >> 9; @@ -533,9 +537,9 @@ static int bch_bset_print_stats(struct cache_set *c, char *buf) op.stats.floats, op.stats.failed); } -static unsigned bch_root_usage(struct cache_set *c) +static unsigned int bch_root_usage(struct cache_set *c) { - unsigned bytes = 0; + unsigned int bytes = 0; struct bkey *k; struct btree *b; struct btree_iter iter; @@ -570,9 +574,9 @@ static size_t bch_cache_size(struct cache_set *c) return ret; } -static unsigned bch_cache_max_chain(struct cache_set *c) +static unsigned int bch_cache_max_chain(struct cache_set *c) { - unsigned ret = 0; + unsigned int ret = 0; struct hlist_head *h; mutex_lock(&c->bucket_lock); @@ -580,7 +584,7 @@ static unsigned bch_cache_max_chain(struct cache_set *c) for (h = c->bucket_hash; h < c->bucket_hash + (1 << BUCKET_HASH_BITS); h++) { - unsigned i = 0; + unsigned int i = 0; struct hlist_node *p; hlist_for_each(p, h) @@ -593,13 +597,13 @@ static unsigned bch_cache_max_chain(struct cache_set *c) return ret; } -static unsigned bch_btree_used(struct cache_set *c) +static unsigned int bch_btree_used(struct cache_set *c) { return div64_u64(c->gc_stats.key_bytes * 100, (c->gc_stats.nodes ?: 1) * btree_bytes(c)); } -static unsigned bch_average_key_size(struct cache_set *c) +static unsigned int bch_average_key_size(struct cache_set *c) { return c->gc_stats.nkeys ? div64_u64(c->gc_stats.data, c->gc_stats.nkeys) @@ -703,6 +707,7 @@ STORE(__bch_cache_set) if (attr == &sysfs_flash_vol_create) { int r; uint64_t v; + strtoi_h_or_return(buf, v); r = bch_flash_dev_create(c, v); @@ -736,6 +741,7 @@ STORE(__bch_cache_set) if (attr == &sysfs_prune_cache) { struct shrink_control sc; + sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); c->shrink.scan_objects(&c->shrink, &sc); @@ -789,12 +795,14 @@ STORE_LOCKED(bch_cache_set) SHOW(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_show(&c->kobj, attr, buf); } STORE(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal); + return bch_cache_set_store(&c->kobj, attr, buf, size); } @@ -996,7 +1004,7 @@ STORE(__bch_cache) if (v < 0) return v; - if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) { + if ((unsigned int) v != CACHE_REPLACEMENT(&ca->sb)) { mutex_lock(&ca->set->bucket_lock); SET_CACHE_REPLACEMENT(&ca->sb, v); mutex_unlock(&ca->set->bucket_lock); diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h index b54fe9602529..3fe82425859c 100644 --- a/drivers/md/bcache/sysfs.h +++ b/drivers/md/bcache/sysfs.h @@ -44,9 +44,9 @@ STORE(fn) \ static struct attribute sysfs_##_name = \ { .name = #_name, .mode = _mode } -#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) -#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) -#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) +#define write_attribute(n) __sysfs_attribute(n, 0200) +#define read_attribute(n) __sysfs_attribute(n, 0444) +#define rw_attribute(n) __sysfs_attribute(n, 0644) #define sysfs_printf(file, fmt, ...) \ do { \ diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index b15256bcf0e7..20eddeac1531 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * random utiility code, for bcache but in theory not specific to bcache * @@ -133,6 +134,7 @@ bool bch_is_zero(const char *p, size_t n) int bch_parse_uuid(const char *s, char *uuid) { size_t i, j, x; + memset(uuid, 0, 16); for (i = 0, j = 0; @@ -279,134 +281,3 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask) return 0; } - -/* - * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any - * use permitted, subject to terms of PostgreSQL license; see.) - - * If we have a 64-bit integer type, then a 64-bit CRC looks just like the - * usual sort of implementation. (See Ross Williams' excellent introduction - * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from - * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) - * If we have no working 64-bit type, then fake it with two 32-bit registers. - * - * The present implementation is a normal (not "reflected", in Williams' - * terms) 64-bit CRC, using initial all-ones register contents and a final - * bit inversion. The chosen polynomial is borrowed from the DLT1 spec - * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): - * - * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + - * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + - * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + - * x^7 + x^4 + x + 1 -*/ - -static const uint64_t crc_table[256] = { - 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, - 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, - 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, - 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, - 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, - 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, - 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, - 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, - 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, - 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, - 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, - 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, - 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, - 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, - 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, - 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, - 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, - 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, - 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, - 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, - 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, - 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, - 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, - 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, - 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, - 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, - 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, - 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, - 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, - 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, - 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, - 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, - 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, - 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, - 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, - 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, - 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, - 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, - 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, - 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, - 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, - 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, - 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, - 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, - 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, - 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, - 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, - 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, - 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, - 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, - 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, - 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, - 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, - 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, - 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, - 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, - 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, - 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, - 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, - 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, - 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, - 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, - 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, - 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, - 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, - 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, - 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, - 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, - 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, - 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, - 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, - 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, - 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, - 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, - 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, - 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, - 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, - 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, - 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, - 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, - 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, - 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, - 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, - 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, - 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, - 0x9AFCE626CE85B507ULL, -}; - -uint64_t bch_crc64_update(uint64_t crc, const void *_data, size_t len) -{ - const unsigned char *data = _data; - - while (len--) { - int i = ((int) (crc >> 56) ^ *data++) & 0xFF; - crc = crc_table[i] ^ (crc << 8); - } - - return crc; -} - -uint64_t bch_crc64(const void *data, size_t len) -{ - uint64_t crc = 0xffffffffffffffffULL; - - crc = bch_crc64_update(crc, data, len); - - return crc ^ 0xffffffffffffffffULL; -} diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index f7b0133c9d2f..00aab6abcfe4 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> +#include <linux/crc64.h> #include "closure.h" @@ -288,10 +289,10 @@ do { \ #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -int bch_strtoint_h(const char *, int *); -int bch_strtouint_h(const char *, unsigned int *); -int bch_strtoll_h(const char *, long long *); -int bch_strtoull_h(const char *, unsigned long long *); +int bch_strtoint_h(const char *cp, int *res); +int bch_strtouint_h(const char *cp, unsigned int *res); +int bch_strtoll_h(const char *cp, long long *res); +int bch_strtoull_h(const char *cp, unsigned long long *res); static inline int bch_strtol_h(const char *cp, long *res) { @@ -347,7 +348,7 @@ static inline int bch_strtoul_h(const char *cp, long *res) snprintf(buf, size, \ __builtin_types_compatible_p(typeof(var), int) \ ? "%i\n" : \ - __builtin_types_compatible_p(typeof(var), unsigned) \ + __builtin_types_compatible_p(typeof(var), unsigned int) \ ? "%u\n" : \ __builtin_types_compatible_p(typeof(var), long) \ ? "%li\n" : \ @@ -379,7 +380,7 @@ struct time_stats { void bch_time_stats_update(struct time_stats *stats, uint64_t time); -static inline unsigned local_clock_us(void) +static inline unsigned int local_clock_us(void) { return local_clock() >> 10; } @@ -402,7 +403,8 @@ do { \ __print_time_stat(stats, name, \ average_duration, duration_units); \ sysfs_print(name ## _ ##max_duration ## _ ## duration_units, \ - div_u64((stats)->max_duration, NSEC_PER_ ## duration_units));\ + div_u64((stats)->max_duration, \ + NSEC_PER_ ## duration_units)); \ \ sysfs_print(name ## _last_ ## frequency_units, (stats)->last \ ? div_s64(local_clock() - (stats)->last, \ @@ -542,10 +544,27 @@ dup: \ #define RB_PREV(ptr, member) \ container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member) +static inline uint64_t bch_crc64(const void *p, size_t len) +{ + uint64_t crc = 0xffffffffffffffffULL; + + crc = crc64_be(crc, p, len); + return crc ^ 0xffffffffffffffffULL; +} + +static inline uint64_t bch_crc64_update(uint64_t crc, + const void *p, + size_t len) +{ + crc = crc64_be(crc, p, len); + return crc; +} + /* Does linear interpolation between powers of two */ -static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) +static inline unsigned int fract_exp_two(unsigned int x, + unsigned int fract_bits) { - unsigned fract = x & ~(~0 << fract_bits); + unsigned int fract = x & ~(~0 << fract_bits); x >>= fract_bits; x = 1 << x; @@ -561,8 +580,4 @@ static inline sector_t bdev_sectors(struct block_device *bdev) { return bdev->bd_inode->i_size >> 9; } - -uint64_t bch_crc64_update(uint64_t, const void *, size_t); -uint64_t bch_crc64(const void *, size_t); - #endif /* _BCACHE_UTIL_H */ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 481d4cf38ac0..6be05bd7ca67 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -215,7 +215,8 @@ static void update_writeback_rate(struct work_struct *work) smp_mb(); } -static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) +static unsigned int writeback_delay(struct cached_dev *dc, + unsigned int sectors) { if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || !dc->writeback_percent) @@ -249,6 +250,7 @@ static void dirty_init(struct keybuf_key *w) static void dirty_io_destructor(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); + kfree(io); } @@ -263,7 +265,7 @@ static void write_dirty_finish(struct closure *cl) /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { int ret; - unsigned i; + unsigned int i; struct keylist keys; bch_keylist_init(&keys); @@ -377,7 +379,7 @@ static void read_dirty_submit(struct closure *cl) static void read_dirty(struct cached_dev *dc) { - unsigned delay = 0; + unsigned int delay = 0; struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w; size_t size; int nk, i; @@ -442,7 +444,8 @@ static void read_dirty(struct cached_dev *dc) io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) * - DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), + DIV_ROUND_UP(KEY_SIZE(&w->key), + PAGE_SECTORS), GFP_KERNEL); if (!io) goto err; @@ -465,7 +468,8 @@ static void read_dirty(struct cached_dev *dc) down(&dc->in_flight); - /* We've acquired a semaphore for the maximum + /* + * We've acquired a semaphore for the maximum * simultaneous number of writebacks; from here * everything happens asynchronously. */ @@ -498,11 +502,11 @@ err: /* Scan for dirty data */ -void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned stripe_offset, stripe, sectors_dirty; + unsigned int stripe_offset, stripe, sectors_dirty; if (!d) return; @@ -514,7 +518,7 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { - int s = min_t(unsigned, abs(nr_sectors), + int s = min_t(unsigned int, abs(nr_sectors), d->stripe_size - stripe_offset); if (nr_sectors < 0) @@ -538,7 +542,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, static bool dirty_pred(struct keybuf *buf, struct bkey *k) { - struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); + struct cached_dev *dc = container_of(buf, + struct cached_dev, + writeback_keys); BUG_ON(KEY_INODE(k) != dc->disk.id); @@ -548,7 +554,7 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static void refill_full_stripes(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - unsigned start_stripe, stripe, next_stripe; + unsigned int start_stripe, stripe, next_stripe; bool wrapped = false; stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); @@ -688,7 +694,7 @@ static int bch_writeback_thread(void *arg) read_dirty(dc); if (searched_full_index) { - unsigned delay = dc->writeback_delay * HZ; + unsigned int delay = dc->writeback_delay * HZ; while (delay && !kthread_should_stop() && @@ -712,7 +718,7 @@ static int bch_writeback_thread(void *arg) struct sectors_dirty_init { struct btree_op op; - unsigned inode; + unsigned int inode; size_t count; struct bkey start; }; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 3745d7004c47..d2b9fdbc8994 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -28,7 +28,7 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline unsigned offset_to_stripe(struct bcache_device *d, +static inline unsigned int offset_to_stripe(struct bcache_device *d, uint64_t offset) { do_div(offset, d->stripe_size); @@ -37,9 +37,9 @@ static inline unsigned offset_to_stripe(struct bcache_device *d, static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, - unsigned nr_sectors) + unsigned int nr_sectors) { - unsigned stripe = offset_to_stripe(&dc->disk, offset); + unsigned int stripe = offset_to_stripe(&dc->disk, offset); while (1) { if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) @@ -54,9 +54,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, } static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, - unsigned cache_mode, bool would_skip) + unsigned int cache_mode, bool would_skip) { - unsigned in_use = dc->disk.c->gc_stats.in_use; + unsigned int in_use = dc->disk.c->gc_stats.in_use; if (cache_mode != CACHE_MODE_WRITEBACK || test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || @@ -96,10 +96,11 @@ static inline void bch_writeback_add(struct cached_dev *dc) } } -void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, + uint64_t offset, int nr_sectors); -void bch_sectors_dirty_init(struct bcache_device *); -void bch_cached_dev_writeback_init(struct cached_dev *); -int bch_cached_dev_writeback_start(struct cached_dev *); +void bch_sectors_dirty_init(struct bcache_device *d); +void bch_cached_dev_writeback_init(struct cached_dev *dc); +int bch_cached_dev_writeback_start(struct cached_dev *dc); #endif diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c index 63d6246d6dff..6369aeaa7056 100644 --- a/drivers/misc/mic/scif/scif_dma.c +++ b/drivers/misc/mic/scif/scif_dma.c @@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn, schedule_work(&scif_info.misc_work); } -static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct scif_mmu_notif *mmn; mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); scif_rma_destroy_tcw(mmn, start, end - start); + + return 0; } static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index a3454eb56fbf..be28f05bfafa 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru) /* * MMUOPS notifier callout functions */ -static void gru_invalidate_range_start(struct mmu_notifier *mn, +static int gru_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool blockable) { struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, ms_notifier); @@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn, gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms, start, end, atomic_read(&gms->ms_range_active)); gru_flush_tlb_range(gms, start, end - start); + + return 0; } static void gru_invalidate_range_end(struct mmu_notifier *mn, diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 738e3546abb1..c2ab57705043 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -615,13 +615,11 @@ static bool acpi_pci_need_resume(struct pci_dev *dev) /* * In some cases (eg. Samsung 305V4A) leaving a bridge in suspend over * system-wide suspend/resume confuses the platform firmware, so avoid - * doing that, unless the bridge has a driver that should take care of - * the PM handling. According to Section 16.1.6 of ACPI 6.2, endpoint + * doing that. According to Section 16.1.6 of ACPI 6.2, endpoint * devices are expected to be in D3 before invoking the S3 entry path * from the firmware, so they should not be affected by this issue. */ - if (pci_is_bridge(dev) && !dev->driver && - acpi_target_system_state() != ACPI_STATE_S0) + if (pci_is_bridge(dev) && acpi_target_system_state() != ACPI_STATE_S0) return true; if (!adev || !acpi_device_power_manageable(adev)) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 46f58a9771d7..ef7143a274e0 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -66,9 +66,15 @@ static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, f->vendor == (u16) PCI_ANY_ID) && (f->device == dev->device || f->device == (u16) PCI_ANY_ID)) { - calltime = fixup_debug_start(dev, f->hook); - f->hook(dev); - fixup_debug_report(dev, calltime, f->hook); + void (*hook)(struct pci_dev *dev); +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + hook = offset_to_ptr(&f->hook_offset); +#else + hook = f->hook; +#endif + calltime = fixup_debug_start(dev, hook); + hook(dev); + fixup_debug_report(dev, calltime, hook); } } diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index a8cb8d2f2abb..cbe467ff1aba 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -1006,7 +1006,6 @@ out_free: static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg) { struct mport_cdev_priv *priv; - struct mport_dev *md; struct rio_async_tx_wait w_param; struct mport_dma_req *req; dma_cookie_t cookie; @@ -1016,7 +1015,6 @@ static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg) int ret; priv = (struct mport_cdev_priv *)filp->private_data; - md = priv->md; if (unlikely(copy_from_user(&w_param, arg, sizeof(w_param)))) return -EFAULT; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index c866a62f766d..57390c7666e5 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -479,18 +479,25 @@ static const struct vm_operations_struct gntdev_vmops = { /* ------------------------------------------------------------------ */ +static bool in_range(struct gntdev_grant_map *map, + unsigned long start, unsigned long end) +{ + if (!map->vma) + return false; + if (map->vma->vm_start >= end) + return false; + if (map->vma->vm_end <= start) + return false; + + return true; +} + static void unmap_if_in_range(struct gntdev_grant_map *map, unsigned long start, unsigned long end) { unsigned long mstart, mend; int err; - if (!map->vma) - return; - if (map->vma->vm_start >= end) - return; - if (map->vma->vm_end <= start) - return; mstart = max(start, map->vma->vm_start); mend = min(end, map->vma->vm_end); pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", @@ -503,21 +510,40 @@ static void unmap_if_in_range(struct gntdev_grant_map *map, WARN_ON(err); } -static void mn_invl_range_start(struct mmu_notifier *mn, +static int mn_invl_range_start(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool blockable) { struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct gntdev_grant_map *map; + int ret = 0; + + /* TODO do we really need a mutex here? */ + if (blockable) + mutex_lock(&priv->lock); + else if (!mutex_trylock(&priv->lock)) + return -EAGAIN; - mutex_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { + if (in_range(map, start, end)) { + ret = -EAGAIN; + goto out_unlock; + } unmap_if_in_range(map, start, end); } list_for_each_entry(map, &priv->freeable_maps, next) { + if (in_range(map, start, end)) { + ret = -EAGAIN; + goto out_unlock; + } unmap_if_in_range(map, start, end); } + +out_unlock: mutex_unlock(&priv->lock); + + return ret; } static void mn_release(struct mmu_notifier *mn, diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index e91028d4340a..66621e96f9af 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -167,7 +167,7 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode) * of time to convert from RISC OS epoch to Unix epoch. */ static void -adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) +adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) { unsigned int high, low; /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since @@ -195,11 +195,11 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) /* convert from RISC OS to Unix epoch */ nsec -= nsec_unix_epoch_diff_risc_os_epoch; - *tv = ns_to_timespec(nsec); + *tv = ns_to_timespec64(nsec); return; cur_time: - *tv = timespec64_to_timespec(current_time(inode)); + *tv = current_time(inode); return; too_early: @@ -242,7 +242,6 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs) struct inode * adfs_iget(struct super_block *sb, struct object_info *obj) { - struct timespec ts; struct inode *inode; inode = new_inode(sb); @@ -271,9 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); inode->i_mode = adfs_atts2mode(sb, inode); - ts = timespec64_to_timespec(inode->i_mtime); - adfs_adfs2unix_time(&ts, inode); - inode->i_mtime = timespec_to_timespec64(ts); + adfs_adfs2unix_time(&inode->i_mtime, inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 502812289850..9f9cadbfbd7a 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -27,6 +27,7 @@ #include <linux/list.h> #include <linux/completion.h> #include <linux/file.h> +#include <linux/magic.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -125,7 +126,8 @@ struct autofs_sb_info { static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { - return (struct autofs_sb_info *)(sb->s_fs_info); + return sb->s_magic != AUTOFS_SUPER_MAGIC ? + NULL : (struct autofs_sb_info *)(sb->s_fs_info); } static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) @@ -152,15 +154,9 @@ int autofs_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when); + struct autofs_sb_info *sbi, unsigned int how); int autofs_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); -struct dentry *autofs_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); -struct dentry *autofs_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); /* Device node initialization */ diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index b332d3f6e730..d441244b79df 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -10,11 +10,9 @@ #include "autofs_i.h" -static unsigned long now; - /* Check if a dentry can be expired */ static inline int autofs_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) + unsigned long timeout, unsigned int how) { struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -22,16 +20,17 @@ static inline int autofs_can_expire(struct dentry *dentry, if (ino == NULL) return 0; - if (!do_now) { + if (!(how & AUTOFS_EXP_IMMEDIATE)) { /* Too young to die */ - if (!timeout || time_after(ino->last_used + timeout, now)) + if (!timeout || time_after(ino->last_used + timeout, jiffies)) return 0; } return 1; } /* Check a mount point for busyness */ -static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +static int autofs_mount_busy(struct vfsmount *mnt, + struct dentry *dentry, unsigned int how) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; @@ -52,6 +51,12 @@ static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) goto done; } + /* Not a submount, has a forced expire been requested */ + if (how & AUTOFS_EXP_FORCED) { + status = 0; + goto done; + } + /* Update the expiry counter if fs is busy */ if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; @@ -187,10 +192,14 @@ again: static int autofs_direct_busy(struct vfsmount *mnt, struct dentry *top, unsigned long timeout, - int do_now) + unsigned int how) { pr_debug("top %p %pd\n", top, top); + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return 0; + /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { struct autofs_info *ino; @@ -202,7 +211,7 @@ static int autofs_direct_busy(struct vfsmount *mnt, } /* Timeout of a direct mount is determined by its top dentry */ - if (!autofs_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, how)) return 1; return 0; @@ -215,7 +224,7 @@ static int autofs_direct_busy(struct vfsmount *mnt, static int autofs_tree_busy(struct vfsmount *mnt, struct dentry *top, unsigned long timeout, - int do_now) + unsigned int how) { struct autofs_info *top_ino = autofs_dentry_ino(top); struct dentry *p; @@ -237,7 +246,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { - if (autofs_mount_busy(mnt, p)) { + if (autofs_mount_busy(mnt, p, how)) { top_ino->last_used = jiffies; dput(p); return 1; @@ -260,8 +269,12 @@ static int autofs_tree_busy(struct vfsmount *mnt, } } + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return 0; + /* Timeout of a tree mount is ultimately determined by its top dentry */ - if (!autofs_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, how)) return 1; return 0; @@ -270,7 +283,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, static struct dentry *autofs_check_leaves(struct vfsmount *mnt, struct dentry *parent, unsigned long timeout, - int do_now) + unsigned int how) { struct dentry *p; @@ -282,11 +295,17 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt, if (d_mountpoint(p)) { /* Can we umount this guy */ - if (autofs_mount_busy(mnt, p)) + if (autofs_mount_busy(mnt, p, how)) continue; + /* This isn't a submount so if a forced expire + * has been requested, user space handles busy + * mounts */ + if (how & AUTOFS_EXP_FORCED) + return p; + /* Can we expire this guy */ - if (autofs_can_expire(p, timeout, do_now)) + if (autofs_can_expire(p, timeout, how)) return p; } } @@ -294,23 +313,21 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt, } /* Check if we can expire a direct mount (possibly a tree) */ -struct dentry *autofs_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +static struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + unsigned int how) { - unsigned long timeout; struct dentry *root = dget(sb->s_root); - int do_now = how & AUTOFS_EXP_IMMEDIATE; struct autofs_info *ino; + unsigned long timeout; if (!root) return NULL; - now = jiffies; timeout = sbi->exp_timeout; - if (!autofs_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, how)) { spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(root); /* No point expiring a pending mount */ @@ -321,7 +338,7 @@ struct dentry *autofs_expire_direct(struct super_block *sb, ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - if (!autofs_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, how)) { spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); @@ -346,10 +363,8 @@ out: static struct dentry *should_expire(struct dentry *dentry, struct vfsmount *mnt, unsigned long timeout, - int how) + unsigned int how) { - int do_now = how & AUTOFS_EXP_IMMEDIATE; - int exp_leaves = how & AUTOFS_EXP_LEAVES; struct autofs_info *ino = autofs_dentry_ino(dentry); unsigned int ino_count; @@ -367,22 +382,33 @@ static struct dentry *should_expire(struct dentry *dentry, pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ - if (autofs_mount_busy(mnt, dentry)) + if (autofs_mount_busy(mnt, dentry, how)) return NULL; + /* This isn't a submount so if a forced expire + * has been requested, user space handles busy + * mounts */ + if (how & AUTOFS_EXP_FORCED) + return dentry; + /* Can we expire this guy */ - if (autofs_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, how)) return dentry; return NULL; } if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); + + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return dentry; + /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. */ - if (autofs_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, how)) return dentry; return NULL; } @@ -391,27 +417,33 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; /* Case 2: tree mount, expire iff entire tree is not busy */ - if (!exp_leaves) { - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; + if (!(how & AUTOFS_EXP_LEAVES)) { + /* Not a forced expire? */ + if (!(how & AUTOFS_EXP_FORCED)) { + /* ref-walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + } - if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) + if (!autofs_tree_busy(mnt, dentry, timeout, how)) return dentry; /* * Case 3: pseudo direct mount, expire individual leaves * (autofs-4.1). */ } else { - /* Path walk currently on this dentry? */ struct dentry *expired; - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; + /* Not a forced expire? */ + if (!(how & AUTOFS_EXP_FORCED)) { + /* ref-walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + } - expired = autofs_check_leaves(mnt, dentry, timeout, do_now); + expired = autofs_check_leaves(mnt, dentry, timeout, how); if (expired) { if (expired == dentry) dput(dentry); @@ -427,10 +459,10 @@ static struct dentry *should_expire(struct dentry *dentry, * - it is unused by any user process * - it has been unused for exp_timeout time */ -struct dentry *autofs_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +static struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + unsigned int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -442,13 +474,10 @@ struct dentry *autofs_expire_indirect(struct super_block *sb, if (!root) return NULL; - now = jiffies; timeout = sbi->exp_timeout; dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { - int flags = how; - spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(dentry); if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { @@ -457,7 +486,7 @@ struct dentry *autofs_expire_indirect(struct super_block *sb, } spin_unlock(&sbi->fs_lock); - expired = should_expire(dentry, mnt, timeout, flags); + expired = should_expire(dentry, mnt, timeout, how); if (!expired) continue; @@ -470,7 +499,7 @@ struct dentry *autofs_expire_indirect(struct super_block *sb, /* Make sure a reference is not taken on found if * things have changed. */ - flags &= ~AUTOFS_EXP_LEAVES; + how &= ~AUTOFS_EXP_LEAVES; found = should_expire(expired, mnt, timeout, how); if (!found || found != expired) /* Something has changed, continue */ @@ -575,7 +604,7 @@ int autofs_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ - ino->last_used = now; + ino->last_used = jiffies; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -584,15 +613,15 @@ int autofs_expire_run(struct super_block *sb, } int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when) + struct autofs_sb_info *sbi, unsigned int how) { struct dentry *dentry; int ret = -EAGAIN; if (autofs_type_trigger(sbi->type)) - dentry = autofs_expire_direct(sb, mnt, sbi, when); + dentry = autofs_expire_direct(sb, mnt, sbi, how); else - dentry = autofs_expire_indirect(sb, mnt, sbi, when); + dentry = autofs_expire_indirect(sb, mnt, sbi, how); if (dentry) { struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -605,7 +634,7 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ - ino->last_used = now; + ino->last_used = jiffies; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -622,10 +651,10 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { - int do_now = 0; + unsigned int how = 0; - if (arg && get_user(do_now, arg)) + if (arg && get_user(how, arg)) return -EFAULT; - return autofs_do_expire_multi(sb, mnt, sbi, do_now); + return autofs_do_expire_multi(sb, mnt, sbi, how); } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index b51980fc274e..846c052569dd 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -10,7 +10,6 @@ #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/parser.h> -#include <linux/magic.h> #include "autofs_i.h" diff --git a/fs/autofs/root.c b/fs/autofs/root.c index a3d414150578..782e57b911ab 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -559,6 +559,13 @@ static int autofs_dir_symlink(struct inode *dir, if (!autofs_oz_mode(sbi)) return -EACCES; + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; + BUG_ON(!ino); autofs_clean_ino(ino); @@ -612,9 +619,15 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; - /* This allows root to remove symlinks */ - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!autofs_oz_mode(sbi)) + return -EACCES; + + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; if (atomic_dec_and_test(&ino->count)) { p_ino = autofs_dentry_ino(dentry->d_parent); @@ -697,6 +710,13 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (!autofs_oz_mode(sbi)) return -EACCES; + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; + spin_lock(&sbi->lookup_lock); if (!simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); @@ -735,6 +755,13 @@ static int autofs_dir_mkdir(struct inode *dir, if (!autofs_oz_mode(sbi)) return -EACCES; + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; + pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 67db22fe99c5..42bbe6824b4b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -50,10 +50,10 @@ * * 1) epmutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (spinlock) + * 3) ep->wq.lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a spinlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->wq.lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -85,7 +85,7 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->lock") to have it working, + * mutex "epmutex" (together with "ep->wq.lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee @@ -182,11 +182,10 @@ struct epitem { * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. + * + * Access to it is protected by the lock inside wq. */ struct eventpoll { - /* Protect the access to this structure */ - spinlock_t lock; - /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event @@ -210,7 +209,7 @@ struct eventpoll { /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out - * holding ->lock. + * holding ->wq.lock. */ struct epitem *ovflist; @@ -337,9 +336,9 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1, } /* Tells us if the item is currently linked */ -static inline int ep_is_linked(struct list_head *p) +static inline int ep_is_linked(struct epitem *epi) { - return !list_empty(p); + return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) @@ -392,7 +391,6 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) return ep_events_available(ep) || busy_loop_timeout(start_time); } -#endif /* CONFIG_NET_RX_BUSY_POLL */ /* * Busy poll if globally on and supporting sockets found && no events, @@ -402,20 +400,16 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) */ static void ep_busy_loop(struct eventpoll *ep, int nonblock) { -#ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); -#endif } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) { -#ifdef CONFIG_NET_RX_BUSY_POLL if (ep->napi_id) ep->napi_id = 0; -#endif } /* @@ -423,7 +417,6 @@ static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { -#ifdef CONFIG_NET_RX_BUSY_POLL struct eventpoll *ep; unsigned int napi_id; struct socket *sock; @@ -453,9 +446,24 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; -#endif } +#else + +static inline void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +} + +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +} + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -668,10 +676,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, { __poll_t res; int pwake = 0; - unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); + lockdep_assert_irqs_enabled(); + /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). @@ -688,17 +697,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); /* * Now call the callback function. */ res = (*sproc)(ep, &txlist, priv); - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -712,7 +721,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ - if (!ep_is_linked(&epi->rdllink)) { + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } @@ -740,7 +749,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); if (!ep_locked) mutex_unlock(&ep->mtx); @@ -764,16 +773,12 @@ static void epi_rcu_free(struct rcu_head *head) */ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { - unsigned long flags; struct file *file = epi->ffd.file; + lockdep_assert_irqs_enabled(); + /* - * Removes poll wait queue hooks. We _have_ to do this without holding - * the "ep->lock" otherwise a deadlock might occur. This because of the - * sequence of the lock acquisition. Here we do "ep->lock" then the wait - * queue head lock when unregistering the wait queue. The wakeup callback - * will run by holding the wait queue head lock and will call our callback - * that will try to get "ep->lock". + * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); @@ -784,10 +789,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) rb_erase_cached(&epi->rbn, &ep->rbr); - spin_lock_irqsave(&ep->lock, flags); - if (ep_is_linked(&epi->rdllink)) + spin_lock_irq(&ep->wq.lock); + if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -837,7 +842,7 @@ static void ep_free(struct eventpoll *ep) * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->lock". + * us during this operation. So we can avoid the lock on "ep->wq.lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ @@ -1017,7 +1022,6 @@ static int ep_alloc(struct eventpoll **pep) if (unlikely(!ep)) goto free_uid; - spin_lock_init(&ep->lock); mutex_init(&ep->mtx); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); @@ -1122,7 +1126,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v __poll_t pollflags = key_to_poll(key); int ewake = 0; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1167,7 +1171,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v } /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(&epi->rdllink)) { + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake_rcu(epi); } @@ -1199,7 +1203,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1417,11 +1421,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, { int error, pwake = 0; __poll_t revents; - unsigned long flags; long user_watches; struct epitem *epi; struct ep_pqueue epq; + lockdep_assert_irqs_enabled(); + user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; @@ -1484,13 +1489,13 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ - if (revents && !ep_is_linked(&epi->rdllink)) { + if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1501,7 +1506,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); atomic_long_inc(&ep->user->epoll_watches); @@ -1527,10 +1532,10 @@ error_unregister: * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - spin_lock_irqsave(&ep->lock, flags); - if (ep_is_linked(&epi->rdllink)) + spin_lock_irq(&ep->wq.lock); + if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); wakeup_source_unregister(ep_wakeup_source(epi)); @@ -1550,6 +1555,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, int pwake = 0; poll_table pt; + lockdep_assert_irqs_enabled(); + init_poll_funcptr(&pt, NULL); /* @@ -1572,9 +1579,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). - * We need this because we did not take ep->lock while + * We need this because we did not take ep->wq.lock while * changing epi above (but ep_poll_callback does take - * ep->lock). + * ep->wq.lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also @@ -1593,8 +1600,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - spin_lock_irq(&ep->lock); - if (!ep_is_linked(&epi->rdllink)) { + spin_lock_irq(&ep->wq.lock); + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1604,7 +1611,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->wq.lock); } /* We have to call this outside the lock */ @@ -1739,11 +1746,12 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res = 0, eavail, timed_out = 0; - unsigned long flags; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; + lockdep_assert_irqs_enabled(); + if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1756,7 +1764,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * caller specified a non blocking operation. */ timed_out = 1; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); goto check_events; } @@ -1765,7 +1773,7 @@ fetch_events: if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); if (!ep_events_available(ep)) { /* @@ -1807,11 +1815,11 @@ fetch_events: break; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irq(&ep->wq.lock); } __remove_wait_queue(&ep->wq, &wait); @@ -1821,7 +1829,7 @@ check_events: /* Is it worth to try to dig for events ? */ eavail = ep_events_available(ep); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irq(&ep->wq.lock); /* * Try to transfer events to user space. In case we get 0 events and diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 9f1c96caebda..e8b6b89bddb8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -28,6 +28,7 @@ struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) { + f2fs_build_fault_attr(sbi, 0, 0); set_ckpt_flags(sbi, CP_ERROR_FLAG); if (!end_io) f2fs_flush_merged_writes(sbi); @@ -70,6 +71,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .encrypted_page = NULL, .is_meta = is_meta, }; + int err; if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; @@ -84,9 +86,10 @@ repeat: fio.page = page; - if (f2fs_submit_page_bio(&fio)) { + err = f2fs_submit_page_bio(&fio); + if (err) { f2fs_put_page(page, 1); - goto repeat; + return ERR_PTR(err); } lock_page(page); @@ -95,14 +98,9 @@ repeat: goto repeat; } - /* - * if there is any IO error when accessing device, make our filesystem - * readonly and make sure do not write checkpoint with non-uptodate - * meta page. - */ if (unlikely(!PageUptodate(page))) { - memset(page_address(page), 0, PAGE_SIZE); - f2fs_stop_checkpoint(sbi, false); + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); } out: return page; @@ -113,13 +111,32 @@ struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) return __get_meta_page(sbi, index, true); } +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index) +{ + struct page *page; + int count = 0; + +retry: + page = __get_meta_page(sbi, index, true); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EIO && + ++count <= DEFAULT_RETRY_IO_COUNT) + goto retry; + + f2fs_stop_checkpoint(sbi, false); + f2fs_bug_on(sbi, 1); + } + + return page; +} + /* for POR only */ struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, false); } -bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { @@ -140,8 +157,20 @@ bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, return false; break; case META_POR: + case DATA_GENERIC: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || - blkaddr < MAIN_BLKADDR(sbi))) + blkaddr < MAIN_BLKADDR(sbi))) { + if (type == DATA_GENERIC) { + f2fs_msg(sbi->sb, KERN_WARNING, + "access invalid blkaddr:%u", blkaddr); + WARN_ON(1); + } + return false; + } + break; + case META_GENERIC: + if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || + blkaddr >= MAIN_BLKADDR(sbi))) return false; break; default: @@ -176,7 +205,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { - if (!f2fs_is_valid_meta_blkaddr(sbi, blkno, type)) + if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) goto out; switch (type) { @@ -242,11 +271,8 @@ static int __f2fs_write_meta_page(struct page *page, trace_f2fs_writepage(page, META); - if (unlikely(f2fs_cp_error(sbi))) { - dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); - return 0; - } + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) @@ -529,13 +555,12 @@ int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) spin_lock(&im->ino_lock); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } -#endif + if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else @@ -572,12 +597,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; - int err = f2fs_acquire_orphan_inode(sbi); - - if (err) - goto err_out; - - __add_ino_entry(sbi, ino, 0, ORPHAN_INO); + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { @@ -600,14 +620,15 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); - f2fs_get_node_info(sbi, ino, &ni); + err = f2fs_get_node_info(sbi, ino, &ni); + if (err) + goto err_out; /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { err = -EIO; goto err_out; } - __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; err_out: @@ -639,7 +660,10 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) /* Needed for iput() to work correctly and not trash data */ sbi->sb->s_flags |= SB_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif @@ -649,9 +673,15 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { - struct page *page = f2fs_get_meta_page(sbi, start_blk + i); + struct page *page; struct f2fs_orphan_block *orphan_blk; + page = f2fs_get_meta_page(sbi, start_blk + i); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); @@ -742,10 +772,14 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, __u32 crc = 0; *cp_page = f2fs_get_meta_page(sbi, cp_addr); + if (IS_ERR(*cp_page)) + return PTR_ERR(*cp_page); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); if (crc_offset > (blk_size - sizeof(__le32))) { + f2fs_put_page(*cp_page, 1); f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; @@ -753,6 +787,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_put_page(*cp_page, 1); f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; } @@ -772,14 +807,22 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_1, version); if (err) - goto invalid_cp1; + return NULL; + + if (le32_to_cpu(cp_block->cp_pack_total_block_count) > + sbi->blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid cp_pack_total_block_count:%u", + le32_to_cpu(cp_block->cp_pack_total_block_count)); + goto invalid_cp; + } pre_version = *version; cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) - goto invalid_cp2; + goto invalid_cp; cur_version = *version; if (cur_version == pre_version) { @@ -787,9 +830,8 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, f2fs_put_page(cp_page_2, 1); return cp_page_1; } -invalid_cp2: f2fs_put_page(cp_page_2, 1); -invalid_cp1: +invalid_cp: f2fs_put_page(cp_page_1, 1); return NULL; } @@ -838,15 +880,15 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); - /* Sanity checking of checkpoint */ - if (f2fs_sanity_check_ckpt(sbi)) - goto free_fail_no_cp; - if (cur_page == cp1) sbi->cur_cp_pack = 1; else sbi->cur_cp_pack = 2; + /* Sanity checking of checkpoint */ + if (f2fs_sanity_check_ckpt(sbi)) + goto free_fail_no_cp; + if (cp_blks <= 1) goto done; @@ -859,6 +901,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned char *ckpt = (unsigned char *)sbi->ckpt; cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); + if (IS_ERR(cur_page)) + goto free_fail_no_cp; sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); @@ -980,12 +1024,10 @@ retry: iput(inode); /* We need to give cpu to another writers. */ - if (ino == cur_ino) { - congestion_wait(BLK_RW_ASYNC, HZ/50); + if (ino == cur_ino) cond_resched(); - } else { + else ino = cur_ino; - } } else { /* * We should submit bio, since it exists several @@ -1119,7 +1161,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) f2fs_unlock_all(sbi); } -static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) { DEFINE_WAIT(wait); @@ -1129,6 +1171,9 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; + if (unlikely(f2fs_cp_error(sbi))) + break; + io_schedule_timeout(5*HZ); } finish_wait(&sbi->cp_wait, &wait); @@ -1202,8 +1247,12 @@ static void commit_checkpoint(struct f2fs_sb_info *sbi, /* writeout cp pack 2 page */ err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); - f2fs_bug_on(sbi, err); + if (unlikely(err && f2fs_cp_error(sbi))) { + f2fs_put_page(page, 1); + return; + } + f2fs_bug_on(sbi, err); f2fs_put_page(page, 0); /* submit checkpoint (with barrier if NOBARRIER is not set) */ @@ -1229,7 +1278,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) while (get_pages(sbi, F2FS_DIRTY_META)) { f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + break; } /* @@ -1309,7 +1358,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + break; } } @@ -1348,10 +1397,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ - wait_on_all_pages_writeback(sbi); - - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + f2fs_wait_on_all_pages_writeback(sbi); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); @@ -1360,12 +1406,19 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); - wait_on_all_pages_writeback(sbi); + f2fs_wait_on_all_pages_writeback(sbi); + + /* + * invalidate intermediate page cache borrowed from meta inode + * which are used for migration of encrypted inode's blocks. + */ + if (f2fs_sb_has_encrypt(sbi->sb)) + invalidate_mapping_pages(META_MAPPING(sbi), + MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); f2fs_release_ino_entry(sbi, false); - if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + f2fs_reset_fsync_node_info(sbi); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); @@ -1381,7 +1434,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); - return 0; + return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b7c9b58acf3e..382c1ef9a9e4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -126,12 +126,10 @@ static bool f2fs_bio_post_read_required(struct bio *bio) static void f2fs_read_end_io(struct bio *bio) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_P_SB(bio_first_page_all(bio)), FAULT_IO)) { f2fs_show_injection_info(FAULT_IO); bio->bi_status = BLK_STS_IOERR; } -#endif if (f2fs_bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; @@ -177,6 +175,8 @@ static void f2fs_write_end_io(struct bio *bio) page->index != nid_of_node(page)); dec_page_count(sbi, type); + if (f2fs_in_warm_node_list(sbi, page)) + f2fs_del_fsync_node_entry(sbi, page); clear_cold_data(page); end_page_writeback(page); } @@ -264,7 +264,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, if (type != DATA && type != NODE) goto submit_io; - if (f2fs_sb_has_blkzoned(sbi->sb) && current->plug) + if (test_opt(sbi, LFS) && current->plug) blk_finish_plug(current->plug); start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; @@ -441,7 +441,10 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; - verify_block_addr(fio, fio->new_blkaddr); + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, + __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) + return -EFAULT; + trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); @@ -485,7 +488,7 @@ next: spin_unlock(&io->io_lock); } - if (is_valid_blkaddr(fio->old_blkaddr)) + if (__is_valid_data_blkaddr(fio->old_blkaddr)) verify_block_addr(fio, fio->old_blkaddr); verify_block_addr(fio, fio->new_blkaddr); @@ -534,19 +537,22 @@ out: } static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) + unsigned nr_pages, unsigned op_flag) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; struct bio_post_read_ctx *ctx; unsigned int post_read_steps = 0; + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + return ERR_PTR(-EFAULT); + bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false); if (!bio) return ERR_PTR(-ENOMEM); f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio_set_op_attrs(bio, REQ_OP_READ, op_flag); if (f2fs_encrypted_file(inode)) post_read_steps |= 1 << STEP_DECRYPT; @@ -571,7 +577,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, static int f2fs_submit_page_read(struct inode *inode, struct page *page, block_t blkaddr) { - struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0); if (IS_ERR(bio)) return PTR_ERR(bio); @@ -869,6 +875,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; + block_t old_blkaddr; pgoff_t fofs; blkcnt_t count = 1; int err; @@ -876,6 +883,10 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; + dn->data_blkaddr = datablock_addr(dn->inode, dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) @@ -885,11 +896,13 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) return err; alloc: - f2fs_get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - - f2fs_allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + old_blkaddr = dn->data_blkaddr; + f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr, &sum, seg_type, NULL, false); + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); f2fs_set_data_blkaddr(dn); /* update i_size */ @@ -1045,7 +1058,13 @@ next_dnode: next_block: blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (!is_valid_blkaddr(blkaddr)) { + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) { + err = -EFAULT; + goto sync_out; + } + + if (!is_valid_data_blkaddr(sbi, blkaddr)) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1282,7 +1301,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } phys = (__u64)blk_to_logical(inode, ni.blk_addr); offset = offsetof(struct f2fs_inode, i_addr) + @@ -1309,7 +1332,11 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (!page) return -ENOMEM; - f2fs_get_node_info(sbi, xnid, &ni); + err = f2fs_get_node_info(sbi, xnid, &ni); + if (err) { + f2fs_put_page(page, 1); + return err; + } phys = (__u64)blk_to_logical(inode, ni.blk_addr); len = inode->i_sb->s_blocksize; @@ -1425,11 +1452,11 @@ out: * Note that the aops->readpages() function is ONLY used for read-ahead. If * this function ever deviates from doing just read-ahead, it should either * use ->readpage() or do the necessary surgery to decouple ->readpages() - * readom read-ahead. + * from read-ahead. */ static int f2fs_mpage_readpages(struct address_space *mapping, struct list_head *pages, struct page *page, - unsigned nr_pages) + unsigned nr_pages, bool is_readahead) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -1500,6 +1527,10 @@ got_it: SetPageUptodate(page); goto confused; } + + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr, + DATA_GENERIC)) + goto set_error_page; } else { zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) @@ -1519,7 +1550,8 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, + is_readahead ? REQ_RAHEAD : 0); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; @@ -1563,7 +1595,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1); + ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1, false); return ret; } @@ -1580,12 +1612,13 @@ static int f2fs_read_data_pages(struct file *file, if (f2fs_has_inline_data(inode)) return 0; - return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); + return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true); } static int encrypt_one_page(struct f2fs_io_info *fio) { struct inode *inode = fio->page->mapping->host; + struct page *mpage; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) @@ -1597,17 +1630,25 @@ static int encrypt_one_page(struct f2fs_io_info *fio) retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, PAGE_SIZE, 0, fio->page->index, gfp_flags); - if (!IS_ERR(fio->encrypted_page)) - return 0; + if (IS_ERR(fio->encrypted_page)) { + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); + } - /* flush pending IOs and wait for a while in the ENOMEM case */ - if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { - f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - goto retry_encrypt; + mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); + if (mpage) { + if (PageUptodate(mpage)) + memcpy(page_address(mpage), + page_address(fio->encrypted_page), PAGE_SIZE); + f2fs_put_page(mpage, 1); } - return PTR_ERR(fio->encrypted_page); + return 0; } static inline bool check_inplace_update_policy(struct inode *inode, @@ -1691,6 +1732,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct inode *inode = page->mapping->host; struct dnode_of_data dn; struct extent_info ei = {0,0,0}; + struct node_info ni; bool ipu_force = false; int err = 0; @@ -1699,11 +1741,13 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) f2fs_lookup_extent_cache(inode, page->index, &ei)) { fio->old_blkaddr = ei.blk + page->index - ei.fofs; - if (is_valid_blkaddr(fio->old_blkaddr)) { - ipu_force = true; - fio->need_lock = LOCK_DONE; - goto got_it; - } + if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) + return -EFAULT; + + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; } /* Deadlock due to between page->lock and f2fs_lock_op */ @@ -1722,11 +1766,17 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) goto out_writepage; } got_it: + if (__is_valid_data_blkaddr(fio->old_blkaddr) && + !f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr, + DATA_GENERIC)) { + err = -EFAULT; + goto out_writepage; + } /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (ipu_force || (is_valid_blkaddr(fio->old_blkaddr) && + if (ipu_force || (is_valid_data_blkaddr(fio->sbi, fio->old_blkaddr) && need_inplace_update(fio))) { err = encrypt_one_page(fio); if (err) @@ -1751,6 +1801,12 @@ got_it: fio->need_lock = LOCK_REQ; } + err = f2fs_get_node_info(fio->sbi, dn.nid, &ni); + if (err) + goto out_writepage; + + fio->version = ni.version; + err = encrypt_one_page(fio); if (err) goto out_writepage; @@ -2079,6 +2135,18 @@ continue_unlock: return ret; } +static inline bool __should_serialize_io(struct inode *inode, + struct writeback_control *wbc) +{ + if (!S_ISREG(inode->i_mode)) + return false; + if (wbc->sync_mode != WB_SYNC_ALL) + return true; + if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks) + return true; + return false; +} + static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) @@ -2087,6 +2155,7 @@ static int __f2fs_write_data_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct blk_plug plug; int ret; + bool locked = false; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -2117,10 +2186,18 @@ static int __f2fs_write_data_pages(struct address_space *mapping, else if (atomic_read(&sbi->wb_sync_req[DATA])) goto skip_write; + if (__should_serialize_io(inode, wbc)) { + mutex_lock(&sbi->writepages); + locked = true; + } + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + if (locked) + mutex_unlock(&sbi->writepages); + if (wbc->sync_mode == WB_SYNC_ALL) atomic_dec(&sbi->wb_sync_req[DATA]); /* @@ -2153,10 +2230,14 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); + truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -2251,8 +2332,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, trace_f2fs_write_begin(inode, pos, len, flags); - if (f2fs_is_atomic_file(inode) && - !f2fs_available_free_memory(sbi, INMEM_PAGES)) { + if ((f2fs_is_atomic_file(inode) && + !f2fs_available_free_memory(sbi, INMEM_PAGES)) || + is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) { err = -ENOMEM; drop_atomic = true; goto fail; @@ -2376,14 +2458,20 @@ unlock_out: static int check_direct_IO(struct inode *inode, struct iov_iter *iter, loff_t offset) { - unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - - if (offset & blocksize_mask) - return -EINVAL; - - if (iov_iter_alignment(iter) & blocksize_mask) - return -EINVAL; - + unsigned i_blkbits = READ_ONCE(inode->i_blkbits); + unsigned blkbits = i_blkbits; + unsigned blocksize_mask = (1 << blkbits) - 1; + unsigned long align = offset | iov_iter_alignment(iter); + struct block_device *bdev = inode->i_sb->s_bdev; + + if (align & blocksize_mask) { + if (bdev) + blkbits = blksize_bits(bdev_logical_block_size(bdev)); + blocksize_mask = (1 << blkbits) - 1; + if (align & blocksize_mask) + return -EINVAL; + return 1; + } return 0; } @@ -2401,7 +2489,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) err = check_direct_IO(inode, iter, offset); if (err) - return err; + return err < 0 ? err : 0; if (f2fs_force_buffered_io(inode, rw)) return 0; @@ -2495,6 +2583,10 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); + /* don't remain PG_checked flag which was set during GC */ + if (is_cold_data(page)) + clear_cold_data(page); + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { f2fs_register_inmem_page(inode, page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 2d65e77ae5cf..214a968962a1 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -215,7 +215,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); - si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks * + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK); si->base_mem += NM_I(sbi)->nat_blocks / 8; si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7f955c4e86a4..ecc3a4e2be96 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -517,12 +517,11 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, } start: -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; } -#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6799c3fc44e3..abf925664d9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -41,7 +41,6 @@ } while (0) #endif -#ifdef CONFIG_F2FS_FAULT_INJECTION enum { FAULT_KMALLOC, FAULT_KVMALLOC, @@ -56,16 +55,20 @@ enum { FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, + FAULT_DISCARD, FAULT_MAX, }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) + struct f2fs_fault_info { atomic_t inject_ops; unsigned int inject_rate; unsigned int inject_type; }; -extern char *fault_name[FAULT_MAX]; +extern char *f2fs_fault_name[FAULT_MAX]; #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif @@ -178,7 +181,6 @@ enum { #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DEF_MAX_DISCARD_REQUEST 8 /* issue 8 discards per round */ -#define DEF_MAX_DISCARD_LEN 512 /* Max. 2MB per discard */ #define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ #define DEF_MID_DISCARD_ISSUE_TIME 500 /* 500 ms, if device busy */ #define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ @@ -194,7 +196,7 @@ struct cp_control { }; /* - * For CP/NAT/SIT/SSA readahead + * indicate meta/data type */ enum { META_CP, @@ -202,6 +204,8 @@ enum { META_SIT, META_SSA, META_POR, + DATA_GENERIC, + META_GENERIC, }; /* for the list of ino */ @@ -226,6 +230,12 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; +struct fsync_node_entry { + struct list_head list; /* list head */ + struct page *page; /* warm node page pointer */ + unsigned int seq_id; /* sequence id */ +}; + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -242,9 +252,10 @@ struct discard_entry { (MAX_PLIST_NUM - 1) : (blk_num - 1)) enum { - D_PREP, - D_SUBMIT, - D_DONE, + D_PREP, /* initial */ + D_PARTIAL, /* partially submitted */ + D_SUBMIT, /* all submitted */ + D_DONE, /* finished */ }; struct discard_info { @@ -269,7 +280,10 @@ struct discard_cmd { struct block_device *bdev; /* bdev */ unsigned short ref; /* reference count */ unsigned char state; /* state */ + unsigned char issuing; /* issuing discard */ int error; /* bio error */ + spinlock_t lock; /* for state/bio_ref updating */ + unsigned short bio_ref; /* bio reference count */ }; enum { @@ -289,6 +303,7 @@ struct discard_policy { unsigned int io_aware_gran; /* minimum granularity discard not be aware of I/O */ bool io_aware; /* issue discard in idle time */ bool sync; /* submit discard with REQ_SYNC flag */ + bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ }; @@ -305,10 +320,12 @@ struct discard_cmd_control { unsigned int max_discards; /* max. discards to be issued */ unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ + unsigned int next_pos; /* next discard position */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ atomic_t discard_cmd_cnt; /* # of cached cmd count */ struct rb_root root; /* root of discard rb-tree */ + bool rbtree_check; /* config for consistence check */ }; /* for the list of fsync inodes, used only during recovery */ @@ -508,13 +525,12 @@ enum { */ }; +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ + #define F2FS_LINK_MAX 0xffffffff /* maximum link count per file */ #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ -/* vector size for gang look-up from extent cache that consists of radix tree */ -#define EXT_TREE_VEC_SIZE 64 - /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ @@ -600,6 +616,8 @@ enum { #define FADVISE_HOT_BIT 0x20 #define FADVISE_VERITY_BIT 0x40 /* reserved */ +#define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) + #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) @@ -669,8 +687,8 @@ struct f2fs_inode_info { int i_extra_isize; /* size of extra space located in i_addr */ kprojid_t i_projid; /* id for project quota */ int i_inline_xattr_size; /* inline xattr size */ - struct timespec i_crtime; /* inode creation time */ - struct timespec i_disk_time[4]; /* inode disk times */ + struct timespec64 i_crtime; /* inode creation time */ + struct timespec64 i_disk_time[4];/* inode disk times */ }; static inline void get_extent_info(struct extent_info *ext, @@ -698,22 +716,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, } static inline bool __is_discard_mergeable(struct discard_info *back, - struct discard_info *front) + struct discard_info *front, unsigned int max_len) { return (back->lstart + back->len == front->lstart) && - (back->len + front->len < DEF_MAX_DISCARD_LEN); + (back->len + front->len <= max_len); } static inline bool __is_discard_back_mergeable(struct discard_info *cur, - struct discard_info *back) + struct discard_info *back, unsigned int max_len) { - return __is_discard_mergeable(back, cur); + return __is_discard_mergeable(back, cur, max_len); } static inline bool __is_discard_front_mergeable(struct discard_info *cur, - struct discard_info *front) + struct discard_info *front, unsigned int max_len) { - return __is_discard_mergeable(cur, front); + return __is_discard_mergeable(cur, front, max_len); } static inline bool __is_extent_mergeable(struct extent_info *back, @@ -768,6 +786,7 @@ struct f2fs_nm_info { struct radix_tree_root nat_set_root;/* root of the nat set cache */ struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ struct list_head nat_entries; /* cached nat entry list (clean) */ + spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ unsigned int nat_blocks; /* # of nat blocks */ @@ -894,6 +913,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_seq_blocks; /* threshold for sequential blocks */ unsigned int min_hot_blocks; /* threshold for hot block allocation */ unsigned int min_ssr_sections; /* threshold to trigger SSR allocation */ @@ -1015,6 +1035,7 @@ struct f2fs_io_info { bool retry; /* need to reallocate block address */ enum iostat_type io_type; /* io type */ struct writeback_control *io_wbc; /* writeback control */ + unsigned char version; /* version of the node */ }; #define is_read_io(rw) ((rw) == READ) @@ -1066,6 +1087,7 @@ enum { SBI_POR_DOING, /* recovery is doing or not */ SBI_NEED_SB_WRITE, /* need to recover superblock */ SBI_NEED_CP, /* need to checkpoint */ + SBI_IS_SHUTDOWN, /* shutdown by ioctl */ }; enum { @@ -1112,6 +1134,7 @@ struct f2fs_sb_info { struct rw_semaphore sb_lock; /* lock for raw super block */ int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ + struct mutex writepages; /* mutex for writepages() */ #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ @@ -1148,6 +1171,11 @@ struct f2fs_sb_info { struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ + spinlock_t fsync_node_lock; /* for node entry lock */ + struct list_head fsync_node_list; /* node list head */ + unsigned int fsync_seg_id; /* sequence id */ + unsigned int fsync_node_num; /* number of node entries */ + /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ @@ -1215,6 +1243,7 @@ struct f2fs_sb_info { unsigned int gc_mode; /* current GC state */ /* for skip statistic */ unsigned long long skipped_atomic_files[2]; /* FG_GC and BG_GC */ + unsigned long long skipped_gc_rwsem; /* FG_GC only */ /* threshold for gc trials on pinned files */ u64 gc_pin_file_threshold; @@ -1279,7 +1308,7 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FAULT_INJECTION #define f2fs_show_injection_info(type) \ printk("%sF2FS-fs : inject %s in %s of %pF\n", \ - KERN_INFO, fault_name[type], \ + KERN_INFO, f2fs_fault_name[type], \ __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { @@ -1298,6 +1327,12 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) } return false; } +#else +#define f2fs_show_injection_info(type) do { } while (0) +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + return false; +} #endif /* For write statistics. Suppose sector size is 512 bytes, @@ -1326,7 +1361,7 @@ static inline bool is_idle(struct f2fs_sb_info *sbi) struct request_list *rl = &q->root_rl; if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) - return 0; + return false; return f2fs_time_over(sbi, REQ_TIME); } @@ -1650,13 +1685,12 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, if (ret) return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); release = *count; goto enospc; } -#endif + /* * let's increase this in prior to actual block count change in order * for f2fs_sync_file to avoid data races when deciding checkpoint. @@ -1680,18 +1714,20 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, sbi->total_valid_block_count -= diff; if (!*count) { spin_unlock(&sbi->stat_lock); - percpu_counter_sub(&sbi->alloc_valid_block_count, diff); goto enospc; } } spin_unlock(&sbi->stat_lock); - if (unlikely(release)) + if (unlikely(release)) { + percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); + } f2fs_i_blocks_write(inode, *count, true, true); return 0; enospc: + percpu_counter_sub(&sbi->alloc_valid_block_count, release); dquot_release_reservation_block(inode, release); return -ENOSPC; } @@ -1863,12 +1899,10 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, return ret; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_BLOCK)) { f2fs_show_injection_info(FAULT_BLOCK); goto enospc; } -#endif spin_lock(&sbi->stat_lock); @@ -1953,17 +1987,23 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { -#ifdef CONFIG_F2FS_FAULT_INJECTION - struct page *page = find_lock_page(mapping, index); + struct page *page; - if (page) - return page; + if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + if (!for_write) + page = find_get_page_flags(mapping, index, + FGP_LOCK | FGP_ACCESSED); + else + page = find_lock_page(mapping, index); + if (page) + return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { - f2fs_show_injection_info(FAULT_PAGE_ALLOC); - return NULL; + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); + return NULL; + } } -#endif + if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -1973,12 +2013,11 @@ static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp_mask) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) { f2fs_show_injection_info(FAULT_PAGE_GET); return NULL; } -#endif + return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } @@ -2043,12 +2082,11 @@ static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages); return bio; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ALLOC_BIO)) { f2fs_show_injection_info(FAULT_ALLOC_BIO); return NULL; } -#endif + return bio_alloc(GFP_KERNEL, npages); } @@ -2518,7 +2556,6 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { - struct timespec ts; bool ret; if (dsync) { @@ -2534,16 +2571,13 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) i_size_read(inode) & ~PAGE_MASK) return false; - ts = timespec64_to_timespec(inode->i_atime); - if (!timespec_equal(F2FS_I(inode)->i_disk_time, &ts)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) return false; - ts = timespec64_to_timespec(inode->i_ctime); - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) return false; - ts = timespec64_to_timespec(inode->i_mtime); - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; - if (!timespec_equal(F2FS_I(inode)->i_disk_time + 3, + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3, &F2FS_I(inode)->i_crtime)) return false; @@ -2587,12 +2621,11 @@ static inline bool f2fs_may_extent_tree(struct inode *inode) static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_KMALLOC)) { f2fs_show_injection_info(FAULT_KMALLOC); return NULL; } -#endif + return kmalloc(size, flags); } @@ -2605,12 +2638,11 @@ static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi, static inline void *f2fs_kvmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_KVMALLOC)) { f2fs_show_injection_info(FAULT_KVMALLOC); return NULL; } -#endif + return kvmalloc(size, flags); } @@ -2669,13 +2701,39 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, spin_unlock(&sbi->iostat_lock); } -static inline bool is_valid_blkaddr(block_t blkaddr) +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ + (!is_read_io(fio->op) || fio->is_meta)) + +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); +static inline void verify_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type) +{ + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) { + f2fs_msg(sbi->sb, KERN_ERR, + "invalid blkaddr: %u, type: %d, run fsck to fix.", + blkaddr, type); + f2fs_bug_on(sbi, 1); + } +} + +static inline bool __is_valid_data_blkaddr(block_t blkaddr) { if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return false; return true; } +static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + if (!__is_valid_data_blkaddr(blkaddr)) + return false; + verify_blkaddr(sbi, blkaddr, DATA_GENERIC); + return true; +} + /* * file.c */ @@ -2790,16 +2848,21 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); -void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); pgoff_t f2fs_get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -2808,11 +2871,12 @@ struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); void f2fs_move_node_page(struct page *node_page, int gc_type); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, - struct writeback_control *wbc, bool atomic); + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id); int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type); -void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); @@ -2820,7 +2884,7 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); void f2fs_recover_inline_xattr(struct inode *inode, struct page *page); int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); -void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); @@ -2898,9 +2962,10 @@ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *f2fs_get_meta_page_nofail(struct f2fs_sb_info *sbi, pgoff_t index); struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); -bool f2fs_is_valid_meta_blkaddr(struct f2fs_sb_info *sbi, - block_t blkaddr, int type); +bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, + block_t blkaddr, int type); int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); @@ -2924,6 +2989,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi); void f2fs_update_dirty_page(struct inode *inode, struct page *page); void f2fs_remove_dirty_inode(struct inode *inode); int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi); int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi); int __init f2fs_create_checkpoint_caches(void); @@ -3362,7 +3428,7 @@ static inline bool f2fs_may_encrypt(struct inode *inode) return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else - return 0; + return false; #endif } @@ -3373,4 +3439,11 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, int rw) F2FS_I_SB(inode)->s_ndevs); } +#ifdef CONFIG_F2FS_FAULT_INJECTION +extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type); +#else +#define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) +#endif + #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6880c6f78d58..5474aaa274b9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -213,6 +213,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; + unsigned int seq_id = 0; if (unlikely(f2fs_readonly(inode->i_sb))) return 0; @@ -275,7 +276,7 @@ go_write: } sync_nodes: atomic_inc(&sbi->wb_sync_req[NODE]); - ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic); + ret = f2fs_fsync_node_pages(sbi, inode, &wbc, atomic, &seq_id); atomic_dec(&sbi->wb_sync_req[NODE]); if (ret) goto out; @@ -301,7 +302,7 @@ sync_nodes: * given fsync mark. */ if (!atomic) { - ret = f2fs_wait_on_node_pages_writeback(sbi, ino); + ret = f2fs_wait_on_node_pages_writeback(sbi, seq_id); if (ret) goto out; } @@ -350,13 +351,13 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, return pgofs; } -static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs, - int whence) +static bool __found_offset(struct f2fs_sb_info *sbi, block_t blkaddr, + pgoff_t dirty, pgoff_t pgofs, int whence) { switch (whence) { case SEEK_DATA: if ((blkaddr == NEW_ADDR && dirty == pgofs) || - is_valid_blkaddr(blkaddr)) + is_valid_data_blkaddr(sbi, blkaddr)) return true; break; case SEEK_HOLE: @@ -420,7 +421,15 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); - if (__found_offset(blkaddr, dirty, pgofs, whence)) { + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(F2FS_I_SB(inode), + blkaddr, DATA_GENERIC)) { + f2fs_put_dnode(&dn); + goto fail; + } + + if (__found_offset(F2FS_I_SB(inode), blkaddr, dirty, + pgofs, whence)) { f2fs_put_dnode(&dn); goto found; } @@ -513,6 +522,11 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) dn->data_blkaddr = NULL_ADDR; f2fs_set_data_blkaddr(dn); + + if (__is_valid_data_blkaddr(blkaddr) && + !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC)) + continue; + f2fs_invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); @@ -654,12 +668,11 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { f2fs_show_injection_info(FAULT_TRUNCATE); return -EIO; } -#endif + /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -782,22 +795,26 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size <= i_size_read(inode)) { - down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_setsize(inode, attr->ia_size); + bool to_smaller = (attr->ia_size <= i_size_read(inode)); + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + truncate_setsize(inode, attr->ia_size); + + if (to_smaller) err = f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); - if (err) - return err; - } else { - /* - * do not trim all blocks after i_size if target size is - * larger than i_size. - */ - down_write(&F2FS_I(inode)->i_mmap_sem); - truncate_setsize(inode, attr->ia_size); - up_write(&F2FS_I(inode)->i_mmap_sem); + /* + * do not trim all blocks after i_size if target size is + * larger than i_size. + */ + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + + if (err) + return err; + if (!to_smaller) { /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -944,14 +961,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); down_write(&F2FS_I(inode)->i_mmap_sem); + truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1054,7 +1076,12 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, if (ret) return ret; - f2fs_get_node_info(sbi, dn.nid, &ni); + ret = f2fs_get_node_info(sbi, dn.nid, &ni); + if (ret) { + f2fs_put_dnode(&dn); + return ret; + } + ilen = min((pgoff_t) ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); @@ -1161,25 +1188,33 @@ roll_back: return ret; } -static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; int ret; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); - f2fs_drop_extent_tree(inode); + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + f2fs_lock_op(sbi); + f2fs_drop_extent_tree(inode); + truncate_pagecache(inode, offset); ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); + + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) { - pgoff_t pg_start, pg_end; loff_t new_size; int ret; @@ -1194,25 +1229,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - pg_start = offset >> PAGE_SHIFT; - pg_end = (offset + len) >> PAGE_SHIFT; - - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - - down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out_unlock; - - truncate_pagecache(inode, offset); + return ret; - ret = f2fs_do_collapse(inode, pg_start, pg_end); + ret = f2fs_do_collapse(inode, offset, len); if (ret) - goto out_unlock; + return ret; /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); @@ -1220,11 +1247,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, new_size); ret = f2fs_truncate_blocks(inode, new_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out_unlock: - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1290,12 +1315,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - goto out_sem; - - truncate_pagecache_range(inode, offset, offset + len - 1); + return ret; pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1307,7 +1329,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, offset + len); } else { @@ -1315,7 +1337,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - goto out_sem; + return ret; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1326,12 +1348,21 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, unsigned int end_offset; pgoff_t end; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + + truncate_pagecache_range(inode, + (loff_t)index << PAGE_SHIFT, + ((loff_t)pg_end << PAGE_SHIFT) - 1); + f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1340,7 +1371,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1368,9 +1402,6 @@ out: else f2fs_i_size_write(inode, new_size); } -out_sem: - up_write(&F2FS_I(inode)->i_mmap_sem); - return ret; } @@ -1399,26 +1430,27 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - /* avoid gc operation during block exchange */ - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); + up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) - goto out; + return ret; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - goto out; - - truncate_pagecache(inode, offset); + return ret; pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + /* avoid gc operation during block exchange */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); + truncate_pagecache(inode, offset); + while (!ret && idx > pg_start) { nr = idx - pg_start; if (nr > delta) @@ -1432,16 +1464,17 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ + down_write(&F2FS_I(inode)->i_mmap_sem); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); + up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); -out: - up_write(&F2FS_I(inode)->i_mmap_sem); - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1597,7 +1630,7 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags = fi->i_flags; - if (file_is_encrypt(inode)) + if (f2fs_encrypted_inode(inode)) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; @@ -1688,15 +1721,18 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) inode_lock(inode); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) + ret = -EINVAL; goto out; + } ret = f2fs_convert_inline_inode(inode); if (ret) goto out; + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (!get_dirty_pages(inode)) goto skip_flush; @@ -1704,18 +1740,20 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; + } skip_flush: set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1733,9 +1771,9 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - inode_lock(inode); + f2fs_balance_fs(F2FS_I_SB(inode), true); - down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + inode_lock(inode); if (f2fs_is_volatile_file(inode)) { ret = -EINVAL; @@ -1761,7 +1799,6 @@ err_out: clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); ret = -EINVAL; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1853,6 +1890,8 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } + clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); + inode_unlock(inode); mnt_drop_write_file(filp); @@ -1866,7 +1905,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; - int ret; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1889,6 +1928,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) } if (sb) { f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); thaw_bdev(sb->s_bdev, sb); } break; @@ -1898,13 +1938,16 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (ret) goto out; f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_NOSYNC: f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; case F2FS_GOING_DOWN_METAFLUSH: f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); + set_sbi_flag(sbi, SBI_IS_SHUTDOWN); break; default: ret = -EINVAL; @@ -2107,7 +2150,7 @@ out: return ret; } -static int f2fs_ioc_f2fs_write_checkpoint(struct file *filp, unsigned long arg) +static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -2351,15 +2394,10 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, } inode_lock(src); - down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); if (src != dst) { ret = -EBUSY; if (!inode_trylock(dst)) goto out; - if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) { - inode_unlock(dst); - goto out; - } } ret = -EINVAL; @@ -2404,6 +2442,14 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, goto out_unlock; f2fs_balance_fs(sbi, true); + + down_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); + if (src != dst) { + ret = -EBUSY; + if (!down_write_trylock(&F2FS_I(dst)->i_gc_rwsem[WRITE])) + goto out_src; + } + f2fs_lock_op(sbi); ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, pos_out >> F2FS_BLKSIZE_BITS, @@ -2416,13 +2462,15 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, f2fs_i_size_write(dst, dst_osize); } f2fs_unlock_op(sbi); -out_unlock: - if (src != dst) { + + if (src != dst) up_write(&F2FS_I(dst)->i_gc_rwsem[WRITE]); +out_src: + up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); +out_unlock: + if (src != dst) inode_unlock(dst); - } out: - up_write(&F2FS_I(src)->i_gc_rwsem[WRITE]); inode_unlock(src); return ret; } @@ -2782,7 +2830,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) if (!pin) { clear_inode_flag(inode, FI_PIN_FILE); - F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN] = 1; + f2fs_i_gc_failures_write(inode, 0); goto done; } @@ -2888,7 +2936,7 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT_RANGE: return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: - return f2fs_ioc_f2fs_write_checkpoint(filp, arg); + return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 9093be6e7a7d..5c8d00422237 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -53,12 +53,10 @@ static int gc_thread_func(void *data) continue; } -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } -#endif if (!sb_start_write_trylock(sbi->sb)) continue; @@ -517,7 +515,11 @@ next_step: continue; } - f2fs_get_node_info(sbi, nid, &ni); + if (f2fs_get_node_info(sbi, nid, &ni)) { + f2fs_put_page(node_page, 1); + continue; + } + if (ni.blk_addr != start_addr + off) { f2fs_put_page(node_page, 1); continue; @@ -576,7 +578,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (IS_ERR(node_page)) return false; - f2fs_get_node_info(sbi, nid, dni); + if (f2fs_get_node_info(sbi, nid, dni)) { + f2fs_put_page(node_page, 1); + return false; + } if (sum->version != dni->version) { f2fs_msg(sbi->sb, KERN_WARNING, @@ -594,6 +599,72 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } +static int ra_data_block(struct inode *inode, pgoff_t index) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct dnode_of_data dn; + struct page *page; + struct extent_info ei = {0, 0, 0}; + struct f2fs_io_info fio = { + .sbi = sbi, + .ino = inode->i_ino, + .type = DATA, + .temp = COLD, + .op = REQ_OP_READ, + .op_flags = 0, + .encrypted_page = NULL, + .in_list = false, + .retry = false, + }; + int err; + + page = f2fs_grab_cache_page(mapping, index, true); + if (!page) + return -ENOMEM; + + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err) + goto put_page; + f2fs_put_dnode(&dn); + + if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, + DATA_GENERIC))) { + err = -EFAULT; + goto put_page; + } +got_it: + /* read page */ + fio.page = page; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; + + fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), + dn.data_blkaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto put_page; + } + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_encrypted_page; + f2fs_put_page(fio.encrypted_page, 0); + f2fs_put_page(page, 1); + return 0; +put_encrypted_page: + f2fs_put_page(fio.encrypted_page, 1); +put_page: + f2fs_put_page(page, 1); + return err; +} + /* * Move data block via META_MAPPING while keeping locked data page. * This can be used to move blocks, aka LBAs, directly on disk. @@ -615,7 +686,7 @@ static void move_data_block(struct inode *inode, block_t bidx, struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; - struct page *page; + struct page *page, *mpage; block_t newaddr; int err; bool lfs_mode = test_opt(fio.sbi, LFS); @@ -655,7 +726,10 @@ static void move_data_block(struct inode *inode, block_t bidx, */ f2fs_wait_on_page_writeback(page, DATA, true); - f2fs_get_node_info(fio.sbi, dn.nid, &ni); + err = f2fs_get_node_info(fio.sbi, dn.nid, &ni); + if (err) + goto put_out; + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ @@ -675,6 +749,23 @@ static void move_data_block(struct inode *inode, block_t bidx, goto recover_block; } + mpage = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), + fio.old_blkaddr, FGP_LOCK, GFP_NOFS); + if (mpage) { + bool updated = false; + + if (PageUptodate(mpage)) { + memcpy(page_address(fio.encrypted_page), + page_address(mpage), PAGE_SIZE); + updated = true; + } + f2fs_put_page(mpage, 1); + invalidate_mapping_pages(META_MAPPING(fio.sbi), + fio.old_blkaddr, fio.old_blkaddr); + if (updated) + goto write_page; + } + err = f2fs_submit_page_bio(&fio); if (err) goto put_page_out; @@ -691,6 +782,7 @@ static void move_data_block(struct inode *inode, block_t bidx, goto put_page_out; } +write_page: set_page_dirty(fio.encrypted_page); f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) @@ -865,22 +957,30 @@ next_step: if (IS_ERR(inode) || is_bad_inode(inode)) continue; - /* if inode uses special I/O path, let's go phase 3 */ - if (f2fs_post_read_required(inode)) { - add_gc_inode(gc_list, inode); - continue; - } - if (!down_write_trylock( &F2FS_I(inode)->i_gc_rwsem[WRITE])) { iput(inode); + sbi->skipped_gc_rwsem++; + continue; + } + + start_bidx = f2fs_start_bidx_of_node(nofs, inode) + + ofs_in_node; + + if (f2fs_post_read_required(inode)) { + int err = ra_data_block(inode, start_bidx); + + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + if (err) { + iput(inode); + continue; + } + add_gc_inode(gc_list, inode); continue; } - start_bidx = f2fs_start_bidx_of_node(nofs, inode); data_page = f2fs_get_read_data_page(inode, - start_bidx + ofs_in_node, REQ_RAHEAD, - true); + start_bidx, REQ_RAHEAD, true); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (IS_ERR(data_page)) { iput(inode); @@ -903,6 +1003,7 @@ next_step: continue; if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { + sbi->skipped_gc_rwsem++; up_write(&fi->i_gc_rwsem[READ]); continue; } @@ -986,7 +1087,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, goto next; sum = page_address(sum_page); - f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); + if (type != GET_SUM_TYPE((&sum->footer))) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent segment (%u) " + "type [%d, %d] in SSA and SIT", + segno, type, GET_SUM_TYPE((&sum->footer))); + set_sbi_flag(sbi, SBI_NEED_FSCK); + goto next; + } /* * this is to avoid deadlock: @@ -1034,6 +1141,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC]; + unsigned long long first_skipped; unsigned int skipped_round = 0, round = 0; trace_f2fs_gc_begin(sbi->sb, sync, background, @@ -1046,6 +1154,8 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, prefree_segments(sbi)); cpc.reason = __get_cp_reason(sbi); + sbi->skipped_gc_rwsem = 0; + first_skipped = last_skipped; gc_more: if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; @@ -1087,7 +1197,8 @@ gc_more: total_freed += seg_freed; if (gc_type == FG_GC) { - if (sbi->skipped_atomic_files[FG_GC] > last_skipped) + if (sbi->skipped_atomic_files[FG_GC] > last_skipped || + sbi->skipped_gc_rwsem) skipped_round++; last_skipped = sbi->skipped_atomic_files[FG_GC]; round++; @@ -1096,15 +1207,23 @@ gc_more: if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; - if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) { - if (skipped_round > MAX_SKIP_ATOMIC_COUNT && - skipped_round * 2 >= round) - f2fs_drop_inmem_pages_all(sbi, true); + if (sync) + goto stop; + + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + if (skipped_round <= MAX_SKIP_GC_COUNT || + skipped_round * 2 < round) { segno = NULL_SEGNO; goto gc_more; } + if (first_skipped < last_skipped && + (last_skipped - first_skipped) > + sbi->skipped_gc_rwsem) { + f2fs_drop_inmem_pages_all(sbi, true); + segno = NULL_SEGNO; + goto gc_more; + } if (gc_type == FG_GC) ret = f2fs_write_checkpoint(sbi, &cpc); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 043830be5662..115dc219344b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -121,6 +121,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .encrypted_page = NULL, .io_type = FS_DATA_IO, }; + struct node_info ni; int dirty, err; if (!f2fs_exist_data(dn->inode)) @@ -130,6 +131,24 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; + err = f2fs_get_node_info(fio.sbi, dn->nid, &ni); + if (err) { + f2fs_put_dnode(dn); + return err; + } + + fio.version = ni.version; + + if (unlikely(dn->data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(dn); + set_sbi_flag(fio.sbi, SBI_NEED_FSCK); + f2fs_msg(fio.sbi->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dn->inode->i_ino, dn->data_blkaddr); + return -EINVAL; + } + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); f2fs_do_read_inline_data(page, dn->inode_page); @@ -363,6 +382,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, if (err) goto out; + if (unlikely(dn.data_blkaddr != NEW_ADDR)) { + f2fs_put_dnode(&dn); + set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + f2fs_msg(F2FS_P_SB(page)->sb, KERN_WARNING, + "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, " + "run fsck to fix.", + __func__, dir->i_ino, dn.data_blkaddr); + err = -EINVAL; + goto out; + } + f2fs_wait_on_page_writeback(page, DATA, true); dentry_blk = page_address(page); @@ -477,6 +507,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); + f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); @@ -668,7 +699,10 @@ int f2fs_inline_data_fiemap(struct inode *inode, ilen = start + len; ilen -= start; - f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + err = f2fs_get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); + if (err) + goto out; + byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; byteaddr += (char *)inline_data_addr(inode, ipage) - (char *)F2FS_INODE(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index f121c864f4c0..959df2249875 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -68,13 +68,16 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } -static bool __written_first_block(struct f2fs_inode *ri) +static int __written_first_block(struct f2fs_sb_info *sbi, + struct f2fs_inode *ri) { block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); - if (is_valid_blkaddr(addr)) - return true; - return false; + if (!__is_valid_data_blkaddr(addr)) + return 1; + if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC)) + return -EFAULT; + return 0; } static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -121,7 +124,7 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page if (!f2fs_sb_has_inode_chksum(sbi->sb)) return false; - if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR)) return false; if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize), @@ -159,8 +162,15 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_inode *ri; __u32 provided, calculated; + if (unlikely(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN))) + return true; + +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_enable_inode_chksum(sbi, page)) +#else if (!f2fs_enable_inode_chksum(sbi, page) || PageDirty(page) || PageWriteback(page)) +#endif return true; ri = &F2FS_NODE(page)->i; @@ -185,9 +195,31 @@ void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); } -static bool sanity_check_inode(struct inode *inode) +static bool sanity_check_inode(struct inode *inode, struct page *node_page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned long long iblocks; + + iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks); + if (!iblocks) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, " + "run fsck to fix.", + __func__, inode->i_ino, iblocks); + return false; + } + + if (ino_of_node(node_page) != nid_of_node(node_page)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: corrupted inode footer i_ino=%lx, ino,nid: " + "[%u, %u] run fsck to fix.", + __func__, inode->i_ino, + ino_of_node(node_page), nid_of_node(node_page)); + return false; + } if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) && !f2fs_has_extra_attr(inode)) { @@ -197,6 +229,64 @@ static bool sanity_check_inode(struct inode *inode) __func__, inode->i_ino); return false; } + + if (f2fs_has_extra_attr(inode) && + !f2fs_sb_has_extra_attr(sbi->sb)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) is with extra_attr, " + "but extra_attr feature is off", + __func__, inode->i_ino); + return false; + } + + if (fi->i_extra_isize > F2FS_TOTAL_EXTRA_ATTR_SIZE || + fi->i_extra_isize % sizeof(__le32)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted i_extra_isize: %d, " + "max: %zu", + __func__, inode->i_ino, fi->i_extra_isize, + F2FS_TOTAL_EXTRA_ATTR_SIZE); + return false; + } + + if (F2FS_I(inode)->extent_tree) { + struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; + + if (ei->len && + (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC) || + !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1, + DATA_GENERIC))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) extent info [%u, %u, %u] " + "is incorrect, run fsck to fix", + __func__, inode->i_ino, + ei->blk, ei->fofs, ei->len); + return false; + } + } + + if (f2fs_has_inline_data(inode) && + (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_data, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + + if (f2fs_has_inline_dentry(inode) && !S_ISDIR(inode->i_mode)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx, mode=%u) should not have " + "inline_dentry, run fsck to fix", + __func__, inode->i_ino, inode->i_mode); + return false; + } + return true; } @@ -207,6 +297,7 @@ static int do_read_inode(struct inode *inode) struct page *node_page; struct f2fs_inode *ri; projid_t i_projid; + int err; /* Check if ino is within scope */ if (f2fs_check_nid_range(sbi, inode->i_ino)) @@ -268,6 +359,11 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } + if (!sanity_check_inode(inode, node_page)) { + f2fs_put_page(node_page, 1); + return -EINVAL; + } + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -275,8 +371,15 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); - if (__written_first_block(ri)) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + if (S_ISREG(inode->i_mode)) { + err = __written_first_block(sbi, ri); + if (err < 0) { + f2fs_put_page(node_page, 1); + return err; + } + if (!err) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + } if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; @@ -297,9 +400,9 @@ static int do_read_inode(struct inode *inode) fi->i_crtime.tv_nsec = le32_to_cpu(ri->i_crtime_nsec); } - F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); - F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); - F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; f2fs_put_page(node_page, 1); @@ -330,10 +433,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = do_read_inode(inode); if (ret) goto bad_inode; - if (!sanity_check_inode(inode)) { - ret = -EINVAL; - goto bad_inode; - } make_now: if (ino == F2FS_NODE_INO(sbi)) { inode->i_mapping->a_ops = &f2fs_node_aops; @@ -470,10 +569,14 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) if (inode->i_nlink == 0) clear_inline_node(node_page); - F2FS_I(inode)->i_disk_time[0] = timespec64_to_timespec(inode->i_atime); - F2FS_I(inode)->i_disk_time[1] = timespec64_to_timespec(inode->i_ctime); - F2FS_I(inode)->i_disk_time[2] = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_disk_time[0] = inode->i_atime; + F2FS_I(inode)->i_disk_time[1] = inode->i_ctime; + F2FS_I(inode)->i_disk_time[2] = inode->i_mtime; F2FS_I(inode)->i_disk_time[3] = F2FS_I(inode)->i_crtime; + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); +#endif } void f2fs_update_inode_page(struct inode *inode) @@ -558,12 +661,11 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_EVICT_INODE)) { f2fs_show_injection_info(FAULT_EVICT_INODE); err = -EIO; } -#endif + if (!err) { f2fs_lock_op(sbi); err = f2fs_remove_inode_page(inode); @@ -626,6 +728,7 @@ void f2fs_handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + int err; /* * clear nlink of inode in order to release resource of inode @@ -648,10 +751,16 @@ void f2fs_handle_failed_inode(struct inode *inode) * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - f2fs_get_node_info(sbi, inode->i_ino, &ni); + err = f2fs_get_node_info(sbi, inode->i_ino, &ni); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "May loss orphan inode, run fsck to fix."); + goto out; + } if (ni.blk_addr != NULL_ADDR) { - int err = f2fs_acquire_orphan_inode(sbi); + err = f2fs_acquire_orphan_inode(sbi); if (err) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, @@ -664,6 +773,7 @@ void f2fs_handle_failed_inode(struct inode *inode) set_inode_flag(inode, FI_FREE_NID); } +out: f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 231b7f3ea7d3..1f67e389169f 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -51,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - F2FS_I(inode)->i_crtime = timespec64_to_timespec(inode->i_mtime); + F2FS_I(inode)->i_crtime = inode->i_mtime; inode->i_generation = sbi->s_next_generation++; if (S_ISDIR(inode->i_mode)) @@ -246,7 +246,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, return -EINVAL; if (hot) { - strncpy(extlist[count], name, strlen(name)); + memcpy(extlist[count], name, strlen(name)); sbi->raw_super->hot_ext_count = hot_count + 1; } else { char buf[F2FS_MAX_EXTENSION][F2FS_EXTENSION_LEN]; @@ -254,7 +254,7 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name, memcpy(buf, &extlist[cold_count], F2FS_EXTENSION_LEN * hot_count); memset(extlist[cold_count], 0, F2FS_EXTENSION_LEN); - strncpy(extlist[cold_count], name, strlen(name)); + memcpy(extlist[cold_count], name, strlen(name)); memcpy(&extlist[cold_count + 1], buf, F2FS_EXTENSION_LEN * hot_count); sbi->raw_super->extension_count = cpu_to_le32(cold_count + 1); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 10643b11bd59..dd2e45a661aa 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -28,6 +28,7 @@ static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; +static struct kmem_cache *fsync_node_entry_slab; /* * Check whether the given nid is within node id range. @@ -112,25 +113,22 @@ static void clear_node_page_dirty(struct page *page) static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - pgoff_t index = current_nat_addr(sbi, nid); - return f2fs_get_meta_page(sbi, index); + return f2fs_get_meta_page_nofail(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { struct page *src_page; struct page *dst_page; - pgoff_t src_off; pgoff_t dst_off; void *src_addr; void *dst_addr; struct f2fs_nm_info *nm_i = NM_I(sbi); - src_off = current_nat_addr(sbi, nid); - dst_off = next_nat_addr(sbi, src_off); + dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid)); /* get current nat block page with lock */ - src_page = f2fs_get_meta_page(sbi, src_off); + src_page = get_current_nat_page(sbi, nid); dst_page = f2fs_grab_meta_page(sbi, dst_off); f2fs_bug_on(sbi, PageDirty(src_page)); @@ -176,14 +174,30 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i, if (raw_ne) node_info_from_raw_nat(&ne->ni, raw_ne); + + spin_lock(&nm_i->nat_list_lock); list_add_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + nm_i->nat_cnt++; return ne; } static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n) { - return radix_tree_lookup(&nm_i->nat_root, n); + struct nat_entry *ne; + + ne = radix_tree_lookup(&nm_i->nat_root, n); + + /* for recent accessed nat entry, move it to tail of lru list */ + if (ne && !get_nat_flag(ne, IS_DIRTY)) { + spin_lock(&nm_i->nat_list_lock); + if (!list_empty(&ne->list)) + list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + } + + return ne; } static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, @@ -194,7 +208,6 @@ static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i, static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) { - list_del(&e->list); radix_tree_delete(&nm_i->nat_root, nat_get_nid(e)); nm_i->nat_cnt--; __free_nat_entry(e); @@ -245,16 +258,21 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nm_i->dirty_nat_cnt++; set_nat_flag(ne, IS_DIRTY, true); refresh_list: + spin_lock(&nm_i->nat_list_lock); if (new_ne) list_del_init(&ne->list); else list_move_tail(&ne->list, &head->entry_list); + spin_unlock(&nm_i->nat_list_lock); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, struct nat_entry_set *set, struct nat_entry *ne) { + spin_lock(&nm_i->nat_list_lock); list_move_tail(&ne->list, &nm_i->nat_entries); + spin_unlock(&nm_i->nat_list_lock); + set_nat_flag(ne, IS_DIRTY, false); set->entry_cnt--; nm_i->dirty_nat_cnt--; @@ -267,6 +285,72 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +{ + return NODE_MAPPING(sbi) == page->mapping && + IS_DNODE(page) && is_cold_node(page); +} + +void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) +{ + spin_lock_init(&sbi->fsync_node_lock); + INIT_LIST_HEAD(&sbi->fsync_node_list); + sbi->fsync_seg_id = 0; + sbi->fsync_node_num = 0; +} + +static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, + struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + unsigned int seq_id; + + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + + get_page(page); + fn->page = page; + INIT_LIST_HEAD(&fn->list); + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_add_tail(&fn->list, &sbi->fsync_node_list); + fn->seq_id = sbi->fsync_seg_id++; + seq_id = fn->seq_id; + sbi->fsync_node_num++; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + + return seq_id; +} + +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page) +{ + struct fsync_node_entry *fn; + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + list_for_each_entry(fn, &sbi->fsync_node_list, list) { + if (fn->page == page) { + list_del(&fn->list); + sbi->fsync_node_num--; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + kmem_cache_free(fsync_node_entry_slab, fn); + put_page(page); + return; + } + } + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + f2fs_bug_on(sbi, 1); +} + +void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + sbi->fsync_seg_id = 0; + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); +} + int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -371,7 +455,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, new_blkaddr == NULL_ADDR); f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(sbi, is_valid_blkaddr(nat_get_blkaddr(e)) && + f2fs_bug_on(sbi, is_valid_data_blkaddr(sbi, nat_get_blkaddr(e)) && new_blkaddr == NEW_ADDR); /* increment version no as node is removed */ @@ -382,7 +466,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* change address */ nat_set_blkaddr(e, new_blkaddr); - if (!is_valid_blkaddr(new_blkaddr)) + if (!is_valid_data_blkaddr(sbi, new_blkaddr)) set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); @@ -405,13 +489,25 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) if (!down_write_trylock(&nm_i->nat_tree_lock)) return 0; - while (nr_shrink && !list_empty(&nm_i->nat_entries)) { + spin_lock(&nm_i->nat_list_lock); + while (nr_shrink) { struct nat_entry *ne; + + if (list_empty(&nm_i->nat_entries)) + break; + ne = list_first_entry(&nm_i->nat_entries, struct nat_entry, list); + list_del(&ne->list); + spin_unlock(&nm_i->nat_list_lock); + __del_from_nat_cache(nm_i, ne); nr_shrink--; + + spin_lock(&nm_i->nat_list_lock); } + spin_unlock(&nm_i->nat_list_lock); + up_write(&nm_i->nat_tree_lock); return nr - nr_shrink; } @@ -419,7 +515,7 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink) /* * This function always returns success */ -void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, +int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -443,7 +539,7 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); up_read(&nm_i->nat_tree_lock); - return; + return 0; } memset(&ne, 0, sizeof(struct f2fs_nat_entry)); @@ -466,6 +562,9 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, up_read(&nm_i->nat_tree_lock); page = f2fs_get_meta_page(sbi, index); + if (IS_ERR(page)) + return PTR_ERR(page); + nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); @@ -473,6 +572,7 @@ void f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, cache: /* cache nat entry */ cache_nat_entry(sbi, nid, &ne); + return 0; } /* @@ -722,12 +822,15 @@ release_out: return err; } -static void truncate_node(struct dnode_of_data *dn) +static int truncate_node(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; + int err; - f2fs_get_node_info(sbi, dn->nid, &ni); + err = f2fs_get_node_info(sbi, dn->nid, &ni); + if (err) + return err; /* Deallocate node address */ f2fs_invalidate_blocks(sbi, ni.blk_addr); @@ -750,11 +853,14 @@ static void truncate_node(struct dnode_of_data *dn) dn->node_page = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); + + return 0; } static int truncate_dnode(struct dnode_of_data *dn) { struct page *page; + int err; if (dn->nid == 0) return 1; @@ -770,7 +876,10 @@ static int truncate_dnode(struct dnode_of_data *dn) dn->node_page = page; dn->ofs_in_node = 0; f2fs_truncate_data_blocks(dn); - truncate_node(dn); + err = truncate_node(dn); + if (err) + return err; + return 1; } @@ -835,7 +944,9 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (!ofs) { /* remove current indirect node */ dn->node_page = page; - truncate_node(dn); + ret = truncate_node(dn); + if (ret) + goto out_err; freed++; } else { f2fs_put_page(page, 1); @@ -893,7 +1004,9 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, if (offset[idx + 1] == 0) { dn->node_page = pages[idx]; dn->nid = nid[idx]; - truncate_node(dn); + err = truncate_node(dn); + if (err) + goto fail; } else { f2fs_put_page(pages[idx], 1); } @@ -1014,6 +1127,7 @@ int f2fs_truncate_xattr_node(struct inode *inode) nid_t nid = F2FS_I(inode)->i_xattr_nid; struct dnode_of_data dn; struct page *npage; + int err; if (!nid) return 0; @@ -1022,10 +1136,15 @@ int f2fs_truncate_xattr_node(struct inode *inode) if (IS_ERR(npage)) return PTR_ERR(npage); + set_new_dnode(&dn, inode, NULL, npage, nid); + err = truncate_node(&dn); + if (err) { + f2fs_put_page(npage, 1); + return err; + } + f2fs_i_xnid_write(inode, 0); - set_new_dnode(&dn, inode, NULL, npage, nid); - truncate_node(&dn); return 0; } @@ -1055,11 +1174,19 @@ int f2fs_remove_inode_page(struct inode *inode) f2fs_truncate_data_blocks_range(&dn, 1); /* 0 is possible, after f2fs_new_inode() has failed */ + if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { + f2fs_put_dnode(&dn); + return -EIO; + } f2fs_bug_on(F2FS_I_SB(inode), inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ - truncate_node(&dn); + err = truncate_node(&dn); + if (err) { + f2fs_put_dnode(&dn); + return err; + } return 0; } @@ -1092,7 +1219,11 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) goto fail; #ifdef CONFIG_F2FS_CHECK_FS - f2fs_get_node_info(sbi, dn->nid, &new_ni); + err = f2fs_get_node_info(sbi, dn->nid, &new_ni); + if (err) { + dec_valid_node_count(sbi, dn->inode, !ofs); + goto fail; + } f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); #endif new_ni.nid = dn->nid; @@ -1140,13 +1271,21 @@ static int read_node_page(struct page *page, int op_flags) .page = page, .encrypted_page = NULL, }; + int err; - if (PageUptodate(page)) + if (PageUptodate(page)) { +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_bug_on(sbi, !f2fs_inode_chksum_verify(sbi, page)); +#endif return LOCKED_PAGE; + } - f2fs_get_node_info(sbi, page->index, &ni); + err = f2fs_get_node_info(sbi, page->index, &ni); + if (err) + return err; - if (unlikely(ni.blk_addr == NULL_ADDR)) { + if (unlikely(ni.blk_addr == NULL_ADDR) || + is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; } @@ -1348,7 +1487,7 @@ continue_unlock: static int __write_node_page(struct page *page, bool atomic, bool *submitted, struct writeback_control *wbc, bool do_balance, - enum iostat_type io_type) + enum iostat_type io_type, unsigned int *seq_id) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1365,6 +1504,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .io_type = io_type, .io_wbc = wbc, }; + unsigned int seq; trace_f2fs_writepage(page, NODE); @@ -1374,10 +1514,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; + if (wbc->sync_mode == WB_SYNC_NONE && + IS_DNODE(page) && is_cold_node(page)) + goto redirty_out; + /* get old block addr of this node page */ nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); + if (f2fs_get_node_info(sbi, nid, &ni)) + goto redirty_out; + if (wbc->for_reclaim) { if (!down_read_trylock(&sbi->node_write)) goto redirty_out; @@ -1385,8 +1532,6 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, down_read(&sbi->node_write); } - f2fs_get_node_info(sbi, nid, &ni); - /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { ClearPageUptodate(page); @@ -1396,11 +1541,22 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, return 0; } + if (__is_valid_data_blkaddr(ni.blk_addr) && + !f2fs_is_valid_blkaddr(sbi, ni.blk_addr, DATA_GENERIC)) + goto redirty_out; + if (atomic && !test_opt(sbi, NOBARRIER)) fio.op_flags |= REQ_PREFLUSH | REQ_FUA; set_page_writeback(page); ClearPageError(page); + + if (f2fs_in_warm_node_list(sbi, page)) { + seq = f2fs_add_fsync_node_entry(sbi, page); + if (seq_id) + *seq_id = seq; + } + fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); @@ -1448,7 +1604,7 @@ void f2fs_move_node_page(struct page *node_page, int gc_type) goto out_page; if (__write_node_page(node_page, false, NULL, - &wbc, false, FS_GC_NODE_IO)) + &wbc, false, FS_GC_NODE_IO, NULL)) unlock_page(node_page); goto release_page; } else { @@ -1465,11 +1621,13 @@ release_page: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); + return __write_node_page(page, false, NULL, wbc, false, + FS_NODE_IO, NULL); } int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, - struct writeback_control *wbc, bool atomic) + struct writeback_control *wbc, bool atomic, + unsigned int *seq_id) { pgoff_t index; pgoff_t last_idx = ULONG_MAX; @@ -1550,7 +1708,7 @@ continue_unlock: ret = __write_node_page(page, atomic && page == last_page, &submitted, wbc, true, - FS_NODE_IO); + FS_NODE_IO, seq_id); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1633,7 +1791,9 @@ next_step: !is_cold_node(page))) continue; lock_node: - if (!trylock_page(page)) + if (wbc->sync_mode == WB_SYNC_ALL) + lock_page(page); + else if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { @@ -1665,7 +1825,7 @@ continue_unlock: set_dentry_mark(page, 0); ret = __write_node_page(page, false, &submitted, - wbc, do_balance, io_type); + wbc, do_balance, io_type, NULL); if (ret) unlock_page(page); else if (submitted) @@ -1684,10 +1844,12 @@ continue_unlock: } if (step < 2) { + if (wbc->sync_mode == WB_SYNC_NONE && step == 1) + goto out; step++; goto next_step; } - +out: if (nwritten) f2fs_submit_merged_write(sbi, NODE); @@ -1696,35 +1858,46 @@ continue_unlock: return ret; } -int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) +int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, + unsigned int seq_id) { - pgoff_t index = 0; - struct pagevec pvec; + struct fsync_node_entry *fn; + struct page *page; + struct list_head *head = &sbi->fsync_node_list; + unsigned long flags; + unsigned int cur_seq_id = 0; int ret2, ret = 0; - int nr_pages; - pagevec_init(&pvec); + while (seq_id && cur_seq_id < seq_id) { + spin_lock_irqsave(&sbi->fsync_node_lock, flags); + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + fn = list_first_entry(head, struct fsync_node_entry, list); + if (fn->seq_id > seq_id) { + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); + break; + } + cur_seq_id = fn->seq_id; + page = fn->page; + get_page(page); + spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); - while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK))) { - int i; + f2fs_wait_on_page_writeback(page, NODE, true); + if (TestClearPageError(page)) + ret = -EIO; - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; + put_page(page); - if (ino && ino_of_node(page) == ino) { - f2fs_wait_on_page_writeback(page, NODE, true); - if (TestClearPageError(page)) - ret = -EIO; - } - } - pagevec_release(&pvec); - cond_resched(); + if (ret) + break; } ret2 = filemap_check_errors(NODE_MAPPING(sbi)); if (!ret) ret = ret2; + return ret; } @@ -1774,6 +1947,10 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); +#ifdef CONFIG_F2FS_CHECK_FS + if (IS_INODE(page)) + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); @@ -1968,7 +2145,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } -static void scan_nat_page(struct f2fs_sb_info *sbi, +static int scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1986,7 +2163,10 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - f2fs_bug_on(sbi, blk_addr == NEW_ADDR); + + if (blk_addr == NEW_ADDR) + return -EINVAL; + if (blk_addr == NULL_ADDR) { add_free_nid(sbi, start_nid, true, true); } else { @@ -1995,6 +2175,8 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, spin_unlock(&NM_I(sbi)->nid_list_lock); } } + + return 0; } static void scan_curseg_cache(struct f2fs_sb_info *sbi) @@ -2050,11 +2232,11 @@ out: up_read(&nm_i->nat_tree_lock); } -static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, +static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int i = 0; + int i = 0, ret; nid_t nid = nm_i->next_scan_nid; if (unlikely(nid >= nm_i->max_nid)) @@ -2062,17 +2244,17 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, /* Enough entries */ if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) - return; + return 0; if (!sync && !f2fs_available_free_memory(sbi, FREE_NIDS)) - return; + return 0; if (!mount) { /* try to find free nids in free_nid_bitmap */ scan_free_nid_bits(sbi); if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK) - return; + return 0; } /* readahead nat pages to be scanned */ @@ -2086,8 +2268,16 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, nm_i->nat_block_bitmap)) { struct page *page = get_current_nat_page(sbi, nid); - scan_nat_page(sbi, page, nid); + ret = scan_nat_page(sbi, page, nid); f2fs_put_page(page, 1); + + if (ret) { + up_read(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, !mount); + f2fs_msg(sbi->sb, KERN_ERR, + "NAT is corrupt, run fsck to fix it"); + return -EINVAL; + } } nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK)); @@ -2108,13 +2298,19 @@ static void __f2fs_build_free_nids(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); + + return 0; } -void f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) +int f2fs_build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { + int ret; + mutex_lock(&NM_I(sbi)->build_lock); - __f2fs_build_free_nids(sbi, sync, mount); + ret = __f2fs_build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); + + return ret; } /* @@ -2127,12 +2323,11 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ALLOC_NID)) { f2fs_show_injection_info(FAULT_ALLOC_NID); return false; } -#endif + spin_lock(&nm_i->nid_list_lock); if (unlikely(nm_i->available_nids == 0)) { @@ -2277,12 +2472,16 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) struct dnode_of_data dn; struct node_info ni; struct page *xpage; + int err; if (!prev_xnid) goto recover_xnid; /* 1: invalidate the previous xattr nid */ - f2fs_get_node_info(sbi, prev_xnid, &ni); + err = f2fs_get_node_info(sbi, prev_xnid, &ni); + if (err) + return err; + f2fs_invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); @@ -2317,8 +2516,11 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; struct page *ipage; + int err; - f2fs_get_node_info(sbi, ino, &old_ni); + err = f2fs_get_node_info(sbi, ino, &old_ni); + if (err) + return err; if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; @@ -2372,7 +2574,7 @@ retry: return 0; } -void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, +int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum) { struct f2fs_node *rn; @@ -2394,6 +2596,9 @@ void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, for (idx = addr; idx < addr + nrpages; idx++) { struct page *page = f2fs_get_tmp_page(sbi, idx); + if (IS_ERR(page)) + return PTR_ERR(page); + rn = F2FS_NODE(page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; @@ -2405,6 +2610,7 @@ void f2fs_restore_node_summary(struct f2fs_sb_info *sbi, invalidate_mapping_pages(META_MAPPING(sbi), addr, addr + nrpages); } + return 0; } static void remove_nats_in_journal(struct f2fs_sb_info *sbi) @@ -2582,6 +2788,13 @@ void f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) nid_t set_idx = 0; LIST_HEAD(sets); + /* during unmount, let's flush nat_bits before checking dirty_nat_cnt */ + if (enabled_nat_bits(sbi, cpc)) { + down_write(&nm_i->nat_tree_lock); + remove_nats_in_journal(sbi); + up_write(&nm_i->nat_tree_lock); + } + if (!nm_i->dirty_nat_cnt) return; @@ -2634,7 +2847,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { - struct page *page = f2fs_get_meta_page(sbi, nat_bits_addr++); + struct page *page; + + page = f2fs_get_meta_page(sbi, nat_bits_addr++); + if (IS_ERR(page)) { + disable_nat_bits(sbi, true); + return PTR_ERR(page); + } memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), page_address(page), F2FS_BLKSIZE); @@ -2718,6 +2937,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); + spin_lock_init(&nm_i->nat_list_lock); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->nid_list_lock); @@ -2762,8 +2982,8 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) for (i = 0; i < nm_i->nat_blocks; i++) { nm_i->free_nid_bitmap[i] = f2fs_kvzalloc(sbi, - NAT_ENTRY_BITMAP_SIZE_ALIGNED, GFP_KERNEL); - if (!nm_i->free_nid_bitmap) + f2fs_bitmap_size(NAT_ENTRY_PER_BLOCK), GFP_KERNEL); + if (!nm_i->free_nid_bitmap[i]) return -ENOMEM; } @@ -2801,8 +3021,7 @@ int f2fs_build_node_manager(struct f2fs_sb_info *sbi) /* load free nid status from nat_bits table */ load_free_nid_bitmap(sbi); - f2fs_build_free_nids(sbi, true, true); - return 0; + return f2fs_build_free_nids(sbi, true, true); } void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) @@ -2837,8 +3056,13 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi) unsigned idx; nid = nat_get_nid(natvec[found - 1]) + 1; - for (idx = 0; idx < found; idx++) + for (idx = 0; idx < found; idx++) { + spin_lock(&nm_i->nat_list_lock); + list_del(&natvec[idx]->list); + spin_unlock(&nm_i->nat_list_lock); + __del_from_nat_cache(nm_i, natvec[idx]); + } } f2fs_bug_on(sbi, nm_i->nat_cnt); @@ -2893,8 +3117,15 @@ int __init f2fs_create_node_manager_caches(void) sizeof(struct nat_entry_set)); if (!nat_entry_set_slab) goto destroy_free_nid; + + fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry", + sizeof(struct fsync_node_entry)); + if (!fsync_node_entry_slab) + goto destroy_nat_entry_set; return 0; +destroy_nat_entry_set: + kmem_cache_destroy(nat_entry_set_slab); destroy_free_nid: kmem_cache_destroy(free_nid_slab); destroy_nat_entry: @@ -2905,6 +3136,7 @@ fail: void f2fs_destroy_node_manager_caches(void) { + kmem_cache_destroy(fsync_node_entry_slab); kmem_cache_destroy(nat_entry_set_slab); kmem_cache_destroy(free_nid_slab); kmem_cache_destroy(nat_entry_slab); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index b95e49e4a928..0f4db7a61254 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -135,6 +135,11 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; } +static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) +{ + return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ @@ -444,6 +449,10 @@ static inline void set_mark(struct page *page, int mark, int type) else flag &= ~(0x1 << type); rn->footer.flag = cpu_to_le32(flag); + +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_inode_chksum_set(F2FS_P_SB(page), page); +#endif } #define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) #define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 38f25f0b193a..95511ed11a22 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -241,8 +241,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, struct page *page = NULL; block_t blkaddr; unsigned int loop_cnt = 0; - unsigned int free_blocks = sbi->user_block_count - - valid_user_blocks(sbi); + unsigned int free_blocks = MAIN_SEGS(sbi) * sbi->blocks_per_seg - + valid_user_blocks(sbi); int err = 0; /* get node pages in the current segment */ @@ -252,10 +252,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; - if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } if (!is_recoverable_dnode(page)) break; @@ -471,7 +475,10 @@ retry_dn: f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - f2fs_get_node_info(sbi, dn.nid, &ni); + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) + goto err; + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); @@ -507,14 +514,13 @@ retry_dn: } /* dest is valid block, try to recover from src to dest */ - if (f2fs_is_valid_meta_blkaddr(sbi, dest, META_POR)) { + if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = f2fs_reserve_new_block(&dn); -#ifdef CONFIG_F2FS_FAULT_INJECTION - while (err) + while (err && + IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) err = f2fs_reserve_new_block(&dn); -#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); if (err) @@ -568,12 +574,16 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; - if (!f2fs_is_valid_meta_blkaddr(sbi, blkaddr, META_POR)) + if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; f2fs_ra_meta_pages_cond(sbi, blkaddr); page = f2fs_get_tmp_page(sbi, blkaddr); + if (IS_ERR(page)) { + err = PTR_ERR(page); + break; + } if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); @@ -628,7 +638,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) #endif if (s_flags & SB_RDONLY) { - f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + f2fs_msg(sbi->sb, KERN_INFO, + "recover fsync data on readonly fs"); sbi->sb->s_flags &= ~SB_RDONLY; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9efce174c51a..30779aaa9dba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,7 +250,13 @@ retry: err = -EAGAIN; goto next; } - f2fs_get_node_info(sbi, dn.nid, &ni); + + err = f2fs_get_node_info(sbi, dn.nid, &ni); + if (err) { + f2fs_put_dnode(&dn); + return err; + } + if (cur->old_addr == NEW_ADDR) { f2fs_invalidate_blocks(sbi, dn.data_blkaddr); f2fs_update_data_blkaddr(&dn, NEW_ADDR); @@ -439,8 +445,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) int err; f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); + down_write(&fi->i_gc_rwsem[WRITE]); + + f2fs_lock_op(sbi); set_inode_flag(inode, FI_ATOMIC_COMMIT); mutex_lock(&fi->inmem_lock); @@ -455,6 +463,8 @@ int f2fs_commit_inmem_pages(struct inode *inode) clear_inode_flag(inode, FI_ATOMIC_COMMIT); f2fs_unlock_op(sbi); + up_write(&fi->i_gc_rwsem[WRITE]); + return err; } @@ -464,12 +474,10 @@ int f2fs_commit_inmem_pages(struct inode *inode) */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { -#ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_CHECKPOINT)) { f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); } -#endif /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) @@ -503,7 +511,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else f2fs_build_free_nids(sbi, false, false); - if (!is_idle(sbi) && !excess_dirty_nats(sbi)) + if (!is_idle(sbi) && + (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi))) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -511,6 +520,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) !f2fs_available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || + excess_dirty_nodes(sbi) || f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; @@ -831,9 +841,12 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, dc->len = len; dc->ref = 0; dc->state = D_PREP; + dc->issuing = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); + spin_lock_init(&dc->lock); + dc->bio_ref = 0; atomic_inc(&dcc->discard_cmd_cnt); dcc->undiscard_blks += len; @@ -860,7 +873,7 @@ static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) - atomic_dec(&dcc->issing_discard); + atomic_sub(dc->issuing, &dcc->issing_discard); list_del(&dc->list); rb_erase(&dc->rb_node, &dcc->root); @@ -875,9 +888,17 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + unsigned long flags; trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len); + spin_lock_irqsave(&dc->lock, flags); + if (dc->bio_ref) { + spin_unlock_irqrestore(&dc->lock, flags); + return; + } + spin_unlock_irqrestore(&dc->lock, flags); + f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) @@ -893,10 +914,17 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi, static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + unsigned long flags; dc->error = blk_status_to_errno(bio->bi_status); - dc->state = D_DONE; - complete_all(&dc->wait); + + spin_lock_irqsave(&dc->lock, flags); + dc->bio_ref--; + if (!dc->bio_ref && dc->state == D_SUBMIT) { + dc->state = D_DONE; + complete_all(&dc->wait); + } + spin_unlock_irqrestore(&dc->lock, flags); bio_put(bio); } @@ -934,6 +962,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; + dpolicy->ordered = false; dpolicy->granularity = granularity; dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; @@ -945,6 +974,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME; dpolicy->io_aware = true; dpolicy->sync = false; + dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -962,48 +992,115 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } } - +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len); /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static void __submit_discard_cmd(struct f2fs_sb_info *sbi, +static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, - struct discard_cmd *dc) + struct discard_cmd *dc, + unsigned int *issued) { + struct block_device *bdev = dc->bdev; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); - struct bio *bio = NULL; int flag = dpolicy->sync ? REQ_SYNC : 0; + block_t lstart, start, len, total_len; + int err = 0; if (dc->state != D_PREP) - return; + return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - return; + return 0; - trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + trace_f2fs_issue_discard(bdev, dc->start, dc->len); - dc->error = __blkdev_issue_discard(dc->bdev, - SECTOR_FROM_BLOCK(dc->start), - SECTOR_FROM_BLOCK(dc->len), - GFP_NOFS, 0, &bio); - if (!dc->error) { - /* should keep before submission to avoid D_DONE right away */ - dc->state = D_SUBMIT; - atomic_inc(&dcc->issued_discard); - atomic_inc(&dcc->issing_discard); - if (bio) { - bio->bi_private = dc; - bio->bi_end_io = f2fs_submit_discard_endio; - bio->bi_opf |= flag; - submit_bio(bio); - list_move_tail(&dc->list, wait_list); - __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); - - f2fs_update_iostat(sbi, FS_DISCARD, 1); + lstart = dc->lstart; + start = dc->start; + len = dc->len; + total_len = len; + + dc->len = 0; + + while (total_len && *issued < dpolicy->max_requests && !err) { + struct bio *bio = NULL; + unsigned long flags; + bool last = true; + + if (len > max_discard_blocks) { + len = max_discard_blocks; + last = false; } - } else { - __remove_discard_cmd(sbi, dc); + + (*issued)++; + if (*issued == dpolicy->max_requests) + last = true; + + dc->len += len; + + if (time_to_inject(sbi, FAULT_DISCARD)) { + f2fs_show_injection_info(FAULT_DISCARD); + err = -EIO; + goto submit; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(start), + SECTOR_FROM_BLOCK(len), + GFP_NOFS, 0, &bio); +submit: + if (err) { + spin_lock_irqsave(&dc->lock, flags); + if (dc->state == D_PARTIAL) + dc->state = D_SUBMIT; + spin_unlock_irqrestore(&dc->lock, flags); + + break; + } + + f2fs_bug_on(sbi, !bio); + + /* + * should keep before submission to avoid D_DONE + * right away + */ + spin_lock_irqsave(&dc->lock, flags); + if (last) + dc->state = D_SUBMIT; + else + dc->state = D_PARTIAL; + dc->bio_ref++; + spin_unlock_irqrestore(&dc->lock, flags); + + atomic_inc(&dcc->issing_discard); + dc->issuing++; + list_move_tail(&dc->list, wait_list); + + /* sanity check on discard range */ + __check_sit_bitmap(sbi, start, start + len); + + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= flag; + submit_bio(bio); + + atomic_inc(&dcc->issued_discard); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); + + lstart += len; + start += len; + total_len -= len; + len = total_len; } + + if (!err && len) + __update_discard_tree_range(sbi, bdev, lstart, start, len); + return err; } static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, @@ -1084,10 +1181,11 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_discard_blocks = + SECTOR_TO_BLOCK(q->limits.max_discard_sectors); block_t end = lstart + len; - mutex_lock(&dcc->cmd_lock); - dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, lstart, (struct rb_entry **)&prev_dc, @@ -1127,7 +1225,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, if (prev_dc && prev_dc->state == D_PREP && prev_dc->bdev == bdev && - __is_discard_back_mergeable(&di, &prev_dc->di)) { + __is_discard_back_mergeable(&di, &prev_dc->di, + max_discard_blocks)) { prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); @@ -1138,7 +1237,8 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, if (next_dc && next_dc->state == D_PREP && next_dc->bdev == bdev && - __is_discard_front_mergeable(&di, &next_dc->di)) { + __is_discard_front_mergeable(&di, &next_dc->di, + max_discard_blocks)) { next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; @@ -1161,8 +1261,6 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi, node = rb_next(&prev_dc->rb_node); next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } - - mutex_unlock(&dcc->cmd_lock); } static int __queue_discard_cmd(struct f2fs_sb_info *sbi, @@ -1177,10 +1275,72 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, blkstart -= FDEV(devi).start_blk; } + mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); return 0; } +static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, + struct discard_policy *dpolicy) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + struct discard_cmd *dc; + struct blk_plug plug; + unsigned int pos = dcc->next_pos; + unsigned int issued = 0; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, + NULL, pos, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (!dc) + dc = next_dc; + + blk_start_plug(&plug); + + while (dc) { + struct rb_node *node; + int err = 0; + + if (dc->state != D_PREP) + goto next; + + if (dpolicy->io_aware && !is_idle(sbi)) { + io_interrupted = true; + break; + } + + dcc->next_pos = dc->lstart + dc->len; + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) + break; +next: + node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); + dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + blk_finish_plug(&plug); + + if (!dc) + dcc->next_pos = 0; + + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { @@ -1188,19 +1348,24 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0, issued = 0; + int i, issued = 0; bool io_interrupted = false; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (i + 1 < dpolicy->granularity) break; + + if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered) + return __issue_discard_cmd_orderly(sbi, dpolicy); + pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; - f2fs_bug_on(sbi, - !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); @@ -1208,20 +1373,19 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi)) { io_interrupted = true; - goto skip; + break; } - __submit_discard_cmd(sbi, dpolicy, dc); - issued++; -skip: - if (++iter >= dpolicy->max_requests) + __submit_discard_cmd(sbi, dpolicy, dc, &issued); + + if (issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); - if (iter >= dpolicy->max_requests) + if (issued >= dpolicy->max_requests || io_interrupted) break; } @@ -1319,21 +1483,22 @@ next: return trimmed; } -static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi, +static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_policy dp; + unsigned int discard_blks; - if (dpolicy) { - __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); - return; - } + if (dpolicy) + return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, 1); - __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, 1); - __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); + + return discard_blks; } /* This should be covered by global mutex, &sit_i->sentry_lock */ @@ -1386,6 +1551,8 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) /* just to make sure there is no pending discard commands */ __wait_all_discard_cmd(sbi, NULL); + + f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); return dropped; } @@ -1643,21 +1810,30 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; + + if (need_align && end != -1) + end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) break; end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); - for (i = start; i < end; i++) - clear_bit(i, prefree_map); + if (need_align) { + start = rounddown(start, sbi->segs_per_sec); + end = roundup(end, sbi->segs_per_sec); + } - dirty_i->nr_dirty[PRE] -= end - start; + for (i = start; i < end; i++) { + if (test_and_clear_bit(i, prefree_map)) + dirty_i->nr_dirty[PRE]--; + } if (!test_opt(sbi, DISCARD)) continue; @@ -1751,7 +1927,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->nr_discards = 0; dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; dcc->undiscard_blks = 0; + dcc->next_pos = 0; dcc->root = RB_ROOT; + dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; @@ -1901,6 +2079,8 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) if (addr == NEW_ADDR) return; + invalidate_mapping_pages(META_MAPPING(sbi), addr, addr); + /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); @@ -1919,7 +2099,7 @@ bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) struct seg_entry *se; bool is_cp = false; - if (!is_valid_blkaddr(blkaddr)) + if (!is_valid_data_blkaddr(sbi, blkaddr)) return true; down_read(&sit_i->sentry_lock); @@ -1983,7 +2163,7 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) */ struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); + return f2fs_get_meta_page_nofail(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, @@ -2366,7 +2546,7 @@ bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, return has_candidate; } -static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, +static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, unsigned int start, unsigned int end) { @@ -2376,12 +2556,15 @@ static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_cmd *dc; struct blk_plug plug; int issued; + unsigned int trimmed = 0; next: issued = 0; mutex_lock(&dcc->cmd_lock); - f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, &dcc->root)); + if (unlikely(dcc->rbtree_check)) + f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi, + &dcc->root)); dc = (struct discard_cmd *)f2fs_lookup_rb_tree_ret(&dcc->root, NULL, start, @@ -2395,6 +2578,7 @@ next: while (dc && dc->lstart <= end) { struct rb_node *node; + int err = 0; if (dc->len < dpolicy->granularity) goto skip; @@ -2404,19 +2588,24 @@ next: goto skip; } - __submit_discard_cmd(sbi, dpolicy, dc); + err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); - if (++issued >= dpolicy->max_requests) { + if (issued >= dpolicy->max_requests) { start = dc->lstart + dc->len; + if (err) + __remove_discard_cmd(sbi, dc); + blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); - __wait_all_discard_cmd(sbi, NULL); + trimmed += __wait_all_discard_cmd(sbi, NULL); congestion_wait(BLK_RW_ASYNC, HZ/50); goto next; } skip: node = rb_next(&dc->rb_node); + if (err) + __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); if (fatal_signal_pending(current)) @@ -2425,6 +2614,8 @@ skip: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + return trimmed; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) @@ -2437,12 +2628,13 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; + bool need_align = test_opt(sbi, LFS) && sbi->segs_per_sec > 1; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; - if (end <= MAIN_BLKADDR(sbi)) - return -EINVAL; + if (end < MAIN_BLKADDR(sbi)) + goto out; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_msg(sbi->sb, KERN_WARNING, @@ -2454,6 +2646,10 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); + if (need_align) { + start_segno = rounddown(start_segno, sbi->segs_per_sec); + end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1; + } cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); @@ -2469,24 +2665,27 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (err) goto out; - start_block = START_BLOCK(sbi, start_segno); - end_block = START_BLOCK(sbi, end_segno + 1); - - __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); - __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - /* * We filed discard candidates, but actually we don't need to wait for * all of them, since they'll be issued in idle time along with runtime * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ - if (!test_opt(sbi, DISCARD)) { - trimmed = __wait_discard_cmd_range(sbi, &dpolicy, + if (test_opt(sbi, DISCARD)) + goto out; + + start_block = START_BLOCK(sbi, start_segno); + end_block = START_BLOCK(sbi, end_segno + 1); + + __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); + trimmed = __issue_discard_cmd_range(sbi, &dpolicy, + start_block, end_block); + + trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); - range->len = F2FS_BLK_TO_BYTES(trimmed); - } out: + if (!err) + range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } @@ -2639,8 +2838,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) return CURSEG_COLD_DATA; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - is_inode_flag_set(inode, FI_ATOMIC_FILE) || - is_inode_flag_set(inode, FI_VOLATILE_FILE)) + f2fs_is_atomic_file(inode) || + f2fs_is_volatile_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(inode->i_write_hint); } else { @@ -2781,6 +2980,9 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) reallocate: f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio, true); + if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) + invalidate_mapping_pages(META_MAPPING(fio->sbi), + fio->old_blkaddr, fio->old_blkaddr); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); @@ -2836,11 +3038,9 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn, { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; - struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); - f2fs_get_node_info(sbi, dn->nid, &ni); - set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); + set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); @@ -2937,8 +3137,11 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!recover_curseg || recover_newaddr) update_sit_entry(sbi, new_blkaddr, 1); - if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) + if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { + invalidate_mapping_pages(META_MAPPING(sbi), + old_blkaddr, old_blkaddr); update_sit_entry(sbi, old_blkaddr, -1); + } locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); @@ -2992,7 +3195,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; - if (!is_valid_blkaddr(blkaddr)) + if (!is_valid_data_blkaddr(sbi, blkaddr)) return; cpage = find_lock_page(META_MAPPING(sbi), blkaddr); @@ -3002,7 +3205,7 @@ void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static void read_compacted_summaries(struct f2fs_sb_info *sbi) +static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; @@ -3014,6 +3217,8 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) start = start_sum_block(sbi); page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); /* Step 1: restore nat cache */ @@ -3054,11 +3259,14 @@ static void read_compacted_summaries(struct f2fs_sb_info *sbi) page = NULL; page = f2fs_get_meta_page(sbi, start++); + if (IS_ERR(page)) + return PTR_ERR(page); kaddr = (unsigned char *)page_address(page); offset = 0; } } f2fs_put_page(page, 1); + return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) @@ -3070,6 +3278,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; + int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { @@ -3093,6 +3302,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) } new = f2fs_get_meta_page(sbi, blk_addr); + if (IS_ERR(new)) + return PTR_ERR(new); sum = (struct f2fs_summary_block *)page_address(new); if (IS_NODESEG(type)) { @@ -3104,7 +3315,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) ns->ofs_in_node = 0; } } else { - f2fs_restore_node_summary(sbi, segno, sum); + err = f2fs_restore_node_summary(sbi, segno, sum); + if (err) + goto out; } } @@ -3124,8 +3337,9 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); +out: f2fs_put_page(new, 1); - return 0; + return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) @@ -3143,7 +3357,9 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) META_CP, true); /* restore for compacted data summary */ - read_compacted_summaries(sbi); + err = read_compacted_summaries(sbi); + if (err) + return err; type = CURSEG_HOT_NODE; } @@ -3274,7 +3490,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_page_nofail(sbi, current_sit_addr(sbi, segno)); } static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, @@ -3923,6 +4139,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index f18fc82fbe99..b3d9e317ff0c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -85,7 +85,7 @@ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - ((!is_valid_blkaddr(blk_addr)) ? \ + ((!is_valid_data_blkaddr(sbi, blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ @@ -215,7 +215,7 @@ struct segment_allocation { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) -#define MAX_SKIP_ATOMIC_COUNT 16 +#define MAX_SKIP_GC_COUNT 16 struct inmem_pages { struct list_head list; @@ -448,6 +448,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, if (test_and_clear_bit(segno, free_i->free_segmap)) { free_i->free_segments++; + if (IS_CURSEC(sbi, secno)) + goto skip_free; next = find_next_bit(free_i->free_segmap, start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { @@ -455,6 +457,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, free_i->free_sections++; } } +skip_free: spin_unlock(&free_i->segmap_lock); } @@ -645,13 +648,10 @@ static inline void verify_block_addr(struct f2fs_io_info *fio, block_t blk_addr) { struct f2fs_sb_info *sbi = fio->sbi; - if (PAGE_TYPE_OF_BIO(fio->type) == META && - (!is_read_io(fio->op) || fio->is_meta)) - BUG_ON(blk_addr < SEG0_BLKADDR(sbi) || - blk_addr >= MAIN_BLKADDR(sbi)); + if (__is_meta_io(fio)) + verify_blkaddr(sbi, blk_addr, META_GENERIC); else - BUG_ON(blk_addr < MAIN_BLKADDR(sbi) || - blk_addr >= MAX_BLKADDR(sbi)); + verify_blkaddr(sbi, blk_addr, DATA_GENERIC); } /* diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 17bcff789c08..896b885f504e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -41,7 +41,7 @@ static struct kmem_cache *f2fs_inode_cachep; #ifdef CONFIG_F2FS_FAULT_INJECTION -char *fault_name[FAULT_MAX] = { +char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KMALLOC] = "kmalloc", [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", @@ -55,20 +55,24 @@ char *fault_name[FAULT_MAX] = { [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", + [FAULT_DISCARD] = "discard error", }; -static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, - unsigned int rate) +void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, + unsigned int type) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; if (rate) { atomic_set(&ffi->inject_ops, 0); ffi->inject_rate = rate; - ffi->inject_type = (1 << FAULT_MAX) - 1; - } else { - memset(ffi, 0, sizeof(struct f2fs_fault_info)); } + + if (type) + ffi->inject_type = type; + + if (!rate && !type) + memset(ffi, 0, sizeof(struct f2fs_fault_info)); } #endif @@ -113,6 +117,7 @@ enum { Opt_mode, Opt_io_size_bits, Opt_fault_injection, + Opt_fault_type, Opt_lazytime, Opt_nolazytime, Opt_quota, @@ -170,6 +175,7 @@ static match_table_t f2fs_tokens = { {Opt_mode, "mode=%s"}, {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, + {Opt_fault_type, "fault_type=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, {Opt_quota, "quota"}, @@ -347,12 +353,6 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) "QUOTA feature is enabled, so ignore jquota_fmt"); F2FS_OPTION(sbi).s_jquota_fmt = 0; } - if (f2fs_sb_has_quota_ino(sbi->sb) && f2fs_readonly(sbi->sb)) { - f2fs_msg(sbi->sb, KERN_INFO, - "Filesystem with quota feature cannot be mounted RDWR " - "without CONFIG_QUOTA"); - return -1; - } return 0; } #endif @@ -606,7 +606,18 @@ static int parse_options(struct super_block *sb, char *options) if (args->from && match_int(args, &arg)) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(sbi, arg); + f2fs_build_fault_attr(sbi, arg, F2FS_ALL_FAULT_TYPE); + set_opt(sbi, FAULT_INJECTION); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_fault_type: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0, arg); set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, @@ -775,6 +786,19 @@ static int parse_options(struct super_block *sb, char *options) #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; +#else + if (f2fs_sb_has_quota_ino(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Filesystem with quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return -EINVAL; + } + if (f2fs_sb_has_project_quota(sbi->sb) && !f2fs_readonly(sbi->sb)) { + f2fs_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be " + "mounted RDWR without CONFIG_QUOTA"); + return -EINVAL; + } #endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { @@ -1030,6 +1054,10 @@ static void f2fs_put_super(struct super_block *sb) /* our cp_error case, we can wait for any writeback page */ f2fs_flush_merged_writes(sbi); + f2fs_wait_on_all_pages_writeback(sbi); + + f2fs_bug_on(sbi, sbi->fsync_node_num); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -1118,7 +1146,7 @@ static int f2fs_statfs_project(struct super_block *sb, dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); limit = (dquot->dq_dqb.dqb_bsoftlimit ? dquot->dq_dqb.dqb_bsoftlimit : @@ -1141,7 +1169,7 @@ static int f2fs_statfs_project(struct super_block *sb, (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } @@ -1310,9 +1338,12 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); #ifdef CONFIG_F2FS_FAULT_INJECTION - if (test_opt(sbi, FAULT_INJECTION)) + if (test_opt(sbi, FAULT_INJECTION)) { seq_printf(seq, ",fault_injection=%u", F2FS_OPTION(sbi).fault_info.inject_rate); + seq_printf(seq, ",fault_type=%u", + F2FS_OPTION(sbi).fault_info.inject_type); + } #endif #ifdef CONFIG_QUOTA if (test_opt(sbi, QUOTA)) @@ -1343,6 +1374,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",fsync_mode=%s", "posix"); else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) seq_printf(seq, ",fsync_mode=%s", "strict"); + else if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_NOBARRIER) + seq_printf(seq, ",fsync_mode=%s", "nobarrier"); return 0; } @@ -1355,7 +1388,8 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; F2FS_OPTION(sbi).test_dummy_encryption = false; - sbi->readdir_ra = 1; + F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); + F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); set_opt(sbi, BG_GC); set_opt(sbi, INLINE_XATTR); @@ -1365,12 +1399,12 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, NOHEAP); sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_has_blkzoned(sbi->sb)) { - set_opt_mode(sbi, F2FS_MOUNT_LFS); + if (blk_queue_discard(bdev_get_queue(sbi->sb->s_bdev))) set_opt(sbi, DISCARD); - } else { + if (f2fs_sb_has_blkzoned(sbi->sb)) + set_opt_mode(sbi, F2FS_MOUNT_LFS); + else set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); - } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -1379,9 +1413,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, POSIX_ACL); #endif -#ifdef CONFIG_F2FS_FAULT_INJECTION - f2fs_build_fault_attr(sbi, 0); -#endif + f2fs_build_fault_attr(sbi, 0, 0); } #ifdef CONFIG_QUOTA @@ -2229,9 +2261,9 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } - if (secs_per_zone > total_sections) { + if (secs_per_zone > total_sections || !secs_per_zone) { f2fs_msg(sb, KERN_INFO, - "Wrong secs_per_zone (%u > %u)", + "Wrong secs_per_zone / total_sections (%u, %u)", secs_per_zone, total_sections); return 1; } @@ -2282,12 +2314,20 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; unsigned int main_segs, blocks_per_seg; + unsigned int sit_segs, nat_segs; + unsigned int sit_bitmap_size, nat_bitmap_size; + unsigned int log_blocks_per_seg; + unsigned int segment_count_main; + unsigned int cp_pack_start_sum, cp_payload; + block_t user_block_count; int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); - fsmeta += le32_to_cpu(raw_super->segment_count_sit); - fsmeta += le32_to_cpu(raw_super->segment_count_nat); + sit_segs = le32_to_cpu(raw_super->segment_count_sit); + fsmeta += sit_segs; + nat_segs = le32_to_cpu(raw_super->segment_count_nat); + fsmeta += nat_segs; fsmeta += le32_to_cpu(ckpt->rsvd_segment_count); fsmeta += le32_to_cpu(raw_super->segment_count_ssa); @@ -2304,6 +2344,16 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + user_block_count = le64_to_cpu(ckpt->user_block_count); + segment_count_main = le32_to_cpu(raw_super->segment_count_main); + log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + if (!user_block_count || user_block_count >= + segment_count_main << log_blocks_per_seg) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong user_block_count: %u", user_block_count); + return 1; + } + main_segs = le32_to_cpu(raw_super->segment_count_main); blocks_per_seg = sbi->blocks_per_seg; @@ -2318,6 +2368,28 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + sit_bitmap_size = le32_to_cpu(ckpt->sit_ver_bitmap_bytesize); + nat_bitmap_size = le32_to_cpu(ckpt->nat_ver_bitmap_bytesize); + + if (sit_bitmap_size != ((sit_segs / 2) << log_blocks_per_seg) / 8 || + nat_bitmap_size != ((nat_segs / 2) << log_blocks_per_seg) / 8) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong bitmap size: sit: %u, nat:%u", + sit_bitmap_size, nat_bitmap_size); + return 1; + } + + cp_pack_start_sum = __start_sum_addr(sbi); + cp_payload = __cp_payload(sbi); + if (cp_pack_start_sum < cp_payload + 1 || + cp_pack_start_sum > blocks_per_seg - 1 - + NR_CURSEG_TYPE) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong cp_pack_start_sum: %u", + cp_pack_start_sum); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -2651,6 +2723,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } + + sbi->readdir_ra = 1; } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) @@ -2700,9 +2774,6 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; - F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); - F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); - /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sb)) sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, @@ -2771,6 +2842,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); + mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); init_rwsem(&sbi->node_change); @@ -2865,6 +2937,8 @@ try_onemore: f2fs_init_ino_entry_info(sbi); + f2fs_init_fsync_node_info(sbi); + /* setup f2fs internal modules */ err = f2fs_build_segment_manager(sbi); if (err) { @@ -2912,10 +2986,11 @@ try_onemore: err = PTR_ERR(root); goto free_stats; } - if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { + if (!S_ISDIR(root->i_mode) || !root->i_blocks || + !root->i_size || !root->i_nlink) { iput(root); err = -EINVAL; - goto free_node_inode; + goto free_stats; } sb->s_root = d_make_root(root); /* allocate root dentry */ @@ -2929,10 +3004,7 @@ try_onemore: goto free_root_inode; #ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ + /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sb) && !f2fs_readonly(sb)) { err = f2fs_enable_quotas(sb); if (err) { @@ -3090,9 +3162,19 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { if (sb->s_root) { - set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); - f2fs_stop_gc_thread(F2FS_SB(sb)); - f2fs_stop_discard_thread(F2FS_SB(sb)); + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_stop_gc_thread(sbi); + f2fs_stop_discard_thread(sbi); + + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + f2fs_write_checkpoint(sbi, &cpc); + } } kill_block_super(sb); } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 2e7e611deaef..81c0e5337443 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -9,6 +9,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ +#include <linux/compiler.h> #include <linux/proc_fs.h> #include <linux/f2fs_fs.h> #include <linux/seq_file.h> @@ -252,6 +253,7 @@ out: if (t >= 1) { sbi->gc_mode = GC_URGENT; if (sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; wake_up_interruptible_all( &sbi->gc_thread->gc_wait_queue_head); wake_up_discard_thread(sbi, true); @@ -286,8 +288,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") || a->struct_type == GC_THREAD); - if (gc_entry) - down_read(&sbi->sb->s_umount); + if (gc_entry) { + if (!down_read_trylock(&sbi->sb->s_umount)) + return -EAGAIN; + } ret = __sbi_store(a, sbi, buf, count); if (gc_entry) up_read(&sbi->sb->s_umount); @@ -393,6 +397,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_seq_blocks, min_seq_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); @@ -445,6 +450,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_seq_blocks), ATTR_LIST(min_hot_blocks), ATTR_LIST(min_ssr_sections), ATTR_LIST(max_victim_search), @@ -516,7 +522,8 @@ static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; -static int segment_info_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused segment_info_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -543,7 +550,8 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_bits_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -567,7 +575,8 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } -static int iostat_info_seq_show(struct seq_file *seq, void *offset) +static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset) { struct super_block *sb = seq->private; struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -609,6 +618,28 @@ static int iostat_info_seq_show(struct seq_file *seq, void *offset) return 0; } +static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + int i; + + seq_puts(seq, "format: victim_secmap bitmaps\n"); + + for (i = 0; i < MAIN_SECS(sbi); i++) { + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0); + if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + return 0; +} + int __init f2fs_init_sysfs(void) { int ret; @@ -658,6 +689,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) segment_bits_seq_show, sb); proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, iostat_info_seq_show, sb); + proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, + victim_bits_seq_show, sb); } return 0; } @@ -668,6 +701,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry("victim_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 708271871f94..77a010e625f5 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -37,9 +37,6 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: @@ -62,9 +59,6 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; case F2FS_XATTR_INDEX_SECURITY: break; default: @@ -100,12 +94,22 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { + unsigned char old_advise = F2FS_I(inode)->i_advise; + unsigned char new_advise; + if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) return -EINVAL; - F2FS_I(inode)->i_advise |= *(char *)value; + new_advise = *(char *)value; + if (new_advise & ~FADVISE_MODIFIABLE_BITS) + return -EINVAL; + + new_advise = new_advise & FADVISE_MODIFIABLE_BITS; + new_advise |= old_advise & ~FADVISE_MODIFIABLE_BITS; + + F2FS_I(inode)->i_advise = new_advise; f2fs_mark_inode_dirty_sync(inode, true); return 0; } diff --git a/fs/fat/cache.c b/fs/fat/cache.c index e9bed49df6b7..78d501c1fb65 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -225,7 +225,8 @@ static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus) int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) { struct super_block *sb = inode->i_sb; - const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const int limit = sb->s_maxbytes >> sbi->cluster_bits; struct fat_entry fatent; struct fat_cache_id cid; int nr; @@ -234,6 +235,12 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) *fclus = 0; *dclus = MSDOS_I(inode)->i_start; + if (!fat_valid_entry(sbi, *dclus)) { + fat_fs_error_ratelimit(sb, + "%s: invalid start cluster (i_pos %lld, start %08x)", + __func__, MSDOS_I(inode)->i_pos, *dclus); + return -EIO; + } if (cluster == 0) return 0; @@ -250,9 +257,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { fat_fs_error_ratelimit(sb, - "%s: detected the cluster chain loop" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + "%s: detected the cluster chain loop (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } @@ -262,9 +268,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) goto out; else if (nr == FAT_ENT_FREE) { fat_fs_error_ratelimit(sb, - "%s: invalid cluster chain (i_pos %lld)", - __func__, - MSDOS_I(inode)->i_pos); + "%s: invalid cluster chain (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 8e100c3bf72c..7f5f3699fc6c 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1130,7 +1130,7 @@ error: return err; } -int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) +int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 8fc1093da47d..9d7d2d5da28b 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -304,7 +304,7 @@ extern int fat_scan_logstart(struct inode *dir, int i_logstart, struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de); -extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); +extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts); extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct fat_slot_info *sinfo); extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo); @@ -348,6 +348,11 @@ static inline void fatent_brelse(struct fat_entry *fatent) fatent->fat_inode = NULL; } +static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry) +{ + return FAT_START_ENT <= entry && entry < sbi->max_cluster; +} + extern void fat_ent_access_init(struct super_block *sb); extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry); @@ -357,6 +362,7 @@ extern int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster); extern int fat_free_clusters(struct inode *inode, int cluster); extern int fat_count_free_clusters(struct super_block *sb); +extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range); /* fat/file.c */ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, @@ -406,9 +412,9 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); } while (0) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); -extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, +extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); -extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, +extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index bac10de678cc..defc2168de91 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -4,6 +4,7 @@ */ #include <linux/blkdev.h> +#include <linux/sched/signal.h> #include "fat.h" struct fatent_operations { @@ -23,7 +24,7 @@ static void fat12_ent_blocknr(struct super_block *sb, int entry, { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = entry + (entry >> 1); - WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } @@ -33,7 +34,7 @@ static void fat_ent_blocknr(struct super_block *sb, int entry, { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = (entry << sbi->fatent_shift); - WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } @@ -353,7 +354,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) int err, offset; sector_t blocknr; - if (entry < FAT_START_ENT || sbi->max_cluster <= entry) { + if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; @@ -690,3 +691,104 @@ out: unlock_fat(sbi); return err; } + +static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus), + nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); +} + +int fat_trim_fs(struct inode *inode, struct fstrim_range *range) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent; + u64 ent_start, ent_end, minlen, trimmed = 0; + u32 free = 0; + unsigned long reada_blocks, reada_mask, cur_block = 0; + int err = 0; + + /* + * FAT data is organized as clusters, trim at the granulary of cluster. + * + * fstrim_range is in byte, convert vaules to cluster index. + * Treat sectors before data region as all used, not to trim them. + */ + ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT); + ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1; + minlen = range->minlen >> sbi->cluster_bits; + + if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size) + return -EINVAL; + if (ent_end >= sbi->max_cluster) + ent_end = sbi->max_cluster - 1; + + reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; + reada_mask = reada_blocks - 1; + + fatent_init(&fatent); + lock_fat(sbi); + fatent_set_entry(&fatent, ent_start); + while (fatent.entry <= ent_end) { + /* readahead of fat blocks */ + if ((cur_block & reada_mask) == 0) { + unsigned long rest = sbi->fat_length - cur_block; + fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); + } + cur_block++; + + err = fat_ent_read_block(sb, &fatent); + if (err) + goto error; + do { + if (ops->ent_get(&fatent) == FAT_ENT_FREE) { + free++; + } else if (free) { + if (free >= minlen) { + u32 clus = fatent.entry - free; + + err = fat_trim_clusters(sb, clus, free); + if (err && err != -EOPNOTSUPP) + goto error; + if (!err) + trimmed += free; + err = 0; + } + free = 0; + } + } while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end); + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto error; + } + + if (need_resched()) { + fatent_brelse(&fatent); + unlock_fat(sbi); + cond_resched(); + lock_fat(sbi); + } + } + /* handle scenario when tail entries are all free */ + if (free && free >= minlen) { + u32 clus = fatent.entry - free; + + err = fat_trim_clusters(sb, clus, free); + if (err && err != -EOPNOTSUPP) + goto error; + if (!err) + trimmed += free; + err = 0; + } + +error: + fatent_brelse(&fatent); + unlock_fat(sbi); + + range->len = trimmed << sbi->cluster_bits; + + return err; +} diff --git a/fs/fat/file.c b/fs/fat/file.c index 4724cc9ad650..4f3d72fb1e60 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -121,6 +121,37 @@ static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) return put_user(sbi->vol_id, user_attr); } +static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) +{ + struct super_block *sb = inode->i_sb; + struct fstrim_range __user *user_range; + struct fstrim_range range; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + user_range = (struct fstrim_range __user *)arg; + if (copy_from_user(&range, user_range, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(unsigned int, range.minlen, + q->limits.discard_granularity); + + err = fat_trim_fs(inode, &range); + if (err < 0) + return err; + + if (copy_to_user(user_range, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -133,6 +164,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fat_ioctl_set_attributes(filp, user_attr); case FAT_IOCTL_GET_VOLUME_ID: return fat_ioctl_get_volume_id(inode, user_attr); + case FITRIM: + return fat_ioctl_fitrim(inode, arg); default: return -ENOTTY; /* Inappropriate ioctl for device */ } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bfd589ea74c0..d6b81e31f9f5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -508,7 +508,6 @@ static int fat_validate_dir(struct inode *dir) /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { - struct timespec ts; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; @@ -559,14 +558,11 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; - fat_time_fat2unix(sbi, &ts, de->time, de->date, 0); - inode->i_mtime = timespec_to_timespec64(ts); + fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &ts, de->ctime, + fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, de->cdate, de->ctime_cs); - inode->i_ctime = timespec_to_timespec64(ts); - fat_time_fat2unix(sbi, &ts, 0, de->adate, 0); - inode->i_atime = timespec_to_timespec64(ts); + fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); } else inode->i_ctime = inode->i_atime = inode->i_mtime; @@ -843,7 +839,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) static int __fat_write_inode(struct inode *inode, int wait) { - struct timespec ts; struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; @@ -881,16 +876,13 @@ retry: raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); - ts = timespec64_to_timespec(inode->i_mtime); - fat_time_unix2fat(sbi, &ts, &raw_entry->time, + fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - ts = timespec64_to_timespec(inode->i_ctime); - fat_time_unix2fat(sbi, &ts, &raw_entry->ctime, + fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); - ts = timespec64_to_timespec(inode->i_atime); - fat_time_unix2fat(sbi, &ts, &atime, + fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); } spin_unlock(&sbi->inode_hash_lock); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index f9bdc1e01c98..573836dcaefc 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -180,17 +180,18 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) #define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100) /* Linear day numbers of the respective 1sts in non-leap years. */ -static time_t days_in_year[] = { +static long days_in_year[] = { /* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ -void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, +void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs) { u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date); - time_t second, day, leap_day, month, year; + time64_t second; + long day, leap_day, month, year; year = date >> 9; month = max(1, (date >> 5) & 0xf); @@ -205,7 +206,7 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, second = (time & 0x1f) << 1; second += ((time >> 5) & 0x3f) * SECS_PER_MIN; second += (time >> 11) * SECS_PER_HOUR; - second += (year * 365 + leap_day + second += (time64_t)(year * 365 + leap_day + days_in_year[month] + day + DAYS_DELTA) * SECS_PER_DAY; @@ -224,11 +225,11 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, } /* Convert linear UNIX date to a FAT time/date pair. */ -void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, +void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; - time_to_tm(ts->tv_sec, + time64_to_tm(ts->tv_sec, (sbi->options.tz_set ? sbi->options.time_offset : -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 16a832c37d66..efb8c40c9d27 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -225,7 +225,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, /***** Creates a directory entry (name is already formatted). */ static int msdos_add_entry(struct inode *dir, const unsigned char *name, int is_dir, int is_hid, int cluster, - struct timespec *ts, struct fat_slot_info *sinfo) + struct timespec64 *ts, struct fat_slot_info *sinfo) { struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); struct msdos_dir_entry de; @@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (err) return err; - dir->i_ctime = dir->i_mtime = timespec_to_timespec64(*ts); + dir->i_ctime = dir->i_mtime = *ts; if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -267,7 +267,6 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct inode *inode = NULL; struct fat_slot_info sinfo; struct timespec64 ts; - struct timespec t; unsigned char msdos_name[MSDOS_NAME]; int err, is_hid; @@ -286,8 +285,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, } ts = current_time(dir); - t = timespec64_to_timespec(ts); - err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &t, &sinfo); + err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo); if (err) goto out; inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); @@ -347,7 +345,6 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; unsigned char msdos_name[MSDOS_NAME]; struct timespec64 ts; - struct timespec t; int err, is_hid, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); @@ -365,13 +362,12 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } ts = current_time(dir); - t = timespec64_to_timespec(ts); - cluster = fat_alloc_new_dir(dir, &t); + cluster = fat_alloc_new_dir(dir, &ts); if (cluster < 0) { err = cluster; goto out; } - err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &t, &sinfo); + err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo); if (err) goto out_free; inc_nlink(dir); @@ -503,9 +499,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { - struct timespec t = timespec64_to_timespec(ts); err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0, - &t, &sinfo); + &ts, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 9a5469120caa..82cd1e69cbdf 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -577,7 +577,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, static int vfat_build_slots(struct inode *dir, const unsigned char *name, int len, int is_dir, int cluster, - struct timespec *ts, + struct timespec64 *ts, struct msdos_dir_slot *slots, int *nr_slots) { struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); @@ -653,7 +653,7 @@ out_free: } static int vfat_add_entry(struct inode *dir, const struct qstr *qname, - int is_dir, int cluster, struct timespec *ts, + int is_dir, int cluster, struct timespec64 *ts, struct fat_slot_info *sinfo) { struct msdos_dir_slot *slots; @@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname, goto cleanup; /* update timestamp */ - dir->i_ctime = dir->i_mtime = dir->i_atime = timespec_to_timespec64(*ts); + dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -762,14 +762,12 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct inode *inode; struct fat_slot_info sinfo; struct timespec64 ts; - struct timespec t; int err; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - t = timespec64_to_timespec(ts); - err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &t, &sinfo); + err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); if (err) goto out; inode_inc_iversion(dir); @@ -853,19 +851,17 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; struct fat_slot_info sinfo; struct timespec64 ts; - struct timespec t; int err, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - t = timespec64_to_timespec(ts); - cluster = fat_alloc_new_dir(dir, &t); + cluster = fat_alloc_new_dir(dir, &ts); if (cluster < 0) { err = cluster; goto out; } - err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &t, &sinfo); + err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo); if (err) goto out_free; inode_inc_iversion(dir); @@ -904,7 +900,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec64 ts; - struct timespec t; loff_t new_i_pos; int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; @@ -939,9 +934,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { - t = timespec64_to_timespec(ts); err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0, - &t, &sinfo); + &ts, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 7cc8b4acf66a..a63371815aab 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -11,18 +11,3 @@ config HFSPLUS_FS MacOS 8. It includes all Mac specific filesystem data such as data forks and creator codes, but it also has several UNIX style features such as file ownership and permissions. - -config HFSPLUS_FS_POSIX_ACL - bool "HFS+ POSIX Access Control Lists" - depends on HFSPLUS_FS - select FS_POSIX_ACL - help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - It needs to understand that POSIX ACLs are treated only under - Linux. POSIX ACLs doesn't mean something under Mac OS X. - Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, - which are part of the NFSv4 standard. - - If you don't know what Access Control Lists are, say N diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index f6a56542f8d7..9ed20e64b983 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -8,5 +8,3 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o - -hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h deleted file mode 100644 index 488c2b75cf41..000000000000 --- a/fs/hfsplus/acl.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/fs/hfsplus/acl.h - * - * Vyacheslav Dubeyko <slava@dubeyko.com> - * - * Handler for Posix Access Control Lists (ACLs) support. - */ - -#include <linux/posix_acl_xattr.h> - -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - -/* posix_acl.c */ -struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); -int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, - int type); -extern int hfsplus_init_posix_acl(struct inode *, struct inode *); - -#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ -#define hfsplus_get_posix_acl NULL -#define hfsplus_set_posix_acl NULL - -static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) -{ - return 0; -} -#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index b5254378f011..c5a70f83dbe7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -18,7 +18,6 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" -#include "acl.h" static inline void hfsplus_instantiate(struct dentry *dentry, struct inode *inode, u32 cnid) @@ -455,7 +454,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, if (res) goto out_err; - res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + res = hfsplus_init_security(inode, dir, &dentry->d_name); if (res == -EOPNOTSUPP) res = 0; /* Operation is not supported. */ else if (res) { @@ -496,7 +495,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, if (res) goto failed_mknod; - res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + res = hfsplus_init_security(inode, dir, &dentry->d_name); if (res == -EOPNOTSUPP) res = 0; /* Operation is not supported. */ else if (res) { @@ -567,10 +566,6 @@ const struct inode_operations hfsplus_dir_inode_operations = { .mknod = hfsplus_mknod, .rename = hfsplus_rename, .listxattr = hfsplus_listxattr, -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - .get_acl = hfsplus_get_posix_acl, - .set_acl = hfsplus_set_posix_acl, -#endif }; const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index e8770935ce6d..8e0f59767694 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -336,6 +336,9 @@ static int hfsplus_free_extents(struct super_block *sb, int i; int err = 0; + /* Mapping the allocation file may lock the extent tree */ + WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock)); + hfsplus_dump_extent(extent); for (i = 0; i < 8; extent++, i++) { count = be32_to_cpu(extent->block_count); @@ -415,11 +418,13 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, if (res) break; start = be32_to_cpu(fd.key->ext.start_block); - hfsplus_free_extents(sb, ext_entry, - total_blocks - start, - total_blocks); hfs_brec_remove(&fd); + + mutex_unlock(&fd.tree->tree_lock); + hfsplus_free_extents(sb, ext_entry, total_blocks - start, + total_blocks); total_blocks = start; + mutex_lock(&fd.tree->tree_lock); } while (total_blocks > blocks); hfs_find_exit(&fd); @@ -576,15 +581,20 @@ void hfsplus_file_truncate(struct inode *inode) } while (1) { if (alloc_cnt == hip->first_blocks) { + mutex_unlock(&fd.tree->tree_lock); hfsplus_free_extents(sb, hip->first_extents, alloc_cnt, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->first_extents); hip->first_blocks = blk_cnt; + mutex_lock(&fd.tree->tree_lock); break; } res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; + hfs_brec_remove(&fd); + + mutex_unlock(&fd.tree->tree_lock); start = hip->cached_start; hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); @@ -596,7 +606,7 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt = start; hip->cached_start = hip->cached_blocks = 0; hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); - hfs_brec_remove(&fd); + mutex_lock(&fd.tree->tree_lock); } hfs_find_exit(&fd); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d9255abafb81..8e039435958a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -31,7 +31,6 @@ #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 #define DBG_ATTR_MOD 0x00000080 -#define DBG_ACL_MOD 0x00000100 #if 0 #define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index c824f702feec..8e9427a42b81 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -21,7 +21,6 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" -#include "acl.h" static int hfsplus_readpage(struct file *file, struct page *page) { @@ -267,12 +266,6 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) setattr_copy(inode, attr); mark_inode_dirty(inode); - if (attr->ia_valid & ATTR_MODE) { - error = posix_acl_chmod(inode, inode->i_mode); - if (unlikely(error)) - return error; - } - return 0; } @@ -336,10 +329,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .setattr = hfsplus_setattr, .listxattr = hfsplus_listxattr, -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - .get_acl = hfsplus_get_posix_acl, - .set_acl = hfsplus_set_posix_acl, -#endif }; static const struct file_operations hfsplus_file_operations = { diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c deleted file mode 100644 index 066114dcc3a2..000000000000 --- a/fs/hfsplus/posix_acl.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/hfsplus/posix_acl.c - * - * Vyacheslav Dubeyko <slava@dubeyko.com> - * - * Handler for Posix Access Control Lists (ACLs) support. - */ - -#include "hfsplus_fs.h" -#include "xattr.h" -#include "acl.h" - -struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) -{ - struct posix_acl *acl; - char *xattr_name; - char *value = NULL; - ssize_t size; - - hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); - - switch (type) { - case ACL_TYPE_ACCESS: - xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; - break; - default: - return ERR_PTR(-EINVAL); - } - - size = __hfsplus_getxattr(inode, xattr_name, NULL, 0); - - if (size > 0) { - value = (char *)hfsplus_alloc_attr_entry(); - if (unlikely(!value)) - return ERR_PTR(-ENOMEM); - size = __hfsplus_getxattr(inode, xattr_name, value, size); - } - - if (size > 0) - acl = posix_acl_from_xattr(&init_user_ns, value, size); - else if (size == -ENODATA) - acl = NULL; - else - acl = ERR_PTR(size); - - hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); - - return acl; -} - -static int __hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, - int type) -{ - int err; - char *xattr_name; - size_t size = 0; - char *value = NULL; - - hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); - - switch (type) { - case ACL_TYPE_ACCESS: - xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - - default: - return -EINVAL; - } - - if (acl) { - size = posix_acl_xattr_size(acl->a_count); - if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE)) - return -ENOMEM; - value = (char *)hfsplus_alloc_attr_entry(); - if (unlikely(!value)) - return -ENOMEM; - err = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (unlikely(err < 0)) - goto end_set_acl; - } - - err = __hfsplus_setxattr(inode, xattr_name, value, size, 0); - -end_set_acl: - hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); - - if (!err) - set_cached_acl(inode, type, acl); - - return err; -} - -int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - int err; - - if (type == ACL_TYPE_ACCESS && acl) { - err = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (err) - return err; - } - return __hfsplus_set_posix_acl(inode, acl, type); -} - -int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) -{ - int err = 0; - struct posix_acl *default_acl, *acl; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, dir->ino %lu\n", - __func__, inode->i_ino, dir->i_ino); - - if (S_ISLNK(inode->i_mode)) - return 0; - - err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (err) - return err; - - if (default_acl) { - err = __hfsplus_set_posix_acl(inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } - - if (acl) { - if (!err) - err = __hfsplus_set_posix_acl(inode, acl, - ACL_TYPE_ACCESS); - posix_acl_release(acl); - } - return err; -} diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index a6c0f54c48c3..eb4535eba95d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -524,8 +524,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) goto out_put_root; if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); - if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) + if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) { + err = -EINVAL; goto out_put_root; + } inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -562,8 +564,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) goto out_put_hidden_dir; } - err = hfsplus_init_inode_security(sbi->hidden_dir, - root, &str); + err = hfsplus_init_security(sbi->hidden_dir, + root, &str); if (err == -EOPNOTSUPP) err = 0; /* Operation is not supported. */ else if (err) { diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index dfa90c21948f..c8d1b2be7854 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -272,8 +272,8 @@ static inline int asc2unichar(struct super_block *sb, const char *astr, int len, return size; } -/* Decomposes a single unicode character. */ -static inline u16 *decompose_unichar(wchar_t uc, int *size) +/* Decomposes a non-Hangul unicode character. */ +static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size) { int off; @@ -296,6 +296,51 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size) return hfsplus_decompose_table + (off / 4); } +/* + * Try to decompose a unicode character as Hangul. Return 0 if @uc is not + * precomposed Hangul, otherwise return the length of the decomposition. + * + * This function was adapted from sample code from the Unicode Standard + * Annex #15: Unicode Normalization Forms, version 3.2.0. + * + * Copyright (C) 1991-2018 Unicode, Inc. All rights reserved. Distributed + * under the Terms of Use in http://www.unicode.org/copyright.html. + */ +static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result) +{ + int index; + int l, v, t; + + index = uc - Hangul_SBase; + if (index < 0 || index >= Hangul_SCount) + return 0; + + l = Hangul_LBase + index / Hangul_NCount; + v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount; + t = Hangul_TBase + index % Hangul_TCount; + + result[0] = l; + result[1] = v; + if (t != Hangul_TBase) { + result[2] = t; + return 3; + } + return 2; +} + +/* Decomposes a single unicode character. */ +static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer) +{ + u16 *result; + + /* Hangul is handled separately */ + result = hangul_buffer; + *size = hfsplus_try_decompose_hangul(uc, result); + if (*size == 0) + result = hfsplus_decompose_nonhangul(uc, size); + return result; +} + int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len) @@ -303,13 +348,14 @@ int hfsplus_asc2uni(struct super_block *sb, int size, dsize, decompose; u16 *dstr, outlen = 0; wchar_t c; + u16 dhangul[3]; decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (outlen < max_unistr_len && len > 0) { size = asc2unichar(sb, astr, len, &c); if (decompose) - dstr = decompose_unichar(c, &dsize); + dstr = decompose_unichar(c, &dsize, dhangul); else dstr = NULL; if (dstr) { @@ -344,6 +390,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) unsigned long hash; wchar_t c; u16 c2; + u16 dhangul[3]; casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); @@ -357,7 +404,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) len -= size; if (decompose) - dstr = decompose_unichar(c, &dsize); + dstr = decompose_unichar(c, &dsize, dhangul); else dstr = NULL; if (dstr) { @@ -396,6 +443,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry, const char *astr1, *astr2; u16 c1, c2; wchar_t c; + u16 dhangul_1[3], dhangul_2[3]; casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); @@ -413,7 +461,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry, len1 -= size; if (decompose) - dstr1 = decompose_unichar(c, &dsize1); + dstr1 = decompose_unichar(c, &dsize1, + dhangul_1); if (!decompose || !dstr1) { c1 = c; dstr1 = &c1; @@ -427,7 +476,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry, len2 -= size; if (decompose) - dstr2 = decompose_unichar(c, &dsize2); + dstr2 = decompose_unichar(c, &dsize2, + dhangul_2); if (!decompose || !dstr2) { c2 = c; dstr2 = &c2; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e538b758c448..d5403b4004c9 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -8,10 +8,8 @@ */ #include "hfsplus_fs.h" -#include <linux/posix_acl_xattr.h> #include <linux/nls.h> #include "xattr.h" -#include "acl.h" static int hfsplus_removexattr(struct inode *inode, const char *name); @@ -19,10 +17,6 @@ const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &hfsplus_xattr_security_handler, NULL }; diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index a4e611d69710..d14e362b3eba 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -38,7 +38,4 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); int hfsplus_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); -int hfsplus_init_inode_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr); - #endif diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index f5550b006e88..cfbe6a3bfb1e 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -12,7 +12,6 @@ #include "hfsplus_fs.h" #include "xattr.h" -#include "acl.h" static int hfsplus_security_getxattr(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, @@ -72,18 +71,6 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir, &hfsplus_initxattrs, NULL); } -int hfsplus_init_inode_security(struct inode *inode, - struct inode *dir, - const struct qstr *qstr) -{ - int err; - - err = hfsplus_init_posix_acl(inode, dir); - if (!err) - err = hfsplus_init_security(inode, dir, qstr); - return err; -} - const struct xattr_handler hfsplus_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = hfsplus_security_getxattr, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 346a146c7617..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -410,7 +410,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, current->mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); @@ -595,7 +594,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * allocation routines. If NUMA is configured, use page index * as input to create an allocation policy. */ - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index c5fa3dee72fc..7da0fac71dc2 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -51,7 +51,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return err; } -static int nilfs_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6ffeca84d7c3..1b9067cf4511 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -834,7 +834,7 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount) sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); - sbp[0]->s_mtime = cpu_to_le64(get_seconds()); + sbp[0]->s_mtime = cpu_to_le64(ktime_get_real_seconds()); skip_mount_setup: sbp[0]->s_state = diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index ec350d4d921c..276914ae3c60 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -603,6 +603,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, if (!inode) goto out_drop_write; + spin_lock(&inode->i_lock); + inode->i_state |= I_CREATING; + spin_unlock(&inode->i_lock); + inode_init_owner(inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 0eaeb41453f5..817c02b13b1d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -31,6 +31,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU + select CRASH_CORE help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be diff --git a/fs/proc/base.c b/fs/proc/base.c index aaffc0c30216..ccf86f16d9f0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -463,7 +463,7 @@ static int lstats_show_proc(struct seq_file *m, void *v) if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); - for (i = 0; i < 32; i++) { + for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; @@ -1366,10 +1366,8 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, if (!task) return -ESRCH; len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); - len = simple_read_from_buffer(buf, count, ppos, numbuf, len); put_task_struct(task); - - return len; + return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { @@ -2519,47 +2517,47 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); + struct task_struct *task; void *page; - ssize_t length; - struct task_struct *task = get_proc_task(inode); - - length = -ESRCH; - if (!task) - goto out_no_task; + int rv; + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } /* A task may only write its own attributes. */ - length = -EACCES; - if (current != task) - goto out; + if (current != task) { + rcu_read_unlock(); + return -EACCES; + } + rcu_read_unlock(); if (count > PAGE_SIZE) count = PAGE_SIZE; /* No partial writes. */ - length = -EINVAL; if (*ppos != 0) - goto out; + return -EINVAL; page = memdup_user(buf, count); if (IS_ERR(page)) { - length = PTR_ERR(page); + rv = PTR_ERR(page); goto out; } /* Guard against adverse ptrace interaction */ - length = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); - if (length < 0) + rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); + if (rv < 0) goto out_free; - length = security_setprocattr(file->f_path.dentry->d_name.name, - page, count); + rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count); mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); out: - put_task_struct(task); -out_no_task: - return length; + return rv; } static const struct file_operations proc_pid_attr_operations = { @@ -3309,12 +3307,12 @@ static const struct pid_entry tid_base_stuff[] = { REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), - REG("maps", S_IRUGO, proc_tid_maps_operations), + REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), @@ -3324,7 +3322,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), - REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif diff --git a/fs/proc/generic.c b/fs/proc/generic.c index bb1c1625b158..8ae109429a88 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -286,9 +286,9 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) return 0; + i = ctx->pos - 2; read_lock(&proc_subdir_lock); de = pde_subdir_first(de); - i = ctx->pos - 2; for (;;) { if (!de) { read_unlock(&proc_subdir_lock); @@ -309,8 +309,8 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx, pde_put(de); return 0; } - read_lock(&proc_subdir_lock); ctx->pos++; + read_lock(&proc_subdir_lock); next = pde_subdir_next(de); pde_put(de); de = next; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 85ffbd27f288..fc5306a31a1d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -105,8 +105,10 @@ void __init proc_init_kmemcache(void) kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); proc_dir_entry_cache = kmem_cache_create_usercopy( - "proc_dir_entry", SIZEOF_PDE_SLOT, 0, SLAB_PANIC, - OFFSETOF_PDE_NAME, SIZEOF_PDE_INLINE_NAME, NULL); + "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + SIZEOF_PDE_INLINE_NAME, NULL); + BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } static int proc_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index da3dbfa09e79..5185d7f6a51e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -65,16 +65,13 @@ struct proc_dir_entry { char inline_name[]; } __randomize_layout; -#define OFFSETOF_PDE_NAME offsetof(struct proc_dir_entry, inline_name) -#define SIZEOF_PDE_SLOT \ - (OFFSETOF_PDE_NAME + 34 <= 64 ? 64 : \ - OFFSETOF_PDE_NAME + 34 <= 128 ? 128 : \ - OFFSETOF_PDE_NAME + 34 <= 192 ? 192 : \ - OFFSETOF_PDE_NAME + 34 <= 256 ? 256 : \ - OFFSETOF_PDE_NAME + 34 <= 512 ? 512 : \ - 0) - -#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE_SLOT - OFFSETOF_PDE_NAME) +#define SIZEOF_PDE ( \ + sizeof(struct proc_dir_entry) < 128 ? 128 : \ + sizeof(struct proc_dir_entry) < 192 ? 192 : \ + sizeof(struct proc_dir_entry) < 256 ? 256 : \ + sizeof(struct proc_dir_entry) < 512 ? 512 : \ + 0) +#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -116,12 +113,12 @@ static inline void *__PDE_DATA(const struct inode *inode) return PDE(inode)->data; } -static inline struct pid *proc_pid(struct inode *inode) +static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } -static inline struct task_struct *get_proc_task(struct inode *inode) +static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } @@ -285,7 +282,6 @@ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; - struct mem_size_stats *rollup; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif @@ -297,12 +293,9 @@ struct proc_maps_private { struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; -extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; -extern const struct file_operations proc_tid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; -extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e64ecb9f2720..80464432dfe6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,6 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com> */ +#include <linux/crash_core.h> #include <linux/mm.h> #include <linux/proc_fs.h> #include <linux/kcore.h> @@ -49,32 +50,23 @@ static struct proc_dir_entry *proc_root_kcore; #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif -/* An ELF note in memory */ -struct memelfnote -{ - const char *name; - int type; - unsigned int datasz; - void *data; -}; - static LIST_HEAD(kclist_head); -static DEFINE_RWLOCK(kclist_lock); +static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; -void -kclist_add(struct kcore_list *new, void *addr, size_t size, int type) +/* This doesn't grab kclist_lock, so it should only be used at init time. */ +void __init kclist_add(struct kcore_list *new, void *addr, size_t size, + int type) { new->addr = (unsigned long)addr; new->size = size; new->type = type; - write_lock(&kclist_lock); list_add_tail(&new->list, &kclist_head); - write_unlock(&kclist_lock); } -static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) +static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, + size_t *data_offset) { size_t try, size; struct kcore_list *m; @@ -88,53 +80,19 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) size = try; *nphdr = *nphdr + 1; } - *elf_buflen = sizeof(struct elfhdr) + - (*nphdr + 2)*sizeof(struct elf_phdr) + - 3 * ((sizeof(struct elf_note)) + - roundup(sizeof(CORE_STR), 4)) + - roundup(sizeof(struct elf_prstatus), 4) + - roundup(sizeof(struct elf_prpsinfo), 4) + - roundup(arch_task_struct_size, 4); - *elf_buflen = PAGE_ALIGN(*elf_buflen); - return size + *elf_buflen; -} - -static void free_kclist_ents(struct list_head *head) -{ - struct kcore_list *tmp, *pos; - list_for_each_entry_safe(pos, tmp, head, list) { - list_del(&pos->list); - kfree(pos); - } + *phdrs_len = *nphdr * sizeof(struct elf_phdr); + *notes_len = (4 * sizeof(struct elf_note) + + 3 * ALIGN(sizeof(CORE_STR), 4) + + VMCOREINFO_NOTE_NAME_BYTES + + ALIGN(sizeof(struct elf_prstatus), 4) + + ALIGN(sizeof(struct elf_prpsinfo), 4) + + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); + *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + + *notes_len); + return *data_offset + size; } -/* - * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list. - */ -static void __kcore_update_ram(struct list_head *list) -{ - int nphdr; - size_t size; - struct kcore_list *tmp, *pos; - LIST_HEAD(garbage); - - write_lock(&kclist_lock); - if (kcore_need_update) { - list_for_each_entry_safe(pos, tmp, &kclist_head, list) { - if (pos->type == KCORE_RAM - || pos->type == KCORE_VMEMMAP) - list_move(&pos->list, &garbage); - } - list_splice_tail(list, &kclist_head); - } else - list_splice(list, &garbage); - kcore_need_update = 0; - proc_root_kcore->size = get_kcore_size(&nphdr, &size); - write_unlock(&kclist_lock); - - free_kclist_ents(&garbage); -} - #ifdef CONFIG_HIGHMEM /* @@ -142,11 +100,9 @@ static void __kcore_update_ram(struct list_head *list) * because memory hole is not as big as !HIGHMEM case. * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) */ -static int kcore_update_ram(void) +static int kcore_ram_list(struct list_head *head) { - LIST_HEAD(head); struct kcore_list *ent; - int ret = 0; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) @@ -154,9 +110,8 @@ static int kcore_update_ram(void) ent->addr = (unsigned long)__va(0); ent->size = max_low_pfn << PAGE_SHIFT; ent->type = KCORE_RAM; - list_add(&ent->list, &head); - __kcore_update_ram(&head); - return ret; + list_add(&ent->list, head); + return 0; } #else /* !CONFIG_HIGHMEM */ @@ -255,11 +210,10 @@ free_out: return 1; } -static int kcore_update_ram(void) +static int kcore_ram_list(struct list_head *list) { int nid, ret; unsigned long end_pfn; - LIST_HEAD(head); /* Not inialized....update now */ /* find out "max pfn" */ @@ -271,258 +225,255 @@ static int kcore_update_ram(void) end_pfn = node_end; } /* scan 0 to max_pfn */ - ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private); - if (ret) { - free_kclist_ents(&head); + ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private); + if (ret) return -ENOMEM; - } - __kcore_update_ram(&head); - return ret; + return 0; } #endif /* CONFIG_HIGHMEM */ -/*****************************************************************************/ -/* - * determine size of ELF note - */ -static int notesize(struct memelfnote *en) -{ - int sz; - - sz = sizeof(struct elf_note); - sz += roundup((strlen(en->name) + 1), 4); - sz += roundup(en->datasz, 4); - - return sz; -} /* end notesize() */ - -/*****************************************************************************/ -/* - * store a note in the header buffer - */ -static char *storenote(struct memelfnote *men, char *bufp) +static int kcore_update_ram(void) { - struct elf_note en; - -#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0) - - en.n_namesz = strlen(men->name) + 1; - en.n_descsz = men->datasz; - en.n_type = men->type; - - DUMP_WRITE(&en, sizeof(en)); - DUMP_WRITE(men->name, en.n_namesz); - - /* XXX - cast from long long to long to avoid need for libgcc.a */ - bufp = (char*) roundup((unsigned long)bufp,4); - DUMP_WRITE(men->data, men->datasz); - bufp = (char*) roundup((unsigned long)bufp,4); - -#undef DUMP_WRITE - - return bufp; -} /* end storenote() */ + LIST_HEAD(list); + LIST_HEAD(garbage); + int nphdr; + size_t phdrs_len, notes_len, data_offset; + struct kcore_list *tmp, *pos; + int ret = 0; -/* - * store an ELF coredump header in the supplied buffer - * nphdr is the number of elf_phdr to insert - */ -static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) -{ - struct elf_prstatus prstatus; /* NT_PRSTATUS */ - struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ - struct elf_phdr *nhdr, *phdr; - struct elfhdr *elf; - struct memelfnote notes[3]; - off_t offset = 0; - struct kcore_list *m; + down_write(&kclist_lock); + if (!xchg(&kcore_need_update, 0)) + goto out; - /* setup ELF header */ - elf = (struct elfhdr *) bufp; - bufp += sizeof(struct elfhdr); - offset += sizeof(struct elfhdr); - memcpy(elf->e_ident, ELFMAG, SELFMAG); - elf->e_ident[EI_CLASS] = ELF_CLASS; - elf->e_ident[EI_DATA] = ELF_DATA; - elf->e_ident[EI_VERSION]= EV_CURRENT; - elf->e_ident[EI_OSABI] = ELF_OSABI; - memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); - elf->e_type = ET_CORE; - elf->e_machine = ELF_ARCH; - elf->e_version = EV_CURRENT; - elf->e_entry = 0; - elf->e_phoff = sizeof(struct elfhdr); - elf->e_shoff = 0; - elf->e_flags = ELF_CORE_EFLAGS; - elf->e_ehsize = sizeof(struct elfhdr); - elf->e_phentsize= sizeof(struct elf_phdr); - elf->e_phnum = nphdr; - elf->e_shentsize= 0; - elf->e_shnum = 0; - elf->e_shstrndx = 0; - - /* setup ELF PT_NOTE program header */ - nhdr = (struct elf_phdr *) bufp; - bufp += sizeof(struct elf_phdr); - offset += sizeof(struct elf_phdr); - nhdr->p_type = PT_NOTE; - nhdr->p_offset = 0; - nhdr->p_vaddr = 0; - nhdr->p_paddr = 0; - nhdr->p_filesz = 0; - nhdr->p_memsz = 0; - nhdr->p_flags = 0; - nhdr->p_align = 0; - - /* setup ELF PT_LOAD program header for every area */ - list_for_each_entry(m, &kclist_head, list) { - phdr = (struct elf_phdr *) bufp; - bufp += sizeof(struct elf_phdr); - offset += sizeof(struct elf_phdr); - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff; - phdr->p_vaddr = (size_t)m->addr; - if (m->type == KCORE_RAM || m->type == KCORE_TEXT) - phdr->p_paddr = __pa(m->addr); - else - phdr->p_paddr = (elf_addr_t)-1; - phdr->p_filesz = phdr->p_memsz = m->size; - phdr->p_align = PAGE_SIZE; + ret = kcore_ram_list(&list); + if (ret) { + /* Couldn't get the RAM list, try again next time. */ + WRITE_ONCE(kcore_need_update, 1); + list_splice_tail(&list, &garbage); + goto out; } - /* - * Set up the notes in similar form to SVR4 core dumps made - * with info from their /proc. - */ - nhdr->p_offset = offset; - - /* set up the process status */ - notes[0].name = CORE_STR; - notes[0].type = NT_PRSTATUS; - notes[0].datasz = sizeof(struct elf_prstatus); - notes[0].data = &prstatus; - - memset(&prstatus, 0, sizeof(struct elf_prstatus)); - - nhdr->p_filesz = notesize(¬es[0]); - bufp = storenote(¬es[0], bufp); - - /* set up the process info */ - notes[1].name = CORE_STR; - notes[1].type = NT_PRPSINFO; - notes[1].datasz = sizeof(struct elf_prpsinfo); - notes[1].data = &prpsinfo; - - memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo)); - prpsinfo.pr_state = 0; - prpsinfo.pr_sname = 'R'; - prpsinfo.pr_zomb = 0; - - strcpy(prpsinfo.pr_fname, "vmlinux"); - strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); - - nhdr->p_filesz += notesize(¬es[1]); - bufp = storenote(¬es[1], bufp); + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(&list, &kclist_head); - /* set up the task structure */ - notes[2].name = CORE_STR; - notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = arch_task_struct_size; - notes[2].data = current; + proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, ¬es_len, + &data_offset); - nhdr->p_filesz += notesize(¬es[2]); - bufp = storenote(¬es[2], bufp); +out: + up_write(&kclist_lock); + list_for_each_entry_safe(pos, tmp, &garbage, list) { + list_del(&pos->list); + kfree(pos); + } + return ret; +} -} /* end elf_kcore_store_hdr() */ +static void append_kcore_note(char *notes, size_t *i, const char *name, + unsigned int type, const void *desc, + size_t descsz) +{ + struct elf_note *note = (struct elf_note *)¬es[*i]; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = descsz; + note->n_type = type; + *i += sizeof(*note); + memcpy(¬es[*i], name, note->n_namesz); + *i = ALIGN(*i + note->n_namesz, 4); + memcpy(¬es[*i], desc, descsz); + *i = ALIGN(*i + descsz, 4); +} -/*****************************************************************************/ -/* - * read from the ELF header and then kernel memory - */ static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { char *buf = file->private_data; - ssize_t acc = 0; - size_t size, tsz; - size_t elf_buflen; + size_t phdrs_offset, notes_offset, data_offset; + size_t phdrs_len, notes_len; + struct kcore_list *m; + size_t tsz; int nphdr; unsigned long start; + size_t orig_buflen = buflen; + int ret = 0; - read_lock(&kclist_lock); - size = get_kcore_size(&nphdr, &elf_buflen); + down_read(&kclist_lock); + + get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); + phdrs_offset = sizeof(struct elfhdr); + notes_offset = phdrs_offset + phdrs_len; + + /* ELF file header. */ + if (buflen && *fpos < sizeof(struct elfhdr)) { + struct elfhdr ehdr = { + .e_ident = { + [EI_MAG0] = ELFMAG0, + [EI_MAG1] = ELFMAG1, + [EI_MAG2] = ELFMAG2, + [EI_MAG3] = ELFMAG3, + [EI_CLASS] = ELF_CLASS, + [EI_DATA] = ELF_DATA, + [EI_VERSION] = EV_CURRENT, + [EI_OSABI] = ELF_OSABI, + }, + .e_type = ET_CORE, + .e_machine = ELF_ARCH, + .e_version = EV_CURRENT, + .e_phoff = sizeof(struct elfhdr), + .e_flags = ELF_CORE_EFLAGS, + .e_ehsize = sizeof(struct elfhdr), + .e_phentsize = sizeof(struct elf_phdr), + .e_phnum = nphdr, + }; + + tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); + if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + ret = -EFAULT; + goto out; + } - if (buflen == 0 || *fpos >= size) { - read_unlock(&kclist_lock); - return 0; + buffer += tsz; + buflen -= tsz; + *fpos += tsz; } - /* trim buflen to not go beyond EOF */ - if (buflen > size - *fpos) - buflen = size - *fpos; - - /* construct an ELF core header if we'll need some of it */ - if (*fpos < elf_buflen) { - char * elf_buf; - - tsz = elf_buflen - *fpos; - if (buflen < tsz) - tsz = buflen; - elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); - if (!elf_buf) { - read_unlock(&kclist_lock); - return -ENOMEM; + /* ELF program headers. */ + if (buflen && *fpos < phdrs_offset + phdrs_len) { + struct elf_phdr *phdrs, *phdr; + + phdrs = kzalloc(phdrs_len, GFP_KERNEL); + if (!phdrs) { + ret = -ENOMEM; + goto out; } - elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); - read_unlock(&kclist_lock); - if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { - kfree(elf_buf); - return -EFAULT; + + phdrs[0].p_type = PT_NOTE; + phdrs[0].p_offset = notes_offset; + phdrs[0].p_filesz = notes_len; + + phdr = &phdrs[1]; + list_for_each_entry(m, &kclist_head, list) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; + phdr->p_vaddr = (size_t)m->addr; + if (m->type == KCORE_RAM) + phdr->p_paddr = __pa(m->addr); + else if (m->type == KCORE_TEXT) + phdr->p_paddr = __pa_symbol(m->addr); + else + phdr->p_paddr = (elf_addr_t)-1; + phdr->p_filesz = phdr->p_memsz = m->size; + phdr->p_align = PAGE_SIZE; + phdr++; } - kfree(elf_buf); + + tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); + if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, + tsz)) { + kfree(phdrs); + ret = -EFAULT; + goto out; + } + kfree(phdrs); + + buffer += tsz; buflen -= tsz; *fpos += tsz; - buffer += tsz; - acc += tsz; + } + + /* ELF note segment. */ + if (buflen && *fpos < notes_offset + notes_len) { + struct elf_prstatus prstatus = {}; + struct elf_prpsinfo prpsinfo = { + .pr_sname = 'R', + .pr_fname = "vmlinux", + }; + char *notes; + size_t i = 0; + + strlcpy(prpsinfo.pr_psargs, saved_command_line, + sizeof(prpsinfo.pr_psargs)); + + notes = kzalloc(notes_len, GFP_KERNEL); + if (!notes) { + ret = -ENOMEM; + goto out; + } + + append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + sizeof(prpsinfo)); + append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + arch_task_struct_size); + /* + * vmcoreinfo_size is mostly constant after init time, but it + * can be changed by crash_save_vmcoreinfo(). Racing here with a + * panic on another CPU before the machine goes down is insanely + * unlikely, but it's better to not leave potential buffer + * overflows lying around, regardless. + */ + append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, + vmcoreinfo_data, + min(vmcoreinfo_size, notes_len - i)); + + tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); + if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + kfree(notes); + ret = -EFAULT; + goto out; + } + kfree(notes); - /* leave now if filled buffer already */ - if (buflen == 0) - return acc; - } else - read_unlock(&kclist_lock); + buffer += tsz; + buflen -= tsz; + *fpos += tsz; + } /* * Check to see if our file offset matches with any of * the addresses in the elf_phdr on our list. */ - start = kc_offset_to_vaddr(*fpos - elf_buflen); + start = kc_offset_to_vaddr(*fpos - data_offset); if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) tsz = buflen; - - while (buflen) { - struct kcore_list *m; - read_lock(&kclist_lock); - list_for_each_entry(m, &kclist_head, list) { - if (start >= m->addr && start < (m->addr+m->size)) - break; + m = NULL; + while (buflen) { + /* + * If this is the first iteration or the address is not within + * the previous entry, search for a matching entry. + */ + if (!m || start < m->addr || start >= m->addr + m->size) { + list_for_each_entry(m, &kclist_head, list) { + if (start >= m->addr && + start < m->addr + m->size) + break; + } } - read_unlock(&kclist_lock); if (&m->list == &kclist_head) { - if (clear_user(buffer, tsz)) - return -EFAULT; + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) - return -EFAULT; + if (copy_to_user(buffer, buf, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_USER) { /* User page is handled prior to normal kernel page: */ - if (copy_to_user(buffer, (char *)start, tsz)) - return -EFAULT; + if (copy_to_user(buffer, (char *)start, tsz)) { + ret = -EFAULT; + goto out; + } } else { if (kern_addr_valid(start)) { /* @@ -530,29 +481,37 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * hardened user copy kernel text checks. */ if (probe_kernel_read(buf, (void *) start, tsz)) { - if (clear_user(buffer, tsz)) - return -EFAULT; + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else { - if (copy_to_user(buffer, buf, tsz)) - return -EFAULT; + if (copy_to_user(buffer, buf, tsz)) { + ret = -EFAULT; + goto out; + } } } else { - if (clear_user(buffer, tsz)) - return -EFAULT; + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } } buflen -= tsz; *fpos += tsz; buffer += tsz; - acc += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } - return acc; +out: + up_read(&kclist_lock); + if (ret) + return ret; + return orig_buflen - buflen; } - static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) @@ -592,9 +551,8 @@ static int __meminit kcore_callback(struct notifier_block *self, switch (action) { case MEM_ONLINE: case MEM_OFFLINE: - write_lock(&kclist_lock); kcore_need_update = 1; - write_unlock(&kclist_lock); + break; } return NOTIFY_OK; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 2fb04846ed11..edda898714eb 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -7,6 +7,7 @@ #include <linux/mman.h> #include <linux/mmzone.h> #include <linux/proc_fs.h> +#include <linux/percpu.h> #include <linux/quicklist.h> #include <linux/seq_file.h> #include <linux/swap.h> @@ -121,6 +122,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", 0ul); show_val_kb(m, "VmallocChunk: ", 0ul); + show_val_kb(m, "Percpu: ", pcpu_nr_pages()); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 59749dfaef67..535eda7857cf 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -183,7 +183,7 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_online_cpus(); + unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dfd73a4616ce..5ea1d64cb0b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -247,7 +247,6 @@ static int proc_map_release(struct inode *inode, struct file *file) if (priv->mm) mmdrop(priv->mm); - kfree(priv->rollup); return seq_release_private(inode, file); } @@ -294,7 +293,7 @@ static void show_vma_header_prefix(struct seq_file *m, } static void -show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) +show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -357,35 +356,18 @@ done: seq_putc(m, '\n'); } -static int show_map(struct seq_file *m, void *v, int is_pid) +static int show_map(struct seq_file *m, void *v) { - show_map_vma(m, v, is_pid); + show_map_vma(m, v); m_cache_vma(m, v); return 0; } -static int show_pid_map(struct seq_file *m, void *v) -{ - return show_map(m, v, 1); -} - -static int show_tid_map(struct seq_file *m, void *v) -{ - return show_map(m, v, 0); -} - static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_map -}; - -static const struct seq_operations proc_tid_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_map + .show = show_map }; static int pid_maps_open(struct inode *inode, struct file *file) @@ -393,11 +375,6 @@ static int pid_maps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_maps_op); } -static int tid_maps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_tid_maps_op); -} - const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, @@ -405,13 +382,6 @@ const struct file_operations proc_pid_maps_operations = { .release = proc_map_release, }; -const struct file_operations proc_tid_maps_operations = { - .open = tid_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, -}; - /* * Proportional Set Size(PSS): my share of RSS. * @@ -433,7 +403,6 @@ const struct file_operations proc_tid_maps_operations = { #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { - bool first; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -447,7 +416,6 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; - unsigned long first_vma_start; u64 pss; u64 pss_locked; u64 swap_pss; @@ -731,14 +699,9 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } #endif /* HUGETLB_PAGE */ -#define SEQ_PUT_DEC(str, val) \ - seq_put_decimal_ull_width(m, str, (val) >> 10, 8) -static int show_smap(struct seq_file *m, void *v, int is_pid) +static void smap_gather_stats(struct vm_area_struct *vma, + struct mem_size_stats *mss) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct mem_size_stats mss_stack; - struct mem_size_stats *mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, #ifdef CONFIG_HUGETLB_PAGE @@ -746,23 +709,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) #endif .mm = vma->vm_mm, }; - int ret = 0; - bool rollup_mode; - bool last_vma; - - if (priv->rollup) { - rollup_mode = true; - mss = priv->rollup; - if (mss->first) { - mss->first_vma_start = vma->vm_start; - mss->first = false; - } - last_vma = !m_next_vma(priv, vma); - } else { - rollup_mode = false; - memset(&mss_stack, 0, sizeof(mss_stack)); - mss = &mss_stack; - } smaps_walk.private = mss; @@ -794,79 +740,116 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) walk_page_vma(vma, &smaps_walk); if (vma->vm_flags & VM_LOCKED) mss->pss_locked += mss->pss; +} - if (!rollup_mode) { - show_map_vma(m, vma, is_pid); - } else if (last_vma) { - show_vma_header_prefix( - m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0); - seq_pad(m, ' '); - seq_puts(m, "[rollup]\n"); - } else { - ret = SEQ_SKIP; - } - - if (!rollup_mode) { - SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); - SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); - SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); - seq_puts(m, " kB\n"); - } +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) - if (!rollup_mode || last_vma) { - SEQ_PUT_DEC("Rss: ", mss->resident); - SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); - SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); - SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); - SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); - SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); - SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); - SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); - SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); - SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); - SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); - SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); - seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", - mss->private_hugetlb >> 10, 7); - SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); - SEQ_PUT_DEC(" kB\nSwapPss: ", - mss->swap_pss >> PSS_SHIFT); - SEQ_PUT_DEC(" kB\nLocked: ", - mss->pss_locked >> PSS_SHIFT); - seq_puts(m, " kB\n"); - } - if (!rollup_mode) { - if (arch_pkeys_enabled()) - seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); - show_smap_vma_flags(m, vma); - } - m_cache_vma(m, vma); - return ret; +/* Show the contents common for smaps and smaps_rollup */ +static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss) +{ + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", + mss->pss_locked >> PSS_SHIFT); + seq_puts(m, " kB\n"); } -#undef SEQ_PUT_DEC -static int show_pid_smap(struct seq_file *m, void *v) +static int show_smap(struct seq_file *m, void *v) { - return show_smap(m, v, 1); + struct vm_area_struct *vma = v; + struct mem_size_stats mss; + + memset(&mss, 0, sizeof(mss)); + + smap_gather_stats(vma, &mss); + + show_map_vma(m, vma); + + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + + __show_smap(m, &mss); + + if (arch_pkeys_enabled()) + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); + show_smap_vma_flags(m, vma); + + m_cache_vma(m, vma); + + return 0; } -static int show_tid_smap(struct seq_file *m, void *v) +static int show_smaps_rollup(struct seq_file *m, void *v) { - return show_smap(m, v, 0); + struct proc_maps_private *priv = m->private; + struct mem_size_stats mss; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned long last_vma_end = 0; + int ret = 0; + + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return -ESRCH; + + mm = priv->mm; + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out_put_task; + } + + memset(&mss, 0, sizeof(mss)); + + down_read(&mm->mmap_sem); + hold_task_mempolicy(priv); + + for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { + smap_gather_stats(vma, &mss); + last_vma_end = vma->vm_end; + } + + show_vma_header_prefix(m, priv->mm->mmap->vm_start, + last_vma_end, 0, 0, 0, 0); + seq_pad(m, ' '); + seq_puts(m, "[rollup]\n"); + + __show_smap(m, &mss); + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); + +out_put_task: + put_task_struct(priv->task); + priv->task = NULL; + + return ret; } +#undef SEQ_PUT_DEC static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_smap -}; - -static const struct seq_operations proc_tid_smaps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_smap + .show = show_smap }; static int pid_smaps_open(struct inode *inode, struct file *file) @@ -874,28 +857,45 @@ static int pid_smaps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_smaps_op); } -static int pid_smaps_rollup_open(struct inode *inode, struct file *file) +static int smaps_rollup_open(struct inode *inode, struct file *file) { - struct seq_file *seq; + int ret; struct proc_maps_private *priv; - int ret = do_maps_open(inode, file, &proc_pid_smaps_op); - - if (ret < 0) - return ret; - seq = file->private_data; - priv = seq->private; - priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL); - if (!priv->rollup) { - proc_map_release(inode, file); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); + if (!priv) return -ENOMEM; + + ret = single_open(file, show_smaps_rollup, priv); + if (ret) + goto out_free; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + ret = PTR_ERR(priv->mm); + + single_release(inode, file); + goto out_free; } - priv->rollup->first = true; + return 0; + +out_free: + kfree(priv); + return ret; } -static int tid_smaps_open(struct inode *inode, struct file *file) +static int smaps_rollup_release(struct inode *inode, struct file *file) { - return do_maps_open(inode, file, &proc_tid_smaps_op); + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + kfree(priv); + return single_release(inode, file); } const struct file_operations proc_pid_smaps_operations = { @@ -906,17 +906,10 @@ const struct file_operations proc_pid_smaps_operations = { }; const struct file_operations proc_pid_smaps_rollup_operations = { - .open = pid_smaps_rollup_open, + .open = smaps_rollup_open, .read = seq_read, .llseek = seq_lseek, - .release = proc_map_release, -}; - -const struct file_operations proc_tid_smaps_operations = { - .open = tid_smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, + .release = smaps_rollup_release, }; enum clear_refs_types { @@ -1728,7 +1721,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, /* * Display pages allocated per node and memory policy via /proc. */ -static int show_numa_map(struct seq_file *m, void *v, int is_pid) +static int show_numa_map(struct seq_file *m, void *v) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; @@ -1812,45 +1805,17 @@ out: return 0; } -static int show_pid_numa_map(struct seq_file *m, void *v) -{ - return show_numa_map(m, v, 1); -} - -static int show_tid_numa_map(struct seq_file *m, void *v) -{ - return show_numa_map(m, v, 0); -} - static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_numa_map, + .show = show_numa_map, }; -static const struct seq_operations proc_tid_numa_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_numa_map, -}; - -static int numa_maps_open(struct inode *inode, struct file *file, - const struct seq_operations *ops) -{ - return proc_maps_open(inode, file, ops, - sizeof(struct numa_maps_private)); -} - static int pid_numa_maps_open(struct inode *inode, struct file *file) { - return numa_maps_open(inode, file, &proc_pid_numa_maps_op); -} - -static int tid_numa_maps_open(struct inode *inode, struct file *file) -{ - return numa_maps_open(inode, file, &proc_tid_numa_maps_op); + return proc_maps_open(inode, file, &proc_pid_numa_maps_op, + sizeof(struct numa_maps_private)); } const struct file_operations proc_pid_numa_maps_operations = { @@ -1860,10 +1825,4 @@ const struct file_operations proc_pid_numa_maps_operations = { .release = proc_map_release, }; -const struct file_operations proc_tid_numa_maps_operations = { - .open = tid_numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, -}; #endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 5b62f57bd9bc..0b63d68dedb2 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -142,8 +142,7 @@ static int is_stack(struct vm_area_struct *vma) /* * display a single VMA to a sequenced file */ -static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, - int is_pid) +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; unsigned long ino = 0; @@ -189,22 +188,11 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, /* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_p, int is_pid) +static int show_map(struct seq_file *m, void *_p) { struct rb_node *p = _p; - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), - is_pid); -} - -static int show_pid_map(struct seq_file *m, void *_p) -{ - return show_map(m, _p, 1); -} - -static int show_tid_map(struct seq_file *m, void *_p) -{ - return show_map(m, _p, 0); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); } static void *m_start(struct seq_file *m, loff_t *pos) @@ -260,14 +248,7 @@ static const struct seq_operations proc_pid_maps_ops = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_map -}; - -static const struct seq_operations proc_tid_maps_ops = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_map + .show = show_map }; static int maps_open(struct inode *inode, struct file *file, @@ -308,11 +289,6 @@ static int pid_maps_open(struct inode *inode, struct file *file) return maps_open(inode, file, &proc_pid_maps_ops); } -static int tid_maps_open(struct inode *inode, struct file *file) -{ - return maps_open(inode, file, &proc_tid_maps_ops); -} - const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, @@ -320,10 +296,3 @@ const struct file_operations proc_pid_maps_operations = { .release = map_release, }; -const struct file_operations proc_tid_maps_operations = { - .open = tid_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = map_release, -}; - diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 3f723cb478af..a4c2791ab70b 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -9,7 +9,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) { - struct timespec uptime; + struct timespec64 uptime; struct timespec64 idle; u64 nsec; u32 rem; @@ -19,7 +19,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) for_each_possible_cpu(i) nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; - get_monotonic_boottime(&uptime); + ktime_get_boottime_ts64(&uptime); idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cfb6674331fd..6c1c2607e9e4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -379,7 +379,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). */ -static int mmap_vmcore_fault(struct vm_fault *vmf) +static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) { #ifdef CONFIG_S390 struct address_space *mapping = vmf->vma->vm_file->f_mapping; diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index e3c558d1b78c..3a5a752d96c7 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -33,30 +33,22 @@ static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) return 0; } -static char *print_time(time_t t) -{ - static char timebuf[256]; - - sprintf(timebuf, "%ld", t); - return timebuf; -} - static void sd_print_item(struct item_head *ih, char *item) { printk("\tmode | size | nlinks | first direct | mtime\n"); if (stat_data_v1(ih)) { struct stat_data_v1 *sd = (struct stat_data_v1 *)item; - printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), + printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd), sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd), - print_time(sd_v1_mtime(sd))); + sd_v1_mtime(sd)); } else { struct stat_data *sd = (struct stat_data *)item; - printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd), + printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd), (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), - sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); + sd_v2_rdev(sd), sd_v2_mtime(sd)); } } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 52eb5d293a34..8a76f9d14bc6 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2381,7 +2381,7 @@ static int journal_read(struct super_block *sb) struct reiserfs_journal_desc *desc; unsigned int oldest_trans_id = 0; unsigned int oldest_invalid_trans_id = 0; - time_t start; + time64_t start; unsigned long oldest_start = 0; unsigned long cur_dblock = 0; unsigned long newest_mount_id = 9; @@ -2395,7 +2395,7 @@ static int journal_read(struct super_block *sb) cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_info(sb, "checking transaction log (%pg)\n", journal->j_dev_bd); - start = get_seconds(); + start = ktime_get_seconds(); /* * step 1, read in the journal header block. Check the transaction @@ -2556,7 +2556,7 @@ start_log_replay: if (replay_count > 0) { reiserfs_info(sb, "replayed %d transactions in %lu seconds\n", - replay_count, get_seconds() - start); + replay_count, ktime_get_seconds() - start); } /* needed to satisfy the locking in _update_journal_header_block */ reiserfs_write_lock(sb); @@ -2914,7 +2914,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); - time_t now = get_seconds(); + time64_t now = ktime_get_seconds(); /* cannot restart while nested */ BUG_ON(!th->t_trans_id); if (th->t_refcount > 1) @@ -3023,7 +3023,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks, int join) { - time_t now = get_seconds(); + time64_t now = ktime_get_seconds(); unsigned int old_trans_id; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_transaction_handle myth; @@ -3056,7 +3056,7 @@ relock: PROC_INFO_INC(sb, journal.journal_relock_writers); goto relock; } - now = get_seconds(); + now = ktime_get_seconds(); /* * if there is no room in the journal OR @@ -3119,7 +3119,7 @@ relock: } /* we are the first writer, set trans_id */ if (journal->j_trans_start_time == 0) { - journal->j_trans_start_time = get_seconds(); + journal->j_trans_start_time = ktime_get_seconds(); } atomic_inc(&journal->j_wcount); journal->j_len_alloc += nblocks; @@ -3559,11 +3559,11 @@ static void flush_async_commits(struct work_struct *work) */ void reiserfs_flush_old_commits(struct super_block *sb) { - time_t now; + time64_t now; struct reiserfs_transaction_handle th; struct reiserfs_journal *journal = SB_JOURNAL(sb); - now = get_seconds(); + now = ktime_get_seconds(); /* * safety check so we don't flush while we are replaying the log during * mount @@ -3613,7 +3613,7 @@ void reiserfs_flush_old_commits(struct super_block *sb) static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) { - time_t now; + time64_t now; int flush = flags & FLUSH_ALL; int commit_now = flags & COMMIT_NOW; int wait_on_commit = flags & WAIT; @@ -3694,7 +3694,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) } /* deal with old transactions where we are the last writers */ - now = get_seconds(); + now = ktime_get_seconds(); if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { commit_now = 1; journal->j_next_async_flush = 1; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index e39b3910d24d..f2cf3441fdfc 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -297,6 +297,13 @@ static int show_oidmap(struct seq_file *m, void *unused) return 0; } +static time64_t ktime_mono_to_real_seconds(time64_t mono) +{ + ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2); + + return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC); +} + static int show_journal(struct seq_file *m, void *unused) { struct super_block *sb = m->private; @@ -325,7 +332,7 @@ static int show_journal(struct seq_file *m, void *unused) "j_bcount: \t%lu\n" "j_first_unflushed_offset: \t%lu\n" "j_last_flush_trans_id: \t%u\n" - "j_trans_start_time: \t%li\n" + "j_trans_start_time: \t%lli\n" "j_list_bitmap_index: \t%i\n" "j_must_wait: \t%i\n" "j_next_full_flush: \t%i\n" @@ -366,7 +373,7 @@ static int show_journal(struct seq_file *m, void *unused) JF(j_bcount), JF(j_first_unflushed_offset), JF(j_last_flush_trans_id), - JF(j_trans_start_time), + ktime_mono_to_real_seconds(JF(j_trans_start_time)), JF(j_list_bitmap_index), JF(j_must_wait), JF(j_next_full_flush), diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index ae4811fecc1f..e5ca9ed79e54 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -271,7 +271,7 @@ struct reiserfs_journal_list { struct mutex j_commit_mutex; unsigned int j_trans_id; - time_t j_timestamp; + time64_t j_timestamp; /* write-only but useful for crash dump analysis */ struct reiserfs_list_bitmap *j_list_bitmap; struct buffer_head *j_commit_bh; /* commit buffer head */ struct reiserfs_journal_cnode *j_realblock; @@ -331,7 +331,7 @@ struct reiserfs_journal { struct buffer_head *j_header_bh; - time_t j_trans_start_time; /* time this transaction started */ + time64_t j_trans_start_time; /* time this transaction started */ struct mutex j_mutex; struct mutex j_flush_mutex; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index ff94fad477e4..48cdfc81fe10 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -792,8 +792,10 @@ static int listxattr_filler(struct dir_context *ctx, const char *name, return 0; size = namelen + 1; if (b->buf) { - if (size > b->size) + if (b->pos + size > b->size) { + b->pos = -ERANGE; return -ERANGE; + } memcpy(b->buf + b->pos, name, namelen); b->buf[b->pos + namelen] = 0; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index bec9f79adb25..499a20a5a010 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -35,7 +35,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait) { struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned long time = get_seconds(), old_time; + u32 time = (u32)ktime_get_real_seconds(), old_time; mutex_lock(&sbi->s_lock); @@ -46,8 +46,8 @@ static int sysv_sync_fs(struct super_block *sb, int wait) */ old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); if (sbi->s_type == FSTYPE_SYSV4) { - if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) - *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time); + if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time)) + *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time); *sbi->s_sb_time = cpu_to_fs32(sbi, time); mark_buffer_dirty(sbi->s_bh2); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 15c265d450bf..f649023b19b5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -910,7 +910,7 @@ wakeup: */ spin_lock(&ctx->fault_pending_wqh.lock); __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); spin_unlock(&ctx->fault_pending_wqh.lock); /* Flush pending events that may still wait on event_wqh */ @@ -1066,7 +1066,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * anyway. */ list_del(&uwq->wq.entry); - __add_wait_queue(&ctx->fault_wqh, &uwq->wq); + add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); @@ -1215,7 +1215,7 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx, __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, range); if (waitqueue_active(&ctx->fault_wqh)) - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range); + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); spin_unlock(&ctx->fault_pending_wqh.lock); } diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h index 012c55cb22ba..e6964e97acdd 100644 --- a/include/acpi/acconfig.h +++ b/include/acpi/acconfig.h @@ -89,7 +89,7 @@ /* Maximum object reference count (detects object deletion issues) */ -#define ACPI_MAX_REFERENCE_COUNT 0x1000 +#define ACPI_MAX_REFERENCE_COUNT 0x4000 /* Default page size for use in mapping memory for operation regions */ diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 226e5aeba6c2..856c56ef0143 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -59,6 +59,12 @@ struct acpi_exception_info { #define AE_OK (acpi_status) 0x0000 +#define ACPI_ENV_EXCEPTION(status) (status & AE_CODE_ENVIRONMENTAL) +#define ACPI_AML_EXCEPTION(status) (status & AE_CODE_AML) +#define ACPI_PROG_EXCEPTION(status) (status & AE_CODE_PROGRAMMER) +#define ACPI_TABLE_EXCEPTION(status) (status & AE_CODE_ACPI_TABLES) +#define ACPI_CNTL_EXCEPTION(status) (status & AE_CODE_CONTROL) + /* * Environmental exceptions */ diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 88072c92ace2..9566f99cc3c0 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -12,7 +12,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20180629 +#define ACPI_CA_VERSION 0x20180810 #include <acpi/acconfig.h> #include <acpi/actypes.h> diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index a7613e1b0c87..20561a60db9c 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -75,9 +75,19 @@ struct bug_entry { /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report - * significant issues that need prompt attention if they should ever - * appear at runtime. Use the versions with printk format strings - * to provide better diagnostics. + * significant kernel issues that need prompt attention if they should ever + * appear at runtime. + * + * Do not use these macros when checking for invalid external inputs + * (e.g. invalid system call arguments, or invalid data coming from + * network/devices), and on transient conditions like ENOMEM or EAGAIN. + * These macros should be used for recoverable kernel issues only. + * For invalid external inputs, transient conditions, etc use + * pr_err[_once/_ratelimited]() followed by dump_stack(), if necessary. + * Do not include "BUG"/"WARNING" in format strings manually to make these + * conditions distinguishable from kernel issues. + * + * Use the versions with printk format strings to provide better diagnostics. */ #ifndef __WARN_TAINT extern __printf(3, 4) diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h index 68efb950a918..4d73e6e3c66c 100644 --- a/include/asm-generic/export.h +++ b/include/asm-generic/export.h @@ -5,12 +5,10 @@ #define KSYM_FUNC(x) x #endif #ifdef CONFIG_64BIT -#define __put .quad #ifndef KSYM_ALIGN #define KSYM_ALIGN 8 #endif #else -#define __put .long #ifndef KSYM_ALIGN #define KSYM_ALIGN 4 #endif @@ -19,6 +17,16 @@ #define KCRC_ALIGN 4 #endif +.macro __put, val, name +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + .long \val - ., \name - . +#elif defined(CONFIG_64BIT) + .quad \val, \name +#else + .long \val, \name +#endif +.endm + /* * note on .section use: @progbits vs %progbits nastiness doesn't matter, * since we immediately emit into those sections anyway. diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 24251762c20c..9a6bc0951cfa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -12,6 +12,7 @@ #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/kref.h> +#include <linux/refcount.h> struct page; struct device; @@ -75,7 +76,7 @@ enum wb_reason { */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ - atomic_t refcnt; /* nr of attached wb's and blkg */ + refcount_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK struct backing_dev_info *__bdi; /* the associated bdi, set to NULL diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 72ca0f3d39f3..c28a47cbe355 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -404,13 +404,13 @@ static inline bool inode_cgwb_enabled(struct inode *inode) static inline struct bdi_writeback_congested * wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { - atomic_inc(&bdi->wb_congested->refcnt); + refcount_inc(&bdi->wb_congested->refcnt); return bdi->wb_congested; } static inline void wb_congested_put(struct bdi_writeback_congested *congested) { - if (atomic_dec_and_test(&congested->refcnt)) + if (refcount_dec_and_test(&congested->refcnt)) kfree(congested); } diff --git a/include/linux/bitops.h b/include/linux/bitops.h index af419012d77d..7ddb1349394d 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,7 +4,8 @@ #include <asm/types.h> #include <linux/bits.h> -#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 42506e4d1f53..681d866efb1e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -280,6 +280,25 @@ unsigned long read_word_at_a_time(const void *addr) #endif /* __KERNEL__ */ +/* + * Force the compiler to emit 'sym' as a symbol, so that we can reference + * it from inline assembler. Necessary in case 'sym' could be inlined + * otherwise, or eliminated entirely due to lack of references that are + * visible to the compiler. + */ +#define __ADDRESSABLE(sym) \ + static void * __attribute__((section(".discard.addressable"), used)) \ + __PASTE(__addressable_##sym, __LINE__) = (void *)&sym; + +/** + * offset_to_ptr - convert a relative memory offset to an absolute pointer + * @off: the address of the 32-bit offset value + */ +static inline void *offset_to_ptr(const int *off) +{ + return (void *)((unsigned long)off + *off); +} + #endif /* __ASSEMBLY__ */ #ifndef __optimize @@ -313,7 +332,7 @@ unsigned long read_word_at_a_time(const void *addr) #ifdef __OPTIMIZE__ # define __compiletime_assert(condition, msg, prefix, suffix) \ do { \ - bool __cond = !(condition); \ + int __cond = !(condition); \ extern void prefix ## suffix(void) __compiletime_error(msg); \ if (__cond) \ prefix ## suffix(); \ diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index b511f6d24b42..525510a9f965 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, diff --git a/include/linux/crc64.h b/include/linux/crc64.h new file mode 100644 index 000000000000..c756e65a1b58 --- /dev/null +++ b/include/linux/crc64.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * See lib/crc64.c for the related specification and polynomial arithmetic. + */ +#ifndef _LINUX_CRC64_H +#define _LINUX_CRC64_H + +#include <linux/types.h> + +u64 __pure crc64_be(u64 crc, const void *p, size_t len); +#endif /* _LINUX_CRC64_H */ diff --git a/include/linux/export.h b/include/linux/export.h index b768d6dd3c90..ae072bc5aacf 100644 --- a/include/linux/export.h +++ b/include/linux/export.h @@ -18,12 +18,6 @@ #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x) #ifndef __ASSEMBLY__ -struct kernel_symbol -{ - unsigned long value; - const char *name; -}; - #ifdef MODULE extern struct module __this_module; #define THIS_MODULE (&__this_module) @@ -54,19 +48,58 @@ extern struct module __this_module; #define __CRC_SYMBOL(sym, sec) #endif +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#include <linux/compiler.h> +/* + * Emit the ksymtab entry as a pair of relative references: this reduces + * the size by half on 64-bit architectures, and eliminates the need for + * absolute relocations that require runtime processing on relocatable + * kernels. + */ +#define __KSYMTAB_ENTRY(sym, sec) \ + __ADDRESSABLE(sym) \ + asm(" .section \"___ksymtab" sec "+" #sym "\", \"a\" \n" \ + " .balign 8 \n" \ + "__ksymtab_" #sym ": \n" \ + " .long " #sym "- . \n" \ + " .long __kstrtab_" #sym "- . \n" \ + " .previous \n") + +struct kernel_symbol { + int value_offset; + int name_offset; +}; +#else +#define __KSYMTAB_ENTRY(sym, sec) \ + static const struct kernel_symbol __ksymtab_##sym \ + __attribute__((section("___ksymtab" sec "+" #sym), used)) \ + = { (unsigned long)&sym, __kstrtab_##sym } + +struct kernel_symbol { + unsigned long value; + const char *name; +}; +#endif + /* For every exported symbol, place a struct in the __ksymtab section */ #define ___EXPORT_SYMBOL(sym, sec) \ extern typeof(sym) sym; \ __CRC_SYMBOL(sym, sec) \ static const char __kstrtab_##sym[] \ - __attribute__((section("__ksymtab_strings"), aligned(1))) \ + __attribute__((section("__ksymtab_strings"), used, aligned(1))) \ = #sym; \ - static const struct kernel_symbol __ksymtab_##sym \ - __used \ - __attribute__((section("___ksymtab" sec "+" #sym), used)) \ - = { (unsigned long)&sym, __kstrtab_##sym } + __KSYMTAB_ENTRY(sym, sec) + +#if defined(__DISABLE_EXPORTS) + +/* + * Allow symbol exports to be disabled completely so that C code may + * be reused in other execution contexts such as the UEFI stub or the + * decompressor. + */ +#define __EXPORT_SYMBOL(sym, sec) -#if defined(__KSYM_DEPS__) +#elif defined(__KSYM_DEPS__) /* * For fine grained build dependencies, we want to tell the build system diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index aa5db8b5521a..f70f8ac9c4f4 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -304,11 +304,6 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) -#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) -#define NAT_ENTRY_BITMAP_SIZE_ALIGNED \ - ((NAT_ENTRY_BITMAP_SIZE + BITS_PER_LONG - 1) / \ - BITS_PER_LONG * BITS_PER_LONG) - struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ diff --git a/include/linux/init.h b/include/linux/init.h index bc27cf03c41e..2538d176dd1f 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -116,8 +116,24 @@ typedef int (*initcall_t)(void); typedef void (*exitcall_t)(void); -extern initcall_t __con_initcall_start[], __con_initcall_end[]; -extern initcall_t __security_initcall_start[], __security_initcall_end[]; +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +typedef int initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return offset_to_ptr(entry); +} +#else +typedef initcall_t initcall_entry_t; + +static inline initcall_t initcall_from_entry(initcall_entry_t *entry) +{ + return *entry; +} +#endif + +extern initcall_entry_t __con_initcall_start[], __con_initcall_end[]; +extern initcall_entry_t __security_initcall_start[], __security_initcall_end[]; /* Used for contructor calls. */ typedef void (*ctor_fn_t)(void); @@ -167,9 +183,20 @@ extern bool initcall_debug; * as KEEP() in the linker script. */ -#define __define_initcall(fn, id) \ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define ___define_initcall(fn, id, __sec) \ + __ADDRESSABLE(fn) \ + asm(".section \"" #__sec ".init\", \"a\" \n" \ + "__initcall_" #fn #id ": \n" \ + ".long " #fn " - . \n" \ + ".previous \n"); +#else +#define ___define_initcall(fn, id, __sec) \ static initcall_t __initcall_##fn##id __used \ - __attribute__((__section__(".initcall" #id ".init"))) = fn; + __attribute__((__section__(#__sec ".init"))) = fn; +#endif + +#define __define_initcall(fn, id) ___define_initcall(fn, id, .initcall##id) /* * Early initcalls run before initializing SMP. @@ -208,13 +235,8 @@ extern bool initcall_debug; #define __exitcall(fn) \ static exitcall_t __exitcall_##fn __exit_call = fn -#define console_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.con_initcall.init) = fn - -#define security_initcall(fn) \ - static initcall_t __initcall_##fn \ - __used __section(.security_initcall.init) = fn +#define console_initcall(fn) ___define_initcall(fn,, .con_initcall) +#define security_initcall(fn) ___define_initcall(fn,, .security_initcall) struct obs_kernel_param { const char *str; diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 6cea726612b7..6ab8c1bada3f 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -16,10 +16,9 @@ struct user_namespace; struct ipc_ids { int in_use; unsigned short seq; - bool tables_initialized; struct rw_semaphore rwsem; struct idr ipcs_idr; - int max_id; + int max_idx; #ifdef CONFIG_CHECKPOINT_RESTORE int next_id; #endif diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 8de55e4b5ee9..c20f296438fb 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -35,7 +35,7 @@ struct vmcoredd_node { }; #ifdef CONFIG_PROC_KCORE -extern void kclist_add(struct kcore_list *, void *, size_t, int type); +void __init kclist_add(struct kcore_list *, void *, size_t, int type); #else static inline void kclist_add(struct kcore_list *new, void *addr, size_t size, int type) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 941dc0a5a877..d6aac75b51ba 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -85,7 +85,23 @@ * arguments just once each. */ #define __round_mask(x, y) ((__typeof__(x))((y)-1)) +/** + * round_up - round up to next specified power of 2 + * @x: the value to round + * @y: multiple to round up to (must be a power of 2) + * + * Rounds @x up to next multiple of @y (which must be a power of 2). + * To perform arbitrary rounding up, use roundup() below. + */ #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +/** + * round_down - round down to next specified power of 2 + * @x: the value to round + * @y: multiple to round down to (must be a power of 2) + * + * Rounds @x down to next multiple of @y (which must be a power of 2). + * To perform arbitrary rounding down, use rounddown() below. + */ #define round_down(x, y) ((x) & ~__round_mask(x, y)) /** @@ -110,13 +126,30 @@ # define DIV_ROUND_UP_SECTOR_T(ll,d) DIV_ROUND_UP(ll,d) #endif -/* The `const' in roundup() prevents gcc-3.3 from calling __divdi3 */ +/** + * roundup - round up to the next specified multiple + * @x: the value to up + * @y: multiple to round up to + * + * Rounds @x up to next multiple of @y. If @y will always be a power + * of 2, consider using the faster round_up(). + * + * The `const' here prevents gcc-3.3 from calling __divdi3 + */ #define roundup(x, y) ( \ { \ const typeof(y) __y = y; \ (((x) + (__y - 1)) / __y) * __y; \ } \ ) +/** + * rounddown - round down to next specified multiple + * @x: the value to round + * @y: multiple to round down to + * + * Rounds @x down to next multiple of @y. If @y will always be a power + * of 2, consider using the faster round_down(). + */ #define rounddown(x, y) ( \ { \ typeof(x) __x = (x); \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c7362dd2faa..0205aee44ded 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, bool blockable); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e6c515fb698..652f602167df 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -225,6 +225,11 @@ struct mem_cgroup { */ bool use_hierarchy; + /* + * Should the OOM killer kill all belonging tasks, had it kill one? + */ + bool oom_group; + /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; @@ -542,6 +547,9 @@ static inline bool task_in_memcg_oom(struct task_struct *p) } bool mem_cgroup_oom_synchronize(bool wait); +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain); +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; @@ -1001,6 +1009,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline struct mem_cgroup *mem_cgroup_get_oom_group( + struct task_struct *victim, struct mem_cgroup *oom_domain) +{ + return NULL; +} + +static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4e9828cda7a2..34a28227068d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,6 +319,7 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) static inline void remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern void __ref free_area_init_core_hotplug(int nid); extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); diff --git a/include/linux/mm.h b/include/linux/mm.h index a3cae495f9ce..a9e733b5fb76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; + memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); @@ -959,15 +960,6 @@ static inline int page_zone_id(struct page *page) return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } -static inline int zone_to_nid(struct zone *zone) -{ -#ifdef CONFIG_NUMA - return zone->node; -#else - return 0; -#endif -} - #ifdef NODE_NOT_IN_PAGE_FLAGS extern int page_to_nid(const struct page *page); #else @@ -2023,7 +2015,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) extern void __init pagecache_init(void); extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, unsigned long * zones_size, +extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 392e6af82701..133ba78820ee 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -151,13 +151,15 @@ struct mmu_notifier_ops { * address space but may still be referenced by sptes until * the last refcount is dropped. * - * If both of these callbacks cannot block, and invalidate_range - * cannot block, mmu_notifier_ops.flags should have - * MMU_INVALIDATE_DOES_NOT_BLOCK set. + * If blockable argument is set to false then the callback cannot + * sleep and has to return with -EAGAIN. 0 should be returned + * otherwise. + * */ - void (*invalidate_range_start)(struct mmu_notifier *mn, + int (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool blockable); void (*invalidate_range_end)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); -extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end, bool only_end); @@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_start(mm, start, end); + __mmu_notifier_invalidate_range_start(mm, start, end, true); +} + +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_invalidate_range_start(mm, start, end, false); + return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, { } +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + return 0; +} + static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 32699b2dc52a..1e22d96734e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -755,25 +755,6 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; } -static inline int zone_id(const struct zone *zone) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - - return zone - pgdat->node_zones; -} - -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_dev_zone(const struct zone *zone) -{ - return zone_id(zone) == ZONE_DEVICE; -} -#else -static inline bool is_dev_zone(const struct zone *zone) -{ - return false; -} -#endif - #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); @@ -824,6 +805,18 @@ static inline int local_memory_node(int node_id) { return node_id; }; */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_dev_zone(const struct zone *zone) +{ + return zone_idx(zone) == ZONE_DEVICE; +} +#else +static inline bool is_dev_zone(const struct zone *zone) +{ + return false; +} +#endif + /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than @@ -841,6 +834,25 @@ static inline bool populated_zone(struct zone *zone) return zone->present_pages; } +#ifdef CONFIG_NUMA +static inline int zone_to_nid(struct zone *zone) +{ + return zone->node; +} + +static inline void zone_set_nid(struct zone *zone, int nid) +{ + zone->node = nid; +} +#else +static inline int zone_to_nid(struct zone *zone) +{ + return 0; +} + +static inline void zone_set_nid(struct zone *zone, int nid) {} +#endif + extern int movable_zone; #ifdef CONFIG_HIGHMEM @@ -956,12 +968,7 @@ static inline int zonelist_zone_idx(struct zoneref *zoneref) static inline int zonelist_node_idx(struct zoneref *zoneref) { -#ifdef CONFIG_NUMA - /* zone_to_nid not available in this context */ - return zoneref->zone->node; -#else - return 0; -#endif /* CONFIG_NUMA */ + return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, diff --git a/include/linux/net_dim.h b/include/linux/net_dim.h index db99240d00bd..c79e859408e6 100644 --- a/include/linux/net_dim.h +++ b/include/linux/net_dim.h @@ -363,7 +363,6 @@ static inline void net_dim_sample(u16 event_ctr, } #define NET_DIM_NEVENTS 64 -#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1)) static inline void net_dim_calc_stats(struct net_dim_sample *start, diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 1fbde8a880d9..5a30ad594ccc 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -518,7 +518,7 @@ static inline int node_random(const nodemask_t *mask) * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ -#if NODES_SHIFT > 8 /* nodemask_t > 256 bytes */ +#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) diff --git a/include/linux/oom.h b/include/linux/oom.h index 6adac113e96d..92f70e4c6252 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm) return 0; } -void __oom_reap_task_mm(struct mm_struct *mm); +bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, diff --git a/include/linux/pci.h b/include/linux/pci.h index 9b87f1936906..e72ca8dd6241 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1809,7 +1809,11 @@ struct pci_fixup { u16 device; /* Or PCI_ANY_ID */ u32 class; /* Or PCI_ANY_ID */ unsigned int class_shift; /* should be 0, 8, 16 */ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + int hook_offset; +#else void (*hook)(struct pci_dev *dev); +#endif }; enum pci_fixup_pass { @@ -1823,12 +1827,28 @@ enum pci_fixup_pass { pci_fixup_suspend_late, /* pci_device_suspend_late() */ }; +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) \ + __ADDRESSABLE(hook) \ + asm(".section " #sec ", \"a\" \n" \ + ".balign 16 \n" \ + ".short " #vendor ", " #device " \n" \ + ".long " #class ", " #class_shift " \n" \ + ".long " #hook " - . \n" \ + ".previous \n"); +#define DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) \ + __DECLARE_PCI_FIXUP_SECTION(sec, name, vendor, device, class, \ + class_shift, hook) +#else /* Anonymous variables would be nice... */ #define DECLARE_PCI_FIXUP_SECTION(section, name, vendor, device, class, \ class_shift, hook) \ static const struct pci_fixup __PASTE(__pci_fixup_##name,__LINE__) __used \ __attribute__((__section__(#section), aligned((sizeof(void *))))) \ = { vendor, device, class, class_shift, hook }; +#endif #define DECLARE_PCI_FIXUP_CLASS_EARLY(vendor, device, class, \ class_shift, hook) \ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 296bbe49d5d1..70b7123f38c7 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -149,4 +149,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) +extern unsigned long pcpu_nr_pages(void); + #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 626fc65c4336..d0e1f1522a78 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -129,7 +129,7 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)); /* get the associated pid namespace for a file in procfs */ -static inline struct pid_namespace *proc_pid_ns(struct inode *inode) +static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) { return inode->i_sb->s_fs_info; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 00de3e950dd4..977cb57d7bc9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -854,6 +854,7 @@ struct task_struct { #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; + unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 4e9b77fb702d..1be35729c2c5 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -323,7 +323,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_ptrace_errno_trap(int errno, void __user *addr); extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sigsegv(int, struct task_struct *); +extern void force_sigsegv(int sig, struct task_struct *p); extern int force_sig_info(int, struct siginfo *, struct task_struct *); extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 913488d828cb..a9c32daeb9d8 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -10,6 +10,7 @@ struct ctl_table; extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; +extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, void __user *buffer, diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 96fe289c4c6e..39ad98c09c58 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -4,6 +4,7 @@ #include <linux/uidgid.h> #include <linux/atomic.h> +#include <linux/refcount.h> #include <linux/ratelimit.h> struct key; @@ -12,7 +13,7 @@ struct key; * Some day this will be a full-fledged user tracking system.. */ struct user_struct { - atomic_t __count; /* reference count */ + refcount_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY @@ -59,7 +60,7 @@ extern struct user_struct root_user; extern struct user_struct * alloc_uid(kuid_t); static inline struct user_struct *get_uid(struct user_struct *u) { - atomic_inc(&u->__count); + refcount_inc(&u->__count); return u; } extern void free_uid(struct user_struct *); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index b154fd2b084c..9443cafd1969 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -12,6 +12,9 @@ struct shrink_control { gfp_t gfp_mask; + /* current node being shrunk (for NUMA aware shrinkers) */ + int nid; + /* * How many objects scan_objects should scan and try to reclaim. * This is reset before every call, so it is safe for callees @@ -26,9 +29,6 @@ struct shrink_control { */ unsigned long nr_scanned; - /* current node being shrunk (for NUMA aware shrinkers) */ - int nid; - /* current memcg being shrunk (for memcg aware shrinkers) */ struct mem_cgroup *memcg; }; @@ -63,9 +63,9 @@ struct shrinker { unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); - int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ - unsigned long flags; + int seeks; /* seeks to recreate an obj */ + unsigned flags; /* These are for internal use */ struct list_head list; diff --git a/include/linux/signal.h b/include/linux/signal.h index fe125b0335f7..3d4cd5db30a9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -267,7 +267,7 @@ extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; -extern int get_signal(struct ksignal *ksig); +extern bool get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); @@ -289,7 +289,7 @@ static inline void disallow_signal(int sig) extern struct kmem_cache *sighand_cachep; -int unhandled_signal(struct task_struct *tsk, int sig); +extern bool unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) diff --git a/include/linux/swap.h b/include/linux/swap.h index 1a8bd05a335e..8e2c11e692ba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,7 +447,7 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d9a084c72541..7f2e16e76ac4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -249,6 +249,19 @@ extern void syscall_unregfunc(void); return static_key_false(&__tracepoint_##name.key); \ } +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS +#define __TRACEPOINT_ENTRY(name) \ + asm(" .section \"__tracepoints_ptrs\", \"a\" \n" \ + " .balign 4 \n" \ + " .long __tracepoint_" #name " - . \n" \ + " .previous \n") +#else +#define __TRACEPOINT_ENTRY(name) \ + static struct tracepoint * const __tracepoint_ptr_##name __used \ + __attribute__((section("__tracepoints_ptrs"))) = \ + &__tracepoint_##name +#endif + /* * We have no guarantee that gcc and the linker won't up-align the tracepoint * structures, so we create an array of pointers that will be used for iteration @@ -258,11 +271,9 @@ extern void syscall_unregfunc(void); static const char __tpstrtab_##name[] \ __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ - __attribute__((section("__tracepoints"))) = \ + __attribute__((section("__tracepoints"), used)) = \ { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ - static struct tracepoint * const __tracepoint_ptr_##name __used \ - __attribute__((section("__tracepoints_ptrs"))) = \ - &__tracepoint_##name; + __TRACEPOINT_ENTRY(name); #define DEFINE_TRACE(name) \ DEFINE_TRACE_FN(name, NULL, NULL); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 6a17f856f841..381cdf5a9bd1 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -119,7 +119,8 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, */ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 end, - umem_call_back cb, void *cookie); + umem_call_back cb, + bool blockable, void *cookie); /* * Find first region intersecting with address range. diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index e13eec3dfb2f..df31aa9c9a8c 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 2 +#define AUTOFS_PROTO_SUBVERSION 3 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed @@ -90,8 +90,10 @@ enum { /* autofs version 4 and later definitions */ /* Mask for expire behaviour */ -#define AUTOFS_EXP_IMMEDIATE 1 -#define AUTOFS_EXP_LEAVES 2 +#define AUTOFS_EXP_NORMAL 0x00 +#define AUTOFS_EXP_IMMEDIATE 0x01 +#define AUTOFS_EXP_LEAVES 0x02 +#define AUTOFS_EXP_FORCED 0x04 #define AUTOFS_TYPE_ANY 0U #define AUTOFS_TYPE_INDIRECT 1U diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 8d19e02d752a..5d4f58e059fd 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -30,10 +30,10 @@ struct bkey { BITMASK(name, struct bkey, field, offset, size) #define PTR_FIELD(name, offset, size) \ -static inline __u64 name(const struct bkey *k, unsigned i) \ +static inline __u64 name(const struct bkey *k, unsigned int i) \ { return (k->ptr[i] >> offset) & ~(~0ULL << size); } \ \ -static inline void SET_##name(struct bkey *k, unsigned i, __u64 v) \ +static inline void SET_##name(struct bkey *k, unsigned int i, __u64 v) \ { \ k->ptr[i] &= ~(~(~0ULL << size) << offset); \ k->ptr[i] |= (v & ~(~0ULL << size)) << offset; \ @@ -117,12 +117,14 @@ static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src) static inline struct bkey *bkey_next(const struct bkey *k) { __u64 *d = (void *) k; + return (struct bkey *) (d + bkey_u64s(k)); } -static inline struct bkey *bkey_idx(const struct bkey *k, unsigned nr_keys) +static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys) { __u64 *d = (void *) k; + return (struct bkey *) (d + nr_keys); } /* Enough for a key with 6 pointers */ diff --git a/init/Kconfig b/init/Kconfig index 9bd50ba8253f..641dd7dd7c8a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -436,7 +436,7 @@ config BSD_PROCESS_ACCT_V3 help If you say Y here, the process accounting information is written in a new file format that also logs the process IDs of each - process and it's parent. Note that this file format is incompatible + process and its parent. Note that this file format is incompatible with previous v0/v1/v2 file formats, so you will need updated tools for processing it. A preliminary version of these tools is available at <http://www.gnu.org/software/acct/>. @@ -967,6 +967,18 @@ config NET_NS endif # NAMESPACES +config CHECKPOINT_RESTORE + bool "Checkpoint/restore support" + select PROC_CHILDREN + default n + help + Enables additional kernel features in a sake of checkpoint/restore. + In particular it adds auxiliary prctl codes to setup process text, + data and heap segment sizes, and a few additional /proc filesystem + entries. + + If unsure, say N here. + config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS @@ -1383,18 +1395,6 @@ config MEMBARRIER If unsure, say Y. -config CHECKPOINT_RESTORE - bool "Checkpoint/restore support" if EXPERT - select PROC_CHILDREN - default n - help - Enables additional kernel features in a sake of checkpoint/restore. - In particular it adds auxiliary prctl codes to setup process text, - data and heap segment sizes, and a few additional /proc filesystem - entries. - - If unsure, say N here. - config KALLSYMS bool "Load all symbols for debugging/ksymoops" if EXPERT default y @@ -1702,7 +1702,7 @@ config MMAP_ALLOW_UNINITIALIZED default n help Normally, and according to the Linux spec, anonymous memory obtained - from mmap() has it's contents cleared before it is passed to + from mmap() has its contents cleared before it is passed to userspace. Enabling this config option allows you to request that mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus providing a huge performance boost. If this option is not enabled, diff --git a/init/do_mounts.c b/init/do_mounts.c index 2c71dabe5626..e1c9afa9d8c9 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -1,13 +1,3 @@ -/* - * Many of the syscalls used in this file expect some of the arguments - * to be __user pointers not __kernel pointers. To limit the sparse - * noise, turn off sparse checking for this file. - */ -#ifdef __CHECKER__ -#undef __CHECKER__ -#warning "Sparse checking disabled for this file" -#endif - #include <linux/module.h> #include <linux/sched.h> #include <linux/ctype.h> diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 5a91aefa7305..d1a5d885ce13 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -1,14 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Many of the syscalls used in this file expect some of the arguments - * to be __user pointers not __kernel pointers. To limit the sparse - * noise, turn off sparse checking for this file. - */ -#ifdef __CHECKER__ -#undef __CHECKER__ -#warning "Sparse checking disabled for this file" -#endif - #include <linux/unistd.h> #include <linux/kernel.h> #include <linux/fs.h> diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c index 7d85d172bc7e..b84031528dd4 100644 --- a/init/do_mounts_md.c +++ b/init/do_mounts_md.c @@ -1,14 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Many of the syscalls used in this file expect some of the arguments - * to be __user pointers not __kernel pointers. To limit the sparse - * noise, turn off sparse checking for this file. - */ -#ifdef __CHECKER__ -#undef __CHECKER__ -#warning "Sparse checking disabled for this file" -#endif - #include <linux/delay.h> #include <linux/raid/md_u.h> #include <linux/raid/md_p.h> diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 035a5f0ab26b..32fb049d18f9 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -1,14 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Many of the syscalls used in this file expect some of the arguments - * to be __user pointers not __kernel pointers. To limit the sparse - * noise, turn off sparse checking for this file. - */ -#ifdef __CHECKER__ -#undef __CHECKER__ -#warning "Sparse checking disabled for this file" -#endif - #include <linux/kernel.h> #include <linux/fs.h> #include <linux/minix_fs.h> diff --git a/init/initramfs.c b/init/initramfs.c index 13643c46ebab..640557788026 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -1,14 +1,4 @@ // SPDX-License-Identifier: GPL-2.0 -/* - * Many of the syscalls used in this file expect some of the arguments - * to be __user pointers not __kernel pointers. To limit the sparse - * noise, turn off sparse checking for this file. - */ -#ifdef __CHECKER__ -#undef __CHECKER__ -#warning "Sparse checking disabled for this file" -#endif - #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> diff --git a/init/main.c b/init/main.c index b729e1f22838..18f8f0140fa0 100644 --- a/init/main.c +++ b/init/main.c @@ -902,18 +902,18 @@ int __init_or_module do_one_initcall(initcall_t fn) } -extern initcall_t __initcall_start[]; -extern initcall_t __initcall0_start[]; -extern initcall_t __initcall1_start[]; -extern initcall_t __initcall2_start[]; -extern initcall_t __initcall3_start[]; -extern initcall_t __initcall4_start[]; -extern initcall_t __initcall5_start[]; -extern initcall_t __initcall6_start[]; -extern initcall_t __initcall7_start[]; -extern initcall_t __initcall_end[]; - -static initcall_t *initcall_levels[] __initdata = { +extern initcall_entry_t __initcall_start[]; +extern initcall_entry_t __initcall0_start[]; +extern initcall_entry_t __initcall1_start[]; +extern initcall_entry_t __initcall2_start[]; +extern initcall_entry_t __initcall3_start[]; +extern initcall_entry_t __initcall4_start[]; +extern initcall_entry_t __initcall5_start[]; +extern initcall_entry_t __initcall6_start[]; +extern initcall_entry_t __initcall7_start[]; +extern initcall_entry_t __initcall_end[]; + +static initcall_entry_t *initcall_levels[] __initdata = { __initcall0_start, __initcall1_start, __initcall2_start, @@ -939,7 +939,7 @@ static char *initcall_level_names[] __initdata = { static void __init do_initcall_level(int level) { - initcall_t *fn; + initcall_entry_t *fn; strcpy(initcall_command_line, saved_command_line); parse_args(initcall_level_names[level], @@ -950,7 +950,7 @@ static void __init do_initcall_level(int level) trace_initcall_level(initcall_level_names[level]); for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) - do_one_initcall(*fn); + do_one_initcall(initcall_from_entry(fn)); } static void __init do_initcalls(void) @@ -981,11 +981,11 @@ static void __init do_basic_setup(void) static void __init do_pre_smp_initcalls(void) { - initcall_t *fn; + initcall_entry_t *fn; trace_initcall_level("early"); for (fn = __initcall_start; fn < __initcall0_start; fn++) - do_one_initcall(*fn); + do_one_initcall(initcall_from_entry(fn)); } /* @@ -1002,6 +1002,7 @@ void __init load_default_modules(void) static int run_init_process(const char *init_filename) { argv_init[0] = init_filename; + pr_info("Run %s as init process\n", init_filename); return do_execve(getname_kernel(init_filename), (const char __user *const __user *)argv_init, (const char __user *const __user *)envp_init); diff --git a/ipc/msg.c b/ipc/msg.c index 203281198079..883642cf2b27 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -163,7 +163,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) /* ipc_addid() locks msq upon success. */ retval = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (retval < 0) { - call_rcu(&msq->q_perm.rcu, msg_rcu_free); + ipc_rcu_putref(&msq->q_perm, msg_rcu_free); return retval; } @@ -386,7 +386,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, down_write(&msg_ids(ns).rwsem); rcu_read_lock(); - ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, + ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd, &msqid64->msg_perm, msqid64->msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); @@ -456,7 +456,7 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, int cmd, struct msginfo *msginfo) { int err; - int max_id; + int max_idx; /* * We must not return kernel stack data. @@ -483,16 +483,15 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_id = ipc_get_maxid(&msg_ids(ns)); + max_idx = ipc_get_maxidx(&msg_ids(ns)); up_read(&msg_ids(ns).rwsem); - return (max_id < 0) ? 0 : max_id; + return (max_idx < 0) ? 0 : max_idx; } static int msgctl_stat(struct ipc_namespace *ns, int msqid, int cmd, struct msqid64_ds *p) { struct msg_queue *msq; - int id = 0; int err; memset(p, 0, sizeof(*p)); @@ -504,7 +503,6 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, err = PTR_ERR(msq); goto out_unlock; } - id = msq->q_perm.id; } else { /* IPC_STAT */ msq = msq_obtain_object_check(ns, msqid); if (IS_ERR(msq)) { @@ -549,10 +547,21 @@ static int msgctl_stat(struct ipc_namespace *ns, int msqid, p->msg_lspid = pid_vnr(msq->q_lspid); p->msg_lrpid = pid_vnr(msq->q_lrpid); - ipc_unlock_object(&msq->q_perm); - rcu_read_unlock(); - return id; + if (cmd == IPC_STAT) { + /* + * As defined in SUS: + * Return 0 on success + */ + err = 0; + } else { + /* + * MSG_STAT and MSG_STAT_ANY (both Linux specific) + * Return the full id, including the sequence number + */ + err = msq->q_perm.id; + } + ipc_unlock_object(&msq->q_perm); out_unlock: rcu_read_unlock(); return err; @@ -1229,7 +1238,7 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, } #endif -int msg_init_ns(struct ipc_namespace *ns) +void msg_init_ns(struct ipc_namespace *ns) { ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; @@ -1237,7 +1246,7 @@ int msg_init_ns(struct ipc_namespace *ns) atomic_set(&ns->msg_bytes, 0); atomic_set(&ns->msg_hdrs, 0); - return ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + ipc_init_ids(&ns->ids[IPC_MSG_IDS]); } #ifdef CONFIG_IPC_NS @@ -1278,12 +1287,11 @@ static int sysvipc_msg_proc_show(struct seq_file *s, void *it) } #endif -int __init msg_init(void) +void __init msg_init(void) { - const int err = msg_init_ns(&init_ipc_ns); + msg_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", IPC_MSG_IDS, sysvipc_msg_proc_show); - return err; } diff --git a/ipc/namespace.c b/ipc/namespace.c index f59a89966f92..21607791d62c 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -55,28 +55,16 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; - err = sem_init_ns(ns); + err = mq_init_ns(ns); if (err) goto fail_put; - err = msg_init_ns(ns); - if (err) - goto fail_destroy_sem; - err = shm_init_ns(ns); - if (err) - goto fail_destroy_msg; - err = mq_init_ns(ns); - if (err) - goto fail_destroy_shm; + sem_init_ns(ns); + msg_init_ns(ns); + shm_init_ns(ns); return ns; -fail_destroy_shm: - shm_exit_ns(ns); -fail_destroy_msg: - msg_exit_ns(ns); -fail_destroy_sem: - sem_exit_ns(ns); fail_put: put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); diff --git a/ipc/sem.c b/ipc/sem.c index 00ef2f743a62..26f8e37fcdcb 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -221,14 +221,14 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #define sc_semopm sem_ctls[2] #define sc_semmni sem_ctls[3] -int sem_init_ns(struct ipc_namespace *ns) +void sem_init_ns(struct ipc_namespace *ns) { ns->sc_semmsl = SEMMSL; ns->sc_semmns = SEMMNS; ns->sc_semopm = SEMOPM; ns->sc_semmni = SEMMNI; ns->used_sems = 0; - return ipc_init_ids(&ns->ids[IPC_SEM_IDS]); + ipc_init_ids(&ns->ids[IPC_SEM_IDS]); } #ifdef CONFIG_IPC_NS @@ -240,14 +240,12 @@ void sem_exit_ns(struct ipc_namespace *ns) } #endif -int __init sem_init(void) +void __init sem_init(void) { - const int err = sem_init_ns(&init_ipc_ns); - + sem_init_ns(&init_ipc_ns); ipc_init_proc_interface("sysvipc/sem", " key semid perms nsems uid gid cuid cgid otime ctime\n", IPC_SEM_IDS, sysvipc_sem_proc_show); - return err; } /** @@ -557,7 +555,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) /* ipc_addid() locks sma upon success. */ retval = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if (retval < 0) { - call_rcu(&sma->sem_perm.rcu, sem_rcu_free); + ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return retval; } ns->used_sems += nsems; @@ -1223,7 +1221,6 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, { struct sem_array *sma; time64_t semotime; - int id = 0; int err; memset(semid64, 0, sizeof(*semid64)); @@ -1235,7 +1232,6 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, err = PTR_ERR(sma); goto out_unlock; } - id = sma->sem_perm.id; } else { /* IPC_STAT */ sma = sem_obtain_object_check(ns, semid); if (IS_ERR(sma)) { @@ -1275,10 +1271,20 @@ static int semctl_stat(struct ipc_namespace *ns, int semid, #endif semid64->sem_nsems = sma->sem_nsems; + if (cmd == IPC_STAT) { + /* + * As defined in SUS: + * Return 0 on success + */ + err = 0; + } else { + /* + * SEM_STAT and SEM_STAT_ANY (both Linux specific) + * Return the full id, including the sequence number + */ + err = sma->sem_perm.id; + } ipc_unlock_object(&sma->sem_perm); - rcu_read_unlock(); - return id; - out_unlock: rcu_read_unlock(); return err; @@ -1288,7 +1294,7 @@ static int semctl_info(struct ipc_namespace *ns, int semid, int cmd, void __user *p) { struct seminfo seminfo; - int max_id; + int max_idx; int err; err = security_sem_semctl(NULL, cmd); @@ -1312,11 +1318,11 @@ static int semctl_info(struct ipc_namespace *ns, int semid, seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } - max_id = ipc_get_maxid(&sem_ids(ns)); + max_idx = ipc_get_maxidx(&sem_ids(ns)); up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; - return (max_id < 0) ? 0 : max_id; + return (max_idx < 0) ? 0 : max_idx; } static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, @@ -1588,7 +1594,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, down_write(&sem_ids(ns).rwsem); rcu_read_lock(); - ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, + ipcp = ipcctl_obtain_check(ns, &sem_ids(ns), semid, cmd, &semid64->sem_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); diff --git a/ipc/shm.c b/ipc/shm.c index b204feb38274..b0eb3757ab89 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -96,14 +96,14 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp); static int sysvipc_shm_proc_show(struct seq_file *s, void *it); #endif -int shm_init_ns(struct ipc_namespace *ns) +void shm_init_ns(struct ipc_namespace *ns) { ns->shm_ctlmax = SHMMAX; ns->shm_ctlall = SHMALL; ns->shm_ctlmni = SHMMNI; ns->shm_rmid_forced = 0; ns->shm_tot = 0; - return ipc_init_ids(&shm_ids(ns)); + ipc_init_ids(&shm_ids(ns)); } /* @@ -136,9 +136,8 @@ void shm_exit_ns(struct ipc_namespace *ns) static int __init ipc_ns_init(void) { - const int err = shm_init_ns(&init_ipc_ns); - WARN(err, "ipc: sysv shm_init_ns failed: %d\n", err); - return err; + shm_init_ns(&init_ipc_ns); + return 0; } pure_initcall(ipc_ns_init); @@ -180,16 +179,33 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) { - struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id); + struct kern_ipc_perm *ipcp; + + rcu_read_lock(); + ipcp = ipc_obtain_object_idr(&shm_ids(ns), id); + if (IS_ERR(ipcp)) + goto err; + ipc_lock_object(ipcp); + /* + * ipc_rmid() may have already freed the ID while ipc_lock_object() + * was spinning: here verify that the structure is still valid. + * Upon races with RMID, return -EIDRM, thus indicating that + * the ID points to a removed identifier. + */ + if (ipc_valid_object(ipcp)) { + /* return a locked ipc object upon success */ + return container_of(ipcp, struct shmid_kernel, shm_perm); + } + + ipc_unlock_object(ipcp); +err: + rcu_read_unlock(); /* * Callers of shm_lock() must validate the status of the returned ipc - * object pointer (as returned by ipc_lock()), and error out as - * appropriate. + * object pointer and error out as appropriate. */ - if (IS_ERR(ipcp)) - return (void *)ipcp; - return container_of(ipcp, struct shmid_kernel, shm_perm); + return (void *)ipcp; } static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) @@ -684,6 +700,8 @@ no_id: if (is_file_hugepages(file) && shp->mlock_user) user_shm_unlock(size, shp->mlock_user); fput(file); + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); + return error; no_file: call_rcu(&shp->shm_perm.rcu, shm_rcu_free); return error; @@ -879,7 +897,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, down_write(&shm_ids(ns).rwsem); rcu_read_lock(); - ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, + ipcp = ipcctl_obtain_check(ns, &shm_ids(ns), shmid, cmd, &shmid64->shm_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); @@ -930,7 +948,7 @@ static int shmctl_ipc_info(struct ipc_namespace *ns, shminfo->shmall = ns->shm_ctlall; shminfo->shmmin = SHMMIN; down_read(&shm_ids(ns).rwsem); - err = ipc_get_maxid(&shm_ids(ns)); + err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; @@ -950,7 +968,7 @@ static int shmctl_shm_info(struct ipc_namespace *ns, shm_info->shm_tot = ns->shm_tot; shm_info->swap_attempts = 0; shm_info->swap_successes = 0; - err = ipc_get_maxid(&shm_ids(ns)); + err = ipc_get_maxidx(&shm_ids(ns)); up_read(&shm_ids(ns).rwsem); if (err < 0) err = 0; @@ -962,7 +980,6 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, int cmd, struct shmid64_ds *tbuf) { struct shmid_kernel *shp; - int id = 0; int err; memset(tbuf, 0, sizeof(*tbuf)); @@ -974,7 +991,6 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, err = PTR_ERR(shp); goto out_unlock; } - id = shp->shm_perm.id; } else { /* IPC_STAT */ shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { @@ -1024,10 +1040,21 @@ static int shmctl_stat(struct ipc_namespace *ns, int shmid, tbuf->shm_lpid = pid_vnr(shp->shm_lprid); tbuf->shm_nattch = shp->shm_nattch; - ipc_unlock_object(&shp->shm_perm); - rcu_read_unlock(); - return id; + if (cmd == IPC_STAT) { + /* + * As defined in SUS: + * Return 0 on success + */ + err = 0; + } else { + /* + * SHM_STAT and SHM_STAT_ANY (both Linux specific) + * Return the full id, including the sequence number + */ + err = shp->shm_perm.id; + } + ipc_unlock_object(&shp->shm_perm); out_unlock: rcu_read_unlock(); return err; diff --git a/ipc/util.c b/ipc/util.c index fdffff41f65b..0af05752969f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -88,16 +88,12 @@ struct ipc_proc_iface { */ static int __init ipc_init(void) { - int err_sem, err_msg; - proc_mkdir("sysvipc", NULL); - err_sem = sem_init(); - WARN(err_sem, "ipc: sysv sem_init failed: %d\n", err_sem); - err_msg = msg_init(); - WARN(err_msg, "ipc: sysv msg_init failed: %d\n", err_msg); + sem_init(); + msg_init(); shm_init(); - return err_msg ? err_msg : err_sem; + return 0; } device_initcall(ipc_init); @@ -116,22 +112,17 @@ static const struct rhashtable_params ipc_kht_params = { * Set up the sequence range to use for the ipc identifier range (limited * below IPCMNI) then initialise the keys hashtable and ids idr. */ -int ipc_init_ids(struct ipc_ids *ids) +void ipc_init_ids(struct ipc_ids *ids) { - int err; ids->in_use = 0; ids->seq = 0; init_rwsem(&ids->rwsem); - err = rhashtable_init(&ids->key_ht, &ipc_kht_params); - if (err) - return err; + rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); - ids->tables_initialized = true; - ids->max_id = -1; + ids->max_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif - return 0; } #ifdef CONFIG_PROC_FS @@ -179,61 +170,66 @@ void __init ipc_init_proc_interface(const char *path, const char *header, */ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) { - struct kern_ipc_perm *ipcp = NULL; + struct kern_ipc_perm *ipcp; - if (likely(ids->tables_initialized)) - ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, + ipcp = rhashtable_lookup_fast(&ids->key_ht, &key, ipc_kht_params); + if (!ipcp) + return NULL; - if (ipcp) { - rcu_read_lock(); - ipc_lock_object(ipcp); - return ipcp; - } - - return NULL; + rcu_read_lock(); + ipc_lock_object(ipcp); + return ipcp; } -#ifdef CONFIG_CHECKPOINT_RESTORE /* - * Specify desired id for next allocated IPC object. + * Insert new IPC object into idr tree, and set sequence number and id + * in the correct order. + * Especially: + * - the sequence number must be set before inserting the object into the idr, + * because the sequence number is accessed without a lock. + * - the id can/must be set after inserting the object into the idr. + * All accesses must be done after getting kern_ipc_perm.lock. + * + * The caller must own kern_ipc_perm.lock.of the new object. + * On error, the function returns a (negative) error code. */ -#define ipc_idr_alloc(ids, new) \ - idr_alloc(&(ids)->ipcs_idr, (new), \ - (ids)->next_id < 0 ? 0 : ipcid_to_idx((ids)->next_id),\ - 0, GFP_NOWAIT) - -static inline int ipc_buildid(int id, struct ipc_ids *ids, - struct kern_ipc_perm *new) +static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { - if (ids->next_id < 0) { /* default, behave as !CHECKPOINT_RESTORE */ + int idx, next_id = -1; + +#ifdef CONFIG_CHECKPOINT_RESTORE + next_id = ids->next_id; + ids->next_id = -1; +#endif + + /* + * As soon as a new object is inserted into the idr, + * ipc_obtain_object_idr() or ipc_obtain_object_check() can find it, + * and the lockless preparations for ipc operations can start. + * This means especially: permission checks, audit calls, allocation + * of undo structures, ... + * + * Thus the object must be fully initialized, and if something fails, + * then the full tear-down sequence must be followed. + * (i.e.: set new->deleted, reduce refcount, call_rcu()) + */ + + if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ new->seq = ids->seq++; if (ids->seq > IPCID_SEQ_MAX) ids->seq = 0; + idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT); } else { - new->seq = ipcid_to_seqx(ids->next_id); - ids->next_id = -1; + new->seq = ipcid_to_seqx(next_id); + idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), + 0, GFP_NOWAIT); } - - return SEQ_MULTIPLIER * new->seq + id; + if (idx >= 0) + new->id = SEQ_MULTIPLIER * new->seq + idx; + return idx; } -#else -#define ipc_idr_alloc(ids, new) \ - idr_alloc(&(ids)->ipcs_idr, (new), 0, 0, GFP_NOWAIT) - -static inline int ipc_buildid(int id, struct ipc_ids *ids, - struct kern_ipc_perm *new) -{ - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; - - return SEQ_MULTIPLIER * new->seq + id; -} - -#endif /* CONFIG_CHECKPOINT_RESTORE */ - /** * ipc_addid - add an ipc identifier * @ids: ipc identifier set @@ -241,9 +237,11 @@ static inline int ipc_buildid(int id, struct ipc_ids *ids, * @limit: limit for the number of used ids * * Add an entry 'new' to the ipc ids idr. The permissions object is - * initialised and the first free entry is set up and the id assigned + * initialised and the first free entry is set up and the index assigned * is returned. The 'new' entry is returned in a locked state on success. + * * On failure the entry is not locked and a negative err-code is returned. + * The caller must use ipc_rcu_putref() to free the identifier. * * Called with writer ipc_ids.rwsem held. */ @@ -251,19 +249,20 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) { kuid_t euid; kgid_t egid; - int id, err; + int idx, err; + + /* 1) Initialize the refcount so that ipc_rcu_putref works */ + refcount_set(&new->refcount, 1); if (limit > IPCMNI) limit = IPCMNI; - if (!ids->tables_initialized || ids->in_use >= limit) + if (ids->in_use >= limit) return -ENOSPC; idr_preload(GFP_KERNEL); - refcount_set(&new->refcount, 1); spin_lock_init(&new->lock); - new->deleted = false; rcu_read_lock(); spin_lock(&new->lock); @@ -271,30 +270,30 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) new->cuid = new->uid = euid; new->gid = new->cgid = egid; - id = ipc_idr_alloc(ids, new); + new->deleted = false; + + idx = ipc_idr_alloc(ids, new); idr_preload_end(); - if (id >= 0 && new->key != IPC_PRIVATE) { + if (idx >= 0 && new->key != IPC_PRIVATE) { err = rhashtable_insert_fast(&ids->key_ht, &new->khtnode, ipc_kht_params); if (err < 0) { - idr_remove(&ids->ipcs_idr, id); - id = err; + idr_remove(&ids->ipcs_idr, idx); + idx = err; } } - if (id < 0) { + if (idx < 0) { + new->deleted = true; spin_unlock(&new->lock); rcu_read_unlock(); - return id; + return idx; } ids->in_use++; - if (id > ids->max_id) - ids->max_id = id; - - new->id = ipc_buildid(id, ids, new); - - return id; + if (idx > ids->max_idx) + ids->max_idx = idx; + return idx; } /** @@ -432,20 +431,20 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) */ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { - int lid = ipcid_to_idx(ipcp->id); + int idx = ipcid_to_idx(ipcp->id); - idr_remove(&ids->ipcs_idr, lid); + idr_remove(&ids->ipcs_idr, idx); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; - if (unlikely(lid == ids->max_id)) { + if (unlikely(idx == ids->max_idx)) { do { - lid--; - if (lid == -1) + idx--; + if (idx == -1) break; - } while (!idr_find(&ids->ipcs_idr, lid)); - ids->max_id = lid; + } while (!idr_find(&ids->ipcs_idr, idx)); + ids->max_idx = idx; } } @@ -463,7 +462,7 @@ void ipc_set_key_private(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) ipcp->key = IPC_PRIVATE; } -int ipc_rcu_getref(struct kern_ipc_perm *ptr) +bool ipc_rcu_getref(struct kern_ipc_perm *ptr) { return refcount_inc_not_zero(&ptr->refcount); } @@ -565,12 +564,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) { struct kern_ipc_perm *out; - int lid = ipcid_to_idx(id); - - if (unlikely(!ids->tables_initialized)) - return ERR_PTR(-EINVAL); + int idx = ipcid_to_idx(id); - out = idr_find(&ids->ipcs_idr, lid); + out = idr_find(&ids->ipcs_idr, idx); if (!out) return ERR_PTR(-EINVAL); @@ -578,48 +574,12 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) } /** - * ipc_lock - lock an ipc structure without rwsem held - * @ids: ipc identifier set - * @id: ipc id to look for - * - * Look for an id in the ipc ids idr and lock the associated ipc object. - * - * The ipc object is locked on successful exit. - */ -struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id) -{ - struct kern_ipc_perm *out; - - rcu_read_lock(); - out = ipc_obtain_object_idr(ids, id); - if (IS_ERR(out)) - goto err; - - spin_lock(&out->lock); - - /* - * ipc_rmid() may have already freed the ID while ipc_lock() - * was spinning: here verify that the structure is still valid. - * Upon races with RMID, return -EIDRM, thus indicating that - * the ID points to a removed identifier. - */ - if (ipc_valid_object(out)) - return out; - - spin_unlock(&out->lock); - out = ERR_PTR(-EIDRM); -err: - rcu_read_unlock(); - return out; -} - -/** * ipc_obtain_object_check * @ids: ipc identifier set * @id: ipc id to look for * - * Similar to ipc_obtain_object_idr() but also checks - * the ipc object reference counter. + * Similar to ipc_obtain_object_idr() but also checks the ipc object + * sequence number. * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. @@ -677,7 +637,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) } /** - * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd + * ipcctl_obtain_check - retrieve an ipc object and check permissions * @ns: ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve @@ -687,16 +647,16 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * * This function does some common audit and permissions check for some IPC_XXX * cmd and is called from semctl_down, shmctl_down and msgctl_down. - * It must be called without any lock held and: * - * - retrieves the ipc with the given id in the given table. + * It: + * - retrieves the ipc object with the given id in the given table. * - performs some audit and permission check, depending on the given cmd * - returns a pointer to the ipc object or otherwise, the corresponding * error. * * Call holding the both the rwsem and the rcu read lock. */ -struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, +struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm) { diff --git a/ipc/util.h b/ipc/util.h index 0aba3230d007..0a159f69b3bb 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -18,8 +18,8 @@ #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ #define SEQ_MULTIPLIER (IPCMNI) -int sem_init(void); -int msg_init(void); +void sem_init(void); +void msg_init(void); void shm_init(void); struct ipc_namespace; @@ -34,17 +34,17 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { } #endif #ifdef CONFIG_SYSVIPC -int sem_init_ns(struct ipc_namespace *ns); -int msg_init_ns(struct ipc_namespace *ns); -int shm_init_ns(struct ipc_namespace *ns); +void sem_init_ns(struct ipc_namespace *ns); +void msg_init_ns(struct ipc_namespace *ns); +void shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else -static inline int sem_init_ns(struct ipc_namespace *ns) { return 0; } -static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } -static inline int shm_init_ns(struct ipc_namespace *ns) { return 0; } +static inline void sem_init_ns(struct ipc_namespace *ns) { } +static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } static inline void msg_exit_ns(struct ipc_namespace *ns) { } @@ -83,7 +83,7 @@ struct ipc_ops { struct seq_file; struct ipc_ids; -int ipc_init_ids(struct ipc_ids *); +void ipc_init_ids(struct ipc_ids *ids); #ifdef CONFIG_PROC_FS void __init ipc_init_proc_interface(const char *path, const char *header, int ids, int (*show)(struct seq_file *, void *)); @@ -113,12 +113,12 @@ void ipc_set_key_private(struct ipc_ids *, struct kern_ipc_perm *); int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); /** - * ipc_get_maxid - get the last assigned id + * ipc_get_maxidx - get the highest assigned index * @ids: ipc identifier set * * Called with ipc_ids.rwsem held for reading. */ -static inline int ipc_get_maxid(struct ipc_ids *ids) +static inline int ipc_get_maxidx(struct ipc_ids *ids) { if (ids->in_use == 0) return -1; @@ -126,7 +126,7 @@ static inline int ipc_get_maxid(struct ipc_ids *ids) if (ids->in_use == IPCMNI) return IPCMNI - 1; - return ids->max_id; + return ids->max_idx; } /* @@ -138,17 +138,16 @@ static inline int ipc_get_maxid(struct ipc_ids *ids) * refcount is initialized by ipc_addid(), before that point call_rcu() * must be used. */ -int ipc_rcu_getref(struct kern_ipc_perm *ptr); +bool ipc_rcu_getref(struct kern_ipc_perm *ptr); void ipc_rcu_putref(struct kern_ipc_perm *ptr, void (*func)(struct rcu_head *head)); -struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int); struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id); void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out); void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); -struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, +struct kern_ipc_perm *ipcctl_obtain_check(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); @@ -173,9 +172,9 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len); extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst); extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len); -static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int uid) +static inline int ipc_checkid(struct kern_ipc_perm *ipcp, int id) { - return uid / SEQ_MULTIPLIER != ipcp->seq; + return ipcid_to_seqx(id) != ipcp->seq; } static inline void ipc_lock_object(struct kern_ipc_perm *perm) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b66aced5e8c2..933cb3e45b98 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,8 +14,8 @@ #include <asm/sections.h> /* vmcoreinfo stuff */ -static unsigned char *vmcoreinfo_data; -static size_t vmcoreinfo_size; +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ @@ -344,7 +344,7 @@ void crash_save_vmcoreinfo(void) if (vmcoreinfo_data_safecopy) vmcoreinfo_data = vmcoreinfo_data_safecopy; - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + vmcoreinfo_append_str("CRASHTIME=%lld\n", ktime_get_real_seconds()); update_vmcoreinfo_note(); } @@ -401,7 +401,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); #ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); + VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmap_area_list); diff --git a/kernel/fork.c b/kernel/fork.c index ff5037be7771..d896e9ca38b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep; struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma; + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) vma_init(vma, mm); return vma; @@ -1301,6 +1302,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; + tsk->last_switch_time = 0; #endif tsk->mm = NULL; @@ -1425,7 +1427,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; atomic_set(&sig->count, 1); + spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + spin_unlock_irq(¤t->sighand->siglock); return 0; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 32b479468e4d..b9132d1269ef 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -40,6 +40,11 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +/* + * Zero (default value) means use sysctl_hung_task_timeout_secs: + */ +unsigned long __read_mostly sysctl_hung_task_check_interval_secs; + int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; @@ -98,8 +103,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) if (switch_count != t->last_switch_count) { t->last_switch_count = switch_count; + t->last_switch_time = jiffies; return; } + if (time_is_after_jiffies(t->last_switch_time + timeout * HZ)) + return; trace_sched_process_hang(t); @@ -245,8 +253,13 @@ static int watchdog(void *dummy) for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; - long t = hung_timeout_jiffies(hung_last_checked, timeout); + unsigned long interval = sysctl_hung_task_check_interval_secs; + long t; + if (interval == 0) + interval = timeout; + interval = min_t(unsigned long, interval, timeout); + t = hung_timeout_jiffies(hung_last_checked, interval); if (t <= 0) { if (!atomic_xchg(&reset_hung_task, 0)) check_hung_uninterruptible_tasks(timeout); diff --git a/kernel/module.c b/kernel/module.c index b046a32520d8..6746c85511fe 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -529,12 +529,30 @@ static bool check_symbol(const struct symsearch *syms, return true; } +static unsigned long kernel_symbol_value(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return (unsigned long)offset_to_ptr(&sym->value_offset); +#else + return sym->value; +#endif +} + +static const char *kernel_symbol_name(const struct kernel_symbol *sym) +{ +#ifdef CONFIG_HAVE_ARCH_PREL32_RELOCATIONS + return offset_to_ptr(&sym->name_offset); +#else + return sym->name; +#endif +} + static int cmp_name(const void *va, const void *vb) { const char *a; const struct kernel_symbol *b; a = va; b = vb; - return strcmp(a, b->name); + return strcmp(a, kernel_symbol_name(b)); } static bool find_symbol_in_section(const struct symsearch *syms, @@ -2170,7 +2188,7 @@ void *__symbol_get(const char *symbol) sym = NULL; preempt_enable(); - return sym ? (void *)sym->value : NULL; + return sym ? (void *)kernel_symbol_value(sym) : NULL; } EXPORT_SYMBOL_GPL(__symbol_get); @@ -2200,10 +2218,12 @@ static int verify_export_symbols(struct module *mod) for (i = 0; i < ARRAY_SIZE(arr); i++) { for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { - if (find_symbol(s->name, &owner, NULL, true, false)) { + if (find_symbol(kernel_symbol_name(s), &owner, NULL, + true, false)) { pr_err("%s: exports duplicate symbol %s" " (owned by %s)\n", - mod->name, s->name, module_name(owner)); + mod->name, kernel_symbol_name(s), + module_name(owner)); return -ENOEXEC; } } @@ -2252,7 +2272,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { - sym[i].st_value = ksym->value; + sym[i].st_value = kernel_symbol_value(ksym); break; } @@ -2516,7 +2536,7 @@ static int is_exported(const char *name, unsigned long value, ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); else ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); - return ks != NULL && ks->value == value; + return ks != NULL && kernel_symbol_value(ks) == value; } /* As per nm */ diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index e880ca22c5a5..3a6c2f87699e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -105,6 +105,7 @@ config PM_SLEEP def_bool y depends on SUSPEND || HIBERNATE_CALLBACKS select PM + select SRCU config PM_SLEEP_SMP def_bool y diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 90b6ab01db59..918f386b2f6e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2788,7 +2788,8 @@ EXPORT_SYMBOL(unregister_console); void __init console_init(void) { int ret; - initcall_t *call; + initcall_t call; + initcall_entry_t *ce; /* Setup the default TTY line discipline. */ n_tty_init(); @@ -2797,13 +2798,14 @@ void __init console_init(void) * set up the console device so that later boot sequences can * inform about problems etc.. */ - call = __con_initcall_start; + ce = __con_initcall_start; trace_initcall_level("console"); - while (call < __con_initcall_end) { - trace_initcall_start((*call)); - ret = (*call)(); - trace_initcall_finish((*call), ret); - call++; + while (ce < __con_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1a3e9bddd17b..16f84142f2f4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -190,7 +190,7 @@ static void cpuidle_idle_call(void) */ next_state = cpuidle_select(drv, dev, &stop_tick); - if (stop_tick) + if (stop_tick || tick_nohz_tick_stopped()) tick_nohz_idle_stop_tick(); else tick_nohz_idle_retain_tick(); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 870f97b313e3..5dd47f1103d1 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -69,6 +69,8 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, wait_queue_entry_t *curr, *next; int cnt = 0; + lockdep_assert_held(&wq_head->lock); + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { curr = list_next_entry(bookmark, entry); diff --git a/kernel/signal.c b/kernel/signal.c index cfa9d10e731a..5843c541fda9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -65,14 +65,14 @@ static void __user *sig_handler(struct task_struct *t, int sig) return t->sighand->action[sig - 1].sa.sa_handler; } -static int sig_handler_ignored(void __user *handler, int sig) +static inline bool sig_handler_ignored(void __user *handler, int sig) { /* Is it explicitly or implicitly ignored? */ return handler == SIG_IGN || - (handler == SIG_DFL && sig_kernel_ignore(sig)); + (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, bool force) +static bool sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; @@ -80,12 +80,12 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && handler == SIG_DFL && !(force && sig_kernel_only(sig))) - return 1; + return true; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, bool force) +static bool sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -93,7 +93,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * unblocked. */ if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) - return 0; + return false; /* * Tracers may want to know about even ignored signal unless it @@ -101,7 +101,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * by SIGNAL_UNKILLABLE task. */ if (t->ptrace && sig != SIGKILL) - return 0; + return false; return sig_task_ignored(t, sig, force); } @@ -110,7 +110,7 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) * Re-calculate pending state from the set of locally pending * signals, globally pending signals, and blocked signals. */ -static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked) { unsigned long ready; long i; @@ -138,20 +138,21 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) #define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) -static int recalc_sigpending_tsk(struct task_struct *t) +static bool recalc_sigpending_tsk(struct task_struct *t) { if ((t->jobctl & JOBCTL_PENDING_MASK) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); - return 1; + return true; } + /* * We must never clear the flag in another thread, or in current * when it's possible the current syscall is returning -ERESTART*. * So we don't clear it here, and only callers who know they should do. */ - return 0; + return false; } /* @@ -529,13 +530,15 @@ flush_signal_handlers(struct task_struct *t, int force_default) } } -int unhandled_signal(struct task_struct *tsk, int sig) +bool unhandled_signal(struct task_struct *tsk, int sig) { void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; if (is_global_init(tsk)) - return 1; + return true; + if (handler != SIG_IGN && handler != SIG_DFL) - return 0; + return false; + /* if ptraced, let the tracer determine */ return !tsk->ptrace; } @@ -709,14 +712,14 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) * * All callers must be holding the siglock. */ -static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) +static void flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) { struct sigqueue *q, *n; sigset_t m; sigandsets(&m, mask, &s->signal); if (sigisemptyset(&m)) - return 0; + return; sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { @@ -725,7 +728,6 @@ static int flush_sigqueue_mask(sigset_t *mask, struct sigpending *s) __sigqueue_free(q); } } - return 1; } static inline int is_si_special(const struct siginfo *info) @@ -742,21 +744,16 @@ static inline bool si_fromuser(const struct siginfo *info) /* * called with RCU read lock from check_kill_permission() */ -static int kill_ok_by_cred(struct task_struct *t) +static bool kill_ok_by_cred(struct task_struct *t) { const struct cred *cred = current_cred(); const struct cred *tcred = __task_cred(t); - if (uid_eq(cred->euid, tcred->suid) || - uid_eq(cred->euid, tcred->uid) || - uid_eq(cred->uid, tcred->suid) || - uid_eq(cred->uid, tcred->uid)) - return 1; - - if (ns_capable(tcred->user_ns, CAP_KILL)) - return 1; - - return 0; + return uid_eq(cred->euid, tcred->suid) || + uid_eq(cred->euid, tcred->uid) || + uid_eq(cred->uid, tcred->suid) || + uid_eq(cred->uid, tcred->uid) || + ns_capable(tcred->user_ns, CAP_KILL); } /* @@ -907,16 +904,20 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * as soon as they're available, so putting the signal on the shared queue * will be equivalent to sending it to one such thread. */ -static inline int wants_signal(int sig, struct task_struct *p) +static inline bool wants_signal(int sig, struct task_struct *p) { if (sigismember(&p->blocked, sig)) - return 0; + return false; + if (p->flags & PF_EXITING) - return 0; + return false; + if (sig == SIGKILL) - return 1; + return true; + if (task_is_stopped_or_traced(p)) - return 0; + return false; + return task_curr(p) || !signal_pending(p); } @@ -996,7 +997,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) return; } -static inline int legacy_queue(struct sigpending *signals, int sig) +static inline bool legacy_queue(struct sigpending *signals, int sig) { return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } @@ -1380,14 +1381,15 @@ static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -static int kill_as_cred_perm(const struct cred *cred, - struct task_struct *target) +static inline bool kill_as_cred_perm(const struct cred *cred, + struct task_struct *target) { const struct cred *pcred = __task_cred(target); - if (!uid_eq(cred->euid, pcred->suid) && !uid_eq(cred->euid, pcred->uid) && - !uid_eq(cred->uid, pcred->suid) && !uid_eq(cred->uid, pcred->uid)) - return 0; - return 1; + + return uid_eq(cred->euid, pcred->suid) || + uid_eq(cred->euid, pcred->uid) || + uid_eq(cred->uid, pcred->suid) || + uid_eq(cred->uid, pcred->uid); } /* like kill_pid_info(), but doesn't use uid/euid of "current" */ @@ -1500,8 +1502,7 @@ send_sig(int sig, struct task_struct *p, int priv) return send_sig_info(sig, __si_special(priv), p); } -void -force_sig(int sig, struct task_struct *p) +void force_sig(int sig, struct task_struct *p) { force_sig_info(sig, SEND_SIG_PRIV, p); } @@ -1512,8 +1513,7 @@ force_sig(int sig, struct task_struct *p) * the problem was already a SIGSEGV, we'll want to * make sure we don't even try to deliver the signal.. */ -int -force_sigsegv(int sig, struct task_struct *p) +void force_sigsegv(int sig, struct task_struct *p) { if (sig == SIGSEGV) { unsigned long flags; @@ -1522,7 +1522,6 @@ force_sigsegv(int sig, struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } force_sig(SIGSEGV, p); - return 0; } int force_sig_fault(int sig, int code, void __user *addr @@ -1923,10 +1922,10 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, spin_unlock_irqrestore(&sighand->siglock, flags); } -static inline int may_ptrace_stop(void) +static inline bool may_ptrace_stop(void) { if (!likely(current->ptrace)) - return 0; + return false; /* * Are we in the middle of do_coredump? * If so and our tracer is also part of the coredump stopping @@ -1942,19 +1941,19 @@ static inline int may_ptrace_stop(void) */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) - return 0; + return false; - return 1; + return true; } /* * Return non-zero if there is a SIGKILL that should be waking us up. * Called with the siglock held. */ -static int sigkill_pending(struct task_struct *tsk) +static bool sigkill_pending(struct task_struct *tsk) { - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); + return sigismember(&tsk->pending.signal, SIGKILL) || + sigismember(&tsk->signal->shared_pending.signal, SIGKILL); } /* @@ -2334,7 +2333,7 @@ static int ptrace_signal(int signr, siginfo_t *info) return signr; } -int get_signal(struct ksignal *ksig) +bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; struct signal_struct *signal = current->signal; @@ -2344,7 +2343,7 @@ int get_signal(struct ksignal *ksig) task_work_run(); if (unlikely(uprobe_deny_signal())) - return 0; + return false; /* * Do this once, we can't return to user-mode if freezing() == T. @@ -2801,7 +2800,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset, } #endif -static int do_sigpending(sigset_t *set) +static void do_sigpending(sigset_t *set) { spin_lock_irq(¤t->sighand->siglock); sigorsets(set, ¤t->pending.signal, @@ -2810,7 +2809,6 @@ static int do_sigpending(sigset_t *set) /* Outside the lock because only this thread touches it. */ sigandsets(set, ¤t->blocked, set); - return 0; } /** @@ -2822,15 +2820,16 @@ static int do_sigpending(sigset_t *set) SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sigsetsize)) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sigsetsize)) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT @@ -2838,15 +2837,13 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, compat_size_t, sigsetsize) { sigset_t set; - int err; if (sigsetsize > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err) - err = put_compat_sigset(uset, &set, sigsetsize); - return err; + do_sigpending(&set); + + return put_compat_sigset(uset, &set, sigsetsize); } #endif @@ -3608,25 +3605,26 @@ int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp) SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset) { sigset_t set; - int err; if (sizeof(old_sigset_t) > sizeof(*uset)) return -EINVAL; - err = do_sigpending(&set); - if (!err && copy_to_user(uset, &set, sizeof(old_sigset_t))) - err = -EFAULT; - return err; + do_sigpending(&set); + + if (copy_to_user(uset, &set, sizeof(old_sigset_t))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { sigset_t set; - int err = do_sigpending(&set); - if (!err) - err = put_user(set.sig[0], set32); - return err; + + do_sigpending(&set); + + return put_user(set.sig[0], set32); } #endif @@ -3697,25 +3695,23 @@ SYSCALL_DEFINE4(rt_sigaction, int, sig, size_t, sigsetsize) { struct k_sigaction new_sa, old_sa; - int ret = -EINVAL; + int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) - goto out; + return -EINVAL; - if (act) { - if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) - return -EFAULT; - } + if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) + return -EFAULT; ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); + if (ret) + return ret; - if (!ret && oact) { - if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) - return -EFAULT; - } -out: - return ret; + if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) + return -EFAULT; + + return 0; } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f22f76b7a138..71ceb6c13c1a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -145,7 +145,10 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ #ifdef CONFIG_DETECT_HUNG_TASK static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #endif @@ -222,7 +225,7 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ -/* Note: sysrq code uses it's own private copy */ +/* Note: sysrq code uses its own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(struct ctl_table *table, int write, @@ -1091,6 +1094,14 @@ static struct ctl_table kern_table[] = { .extra2 = &hung_task_timeout_max, }, { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, + }, + { .procname = "hung_task_warnings", .data = &sysctl_hung_task_warnings, .maxlen = sizeof(int), @@ -1965,13 +1976,13 @@ static void warn_sysctl_write(struct ctl_table *table) } /** - * proc_first_pos_non_zero_ignore - check if firs position is allowed + * proc_first_pos_non_zero_ignore - check if first position is allowed * @ppos: file position * @table: the sysctl table * * Returns true if the first position is non-zero and the sysctl_writes_strict * mode indicates this is not allowed for numeric input types. String proc - * hadlers can ignore the return value. + * handlers can ignore the return value. */ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, struct ctl_table *table) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2a24a59e99c5..2868d85f1fb1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1829,6 +1829,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, mutex_lock(&q->blk_trace_mutex); if (attr == &dev_attr_enable) { + if (!!value == !!q->blk_trace) { + ret = 0; + goto out_unlock_bdev; + } if (value) ret = blk_trace_setup_queue(q, bdev); else diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 96db841bf0fc..bf2c06ef9afc 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -371,6 +371,27 @@ int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data) } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +static void for_each_tracepoint_range(struct tracepoint * const *begin, + struct tracepoint * const *end, + void (*fct)(struct tracepoint *tp, void *priv), + void *priv) +{ + if (!begin) + return; + + if (IS_ENABLED(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)) { + const int *iter; + + for (iter = (const int *)begin; iter < (const int *)end; iter++) + fct(offset_to_ptr(iter), priv); + } else { + struct tracepoint * const *iter; + + for (iter = begin; iter < end; iter++) + fct(*iter, priv); + } +} + #ifdef CONFIG_MODULES bool trace_module_has_bad_taint(struct module *mod) { @@ -435,15 +456,9 @@ EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier); * Ensure the tracer unregistered the module's probes before the module * teardown is performed. Prevents leaks of probe and data pointers. */ -static void tp_module_going_check_quiescent(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) { - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - WARN_ON_ONCE((*iter)->funcs); + WARN_ON_ONCE(tp->funcs); } static int tracepoint_module_coming(struct module *mod) @@ -494,8 +509,9 @@ static void tracepoint_module_going(struct module *mod) * Called the going notifier before checking for * quiescence. */ - tp_module_going_check_quiescent(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + for_each_tracepoint_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints, + tp_module_going_check_quiescent, NULL); break; } } @@ -547,19 +563,6 @@ static __init int init_tracepoints(void) __initcall(init_tracepoints); #endif /* CONFIG_MODULES */ -static void for_each_tracepoint_range(struct tracepoint * const *begin, - struct tracepoint * const *end, - void (*fct)(struct tracepoint *tp, void *priv), - void *priv) -{ - struct tracepoint * const *iter; - - if (!begin) - return; - for (iter = begin; iter < end; iter++) - fct(*iter, priv); -} - /** * for_each_kernel_tracepoint - iteration on all kernel tracepoints * @fct: callback diff --git a/kernel/user.c b/kernel/user.c index 36288d840675..0df9b1640b2a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -96,7 +96,7 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { - .__count = ATOMIC_INIT(1), + .__count = REFCOUNT_INIT(1), .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, @@ -123,7 +123,7 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) hlist_for_each_entry(user, hashent, uidhash_node) { if (uid_eq(user->uid, uid)) { - atomic_inc(&user->__count); + refcount_inc(&user->__count); return user; } } @@ -169,11 +169,8 @@ void free_uid(struct user_struct *up) if (!up) return; - local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + if (refcount_dec_and_lock_irqsave(&up->__count, &uidhash_lock, &flags)) free_user(up, flags); - else - local_irq_restore(flags); } struct user_struct *alloc_uid(kuid_t uid) @@ -191,7 +188,7 @@ struct user_struct *alloc_uid(kuid_t uid) goto out_unlock; new->uid = uid; - atomic_set(&new->__count, 1); + refcount_set(&new->__count, 1); ratelimit_state_init(&new->ratelimit, HZ, 100); ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE); diff --git a/lib/.gitignore b/lib/.gitignore index 09aae85418ab..f2a39c9e5485 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -2,5 +2,7 @@ # Generated files # gen_crc32table +gen_crc64table crc32table.h +crc64table.h oid_registry_data.c diff --git a/lib/Kconfig b/lib/Kconfig index 706836ec314d..a3928d4438b5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -170,6 +170,14 @@ config CRC32_BIT endchoice +config CRC64 + tristate "CRC64 functions" + help + This option is provided for the case where no in-kernel-tree + modules require CRC64 functions, but a module built outside + the kernel tree does. Such modules that use library CRC64 + functions require M here. + config CRC4 tristate "CRC4 functions" help @@ -223,7 +231,6 @@ config AUDIT_COMPAT_GENERIC config RANDOM32_SELFTEST bool "PRNG perform self test on init" - default n help This option enables the 32 bit PRNG library functions to perform a self test on initialization. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ab1b599202bc..3589765141a8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1220,7 +1220,6 @@ config LOCK_TORTURE_TEST tristate "torture tests for locking" depends on DEBUG_KERNEL select TORTURE_TEST - default n help This option provides a kernel module that runs torture tests on kernel locking primitives. The kernel module may be built @@ -1687,7 +1686,6 @@ config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_FS depends on BLOCK - default n help This module enables testing of the different dumping mechanisms by inducing system failures at predefined crash points. @@ -1721,7 +1719,6 @@ config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL depends on KPROBES - default n help This option provides for testing basic kprobes functionality on boot. Samples of kprobe and kretprobe are inserted and @@ -1732,7 +1729,6 @@ config KPROBES_SANITY_TEST config BACKTRACE_SELF_TEST tristate "Self test for the backtrace code" depends on DEBUG_KERNEL - default n help This option provides a kernel module that can be used to test the kernel stack backtrace code. This option is not useful @@ -1802,7 +1798,6 @@ config TEST_PRINTF config TEST_BITMAP tristate "Test bitmap_*() family of functions at runtime" - default n help Enable this option to test the bitmap functions at boot. @@ -1823,7 +1818,6 @@ config TEST_OVERFLOW config TEST_RHASHTABLE tristate "Perform selftest on resizable hash table" - default n help Enable this option to test the rhashtable functions at boot. @@ -1831,7 +1825,6 @@ config TEST_RHASHTABLE config TEST_HASH tristate "Perform selftest on hash functions" - default n help Enable this option to test the kernel's integer (<linux/hash.h>), string (<linux/stringhash.h>), and siphash (<linux/siphash.h>) @@ -1842,7 +1835,6 @@ config TEST_HASH config TEST_PARMAN tristate "Perform selftest on priority array manager" - default n depends on PARMAN help Enable this option to test priority array manager on boot @@ -1852,7 +1844,6 @@ config TEST_PARMAN config TEST_LKM tristate "Test module loading with 'hello world' module" - default n depends on m help This builds the "test_module" module that emits "Hello, world" @@ -1866,7 +1857,6 @@ config TEST_LKM config TEST_USER_COPY tristate "Test user/kernel boundary protections" - default n depends on m help This builds the "test_user_copy" module that runs sanity checks @@ -1879,7 +1869,6 @@ config TEST_USER_COPY config TEST_BPF tristate "Test BPF filter functionality" - default n depends on m && NET help This builds the "test_bpf" module that runs various test vectors @@ -1893,7 +1882,6 @@ config TEST_BPF config FIND_BIT_BENCHMARK tristate "Test find_bit functions" - default n help This builds the "test_find_bit" module that measure find_*_bit() functions performance. @@ -1902,7 +1890,6 @@ config FIND_BIT_BENCHMARK config TEST_FIRMWARE tristate "Test firmware loading via userspace interface" - default n depends on FW_LOADER help This builds the "test_firmware" module that creates a userspace @@ -1915,7 +1902,6 @@ config TEST_FIRMWARE config TEST_SYSCTL tristate "sysctl test driver" - default n depends on PROC_SYSCTL help This builds the "test_sysctl" module. This driver enables to test the @@ -1926,7 +1912,6 @@ config TEST_SYSCTL config TEST_UDELAY tristate "udelay test driver" - default n help This builds the "udelay_test" module that helps to make sure that udelay() is working properly. @@ -1935,7 +1920,6 @@ config TEST_UDELAY config TEST_STATIC_KEYS tristate "Test static keys" - default n depends on m help Test the static key interfaces. @@ -1944,7 +1928,6 @@ config TEST_STATIC_KEYS config TEST_KMOD tristate "kmod stress tester" - default n depends on m depends on BLOCK && (64BIT || LBDAF) # for XFS, BTRFS depends on NETDEVICES && NET_CORE && INET # for TUN diff --git a/lib/Makefile b/lib/Makefile index d95bb2525101..9baefb6cb1a1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -103,6 +103,7 @@ obj-$(CONFIG_CRC16) += crc16.o obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o obj-$(CONFIG_CRC_ITU_T) += crc-itu-t.o obj-$(CONFIG_CRC32) += crc32.o +obj-$(CONFIG_CRC64) += crc64.o obj-$(CONFIG_CRC32_SELFTEST) += crc32test.o obj-$(CONFIG_CRC4) += crc4.o obj-$(CONFIG_CRC7) += crc7.o @@ -217,7 +218,9 @@ obj-$(CONFIG_FONT_SUPPORT) += fonts/ obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o hostprogs-y := gen_crc32table +hostprogs-y += gen_crc64table clean-files := crc32table.h +clean-files += crc64table.h $(obj)/crc32.o: $(obj)/crc32table.h @@ -227,6 +230,14 @@ quiet_cmd_crc32 = GEN $@ $(obj)/crc32table.h: $(obj)/gen_crc32table $(call cmd,crc32) +$(obj)/crc64.o: $(obj)/crc64table.h + +quiet_cmd_crc64 = GEN $@ + cmd_crc64 = $< > $@ + +$(obj)/crc64table.h: $(obj)/gen_crc64table + $(call cmd,crc64) + # # Build a fast OID lookip registry from include/linux/oid_registry.h # diff --git a/lib/bitmap.c b/lib/bitmap.c index 730969c681cb..2fd07f6df0b8 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -1152,14 +1152,10 @@ EXPORT_SYMBOL(bitmap_free); * @buf: array of u32 (in host byte order), the source bitmap * @nbits: number of bits in @bitmap */ -void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, - unsigned int nbits) +void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits) { unsigned int i, halfwords; - if (!nbits) - return; - halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { bitmap[i/2] = (unsigned long) buf[i]; @@ -1183,9 +1179,6 @@ void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits) { unsigned int i, halfwords; - if (!nbits) - return; - halfwords = DIV_ROUND_UP(nbits, 32); for (i = 0; i < halfwords; i++) { buf[i] = (u32) (bitmap[i/2] & UINT_MAX); diff --git a/lib/crc64.c b/lib/crc64.c new file mode 100644 index 000000000000..0ef8ae6ac047 --- /dev/null +++ b/lib/crc64.c @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Normal 64-bit CRC calculation. + * + * This is a basic crc64 implementation following ECMA-182 specification, + * which can be found from, + * http://www.ecma-international.org/publications/standards/Ecma-182.htm + * + * Dr. Ross N. Williams has a great document to introduce the idea of CRC + * algorithm, here the CRC64 code is also inspired by the table-driven + * algorithm and detail example from this paper. This paper can be found + * from, + * http://www.ross.net/crc/download/crc_v3.txt + * + * crc64table[256] is the lookup table of a table-driven 64-bit CRC + * calculation, which is generated by gen_crc64table.c in kernel build + * time. The polynomial of crc64 arithmetic is from ECMA-182 specification + * as well, which is defined as, + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 + * + * Copyright 2018 SUSE Linux. + * Author: Coly Li <colyli@suse.de> + */ + +#include <linux/module.h> +#include <linux/types.h> +#include "crc64table.h" + +MODULE_DESCRIPTION("CRC64 calculations"); +MODULE_LICENSE("GPL v2"); + +/** + * crc64_be - Calculate bitwise big-endian ECMA-182 CRC64 + * @crc: seed value for computation. 0 or (u64)~0 for a new CRC calculation, + or the previous crc64 value if computing incrementally. + * @p: pointer to buffer over which CRC64 is run + * @len: length of buffer @p + */ +u64 __pure crc64_be(u64 crc, const void *p, size_t len) +{ + size_t i, t; + + const unsigned char *_p = p; + + for (i = 0; i < len; i++) { + t = ((crc >> 56) ^ (*_p++)) & 0xFF; + crc = crc64table[t] ^ (crc << 8); + } + + return crc; +} +EXPORT_SYMBOL_GPL(crc64_be); diff --git a/lib/gen_crc64table.c b/lib/gen_crc64table.c new file mode 100644 index 000000000000..9011926e4162 --- /dev/null +++ b/lib/gen_crc64table.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate lookup table for the table-driven CRC64 calculation. + * + * gen_crc64table is executed in kernel build time and generates + * lib/crc64table.h. This header is included by lib/crc64.c for + * the table-driven CRC64 calculation. + * + * See lib/crc64.c for more information about which specification + * and polynomial arithmetic that gen_crc64table.c follows to + * generate the lookup table. + * + * Copyright 2018 SUSE Linux. + * Author: Coly Li <colyli@suse.de> + */ +#include <inttypes.h> +#include <stdio.h> + +#include <linux/swab.h> + +#define CRC64_ECMA182_POLY 0x42F0E1EBA9EA3693ULL + +static uint64_t crc64_table[256] = {0}; + +static void generate_crc64_table(void) +{ + uint64_t i, j, c, crc; + + for (i = 0; i < 256; i++) { + crc = 0; + c = i << 56; + + for (j = 0; j < 8; j++) { + if ((crc ^ c) & 0x8000000000000000ULL) + crc = (crc << 1) ^ CRC64_ECMA182_POLY; + else + crc <<= 1; + c <<= 1; + } + + crc64_table[i] = crc; + } +} + +static void print_crc64_table(void) +{ + int i; + + printf("/* this file is generated - do not edit */\n\n"); + printf("#include <linux/types.h>\n"); + printf("#include <linux/cache.h>\n\n"); + printf("static const u64 ____cacheline_aligned crc64table[256] = {\n"); + for (i = 0; i < 256; i++) { + printf("\t0x%016" PRIx64 "ULL", crc64_table[i]); + if (i & 0x1) + printf(",\n"); + else + printf(", "); + } + printf("};\n"); +} + +int main(int argc, char *argv[]) +{ + generate_crc64_table(); + print_crc64_table(); + return 0; +} diff --git a/lib/rhashtable.c b/lib/rhashtable.c index ae4223e0f5bc..310e29b51507 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -174,17 +174,15 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, int i; size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); - if (gfp != GFP_KERNEL) - tbl = kzalloc(size, gfp | __GFP_NOWARN | __GFP_NORETRY); - else - tbl = kvzalloc(size, gfp); + tbl = kvzalloc(size, gfp); size = nbuckets; - if (tbl == NULL && gfp != GFP_KERNEL) { + if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) { tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); nbuckets = 0; } + if (tbl == NULL) return NULL; @@ -450,7 +448,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht, err = -ENOMEM; - new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); + new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN); if (new_tbl == NULL) goto fail; @@ -1060,9 +1058,16 @@ int rhashtable_init(struct rhashtable *ht, } } + /* + * This is api initialization and thus we need to guarantee the + * initial rhashtable allocation. Upon failure, retry with the + * smallest possible size with __GFP_NOFAIL semantics. + */ tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (tbl == NULL) - return -ENOMEM; + if (unlikely(tbl == NULL)) { + size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); + tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL); + } atomic_set(&ht->nelems, 0); diff --git a/lib/test_debug_virtual.c b/lib/test_debug_virtual.c index b9cdeecc19dc..d5a06addeb27 100644 --- a/lib/test_debug_virtual.c +++ b/lib/test_debug_virtual.c @@ -15,7 +15,7 @@ struct foo { unsigned int bar; }; -struct foo *foo; +static struct foo *foo; static int __init test_debug_virtual_init(void) { diff --git a/lib/test_hexdump.c b/lib/test_hexdump.c index 3f415d8101f3..626f580b4ff7 100644 --- a/lib/test_hexdump.c +++ b/lib/test_hexdump.c @@ -18,7 +18,7 @@ static const unsigned char data_b[] = { static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C..."; -static const char * const test_data_1_le[] __initconst = { +static const char * const test_data_1[] __initconst = { "be", "32", "db", "7b", "0a", "18", "93", "b2", "70", "ba", "c4", "24", "7d", "83", "34", "9b", "a6", "9c", "31", "ad", "9c", "0f", "ac", "e9", @@ -32,16 +32,33 @@ static const char * const test_data_2_le[] __initconst = { "d14c", "9919", "b143", "0caf", }; +static const char * const test_data_2_be[] __initconst = { + "be32", "db7b", "0a18", "93b2", + "70ba", "c424", "7d83", "349b", + "a69c", "31ad", "9c0f", "ace9", + "4cd1", "1999", "43b1", "af0c", +}; + static const char * const test_data_4_le[] __initconst = { "7bdb32be", "b293180a", "24c4ba70", "9b34837d", "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", }; +static const char * const test_data_4_be[] __initconst = { + "be32db7b", "0a1893b2", "70bac424", "7d83349b", + "a69c31ad", "9c0face9", "4cd11999", "43b1af0c", +}; + static const char * const test_data_8_le[] __initconst = { "b293180a7bdb32be", "9b34837d24c4ba70", "e9ac0f9cad319ca6", "0cafb1439919d14c", }; +static const char * const test_data_8_be[] __initconst = { + "be32db7b0a1893b2", "70bac4247d83349b", + "a69c31ad9c0face9", "4cd1199943b1af0c", +}; + #define FILL_CHAR '#' static unsigned total_tests __initdata; @@ -56,6 +73,7 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, size_t l = len; int gs = groupsize, rs = rowsize; unsigned int i; + const bool is_be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); if (rs != 16 && rs != 32) rs = 16; @@ -67,13 +85,13 @@ static void __init test_hexdump_prepare_test(size_t len, int rowsize, gs = 1; if (gs == 8) - result = test_data_8_le; + result = is_be ? test_data_8_be : test_data_8_le; else if (gs == 4) - result = test_data_4_le; + result = is_be ? test_data_4_be : test_data_4_le; else if (gs == 2) - result = test_data_2_le; + result = is_be ? test_data_2_be : test_data_2_le; else - result = test_data_1_le; + result = test_data_1; /* hex dump */ p = test; diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index e5e606ee5f71..9a7b8b049d04 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -46,7 +46,8 @@ config PAGE_POISONING Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps reduce the risk of information leaks from freed data. This does - have a potential performance impact. + have a potential performance impact if enabled with the + "page_poison=1" kernel boot option. Note that "poison" here is not the same thing as the "HWPoison" for CONFIG_MEMORY_FAILURE. This is software poisoning only. @@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY say N. config PAGE_POISONING_ZERO - bool "Use zero for poisoning instead of random data" + bool "Use zero for poisoning instead of debugging value" depends on PAGE_POISONING ---help--- Instead of using the existing poison value, fill the pages with @@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO allocation. If unsure, say N - bool config DEBUG_PAGE_REF bool "Enable tracepoint to track down page reference manipulation" diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2e5d3df0853d..f5981e9d6ae2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -438,10 +438,10 @@ retry: if (new_congested) { /* !found and storage for new one already allocated, insert */ congested = new_congested; - new_congested = NULL; rb_link_node(&congested->rb_node, parent, node); rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - goto found; + spin_unlock_irqrestore(&cgwb_lock, flags); + return congested; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -451,13 +451,13 @@ retry: if (!new_congested) return NULL; - atomic_set(&new_congested->refcnt, 0); + refcount_set(&new_congested->refcnt, 1); new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; found: - atomic_inc(&congested->refcnt); + refcount_inc(&congested->refcnt); spin_unlock_irqrestore(&cgwb_lock, flags); kfree(new_congested); return congested; @@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested) { unsigned long flags; - local_irq_save(flags); - if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { - local_irq_restore(flags); + if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags)) return; - } /* bdi might already have been destroyed leaving @congested unlinked */ if (congested->__bdi) { @@ -804,7 +801,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!bdi->wb_congested) return -ENOMEM; - atomic_set(&bdi->wb_congested->refcnt, 1); + refcount_set(&bdi->wb_congested->refcnt, 1); err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (err) { @@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) up_write(&hmm->mirrors_sem); } -static void hmm_invalidate_range_start(struct mmu_notifier *mn, +static int hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); atomic_inc(&hmm->sequence); + + return 0; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, diff --git a/mm/internal.h b/mm/internal.h index 9e3654d70289..dab088cb6937 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter, return iter + 1; } -/* - * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, - * so all functions starting at paging_init should be marked __init - * in those cases. SPARSEMEM, however, allows for memory hotplug, - * and alloc_bootmem_node is not used. - */ -#ifdef CONFIG_SPARSEMEM -#define __paginginit __meminit -#else -#define __paginginit __init -#endif - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, @@ -703,7 +703,7 @@ again: * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; - * however, it might mean that the page is under page_freeze_refs(). + * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * but if page is swapcache in migrate_page_move_mapping(), it might * still be our page, in which case it's essential to keep the node. @@ -714,7 +714,7 @@ again: * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) - * in the freeze_refs section of __remove_mapping(); but Anon + * in the ref_freeze section of __remove_mapping(); but Anon * page->mapping reset to NULL later, in free_pages_prepare(). */ if (!PageSwapCache(page)) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a921890739f..4ead5a4817de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1777,6 +1777,62 @@ cleanup: } /** + * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM + * @victim: task to be killed by the OOM killer + * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM + * + * Returns a pointer to a memory cgroup, which has to be cleaned up + * by killing all belonging OOM-killable tasks. + * + * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. + */ +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain) +{ + struct mem_cgroup *oom_group = NULL; + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return NULL; + + if (!oom_domain) + oom_domain = root_mem_cgroup; + + rcu_read_lock(); + + memcg = mem_cgroup_from_task(victim); + if (memcg == root_mem_cgroup) + goto out; + + /* + * Traverse the memory cgroup hierarchy from the victim task's + * cgroup up to the OOMing cgroup (or root) to find the + * highest-level memory cgroup with oom.group set. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + if (memcg->oom_group) + oom_group = memcg; + + if (memcg == oom_domain) + break; + } + + if (oom_group) + css_get(&oom_group->css); +out: + rcu_read_unlock(); + + return oom_group; +} + +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ + pr_info("Tasks in "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(" are going to be killed due to memory.oom.group set\n"); +} + +/** * lock_page_memcg - lock a page->mem_cgroup binding * @page: the page * @@ -2899,29 +2955,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) -{ - struct mem_cgroup *iter; - int i; - - memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += memcg_page_state(iter, i); - } -} +struct accumulated_stats { + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long lru_pages[NR_LRU_LISTS]; + const unsigned int *stats_array; + const unsigned int *events_array; + int stats_size; + int events_size; +}; -static void tree_events(struct mem_cgroup *memcg, unsigned long *events) +static void accumulate_memcg_tree(struct mem_cgroup *memcg, + struct accumulated_stats *acc) { - struct mem_cgroup *iter; + struct mem_cgroup *mi; int i; - memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); + for_each_mem_cgroup_tree(mi, memcg) { + for (i = 0; i < acc->stats_size; i++) + acc->stat[i] += memcg_page_state(mi, + acc->stats_array ? acc->stats_array[i] : i); - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += memcg_sum_events(iter, i); + for (i = 0; i < acc->events_size; i++) + acc->events[i] += memcg_sum_events(mi, + acc->events_array ? acc->events_array[i] : i); + + for (i = 0; i < NR_LRU_LISTS; i++) + acc->lru_pages[i] += + mem_cgroup_nr_lru_pages(mi, BIT(i)); } } @@ -3332,6 +3393,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; + struct accumulated_stats acc; BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3364,32 +3426,27 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - unsigned long long val = 0; + memset(&acc, 0, sizeof(acc)); + acc.stats_size = ARRAY_SIZE(memcg1_stats); + acc.stats_array = memcg1_stats; + acc.events_size = ARRAY_SIZE(memcg1_events); + acc.events_array = memcg1_events; + accumulate_memcg_tree(memcg, &acc); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_page_state(mi, memcg1_stats[i]) * - PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + (u64)acc.stat[i] * PAGE_SIZE); } - for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { - unsigned long long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_sum_events(mi, memcg1_events[i]); - seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); - } - - for (i = 0; i < NR_LRU_LISTS; i++) { - unsigned long long val = 0; + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], + (u64)acc.events[i]); - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -5486,8 +5543,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + struct accumulated_stats acc; int i; /* @@ -5501,70 +5557,97 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ - tree_stat(memcg, stat); - tree_events(memcg, events); + memset(&acc, 0, sizeof(acc)); + acc.stats_size = MEMCG_NR_STAT; + acc.events_size = NR_VM_EVENT_ITEMS; + accumulate_memcg_tree(memcg, &acc); seq_printf(m, "anon %llu\n", - (u64)stat[MEMCG_RSS] * PAGE_SIZE); + (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[NR_SLAB_RECLAIMABLE] + - stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + + acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "shmem %llu\n", - (u64)stat[NR_SHMEM] * PAGE_SIZE); + (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[NR_WRITEBACK] * PAGE_SIZE); - - for (i = 0; i < NR_LRU_LISTS; i++) { - struct mem_cgroup *mi; - unsigned long val = 0; + (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)); - seq_printf(m, "%s %llu\n", - mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); - seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + - events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + - events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + + acc.events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + + acc.events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); seq_printf(m, "workingset_refault %lu\n", - stat[WORKINGSET_REFAULT]); + acc.stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", - stat[WORKINGSET_ACTIVATE]); + acc.stat[WORKINGSET_ACTIVATE]); seq_printf(m, "workingset_nodereclaim %lu\n", - stat[WORKINGSET_NODERECLAIM]); + acc.stat[WORKINGSET_NODERECLAIM]); + + return 0; +} + +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%d\n", memcg->oom_group); return 0; } +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, oom_group; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &oom_group); + if (ret) + return ret; + + if (oom_group != 0 && oom_group != 1) + return -EINVAL; + + memcg->oom_group = oom_group; + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5606,6 +5689,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, + { + .name = "oom.group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, { } /* terminate */ }; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9d142b9b86dc..c83a1746812f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1167,7 +1167,7 @@ int memory_failure(unsigned long pfn, int flags) * R/W the page; let's pray that the page has been * used and will be freed some time later. * In fact it's dangerous to directly bump up page count from 0, - * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4eb6e824a80c..9eea6e809a4e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat) static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { struct pglist_data *pgdat; - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); @@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ + pgdat->node_id = nid; + pgdat->node_start_pfn = start_pfn; + /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, zones_size, start_pfn, zholes_size); + free_area_init_core_hotplug(nid); pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* @@ -1017,18 +1018,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) build_all_zonelists(pgdat); /* - * zone->managed_pages is set to an approximate value in - * free_area_init_core(), which will cause - * /sys/device/system/node/nodeX/meminfo has wrong data. - * So reset it to 0 before any memory is onlined. - */ - reset_node_managed_pages(pgdat); - - /* * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). */ + reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); return pgdat; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01f1a14facc4..da858f794eb6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); - return z->zone ? z->zone->node : node; + return z->zone ? zone_to_nid(z->zone) : node; } default: @@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->v.nodes); - polnid = z->zone->node; + polnid = zone_to_nid(z->zone); break; default: @@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) goto put_new; /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/mempool.c b/mm/mempool.c index 44f5fa98c1e7..0ef8cc8d1602 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node); /** * mempool_init - initialize a memory pool + * @pool: pointer to the memory pool that should be initialized * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b72266b4b03..6838a530789b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void) zone->name); /* Iterate the zonelist */ - for_each_zone_zonelist(zone, z, zonelist, zoneid) { -#ifdef CONFIG_NUMA - pr_cont("%d:%s ", zone->node, zone->name); -#else - pr_cont("0:%s ", zone->name); -#endif /* CONFIG_NUMA */ - } + for_each_zone_zonelist(zone, z, zonelist, zoneid) + pr_cont("%d:%s ", zone_to_nid(zone), zone->name); pr_cont("\n"); } } diff --git a/mm/mmap.c b/mm/mmap.c index 8d6449e74431..5f2b2b184c60 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3063,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm) * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ - mutex_lock(&oom_lock); - __oom_reap_task_mm(mm); - mutex_unlock(&oom_lock); + (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index eff6b88a993f..82bb1a939c0e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable) { struct mmu_notifier *mn; + int ret = 0; int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_range_start) - mn->ops->invalidate_range_start(mn, mm, start, end); + if (mn->ops->invalidate_range_start) { + int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); + if (_ret) { + pr_info("%pS callback failed with %d in %sblockable context.\n", + mn->ops->invalidate_range_start, _ret, + !blockable ? "non-" : ""); + ret = _ret; + } + } } srcu_read_unlock(&srcu, id); + + return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7c74dcc2d26d..b5b25e4dcbbb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -400,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + pr_info("Tasks state (memory values in pages):\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -416,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), mm_pgtables_bytes(task->mm), @@ -487,9 +488,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -void __oom_reap_task_mm(struct mm_struct *mm) +bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + bool ret = true; /* * Tell all users of get_user/copy_from_user etc... that the content @@ -519,50 +521,32 @@ void __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); + if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { + ret = false; + continue; + } unmap_page_range(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } } + + return ret; } +/* + * Reaps the address space of the give task. + * + * Returns true on success and false if none or part of the address space + * has been reclaimed and the caller should retry later. + */ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { bool ret = true; - /* - * We have to make sure to not race with the victim exit path - * and cause premature new oom victim selection: - * oom_reap_task_mm exit_mm - * mmget_not_zero - * mmput - * atomic_dec_and_test - * exit_oom_victim - * [...] - * out_of_memory - * select_bad_process - * # no TIF_MEMDIE task selects new victim - * unmap_page_range # frees some memory - */ - mutex_lock(&oom_lock); - if (!down_read_trylock(&mm->mmap_sem)) { - ret = false; trace_skip_task_reaping(tsk->pid); - goto unlock_oom; - } - - /* - * If the mm has invalidate_{start,end}() notifiers that could block, - * sleep to give the oom victim some more time. - * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time - */ - if (mm_has_blockable_invalidate_notifiers(mm)) { - up_read(&mm->mmap_sem); - schedule_timeout_idle(HZ); - goto unlock_oom; + return false; } /* @@ -572,25 +556,27 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * down_write();up_write() cycle in exit_mmap(). */ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { - up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + goto out_unlock; } trace_start_task_reaping(tsk->pid); - __oom_reap_task_mm(mm); + /* failed to reap part of the address space. Try again later */ + ret = __oom_reap_task_mm(mm); + if (!ret) + goto out_finish; pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES))); +out_finish: + trace_finish_task_reaping(tsk->pid); +out_unlock: up_read(&mm->mmap_sem); - trace_finish_task_reaping(tsk->pid); -unlock_oom: - mutex_unlock(&oom_lock); return ret; } @@ -843,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; - /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly - */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); - return; - } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); - p = find_lock_task_mm(victim); if (!p) { put_task_struct(victim); @@ -979,6 +909,99 @@ static void oom_kill_process(struct oom_control *oc, const char *message) #undef K /* + * Kill provided task unless it's secured by setting + * oom_score_adj to OOM_SCORE_ADJ_MIN. + */ +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(task); + __oom_kill_process(task); + } + return 0; +} + +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + struct mem_cgroup *oom_group; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + /* + * Do we need to kill the entire memory cgroup? + * Or even one of the ancestor memory cgroups? + * Check this out before killing the victim task. + */ + oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); + + __oom_kill_process(victim); + + /* + * If necessary, kill all tasks in the selected memory cgroup. + */ + if (oom_group) { + mem_cgroup_print_oom_group(oom_group); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_put(oom_group); + } +} + +/* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ static void check_panic_on_oom(struct oom_control *oc, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15ea511fb41c..c677c1506d73 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2909,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) if (!static_branch_likely(&vm_numa_stat_key)) return; - if (z->node != numa_node_id()) + if (zone_to_nid(z) != numa_node_id()) local_stat = NUMA_OTHER; - if (z->node == preferred_zone->node) + if (zone_to_nid(z) == zone_to_nid(preferred_zone)) __inc_numa_state(z, NUMA_HIT); else { __inc_numa_state(z, NUMA_MISS); @@ -5278,7 +5278,7 @@ int local_memory_node(int node) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return z->zone->node; + return zone_to_nid(z->zone); } #endif @@ -6120,7 +6120,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l return usemapsize / 8; } -static void __init setup_usemap(struct pglist_data *pgdat, +static void __ref setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zone_start_pfn, unsigned long zonesize) @@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __paginginit set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -6168,14 +6168,14 @@ void __paginginit set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __paginginit set_pageblock_order(void) +void __init set_pageblock_order(void) { } #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) +static unsigned long __init calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) { unsigned long pages = spanned_pages; @@ -6194,39 +6194,99 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; } -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat) -{ - enum zone_type j; - int nid = pgdat->node_id; - - pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING +static void pgdat_init_numabalancing(struct pglist_data *pgdat) +{ spin_lock_init(&pgdat->numabalancing_migrate_lock); pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; +} +#else +static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} #endif + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pgdat_init_split_queue(struct pglist_data *pgdat) +{ spin_lock_init(&pgdat->split_queue_lock); INIT_LIST_HEAD(&pgdat->split_queue); pgdat->split_queue_len = 0; +} +#else +static void pgdat_init_split_queue(struct pglist_data *pgdat) {} #endif - init_waitqueue_head(&pgdat->kswapd_wait); - init_waitqueue_head(&pgdat->pfmemalloc_wait); + #ifdef CONFIG_COMPACTION +static void pgdat_init_kcompactd(struct pglist_data *pgdat) +{ init_waitqueue_head(&pgdat->kcompactd_wait); +} +#else +static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} #endif + +static void __meminit pgdat_init_internals(struct pglist_data *pgdat) +{ + pgdat_resize_init(pgdat); + + pgdat_init_numabalancing(pgdat); + pgdat_init_split_queue(pgdat); + pgdat_init_kcompactd(pgdat); + + init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); + pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); +} + +static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, + unsigned long remaining_pages) +{ + zone->managed_pages = remaining_pages; + zone_set_nid(zone, nid); + zone->name = zone_names[idx]; + zone->zone_pgdat = NODE_DATA(nid); + spin_lock_init(&zone->lock); + zone_seqlock_init(zone); + zone_pcp_init(zone); +} + +/* + * Set up the zone data structures + * - init pgdat internals + * - init all zones belonging to this node + * + * NOTE: this function is only called during memory hotplug + */ +#ifdef CONFIG_MEMORY_HOTPLUG +void __ref free_area_init_core_hotplug(int nid) +{ + enum zone_type z; + pg_data_t *pgdat = NODE_DATA(nid); + pgdat_init_internals(pgdat); + for (z = 0; z < MAX_NR_ZONES; z++) + zone_init_internals(&pgdat->node_zones[z], z, nid, 0); +} +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. + * NOTE: this function is only called during early init. + */ +static void __init free_area_init_core(struct pglist_data *pgdat) +{ + enum zone_type j; + int nid = pgdat->node_id; + + pgdat_init_internals(pgdat); pgdat->per_cpu_nodestats = &boot_nodestats; for (j = 0; j < MAX_NR_ZONES; j++) { @@ -6274,15 +6334,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = freesize; -#ifdef CONFIG_NUMA - zone->node = nid; -#endif - zone->name = zone_names[j]; - zone->zone_pgdat = pgdat; - spin_lock_init(&zone->lock); - zone_seqlock_init(zone); - zone_pcp_init(zone); + zone_init_internals(zone, j, nid, freesize); if (!size) continue; @@ -6342,8 +6394,24 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLAT_NODE_MEM_MAP */ -void __paginginit free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, unsigned long *zholes_size) +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) +{ + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +} +#else +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} +#endif + +void __init free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; @@ -6367,16 +6435,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); + pgdat_set_deferred_range(pgdat); -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - /* - * We start only with one section of pages, more pages are added as - * needed until the rest of deferred pages are initialized. - */ - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, - pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -#endif free_area_init_core(pgdat); } @@ -6388,7 +6448,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. */ -void __paginginit zero_resv_unavail(void) +void __init zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; diff --git a/mm/percpu.c b/mm/percpu.c index 0b6480979ac7..a749d4d96e3e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -170,6 +170,14 @@ static LIST_HEAD(pcpu_map_extend_chunks); int pcpu_nr_empty_pop_pages; /* + * The number of populated pages in use by the allocator, protected by + * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets + * allocated/deallocated, it is allocated/deallocated in all units of a chunk + * and increments/decrements this count by 1). + */ +static unsigned long pcpu_nr_populated; + +/* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one @@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; + pcpu_nr_populated += nr; if (!for_alloc) { chunk->nr_empty_pop_pages += nr; @@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, chunk->nr_populated -= nr; chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; + pcpu_nr_populated -= nr; } /* @@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); + /* include all regions of the first chunk */ + pcpu_nr_populated += PFN_DOWN(size_sum); + pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); @@ -2746,6 +2759,22 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ /* + * pcpu_nr_pages - calculate total number of populated backing pages + * + * This reflects the number of pages populated to back chunks. Metadata is + * excluded in the number exposed in meminfo as the number of backing pages + * scales with the number of cpus and can quickly outweigh the memory used for + * metadata. It also keeps this calculation nice and simple. + * + * RETURNS: + * Total number of populated backing pages in use by the allocator. + */ +unsigned long pcpu_nr_pages(void) +{ + return pcpu_nr_populated * pcpu_nr_units; +} + +/* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. diff --git a/mm/shmem.c b/mm/shmem.c index c48c79018a7c..fb04baacc9fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - memset(vma, 0, sizeof(*vma)); vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 008ccb22fee6..63a7b4563a57 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, - cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, + cache->slots, 1); return cache->nr; } @@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, true, &entry); + get_swap_pages(1, &entry, HPAGE_PMD_NR); goto out; } @@ -350,7 +350,7 @@ repeat: goto out; } - get_swap_pages(1, false, &entry); + get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 8837b22c848d..d954b71c4f9c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si, #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR + +#define swap_entry_size(size) (size) #else #define SWAPFILE_CLUSTER 256 + +/* + * Define swap_entry_size() as constant to let compiler to optimize + * out some code if !CONFIG_THP_SWAP + */ +#define swap_entry_size(size) 1 #endif #define LATENCY_LIMIT 256 @@ -269,7 +277,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info) static inline bool cluster_is_huge(struct swap_cluster_info *info) { - return info->flags & CLUSTER_FLAG_HUGE; + if (IS_ENABLED(CONFIG_THP_SWAP)) + return info->flags & CLUSTER_FLAG_HUGE; + return false; } static inline void cluster_clear_huge(struct swap_cluster_info *info) @@ -296,13 +306,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +/* + * Determine the locking method in use for this device. Return + * swap_cluster_info if SSD-style cluster-based locking is in place. + */ static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, - unsigned long offset) + struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; + /* Try to use fine-grained SSD-style locking if available: */ ci = lock_cluster(si, offset); + /* Otherwise, fall back to traditional, coarse locking: */ if (!ci) spin_lock(&si->lock); @@ -863,7 +878,6 @@ no_page: return n_ret; } -#ifdef CONFIG_THP_SWAP static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; @@ -871,6 +885,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) unsigned long offset, i; unsigned char *map; + /* + * Should not even be attempting cluster allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) { + VM_WARN_ON_ONCE(1); + return 0; + } + if (cluster_list_empty(&si->free_clusters)) return 0; @@ -901,13 +924,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); } -#else -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - VM_WARN_ON_ONCE(1); - return 0; -} -#endif /* CONFIG_THP_SWAP */ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) @@ -924,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) { - unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; + unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && cluster); + WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; + avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) goto noswap; @@ -945,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); + atomic_long_sub(n_goal * size, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -972,14 +988,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (cluster) { + if (size == SWAPFILE_CLUSTER) { if (!(si->flags & SWP_FILE)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret || cluster) + if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -1005,7 +1021,7 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long)(n_goal - n_ret) * nr_pages, + atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; @@ -1107,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } -static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) +static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, + unsigned long offset, + unsigned char usage) { - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; - ci = lock_cluster_or_swap_info(p, offset); - count = p->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; @@ -1144,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, usage = count | has_cache; p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + return usage; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); return usage; @@ -1184,19 +1208,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -static void swapcache_free(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); - } -} - -#ifdef CONFIG_THP_SWAP -static void swapcache_free_cluster(swp_entry_t entry) +void put_swap_page(struct page *page, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1205,42 +1217,48 @@ static void swapcache_free_cluster(swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; + int size = swap_entry_size(hpage_nr_pages(page)); si = _swap_info_get(entry); if (!si) return; - ci = lock_cluster(si, offset); - VM_BUG_ON(!cluster_is_huge(ci)); - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (!free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++) - map[i] &= ~SWAP_HAS_CACHE; + ci = lock_cluster_or_swap_info(si, offset); + if (size == SWAPFILE_CLUSTER) { + VM_BUG_ON(!cluster_is_huge(ci)); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + cluster_clear_huge(ci); + if (free_entries == SWAPFILE_CLUSTER) { + unlock_cluster_or_swap_info(si, ci); + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + return; + } } - cluster_clear_huge(ci); - unlock_cluster(ci); - if (free_entries == SWAPFILE_CLUSTER) { - spin_lock(&si->lock); - ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); - unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); - } else if (free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { - if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); + for (i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); + if (i == size - 1) + return; + lock_cluster_or_swap_info(si, offset); } } + unlock_cluster_or_swap_info(si, ci); } +#ifdef CONFIG_THP_SWAP int split_swap_cluster(swp_entry_t entry) { struct swap_info_struct *si; @@ -1255,19 +1273,7 @@ int split_swap_cluster(swp_entry_t entry) unlock_cluster(ci); return 0; } -#else -static inline void swapcache_free_cluster(swp_entry_t entry) -{ -} -#endif /* CONFIG_THP_SWAP */ - -void put_swap_page(struct page *page, swp_entry_t entry) -{ - if (!PageTransHuge(page)) - swapcache_free(entry); - else - swapcache_free_cluster(entry); -} +#endif static int swp_entry_cmp(const void *ent1, const void *ent2) { @@ -1409,7 +1415,6 @@ out: return count; } -#ifdef CONFIG_THP_SWAP static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry) { @@ -1422,12 +1427,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { - if (map[roffset] != SWAP_HAS_CACHE) + if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < SWAPFILE_CLUSTER; i++) { - if (map[offset + i] != SWAP_HAS_CACHE) { + if (swap_count(map[offset + i])) { ret = true; break; } @@ -1442,7 +1447,7 @@ static bool page_swapped(struct page *page) swp_entry_t entry; struct swap_info_struct *si; - if (likely(!PageTransCompound(page))) + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) return page_swapcount(page) != 0; page = compound_head(page); @@ -1466,10 +1471,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { + mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) @@ -1516,26 +1519,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, return map_swapcount; } -#else -#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) -#define page_swapped(page) (page_swapcount(page) != 0) - -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, - int *total_swapcount) -{ - int mapcount, swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - mapcount = page_trans_huge_mapcount(page, total_mapcount); - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return mapcount + swapcount; -} -#endif /* * We can write to an anon page without COW if there are no other references diff --git a/mm/vmscan.c b/mm/vmscan.c index 4375b1e9bd56..7e7d25504651 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -408,7 +408,8 @@ void register_shrinker_prepared(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); #ifdef CONFIG_MEMCG_KMEM - idr_replace(&shrinker_idr, shrinker, shrinker->id); + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + idr_replace(&shrinker_idr, shrinker, shrinker->id); #endif up_write(&shrinker_rwsem); } @@ -902,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, refcount = 2; if (!page_ref_freeze(page, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ if (unlikely(PageDirty(page))) { page_ref_unfreeze(page, refcount); goto cannot_free; diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 447857ffaf6b..5219280bf7ff 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -13,6 +13,7 @@ use POSIX; use File::Basename; use Cwd 'abs_path'; use Term::ANSIColor qw(:constants); +use Encode qw(decode encode); my $P = $0; my $D = dirname(abs_path($P)); @@ -240,11 +241,11 @@ $check_orig = $check; my $exit = 0; +my $perl_version_ok = 1; if ($^V && $^V lt $minimum_perl_version) { + $perl_version_ok = 0; printf "$P: requires at least perl version %vd\n", $minimum_perl_version; - if (!$ignore_perl_version) { - exit(1); - } + exit(1) if (!$ignore_perl_version); } #if no filenames are given, push '-' to read patch from stdin @@ -346,9 +347,10 @@ our $Sparse = qr{ __force| __iomem| __must_check| - __init_refok| __kprobes| __ref| + __refconst| + __refdata| __rcu| __private }x; @@ -847,6 +849,17 @@ sub is_maintained_obsolete { return $status =~ /obsolete/i; } +sub is_SPDX_License_valid { + my ($license) = @_; + + return 1 if (!$tree || which("python") eq "" || !(-e "$root/scripts/spdxcheck.py") || !(-e "$root/.git")); + + my $root_path = abs_path($root); + my $status = `cd "$root_path"; echo "$license" | python scripts/spdxcheck.py -`; + return 0 if ($status ne ""); + return 1; +} + my $camelcase_seeded = 0; sub seed_camelcase_includes { return if ($camelcase_seeded); @@ -1026,11 +1039,11 @@ if (!$quiet) { hash_show_words(\%use_type, "Used"); hash_show_words(\%ignore_type, "Ignored"); - if ($^V lt 5.10.0) { + if (!$perl_version_ok) { print << "EOM" NOTE: perl $^V is not modern enough to detect all possible issues. - An upgrade to at least perl v5.10.0 is suggested. + An upgrade to at least perl $minimum_perl_version is suggested. EOM } if ($exit) { @@ -2235,10 +2248,14 @@ sub process { our $clean = 1; my $signoff = 0; + my $author = ''; + my $authorsignoff = 0; my $is_patch = 0; + my $is_binding_patch = -1; my $in_header_lines = $file ? 0 : 1; my $in_commit_log = 0; #Scanning lines before patch my $has_commit_log = 0; #Encountered lines before patch + my $commit_log_lines = 0; #Number of commit log lines my $commit_log_possible_stack_dump = 0; my $commit_log_long_line = 0; my $commit_log_has_diff = 0; @@ -2485,6 +2502,19 @@ sub process { $check = $check_orig; } $checklicenseline = 1; + + if ($realfile !~ /^MAINTAINERS/) { + my $last_binding_patch = $is_binding_patch; + + $is_binding_patch = () = $realfile =~ m@^(?:Documentation/devicetree/|include/dt-bindings/)@; + + if (($last_binding_patch != -1) && + ($last_binding_patch ^ $is_binding_patch)) { + WARN("DT_SPLIT_BINDING_PATCH", + "DT binding docs and includes should be a separate patch. See: Documentation/devicetree/bindings/submitting-patches.txt\n"); + } + } + next; } @@ -2496,6 +2526,18 @@ sub process { $cnt_lines++ if ($realcnt != 0); +# Verify the existence of a commit log if appropriate +# 2 is used because a $signature is counted in $commit_log_lines + if ($in_commit_log) { + if ($line !~ /^\s*$/) { + $commit_log_lines++; #could be a $signature + } + } elsif ($has_commit_log && $commit_log_lines < 2) { + WARN("COMMIT_MESSAGE", + "Missing commit description - Add an appropriate one\n"); + $commit_log_lines = 2; #warn only once + } + # Check if the commit log has what seems like a diff which can confuse patch if ($in_commit_log && !$commit_log_has_diff && (($line =~ m@^\s+diff\b.*a/[\w/]+@ && @@ -2517,10 +2559,24 @@ sub process { } } +# Check the patch for a From: + if (decode("MIME-Header", $line) =~ /^From:\s*(.*)/) { + $author = $1; + $author = encode("utf8", $author) if ($line =~ /=\?utf-8\?/i); + $author =~ s/"//g; + } + # Check the patch for a signoff: if ($line =~ /^\s*signed-off-by:/i) { $signoff++; $in_commit_log = 0; + if ($author ne '') { + my $l = $line; + $l =~ s/"//g; + if ($l =~ /^\s*signed-off-by:\s*\Q$author\E/i) { + $authorsignoff = 1; + } + } } # Check if MAINTAINERS is being updated. If so, there's probably no need to @@ -2960,8 +3016,14 @@ sub process { if ($comment !~ /^$/ && $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { - WARN("SPDX_LICENSE_TAG", - "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + WARN("SPDX_LICENSE_TAG", + "Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + } elsif ($rawline =~ /(SPDX-License-Identifier: .*)/) { + my $spdx_license = $1; + if (!is_SPDX_License_valid($spdx_license)) { + WARN("SPDX_LICENSE_TAG", + "'$spdx_license' is not supported in LICENSES/...\n" . $herecurr); + } } } } @@ -3079,7 +3141,7 @@ sub process { } # check indentation starts on a tab stop - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $sline =~ /^\+\t+( +)(?:$c90_Keywords\b|\{\s*$|\}\s*(?:else\b|while\b|\s*$)|$Declare\s*$Ident\s*[;=])/) { my $indent = length($1); if ($indent % 8) { @@ -3092,7 +3154,7 @@ sub process { } # check multi-line statement indentation matches previous line - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $prevline =~ /^\+([ \t]*)((?:$c90_Keywords(?:\s+if)\s*)|(?:$Declare\s*)?(?:$Ident|\(\s*\*\s*$Ident\s*\))\s*|(?:\*\s*)*$Lval\s*=\s*$Ident\s*)\(.*(\&\&|\|\||,)\s*$/) { $prevline =~ /^\+(\t*)(.*)$/; my $oldindent = $1; @@ -3781,6 +3843,26 @@ sub process { "type '$tmp' should be specified in [[un]signed] [short|int|long|long long] order\n" . $herecurr); } +# check for unnecessary <signed> int declarations of short/long/long long + while ($sline =~ m{\b($TypeMisordered(\s*\*)*|$C90_int_types)\b}g) { + my $type = trim($1); + next if ($type !~ /\bint\b/); + next if ($type !~ /\b(?:short|long\s+long|long)\b/); + my $new_type = $type; + $new_type =~ s/\b\s*int\s*\b/ /; + $new_type =~ s/\b\s*(?:un)?signed\b\s*/ /; + $new_type =~ s/^const\s+//; + $new_type = "unsigned $new_type" if ($type =~ /\bunsigned\b/); + $new_type = "const $new_type" if ($type =~ /^const\b/); + $new_type =~ s/\s+/ /g; + $new_type = trim($new_type); + if (WARN("UNNECESSARY_INT", + "Prefer '$new_type' over '$type' as the int is unnecessary\n" . $herecurr) && + $fix) { + $fixed[$fixlinenr] =~ s/\b\Q$type\E\b/$new_type/; + } + } + # check for static const char * arrays. if ($line =~ /\bstatic\s+const\s+char\s*\*\s*(\w+)\s*\[\s*\]\s*=\s*/) { WARN("STATIC_CONST_CHAR_ARRAY", @@ -3967,7 +4049,7 @@ sub process { # function brace can't be on same line, except for #defines of do while, # or if closed on same line - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $sline =~ /$Type\s*$Ident\s*$balanced_parens\s*\{/ && $sline !~ /\#\s*define\b.*do\s*\{/ && $sline !~ /}/) { @@ -4483,11 +4565,11 @@ sub process { #need space before brace following if, while, etc if (($line =~ /\(.*\)\{/ && $line !~ /\($Type\)\{/) || - $line =~ /do\{/) { + $line =~ /\b(?:else|do)\{/) { if (ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr) && $fix) { - $fixed[$fixlinenr] =~ s/^(\+.*(?:do|\)))\{/$1 {/; + $fixed[$fixlinenr] =~ s/^(\+.*(?:do|else|\)))\{/$1 {/; } } @@ -4578,7 +4660,7 @@ sub process { # check for unnecessary parentheses around comparisons in if uses # when !drivers/staging or command-line uses --strict if (($realfile !~ m@^(?:drivers/staging/)@ || $check_orig) && - $^V && $^V ge 5.10.0 && defined($stat) && + $perl_version_ok && defined($stat) && $stat =~ /(^.\s*if\s*($balanced_parens))/) { my $if_stat = $1; my $test = substr($2, 1, -1); @@ -4615,7 +4697,7 @@ sub process { # return is not a function if (defined($stat) && $stat =~ /^.\s*return(\s*)\(/s) { my $spacing = $1; - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $stat =~ /^.\s*return\s*($balanced_parens)\s*;\s*$/) { my $value = $1; $value = deparenthesize($value); @@ -4642,7 +4724,7 @@ sub process { } # if statements using unnecessary parentheses - ie: if ((foo == bar)) - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $line =~ /\bif\s*((?:\(\s*){2,})/) { my $openparens = $1; my $count = $openparens =~ tr@\(@\(@; @@ -4659,7 +4741,7 @@ sub process { # avoid cases like "foo + BAR < baz" # only fix matches surrounded by parentheses to avoid incorrect # conversions like "FOO < baz() + 5" being "misfixed" to "baz() > FOO + 5" - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $line =~ /^\+(.*)\b($Constant|[A-Z_][A-Z0-9_]*)\s*($Compare)\s*($LvalOrFunc)/) { my $lead = $1; my $const = $2; @@ -4949,6 +5031,7 @@ sub process { if (defined $define_args && $define_args ne "") { $define_args = substr($define_args, 1, length($define_args) - 2); $define_args =~ s/\s*//g; + $define_args =~ s/\\\+?//g; @def_args = split(",", $define_args); } @@ -5084,7 +5167,7 @@ sub process { # do {} while (0) macro tests: # single-statement macros do not need to be enclosed in do while (0) loop, # macro should not end with a semicolon - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $realfile !~ m@/vmlinux.lds.h$@ && $line =~ /^.\s*\#\s*define\s+$Ident(\()?/) { my $ln = $linenr; @@ -5330,15 +5413,28 @@ sub process { } # concatenated string without spaces between elements - if ($line =~ /$String[A-Z_]/ || $line =~ /[A-Za-z0-9_]$String/) { - CHK("CONCATENATED_STRING", - "Concatenated strings should use spaces between elements\n" . $herecurr); + if ($line =~ /$String[A-Za-z0-9_]/ || $line =~ /[A-Za-z0-9_]$String/) { + if (CHK("CONCATENATED_STRING", + "Concatenated strings should use spaces between elements\n" . $herecurr) && + $fix) { + while ($line =~ /($String)/g) { + my $extracted_string = substr($rawline, $-[0], $+[0] - $-[0]); + $fixed[$fixlinenr] =~ s/\Q$extracted_string\E([A-Za-z0-9_])/$extracted_string $1/; + $fixed[$fixlinenr] =~ s/([A-Za-z0-9_])\Q$extracted_string\E/$1 $extracted_string/; + } + } } # uncoalesced string fragments if ($line =~ /$String\s*"/) { - WARN("STRING_FRAGMENTS", - "Consecutive strings are generally better as a single string\n" . $herecurr); + if (WARN("STRING_FRAGMENTS", + "Consecutive strings are generally better as a single string\n" . $herecurr) && + $fix) { + while ($line =~ /($String)(?=\s*")/g) { + my $extracted_string = substr($rawline, $-[0], $+[0] - $-[0]); + $fixed[$fixlinenr] =~ s/\Q$extracted_string\E\s*"/substr($extracted_string, 0, -1)/e; + } + } } # check for non-standard and hex prefixed decimal printf formats @@ -5374,9 +5470,14 @@ sub process { # warn about #if 0 if ($line =~ /^.\s*\#\s*if\s+0\b/) { - CHK("REDUNDANT_CODE", - "if this code is redundant consider removing it\n" . - $herecurr); + WARN("IF_0", + "Consider removing the code enclosed by this #if 0 and its #endif\n" . $herecurr); + } + +# warn about #if 1 + if ($line =~ /^.\s*\#\s*if\s+1\b/) { + WARN("IF_1", + "Consider removing the #if 1 and its #endif\n" . $herecurr); } # check for needless "if (<foo>) fn(<foo>)" uses @@ -5447,7 +5548,7 @@ sub process { } # check for mask then right shift without a parentheses - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $line =~ /$LvalOrFunc\s*\&\s*($LvalOrFunc)\s*>>/ && $4 !~ /^\&/) { # $LvalOrFunc may be &foo, ignore if so WARN("MASK_THEN_SHIFT", @@ -5455,7 +5556,7 @@ sub process { } # check for pointer comparisons to NULL - if ($^V && $^V ge 5.10.0) { + if ($perl_version_ok) { while ($line =~ /\b$LvalOrFunc\s*(==|\!=)\s*NULL\b/g) { my $val = $1; my $equal = "!"; @@ -5727,7 +5828,7 @@ sub process { } # Check for __attribute__ weak, or __weak declarations (may have link issues) - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $line =~ /(?:$Declare|$DeclareMisordered)\s*$Ident\s*$balanced_parens\s*(?:$Attribute)?\s*;/ && ($line =~ /\b__attribute__\s*\(\s*\(.*\bweak\b/ || $line =~ /\b__weak\b/)) { @@ -5809,7 +5910,7 @@ sub process { } # check for vsprintf extension %p<foo> misuses - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /^\+(?![^\{]*\{\s*).*\b(\w+)\s*\(.*$String\s*,/s && $1 !~ /^_*volatile_*$/) { @@ -5856,7 +5957,7 @@ sub process { } # Check for misused memsets - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/) { @@ -5874,7 +5975,7 @@ sub process { } # Check for memcpy(foo, bar, ETH_ALEN) that could be ether_addr_copy(foo, bar) -# if ($^V && $^V ge 5.10.0 && +# if ($perl_version_ok && # defined $stat && # $stat =~ /^\+(?:.*?)\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { # if (WARN("PREFER_ETHER_ADDR_COPY", @@ -5885,7 +5986,7 @@ sub process { # } # Check for memcmp(foo, bar, ETH_ALEN) that could be ether_addr_equal*(foo, bar) -# if ($^V && $^V ge 5.10.0 && +# if ($perl_version_ok && # defined $stat && # $stat =~ /^\+(?:.*?)\bmemcmp\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { # WARN("PREFER_ETHER_ADDR_EQUAL", @@ -5894,7 +5995,7 @@ sub process { # check for memset(foo, 0x0, ETH_ALEN) that could be eth_zero_addr # check for memset(foo, 0xFF, ETH_ALEN) that could be eth_broadcast_addr -# if ($^V && $^V ge 5.10.0 && +# if ($perl_version_ok && # defined $stat && # $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/) { # @@ -5916,7 +6017,7 @@ sub process { # } # typecasts on min/max could be min_t/max_t - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /^\+(?:.*?)\b(min|max)\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\)/) { if (defined $2 || defined $7) { @@ -5940,7 +6041,7 @@ sub process { } # check usleep_range arguments - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /^\+(?:.*?)\busleep_range\s*\(\s*($FuncArg)\s*,\s*($FuncArg)\s*\)/) { my $min = $1; @@ -5956,7 +6057,7 @@ sub process { } # check for naked sscanf - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $line =~ /\bsscanf\b/ && ($stat !~ /$Ident\s*=\s*sscanf\s*$balanced_parens/ && @@ -5970,7 +6071,7 @@ sub process { } # check for simple sscanf that should be kstrto<foo> - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $line =~ /\bsscanf\b/) { my $lc = $stat =~ tr@\n@@; @@ -6042,7 +6143,7 @@ sub process { } # check for function definitions - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /^.\s*(?:$Storage\s+)?$Type\s*($Ident)\s*$balanced_parens\s*{/s) { $context_function = $1; @@ -6082,14 +6183,14 @@ sub process { # alloc style # p = alloc(sizeof(struct foo), ...) should be p = alloc(sizeof(*p), ...) - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*([kv][mz]alloc(?:_node)?)\s*\(\s*(sizeof\s*\(\s*struct\s+$Lval\s*\))/) { CHK("ALLOC_SIZEOF_STRUCT", "Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr); } # check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /^\+\s*($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)\s*,/) { my $oldfunc = $3; @@ -6118,8 +6219,9 @@ sub process { } # check for krealloc arg reuse - if ($^V && $^V ge 5.10.0 && - $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) { + if ($perl_version_ok && + $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*($Lval)\s*,/ && + $1 eq $3) { WARN("KREALLOC_ARG_REUSE", "Reusing the krealloc arg is almost always a bug\n" . $herecurr); } @@ -6187,7 +6289,7 @@ sub process { } # check for switch/default statements without a break; - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /^\+[$;\s]*(?:case[$;\s]+\w+[$;\s]*:[$;\s]*|)*[$;\s]*\bdefault[$;\s]*:[$;\s]*;/g) { my $cnt = statement_rawlines($stat); @@ -6251,6 +6353,13 @@ sub process { "Avoid using bool as bitfield. Prefer bool bitfields as unsigned int or u<8|16|32>\n" . $herecurr); } +# check for bool use in .h files + if ($realfile =~ /\.h$/ && + $sline =~ /^.\s+bool\s*$Ident\s*(?::\s*d+\s*)?;/) { + CHK("BOOL_MEMBER", + "Avoid using bool structure members because of possible alignment issues - see: https://lkml.org/lkml/2017/11/21/384\n" . $herecurr); + } + # check for semaphores initialized locked if ($line =~ /^.\s*sema_init.+,\W?0\W?\)/) { WARN("CONSIDER_COMPLETION", @@ -6297,7 +6406,7 @@ sub process { } # likely/unlikely comparisons similar to "(likely(foo) > 0)" - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && $line =~ /\b((?:un)?likely)\s*\(\s*$FuncArg\s*\)\s*$Compare/) { WARN("LIKELY_MISUSE", "Using $1 should generally have parentheses around the comparison\n" . $herecurr); @@ -6340,7 +6449,7 @@ sub process { # check for DEVICE_ATTR uses that could be DEVICE_ATTR_<FOO> # and whether or not function naming is typical and if # DEVICE_ATTR permissions uses are unusual too - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $stat =~ /\bDEVICE_ATTR\s*\(\s*(\w+)\s*,\s*\(?\s*(\s*(?:${multi_mode_perms_string_search}|0[0-7]{3,3})\s*)\s*\)?\s*,\s*(\w+)\s*,\s*(\w+)\s*\)/) { my $var = $1; @@ -6400,7 +6509,7 @@ sub process { # specific definition of not visible in sysfs. # o Ignore proc_create*(...) uses with a decimal 0 permission as that means # use the default permissions - if ($^V && $^V ge 5.10.0 && + if ($perl_version_ok && defined $stat && $line =~ /$mode_perms_search/) { foreach my $entry (@mode_permission_funcs) { @@ -6486,9 +6595,14 @@ sub process { ERROR("NOT_UNIFIED_DIFF", "Does not appear to be a unified-diff format patch\n"); } - if ($is_patch && $has_commit_log && $chk_signoff && $signoff == 0) { - ERROR("MISSING_SIGN_OFF", - "Missing Signed-off-by: line(s)\n"); + if ($is_patch && $has_commit_log && $chk_signoff) { + if ($signoff == 0) { + ERROR("MISSING_SIGN_OFF", + "Missing Signed-off-by: line(s)\n"); + } elsif (!$authorsignoff) { + WARN("NO_AUTHOR_SIGN_OFF", + "Missing Signed-off-by: line by nominal patch author '$author'\n"); + } } print report_dump(); diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index c87fa734e3e1..c1c088ef1420 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -48,6 +48,7 @@ my $output_roles = 0; my $output_rolestats = 1; my $output_section_maxlen = 50; my $scm = 0; +my $tree = 1; my $web = 0; my $subsystem = 0; my $status = 0; @@ -61,7 +62,7 @@ my $self_test = undef; my $version = 0; my $help = 0; my $find_maintainer_files = 0; - +my $maintainer_path; my $vcs_used = 0; my $exit = 0; @@ -255,6 +256,7 @@ if (!GetOptions( 'subsystem!' => \$subsystem, 'status!' => \$status, 'scm!' => \$scm, + 'tree!' => \$tree, 'web!' => \$web, 'letters=s' => \$letters, 'pattern-depth=i' => \$pattern_depth, @@ -263,6 +265,7 @@ if (!GetOptions( 'fe|file-emails!' => \$file_emails, 'f|file' => \$from_filename, 'find-maintainer-files' => \$find_maintainer_files, + 'mpath|maintainer-path=s' => \$maintainer_path, 'self-test:s' => \$self_test, 'v|version' => \$version, 'h|help|usage' => \$help, @@ -319,7 +322,7 @@ if ($email && die "$P: Please select at least 1 email option\n"; } -if (!top_of_kernel_tree($lk_path)) { +if ($tree && !top_of_kernel_tree($lk_path)) { die "$P: The current directory does not appear to be " . "a linux kernel source tree.\n"; } @@ -384,26 +387,36 @@ sub find_ignore_git { read_all_maintainer_files(); sub read_all_maintainer_files { - if (-d "${lk_path}MAINTAINERS") { - opendir(DIR, "${lk_path}MAINTAINERS") or die $!; - my @files = readdir(DIR); - closedir(DIR); - foreach my $file (@files) { - push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./); - } - } - - if ($find_maintainer_files) { - find( { wanted => \&find_is_maintainer_file, - preprocess => \&find_ignore_git, - no_chdir => 1, - }, "${lk_path}"); + my $path = "${lk_path}MAINTAINERS"; + if (defined $maintainer_path) { + $path = $maintainer_path; + # Perl Cookbook tilde expansion if necessary + $path =~ s@^~([^/]*)@ $1 ? (getpwnam($1))[7] : ( $ENV{HOME} || $ENV{LOGDIR} || (getpwuid($<))[7])@ex; + } + + if (-d $path) { + $path .= '/' if ($path !~ m@/$@); + if ($find_maintainer_files) { + find( { wanted => \&find_is_maintainer_file, + preprocess => \&find_ignore_git, + no_chdir => 1, + }, "$path"); + } else { + opendir(DIR, "$path") or die $!; + my @files = readdir(DIR); + closedir(DIR); + foreach my $file (@files) { + push(@mfiles, "$path$file") if ($file !~ /^\./); + } + } + } elsif (-f "$path") { + push(@mfiles, "$path"); } else { - push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS"; + die "$P: MAINTAINER file not found '$path'\n"; } - + die "$P: No MAINTAINER files found in '$path'\n" if (scalar(@mfiles) == 0); foreach my $file (@mfiles) { - read_maintainer_file("$file"); + read_maintainer_file("$file"); } } @@ -1031,13 +1044,14 @@ Other options: --sections => print all of the subsystem sections with pattern matches --letters => print all matching 'letter' types from all matching sections --mailmap => use .mailmap file (default: $email_use_mailmap) + --no-tree => run without a kernel tree --self-test => show potential issues with MAINTAINERS file content --version => show version --help => show this help information Default options: - [--email --nogit --git-fallback --m --r --n --l --multiline --pattern-depth=0 - --remove-duplicates --rolestats] + [--email --tree --nogit --git-fallback --m --r --n --l --multiline + --pattern-depth=0 --remove-duplicates --rolestats] Notes: Using "-f directory" may give unexpected results: diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 9a058cff49d4..d2688d55c34b 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -10,6 +10,8 @@ abandonning||abandoning abigious||ambiguous abitrate||arbitrate +abord||abort +aboslute||absolute abov||above abreviated||abbreviated absense||absence @@ -25,6 +27,7 @@ accessable||accessible accesss||access accidentaly||accidentally accidentually||accidentally +acclerated||accelerated accoding||according accomodate||accommodate accomodates||accommodates @@ -58,12 +61,15 @@ addres||address adddress||address addreses||addresses addresss||address +addrress||address aditional||additional aditionally||additionally aditionaly||additionally adminstrative||administrative adress||address adresses||addresses +adrresses||addresses +advertisment||advertisement adviced||advised afecting||affecting againt||against @@ -100,6 +106,7 @@ alue||value ambigious||ambiguous amoung||among amout||amount +amplifer||amplifier an union||a union an user||a user an userspace||a userspace @@ -145,11 +152,15 @@ assistent||assistant assocation||association associcated||associated assotiated||associated +asssert||assert assum||assume assumtpion||assumption asuming||assuming asycronous||asynchronous asynchnous||asynchronous +asynchromous||asynchronous +asymetric||asymmetric +asymmeric||asymmetric atomatically||automatically atomicly||atomically atempt||attempt @@ -172,6 +183,7 @@ avaible||available availabe||available availabled||available availablity||availability +availaible||available availale||available availavility||availability availble||available @@ -206,8 +218,11 @@ borad||board boundry||boundary brievely||briefly broadcat||broadcast +bufufer||buffer cacluated||calculated +caculate||calculate caculation||calculation +cadidate||candidate calender||calendar calescing||coalescing calle||called @@ -221,12 +236,14 @@ capabilty||capability capabitilies||capabilities capatibilities||capabilities capapbilities||capabilities +caputure||capture carefuly||carefully cariage||carriage catagory||category cehck||check challange||challenge challanges||challenges +chache||cache chanell||channel changable||changeable chanined||chained @@ -240,6 +257,7 @@ charaters||characters charcter||character chcek||check chck||check +checksumed||checksummed checksuming||checksumming childern||children childs||children @@ -292,8 +310,10 @@ comunication||communication conbination||combination conditionaly||conditionally conected||connected +conector||connector connecetd||connected configuartion||configuration +configuation||configuration configuratoin||configuration configuraton||configuration configuretion||configuration @@ -315,6 +335,7 @@ continous||continuous continously||continuously continueing||continuing contraints||constraints +contruct||construct contol||control contoller||controller controled||controlled @@ -343,6 +364,7 @@ dafault||default deafult||default deamon||daemon decompres||decompress +decsribed||described decription||description dectected||detected defailt||default @@ -379,6 +401,7 @@ desctiptor||descriptor desriptor||descriptor desriptors||descriptors destionation||destination +destoried||destroyed destory||destroy destoryed||destroyed destorys||destroys @@ -404,18 +427,25 @@ diffrentiate||differentiate difinition||definition dimesions||dimensions diplay||display +directon||direction direectly||directly +diregard||disregard disassocation||disassociation disapear||disappear disapeared||disappeared disappared||disappeared +disbale||disable +disbaled||disabled disble||disable disbled||disabled disconnet||disconnect discontinous||discontinuous +disharge||discharge dispertion||dispersion dissapears||disappears distiction||distinction +divisable||divisible +divsiors||divisors docuentation||documentation documantation||documentation documentaion||documentation @@ -427,6 +457,7 @@ downlad||download downlads||downloads druing||during dynmaic||dynamic +eanable||enable easilly||easily ecspecially||especially edditable||editable @@ -484,9 +515,12 @@ exprimental||experimental extened||extended extensability||extensibility extention||extension +extenstion||extension extracter||extractor +faield||failed falied||failed faild||failed +failer||failure faill||fail failied||failed faillure||failure @@ -520,6 +554,7 @@ forseeable||foreseeable forse||force fortan||fortran forwardig||forwarding +frambuffer||framebuffer framming||framing framwork||framework frequncy||frequency @@ -527,6 +562,7 @@ frome||from fucntion||function fuction||function fuctions||functions +funcation||function funcion||function functionallity||functionality functionaly||functionally @@ -540,6 +576,7 @@ futrue||future gaurenteed||guaranteed generiously||generously genereate||generate +genereted||generated genric||generic globel||global grabing||grabbing @@ -553,6 +590,7 @@ guarentee||guarantee halfs||halves hander||handler handfull||handful +hanlde||handle hanled||handled happend||happened harware||hardware @@ -561,6 +599,7 @@ helpfull||helpful hybernate||hibernate hierachy||hierarchy hierarchie||hierarchy +homogenous||homogeneous howver||however hsould||should hypervior||hypervisor @@ -568,6 +607,8 @@ hypter||hyper identidier||identifier iligal||illegal illigal||illegal +illgal||illegal +iomaped||iomapped imblance||imbalance immeadiately||immediately immedaite||immediate @@ -618,10 +659,12 @@ initation||initiation initators||initiators initialiazation||initialization initializiation||initialization +initialze||initialize initialzed||initialized initilization||initialization initilize||initialize inofficial||unofficial +inrerface||interface insititute||institute instal||install instanciated||instantiated @@ -657,6 +700,7 @@ intregral||integral intrrupt||interrupt intterrupt||interrupt intuative||intuitive +inavlid||invalid invaid||invalid invald||invalid invalde||invalid @@ -683,6 +727,7 @@ langauge||language langugage||language lauch||launch layed||laid +legnth||length leightweight||lightweight lengh||length lenght||length @@ -696,6 +741,7 @@ licenceing||licencing loggging||logging loggin||login logile||logfile +loobpack||loopback loosing||losing losted||lost machinary||machinery @@ -703,6 +749,7 @@ maintainance||maintenance maintainence||maintenance maintan||maintain makeing||making +mailformed||malformed malplaced||misplaced malplace||misplace managable||manageable @@ -710,6 +757,7 @@ managment||management mangement||management manoeuvering||maneuvering mappping||mapping +matchs||matches mathimatical||mathematical mathimatic||mathematic mathimatics||mathematics @@ -725,6 +773,7 @@ messsage||message messsages||messages micropone||microphone microprocesspr||microprocessor +migrateable||migratable milliseonds||milliseconds minium||minimum minimam||minimum @@ -741,6 +790,7 @@ missmatch||mismatch miximum||maximum mmnemonic||mnemonic mnay||many +modfiy||modify modulues||modules momery||memory memomry||memory @@ -777,6 +827,7 @@ notifed||notified numebr||number numner||number obtaion||obtain +obusing||abusing occassionally||occasionally occationally||occasionally occurance||occurrence @@ -787,6 +838,7 @@ occure||occurred occured||occurred occuring||occurring offet||offset +offloded||offloaded omited||omitted omiting||omitting omitt||omit @@ -829,6 +881,7 @@ parametes||parameters parametised||parametrised paramter||parameter paramters||parameters +parmaters||parameters particuarly||particularly particularily||particularly partiton||partition @@ -837,6 +890,7 @@ passin||passing pathes||paths pecularities||peculiarities peformance||performance +peforming||performing peice||piece pendantic||pedantic peprocessor||preprocessor @@ -846,6 +900,7 @@ peroid||period persistance||persistence persistant||persistent plalform||platform +platfoem||platform platfrom||platform plattform||platform pleaes||please @@ -858,6 +913,7 @@ posible||possible positon||position possibilites||possibilities powerfull||powerful +preamle||preamble preample||preamble preapre||prepare preceeded||preceded @@ -870,6 +926,7 @@ prefered||preferred prefferably||preferably premption||preemption prepaired||prepared +preperation||preparation pressre||pressure primative||primitive princliple||principle @@ -935,6 +992,7 @@ recommanded||recommended recyle||recycle redircet||redirect redirectrion||redirection +redundacy||redundancy reename||rename refcounf||refcount refence||reference @@ -945,6 +1003,7 @@ refernces||references refernnce||reference refrence||reference registerd||registered +registeration||registration registeresd||registered registerred||registered registes||registers @@ -973,7 +1032,9 @@ requirment||requirement requred||required requried||required requst||request +reregisteration||reregistration reseting||resetting +reseverd||reserved resizeable||resizable resouce||resource resouces||resources @@ -982,6 +1043,7 @@ responce||response ressizes||resizes ressource||resource ressources||resources +restesting||retesting retransmited||retransmitted retreived||retrieved retreive||retrieve @@ -1006,6 +1068,7 @@ sacrifying||sacrificing safly||safely safty||safety savable||saveable +scaleing||scaling scaned||scanned scaning||scanning scarch||search @@ -1014,6 +1077,8 @@ searchs||searches secquence||sequence secund||second segement||segment +semaphone||semaphore +senario||scenario senarios||scenarios sentivite||sensitive separatly||separately @@ -1025,9 +1090,13 @@ seperate||separate seperatly||separately seperator||separator sepperate||separate +seqeunce||sequence +seqeuncer||sequencer +seqeuencer||sequencer sequece||sequence sequencial||sequential serveral||several +servive||service setts||sets settting||setting shotdown||shutdown @@ -1073,6 +1142,7 @@ standartization||standardization standart||standard staticly||statically stoped||stopped +stoping||stopping stoppped||stopped straming||streaming struc||struct @@ -1085,6 +1155,7 @@ subdirectoires||subdirectories suble||subtle substract||subtract submition||submission +suceed||succeed succesfully||successfully succesful||successful successed||succeeded @@ -1108,6 +1179,7 @@ surpressed||suppressed surpresses||suppresses susbsystem||subsystem suspeneded||suspended +suspsend||suspend suspicously||suspiciously swaping||swapping switchs||switches @@ -1122,6 +1194,7 @@ swtich||switch symetric||symmetric synax||syntax synchonized||synchronized +synchronuously||synchronously syncronize||synchronize syncronized||synchronized syncronizing||synchronizing @@ -1130,11 +1203,14 @@ syste||system sytem||system sythesis||synthesis taht||that +tansmit||transmit targetted||targeted targetting||targeting +taskelt||tasklet teh||the temorary||temporary temproarily||temporarily +thead||thread therfore||therefore thier||their threds||threads @@ -1143,11 +1219,14 @@ thresold||threshold throught||through troughput||throughput thses||these +tiggers||triggers tiggered||triggered tipically||typically +timeing||timing timout||timeout tmis||this torerable||tolerable +traking||tracking tramsmitted||transmitted tramsmit||transmit tranasction||transaction @@ -1162,6 +1241,7 @@ transormed||transformed trasfer||transfer trasmission||transmission treshold||threshold +trigerred||triggered trigerring||triggering trun||turn tunning||tuning @@ -1169,6 +1249,8 @@ ture||true tyep||type udpate||update uesd||used +uknown||unknown +usupported||unsupported uncommited||uncommitted unconditionaly||unconditionally underun||underrun @@ -1181,11 +1263,14 @@ unexpeted||unexpected unexpexted||unexpected unfortunatelly||unfortunately unifiy||unify +uniterrupted||uninterrupted unintialized||uninitialized unkmown||unknown unknonw||unknown unknow||unknown unkown||unknown +unamed||unnamed +uneeded||unneeded unneded||unneeded unneccecary||unnecessary unneccesary||unnecessary @@ -1210,6 +1295,7 @@ usefull||useful usege||usage usera||users usualy||usually +usupported||unsupported utilites||utilities utillities||utilities utilties||utilities @@ -1233,7 +1319,9 @@ virtaul||virtual virtiual||virtual visiters||visitors vitual||virtual +vunerable||vulnerable wakeus||wakeups +wathdog||watchdog wating||waiting wiat||wait wether||whether diff --git a/security/security.c b/security/security.c index 47cfff01d7ec..736e78da1ab9 100644 --- a/security/security.c +++ b/security/security.c @@ -48,14 +48,17 @@ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] = static void __init do_security_initcalls(void) { int ret; - initcall_t *call; - call = __security_initcall_start; + initcall_t call; + initcall_entry_t *ce; + + ce = __security_initcall_start; trace_initcall_level("security"); - while (call < __security_initcall_end) { - trace_initcall_start((*call)); - ret = (*call) (); - trace_initcall_finish((*call), ret); - call++; + while (ce < __security_initcall_end) { + call = initcall_from_entry(ce); + trace_initcall_start(call); + ret = call(); + trace_initcall_finish(call, ret); + ce++; } } diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 74e5912e9f2e..82121a81681f 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -9,3 +9,5 @@ /proc-uptime-001 /proc-uptime-002 /read +/self +/thread-self diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index db310eedc268..1c12c34cf85d 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,4 +1,5 @@ CFLAGS += -Wall -O2 -Wno-unused-function +CFLAGS += -D_GNU_SOURCE TEST_GEN_PROGS := TEST_GEN_PROGS += fd-001-lookup @@ -12,5 +13,7 @@ TEST_GEN_PROGS += proc-self-wchan TEST_GEN_PROGS += proc-uptime-001 TEST_GEN_PROGS += proc-uptime-002 TEST_GEN_PROGS += read +TEST_GEN_PROGS += self +TEST_GEN_PROGS += thread-self include ../lib.mk diff --git a/tools/testing/selftests/proc/proc.h b/tools/testing/selftests/proc/proc.h index 4e178166fd84..b7d57ea40237 100644 --- a/tools/testing/selftests/proc/proc.h +++ b/tools/testing/selftests/proc/proc.h @@ -6,6 +6,18 @@ #include <stdbool.h> #include <stdlib.h> #include <string.h> +#include <unistd.h> +#include <sys/syscall.h> + +static inline pid_t sys_getpid(void) +{ + return syscall(SYS_getpid); +} + +static inline pid_t sys_gettid(void) +{ + return syscall(SYS_gettid); +} static inline bool streq(const char *s1, const char *s2) { diff --git a/tools/testing/selftests/proc/self.c b/tools/testing/selftests/proc/self.c new file mode 100644 index 000000000000..21c15a1ffefb --- /dev/null +++ b/tools/testing/selftests/proc/self.c @@ -0,0 +1,39 @@ +/* + * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that /proc/self gives correct TGID. +#undef NDEBUG +#include <assert.h> +#include <stdio.h> +#include <unistd.h> + +#include "proc.h" + +int main(void) +{ + char buf1[64], buf2[64]; + pid_t pid; + ssize_t rv; + + pid = sys_getpid(); + snprintf(buf1, sizeof(buf1), "%u", pid); + + rv = readlink("/proc/self", buf2, sizeof(buf2)); + assert(rv == strlen(buf1)); + buf2[rv] = '\0'; + assert(streq(buf1, buf2)); + + return 0; +} diff --git a/tools/testing/selftests/proc/thread-self.c b/tools/testing/selftests/proc/thread-self.c new file mode 100644 index 000000000000..4b23b39b7ae0 --- /dev/null +++ b/tools/testing/selftests/proc/thread-self.c @@ -0,0 +1,64 @@ +/* + * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that /proc/thread-self gives correct TGID/PID. +#undef NDEBUG +#include <assert.h> +#include <sched.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/wait.h> + +#include "proc.h" + +int f(void *arg) +{ + char buf1[64], buf2[64]; + pid_t pid, tid; + ssize_t rv; + + pid = sys_getpid(); + tid = sys_gettid(); + snprintf(buf1, sizeof(buf1), "%u/task/%u", pid, tid); + + rv = readlink("/proc/thread-self", buf2, sizeof(buf2)); + assert(rv == strlen(buf1)); + buf2[rv] = '\0'; + assert(streq(buf1, buf2)); + + if (arg) + exit(0); + return 0; +} + +int main(void) +{ + const int PAGE_SIZE = sysconf(_SC_PAGESIZE); + pid_t pid; + void *stack; + + /* main thread */ + f((void *)0); + + stack = mmap(NULL, 2 * PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + assert(stack != MAP_FAILED); + /* side thread */ + pid = clone(f, stack + PAGE_SIZE, CLONE_THREAD|CLONE_SIGHAND|CLONE_VM, (void *)1); + assert(pid > 0); + pause(); + + return 0; +} diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 342c7bc9dc8c..af5ff83f6d7f 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,6 +1,7 @@ hugepage-mmap hugepage-shm map_hugetlb +map_populate thuge-gen compaction_test mlock2-tests diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index fdefa2295ddc..9881876d2aa0 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -12,6 +12,7 @@ TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_populate TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += on-fault-limit diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c new file mode 100644 index 000000000000..6b8aeaa0bf7a --- /dev/null +++ b/tools/testing/selftests/vm/map_populate.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Dmitry Safonov, Arista Networks + * + * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#ifndef MMAP_SZ +#define MMAP_SZ 4096 +#endif + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + exit(1); \ + } \ + } while (0) + +static int parent_f(int sock, unsigned long *smap, int child) +{ + int status, ret; + + ret = read(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + *smap = 0x22222BAD; + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = write(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + waitpid(child, &status, 0); + BUG_ON(!WIFEXITED(status), "child in unexpected state"); + + return WEXITSTATUS(status); +} + +static int child_f(int sock, unsigned long *smap, int fd) +{ + int ret, buf = 0; + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_POPULATE, fd, 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); + + ret = write(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + ret = read(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); + BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); + + return 0; +} + +int main(int argc, char **argv) +{ + int sock[2], child, ret; + FILE *ftmp; + unsigned long *smap; + + ftmp = tmpfile(); + BUG_ON(ftmp == 0, "tmpfile()"); + + ret = ftruncate(fileno(ftmp), MMAP_SZ); + BUG_ON(ret, "ftruncate()"); + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(ftmp), 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + *smap = 0xdeadbabe; + /* Probably unnecessary, but let it be. */ + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); + BUG_ON(ret, "socketpair()"); + + child = fork(); + BUG_ON(child == -1, "fork()"); + + if (child) { + ret = close(sock[0]); + BUG_ON(ret, "close()"); + + return parent_f(sock[1], smap, child); + } + + ret = close(sock[1]); + BUG_ON(ret, "close()"); + + return child_f(sock[0], smap, fileno(ftmp)); +} diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 88cbe5575f0c..584a91ae4a8f 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -168,6 +168,17 @@ else fi echo "--------------------" +echo "running map_populate" +echo "--------------------" +./map_populate +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + +echo "--------------------" echo "running mlock2-tests" echo "--------------------" ./mlock2-tests diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0df592c4f09f..f986e31fa68c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -140,9 +140,10 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; -__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, bool blockable) { + return 0; } bool kvm_is_reserved_pfn(kvm_pfn_t pfn) @@ -360,13 +361,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); } -static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; + int ret; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -384,9 +387,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, spin_unlock(&kvm->mmu_lock); - kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); + ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable); srcu_read_unlock(&kvm->srcu, idx); + + return ret; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, |